From a9e6d44ddeccd3522670e641f1ed9b068e746ff7 Mon Sep 17 00:00:00 2001 From: Sven Joachim Date: Fri, 26 Jan 2018 10:38:01 +0100 Subject: [PATCH 01/82] ssb: Do not disable PCI host on non-Mips After upgrading an old laptop to 4.15-rc9, I found that the eth0 and wlan0 interfaces had disappeared. It turns out that the b43 and b44 drivers require SSB_PCIHOST_POSSIBLE which depends on PCI_DRIVERS_LEGACY, a config option that only exists on Mips. Fixes: 58eae1416b80 ("ssb: Disable PCI host for PCI_DRIVERS_GENERIC") Cc: stable@vger.org Signed-off-by: Sven Joachim Reviewed-by: James Hogan Signed-off-by: Kalle Valo --- drivers/ssb/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig index 71c73766ee22..65af12c3bdb2 100644 --- a/drivers/ssb/Kconfig +++ b/drivers/ssb/Kconfig @@ -32,7 +32,7 @@ config SSB_BLOCKIO config SSB_PCIHOST_POSSIBLE bool - depends on SSB && (PCI = y || PCI = SSB) && PCI_DRIVERS_LEGACY + depends on SSB && (PCI = y || PCI = SSB) && (PCI_DRIVERS_LEGACY || !MIPS) default y config SSB_PCIHOST From d71ef28636e435079028c1ed255fa92d8ff6ed76 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Jan 2018 16:02:03 +0100 Subject: [PATCH 02/82] mt76: implement AP_LINK_PS With software A-MPDU reordering in place, frames that notify mac80211 of powersave changes are reordered as well, which can cause connection stalls. Fix this by implementing powersave state processing in the driver. Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code") Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mac80211.c | 52 ++++++++++++++++++- drivers/net/wireless/mediatek/mt76/mt76.h | 10 ++++ drivers/net/wireless/mediatek/mt76/mt76x2.h | 2 + .../net/wireless/mediatek/mt76/mt76x2_init.c | 1 + .../net/wireless/mediatek/mt76/mt76x2_main.c | 28 +++++----- 5 files changed, 78 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index 5fcb2deb89a2..85f8d324ebf8 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -276,6 +276,7 @@ int mt76_register_device(struct mt76_dev *dev, bool vht, ieee80211_hw_set(hw, TX_AMSDU); ieee80211_hw_set(hw, TX_FRAG_LIST); ieee80211_hw_set(hw, MFP_CAPABLE); + ieee80211_hw_set(hw, AP_LINK_PS); wiphy->flags |= WIPHY_FLAG_IBSS_RSN; @@ -470,6 +471,53 @@ mt76_check_ccmp_pn(struct sk_buff *skb) return 0; } +static void +mt76_check_ps(struct mt76_dev *dev, struct sk_buff *skb) +{ + struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_sta *sta; + struct mt76_wcid *wcid = status->wcid; + bool ps; + + if (!wcid || !wcid->sta) + return; + + sta = container_of((void *) wcid, struct ieee80211_sta, drv_priv); + + if (!test_bit(MT_WCID_FLAG_CHECK_PS, &wcid->flags)) + return; + + if (ieee80211_is_pspoll(hdr->frame_control)) { + ieee80211_sta_pspoll(sta); + return; + } + + if (ieee80211_has_morefrags(hdr->frame_control) || + !(ieee80211_is_mgmt(hdr->frame_control) || + ieee80211_is_data(hdr->frame_control))) + return; + + ps = ieee80211_has_pm(hdr->frame_control); + + if (ps && (ieee80211_is_data_qos(hdr->frame_control) || + ieee80211_is_qos_nullfunc(hdr->frame_control))) + ieee80211_sta_uapsd_trigger(sta, status->tid); + + if (!!test_bit(MT_WCID_FLAG_PS, &wcid->flags) == ps) + return; + + if (ps) { + set_bit(MT_WCID_FLAG_PS, &wcid->flags); + mt76_stop_tx_queues(dev, sta, true); + } else { + clear_bit(MT_WCID_FLAG_PS, &wcid->flags); + } + + ieee80211_sta_ps_transition(sta, ps); + dev->drv->sta_ps(dev, sta, ps); +} + void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames, int queue) { @@ -498,8 +546,10 @@ void mt76_rx_poll_complete(struct mt76_dev *dev, enum mt76_rxq_id q) __skb_queue_head_init(&frames); - while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL) + while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL) { + mt76_check_ps(dev, skb); mt76_rx_aggr_reorder(skb, &frames); + } mt76_rx_complete(dev, &frames, q); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 129015c9d116..d2ce15093edd 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -121,11 +121,18 @@ struct mt76_queue_ops { void (*kick)(struct mt76_dev *dev, struct mt76_queue *q); }; +enum mt76_wcid_flags { + MT_WCID_FLAG_CHECK_PS, + MT_WCID_FLAG_PS, +}; + struct mt76_wcid { struct mt76_rx_tid __rcu *aggr[IEEE80211_NUM_TIDS]; struct work_struct aggr_work; + unsigned long flags; + u8 idx; u8 hw_key_idx; @@ -206,6 +213,9 @@ struct mt76_driver_ops { struct sk_buff *skb); void (*rx_poll_complete)(struct mt76_dev *dev, enum mt76_rxq_id q); + + void (*sta_ps)(struct mt76_dev *dev, struct ieee80211_sta *sta, + bool ps); }; struct mt76_channel_state { diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2.h index 17df17afd9bf..e62131b88102 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x2.h @@ -218,6 +218,8 @@ void mt76x2_rx_poll_complete(struct mt76_dev *mdev, enum mt76_rxq_id q); void mt76x2_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q, struct sk_buff *skb); +void mt76x2_sta_ps(struct mt76_dev *dev, struct ieee80211_sta *sta, bool ps); + void mt76x2_update_channel(struct mt76_dev *mdev); s8 mt76x2_tx_get_max_txpwr_adj(struct mt76x2_dev *dev, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c index 1b00ae4465a2..9dbf94947324 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c @@ -630,6 +630,7 @@ struct mt76x2_dev *mt76x2_alloc_device(struct device *pdev) .tx_complete_skb = mt76x2_tx_complete_skb, .rx_skb = mt76x2_queue_rx_skb, .rx_poll_complete = mt76x2_rx_poll_complete, + .sta_ps = mt76x2_sta_ps, }; struct ieee80211_hw *hw; struct mt76x2_dev *dev; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c index bf26284b9989..205043b470b2 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c @@ -282,6 +282,9 @@ mt76x2_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif, for (i = 0; i < ARRAY_SIZE(sta->txq); i++) mt76x2_txq_init(dev, sta->txq[i]); + if (vif->type == NL80211_IFTYPE_AP) + set_bit(MT_WCID_FLAG_CHECK_PS, &msta->wcid.flags); + rcu_assign_pointer(dev->wcid[idx], &msta->wcid); out: @@ -311,23 +314,14 @@ mt76x2_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif, return 0; } -static void -mt76x2_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - enum sta_notify_cmd cmd, struct ieee80211_sta *sta) +void +mt76x2_sta_ps(struct mt76_dev *mdev, struct ieee80211_sta *sta, bool ps) { struct mt76x2_sta *msta = (struct mt76x2_sta *) sta->drv_priv; - struct mt76x2_dev *dev = hw->priv; + struct mt76x2_dev *dev = container_of(mdev, struct mt76x2_dev, mt76); int idx = msta->wcid.idx; - switch (cmd) { - case STA_NOTIFY_SLEEP: - mt76x2_mac_wcid_set_drop(dev, idx, true); - mt76_stop_tx_queues(&dev->mt76, sta, true); - break; - case STA_NOTIFY_AWAKE: - mt76x2_mac_wcid_set_drop(dev, idx, false); - break; - } + mt76x2_mac_wcid_set_drop(dev, idx, ps); } static int @@ -549,6 +543,12 @@ static void mt76x2_set_coverage_class(struct ieee80211_hw *hw, mutex_unlock(&dev->mutex); } +static int +mt76x2_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set) +{ + return 0; +} + const struct ieee80211_ops mt76x2_ops = { .tx = mt76x2_tx, .start = mt76x2_start, @@ -560,7 +560,6 @@ const struct ieee80211_ops mt76x2_ops = { .bss_info_changed = mt76x2_bss_info_changed, .sta_add = mt76x2_sta_add, .sta_remove = mt76x2_sta_remove, - .sta_notify = mt76x2_sta_notify, .set_key = mt76x2_set_key, .conf_tx = mt76x2_conf_tx, .sw_scan_start = mt76x2_sw_scan, @@ -573,5 +572,6 @@ const struct ieee80211_ops mt76x2_ops = { .release_buffered_frames = mt76_release_buffered_frames, .set_coverage_class = mt76x2_set_coverage_class, .get_survey = mt76_get_survey, + .set_tim = mt76x2_set_tim, }; From 17cf68b702a60aee61432d59098b1ba6ceab2f98 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Jan 2018 16:02:04 +0100 Subject: [PATCH 03/82] mt76: implement processing of BlockAckReq frames Avoids timeouts on reordered A-MPDU rx frames Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code") Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/agg-rx.c | 34 ++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c index 8027bb7c03c2..e9784b50e2af 100644 --- a/drivers/net/wireless/mediatek/mt76/agg-rx.c +++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c @@ -113,6 +113,33 @@ mt76_rx_aggr_reorder_work(struct work_struct *work) local_bh_enable(); } +static void +mt76_rx_aggr_check_ctl(struct sk_buff *skb, struct sk_buff_head *frames) +{ + struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb; + struct ieee80211_bar *bar = (struct ieee80211_bar *) skb->data; + struct mt76_wcid *wcid = status->wcid; + struct mt76_rx_tid *tid; + u16 seqno; + + if (!ieee80211_is_ctl(bar->frame_control)) + return; + + if (!ieee80211_is_back_req(bar->frame_control)) + return; + + status->tid = le16_to_cpu(bar->control) >> 12; + seqno = le16_to_cpu(bar->start_seq_num) >> 4; + tid = rcu_dereference(wcid->aggr[status->tid]); + if (!tid) + return; + + spin_lock_bh(&tid->lock); + mt76_rx_aggr_release_frames(tid, frames, seqno); + mt76_rx_aggr_release_head(tid, frames); + spin_unlock_bh(&tid->lock); +} + void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames) { struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb; @@ -126,9 +153,14 @@ void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames) __skb_queue_tail(frames, skb); sta = wcid_to_sta(wcid); - if (!sta || !status->aggr) + if (!sta) return; + if (!status->aggr) { + mt76_rx_aggr_check_ctl(skb, frames); + return; + } + tid = rcu_dereference(wcid->aggr[status->tid]); if (!tid) return; From fb208dc73ff1667191ba26d87610bba983ea1535 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Jan 2018 16:02:05 +0100 Subject: [PATCH 04/82] mt76: avoid re-queueing A-MPDU rx reorder work if no frames are pending Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code") Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/agg-rx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c index e9784b50e2af..fcb208d1f276 100644 --- a/drivers/net/wireless/mediatek/mt76/agg-rx.c +++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c @@ -98,6 +98,7 @@ mt76_rx_aggr_reorder_work(struct work_struct *work) reorder_work.work); struct mt76_dev *dev = tid->dev; struct sk_buff_head frames; + int nframes; __skb_queue_head_init(&frames); @@ -105,9 +106,12 @@ mt76_rx_aggr_reorder_work(struct work_struct *work) spin_lock(&tid->lock); mt76_rx_aggr_check_release(tid, &frames); + nframes = tid->nframes; spin_unlock(&tid->lock); - ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work, REORDER_TIMEOUT); + if (nframes) + ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work, + REORDER_TIMEOUT); mt76_rx_complete(dev, &frames, -1); local_bh_enable(); From cbbde7e8d98542dc1777ac42ec2a08875aea26ee Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Jan 2018 16:02:06 +0100 Subject: [PATCH 05/82] mt76: do not set status->aggr for NULL data frames Avoids data connection stalls when the client toggles powersave mode Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code") Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt76x2_mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c index 6c30b5eaa9ca..7ea3d841918e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c @@ -341,7 +341,7 @@ int mt76x2_mac_process_rx(struct mt76x2_dev *dev, struct sk_buff *skb, mt76x2_remove_hdr_pad(skb, pad_len); - if (rxinfo & MT_RXINFO_BA) + if ((rxinfo & MT_RXINFO_BA) && !(rxinfo & MT_RXINFO_NULL)) status->aggr = true; if (WARN_ON_ONCE(len > skb->len)) From 0537250fdc6c876ed4cbbe874c739aebef493ee2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 30 Jan 2018 11:30:11 -0800 Subject: [PATCH 06/82] netfilter: x_tables: make allocation less aggressive syzbot has noticed that xt_alloc_table_info can allocate a lot of memory. This is an admin only interface but an admin in a namespace is sufficient as well. eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_table_info()") has changed the opencoded kmalloc->vmalloc fallback into kvmalloc. It has dropped __GFP_NORETRY on the way because vmalloc has simply never fully supported __GFP_NORETRY semantic. This is still the case because e.g. page tables backing the vmalloc area are hardcoded GFP_KERNEL. Revert back to __GFP_NORETRY as a poors man defence against excessively large allocation request here. We will not rule out the OOM killer completely but __GFP_NORETRY should at least stop the large request in most cases. [akpm@linux-foundation.org: coding-style fixes] Fixes: eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_tableLink: http://lkml.kernel.org/r/20180130140104.GE21609@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Florian Westphal Reviewed-by: Andrew Morton Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 8fa4d37141a7..2f685ee1f9c8 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1008,7 +1008,12 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if ((size >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - info = kvmalloc(sz, GFP_KERNEL); + /* __GFP_NORETRY is not fully supported by kvmalloc but it should + * work reasonably well if sz is too large and bail out rather + * than shoot all processes down before realizing there is nothing + * more to reclaim. + */ + info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY); if (!info) return NULL; From ea23d5e3bf340e413b8e05c13da233c99c64142b Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 31 Jan 2018 04:50:01 -0700 Subject: [PATCH 07/82] netfilter: ipv6: nf_defrag: Kill frag queue on RFC2460 failure Failures were seen in ICMPv6 fragmentation timeout tests if they were run after the RFC2460 failure tests. Kernel was not sending out the ICMPv6 fragment reassembly time exceeded packet after the fragmentation reassembly timeout of 1 minute had elapsed. This happened because the frag queue was not released if an error in IPv6 fragmentation header was detected by RFC2460. Fixes: 83f1999caeb1 ("netfilter: ipv6: nf_defrag: Pass on packets to stack per RFC2460") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index ce53dcfda88a..b84ce3e6d728 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -264,6 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); + inet_frag_kill(&fq->q, &nf_frags); return -EPROTO; } if (end > fq->q.len) { From 6be3bcd75afb673a37a82e18ba46d50430f172c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 31 Jan 2018 18:13:39 +0100 Subject: [PATCH 08/82] netfilter: flowtable infrastructure depends on NETFILTER_INGRESS config NF_FLOW_TABLE depends on NETFILTER_INGRESS. If users forget to enable this toggle, flowtable registration fails with EOPNOTSUPP. Moreover, turn 'select NF_FLOW_TABLE' in every flowtable family flavour into dependency instead, otherwise this new dependency on NETFILTER_INGRESS causes a warning. This also allows us to remove the explicit dependency between family flowtables <-> NF_TABLES and NF_CONNTRACK, given they depend on the NF_FLOW_TABLE core that already expresses the general dependencies for this new infrastructure. Moreover, NF_FLOW_TABLE_INET depends on NF_FLOW_TABLE_IPV4 and NF_FLOWTABLE_IPV6, which already depends on NF_FLOW_TABLE. So we can get rid of direct dependency with NF_FLOW_TABLE. In general, let's avoid 'select', it just makes things more complicated. Reported-by: John Crispin Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 3 +-- net/ipv6/netfilter/Kconfig | 3 +-- net/netfilter/Kconfig | 8 +++++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 5f52236780b4..dfe6fa4ea554 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -80,8 +80,7 @@ endif # NF_TABLES config NF_FLOW_TABLE_IPV4 tristate "Netfilter flow table IPv4 module" - depends on NF_CONNTRACK && NF_TABLES - select NF_FLOW_TABLE + depends on NF_FLOW_TABLE help This option adds the flow table IPv4 support. diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 4a634b7a2c80..d395d1590699 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -73,8 +73,7 @@ endif # NF_TABLES config NF_FLOW_TABLE_IPV6 tristate "Netfilter flow table IPv6 module" - depends on NF_CONNTRACK && NF_TABLES - select NF_FLOW_TABLE + depends on NF_FLOW_TABLE help This option adds the flow table IPv6 support. diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9019fa98003d..d3220b43c832 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -666,8 +666,8 @@ endif # NF_TABLES config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" - depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6 - select NF_FLOW_TABLE + depends on NF_FLOW_TABLE_IPV4 + depends on NF_FLOW_TABLE_IPV6 help This option adds the flow table mixed IPv4/IPv6 support. @@ -675,7 +675,9 @@ config NF_FLOW_TABLE_INET config NF_FLOW_TABLE tristate "Netfilter flow table module" - depends on NF_CONNTRACK && NF_TABLES + depends on NETFILTER_INGRESS + depends on NF_CONNTRACK + depends on NF_TABLES help This option adds the flow table core infrastructure. From ba7cd5d95f25cc6005f687dabdb4e7a6063adda9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 31 Jan 2018 15:02:47 -0800 Subject: [PATCH 09/82] netfilter: xt_cgroup: initialize info->priv in cgroup_mt_check_v1() xt_cgroup_info_v1->priv is an internal pointer only used for kernel, we should not trust what user-space provides. Reported-by: Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") Cc: Pablo Neira Ayuso Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 1db1ce59079f..891f4e7e8ea7 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -52,6 +52,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { From c7f0030b5b67866c588845abde7bf011de25b98a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 1 Feb 2018 18:49:00 +0100 Subject: [PATCH 10/82] netfilter: nft_flow_offload: wait for garbage collector to run after cleanup If netdevice goes down, then flowtable entries are scheduled to be removed. Wait for garbage collector to have a chance to run so it can delete them from the hashtable. The flush call might sleep, so hold the nfnl mutex from nft_flow_table_iterate() instead of rcu read side lock. The use of the nfnl mutex is also implicitly fixing races between updates via nfnetlink and netdevice event. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- net/netfilter/nft_flow_offload.c | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0791813a1e7d..07dd1fac78a8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5006,13 +5006,13 @@ void nft_flow_table_iterate(struct net *net, struct nft_flowtable *flowtable; const struct nft_table *table; - rcu_read_lock(); - list_for_each_entry_rcu(table, &net->nft.tables, list) { - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { iter(&flowtable->data, data); } } - rcu_read_unlock(); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_flow_table_iterate); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 4503b8dcf9c0..1739ff8ca21f 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -208,6 +208,7 @@ static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable, void *data) { nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data); + flush_delayed_work(&flowtable->gc_work); } static int flow_offload_netdev_event(struct notifier_block *this, From 992cfc7c5d105094da7c21c9c74d97ac26bb1e56 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 1 Feb 2018 18:49:01 +0100 Subject: [PATCH 11/82] netfilter: nft_flow_offload: no need to flush entries on module removal nft_flow_offload module removal does not require to flush existing flowtables, it is valid to remove this module while keeping flowtables around. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 1739ff8ca21f..e5c45c7ac02a 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -247,14 +247,8 @@ register_expr: static void __exit nft_flow_offload_module_exit(void) { - struct net *net; - nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); - rtnl_lock(); - for_each_net(net) - nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL); - rtnl_unlock(); } module_init(nft_flow_offload_module_init); From 09584b406742413ac4c8d7e030374d4daa045b69 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 2 Feb 2018 22:37:15 -0800 Subject: [PATCH 12/82] bpf: fix selftests/bpf test_kmod.sh failure when CONFIG_BPF_JIT_ALWAYS_ON=y With CONFIG_BPF_JIT_ALWAYS_ON is defined in the config file, tools/testing/selftests/bpf/test_kmod.sh failed like below: [root@localhost bpf]# ./test_kmod.sh sysctl: setting key "net.core.bpf_jit_enable": Invalid argument [ JIT enabled:0 hardened:0 ] [ 132.175681] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096 [ 132.458834] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed] [ JIT enabled:1 hardened:0 ] [ 133.456025] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096 [ 133.730935] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed] [ JIT enabled:1 hardened:1 ] [ 134.769730] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096 [ 135.050864] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed] [ JIT enabled:1 hardened:2 ] [ 136.442882] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096 [ 136.821810] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed] [root@localhost bpf]# The test_kmod.sh load/remove test_bpf.ko multiple times with different settings for sysctl net.core.bpf_jit_{enable,harden}. The failed test #297 of test_bpf.ko is designed such that JIT always fails. Commit 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config) introduced the following tightening logic: ... if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); #ifdef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { *err = -ENOTSUPP; return fp; } #endif ... With this logic, Test #297 always gets return value -ENOTSUPP when CONFIG_BPF_JIT_ALWAYS_ON is defined, causing the test failure. This patch fixed the failure by marking Test #297 as expected failure when CONFIG_BPF_JIT_ALWAYS_ON is defined. Fixes: 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config) Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- lib/test_bpf.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 4cd9ea9b3449..b4e22345963f 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -83,6 +83,7 @@ struct bpf_test { __u32 result; } test[MAX_SUBTESTS]; int (*fill_helper)(struct bpf_test *self); + int expected_errcode; /* used when FLAG_EXPECTED_FAIL is set in the aux */ __u8 frag_data[MAX_DATA]; int stack_depth; /* for eBPF only, since tests don't call verifier */ }; @@ -2026,7 +2027,9 @@ static struct bpf_test tests[] = { }, CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, - { } + { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { "check: div_k_0", @@ -2036,7 +2039,9 @@ static struct bpf_test tests[] = { }, CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, - { } + { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { "check: unknown insn", @@ -2047,7 +2052,9 @@ static struct bpf_test tests[] = { }, CLASSIC | FLAG_EXPECTED_FAIL, { }, - { } + { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { "check: out of range spill/fill", @@ -2057,7 +2064,9 @@ static struct bpf_test tests[] = { }, CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, - { } + { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { "JUMPS + HOLES", @@ -2149,6 +2158,8 @@ static struct bpf_test tests[] = { CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { "check: LDX + RET X", @@ -2159,6 +2170,8 @@ static struct bpf_test tests[] = { CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { /* Mainly checking JIT here. */ "M[]: alt STX + LDX", @@ -2333,6 +2346,8 @@ static struct bpf_test tests[] = { CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, { }, { }, + .fill_helper = NULL, + .expected_errcode = -EINVAL, }, { /* Passes checker but fails during runtime. */ "LD [SKF_AD_OFF-1]", @@ -5395,6 +5410,7 @@ static struct bpf_test tests[] = { { }, { }, .fill_helper = bpf_fill_maxinsns4, + .expected_errcode = -EINVAL, }, { /* Mainly checking JIT here. */ "BPF_MAXINSNS: Very long jump", @@ -5450,10 +5466,15 @@ static struct bpf_test tests[] = { { "BPF_MAXINSNS: Jump, gap, jump, ...", { }, +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL, +#else CLASSIC | FLAG_NO_DATA, +#endif { }, { { 0, 0xababcbac } }, .fill_helper = bpf_fill_maxinsns11, + .expected_errcode = -ENOTSUPP, }, { "BPF_MAXINSNS: ld_abs+get_processor_id", @@ -6344,7 +6365,7 @@ static struct bpf_prog *generate_filter(int which, int *err) *err = bpf_prog_create(&fp, &fprog); if (tests[which].aux & FLAG_EXPECTED_FAIL) { - if (*err == -EINVAL) { + if (*err == tests[which].expected_errcode) { pr_cont("PASS\n"); /* Verifier rejected filter as expected. */ *err = 0; From 7b4eb53d95c30bdc55c44647c6968f24227fd1ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 5 Feb 2018 16:25:24 -0800 Subject: [PATCH 13/82] tools/bpf: fix batch-mode test failure of test_xdp_redirect.sh The tests at tools/testing/selftests/bpf can run in patch mode, e.g., make -C tools/testing/selftests/bpf run_tests With the batch mode, I experimented intermittent test failure of test_xdp_redirect.sh. .... selftests: test_xdp_redirect [PASS] selftests: test_xdp_redirect.sh [PASS] RTNETLINK answers: File exists selftests: test_xdp_meta [FAILED] selftests: test_xdp_meta.sh [FAIL] .... The following illustrates what caused the failure: (1). test_xdp_redirect creates veth pairs (veth1,veth11) and (veth2,veth22), and assign veth11 and veth22 to namespace ns1 and ns2 respectively. (2). at the end of test_xdp_redirect test, ns1 and ns2 are deleted. During this process, the deletion of actual namespace resources, including deletion of veth1{1} and veth2{2}, is put into a workqueue to be processed asynchronously. (3). test_xdp_meta tries to create veth pair (veth1, veth2). The previous veth deletions in step (2) have not finished yet, and veth1 or veth2 may be still valid in the kernel, thus causing the failure. The fix is to explicitly delete the veth pair before test_xdp_redirect exits. Only one end of veth needs deletion as the kernel will delete the other end automatically. Also test_xdp_meta is also fixed in similar manner to avoid future potential issues. Fixes: 996139e801fd ("selftests: bpf: add a test for XDP redirect") Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_xdp_meta.sh | 1 + tools/testing/selftests/bpf/test_xdp_redirect.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh index 307aa856cee3..637fcf4fe4e3 100755 --- a/tools/testing/selftests/bpf/test_xdp_meta.sh +++ b/tools/testing/selftests/bpf/test_xdp_meta.sh @@ -9,6 +9,7 @@ cleanup() fi set +e + ip link del veth1 2> /dev/null ip netns del ns1 2> /dev/null ip netns del ns2 2> /dev/null } diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh index 344a3656dea6..c4b17e08d431 100755 --- a/tools/testing/selftests/bpf/test_xdp_redirect.sh +++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh @@ -19,6 +19,8 @@ cleanup() fi set +e + ip link del veth1 2> /dev/null + ip link del veth2 2> /dev/null ip netns del ns1 2> /dev/null ip netns del ns2 2> /dev/null } From b11a632c442eef34a0afeba61fab923241f317e9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:43 -0800 Subject: [PATCH 14/82] net: add a UID to use for ULP socket assignment Create a UID field and enum that can be used to assign ULPs to sockets. This saves a set of string comparisons if the ULP id is known. For sockmap, which is added in the next patches, a ULP is used to hook into TCP sockets close state. In this case the ULP being added is done at map insert time and the ULP is known and done on the kernel side. In this case the named lookup is not needed. Because we don't want to expose psock internals to user space socket options a user visible flag is also added. For TLS this is set for BPF it will be cleared. Alos remove pr_notice, user gets an error code back and should check that rather than rely on logs. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 6 +++++ net/ipv4/tcp_ulp.c | 59 ++++++++++++++++++++++++++++++++++++++++++---- net/tls/tls_main.c | 2 ++ 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 58278669cc55..a58292d31e12 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1983,6 +1983,10 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) +enum { + TCP_ULP_TLS, +}; + struct tcp_ulp_ops { struct list_head list; @@ -1991,7 +1995,9 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); + int uid; char name[TCP_ULP_NAME_MAX]; + bool user_visible; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 6bb9e14c710a..622caa4039e0 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) return NULL; } +static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) +{ + struct tcp_ulp_ops *e; + + list_for_each_entry_rcu(e, &tcp_ulp_list, list) { + if (e->uid == ulp) + return e; + } + + return NULL; +} + static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; @@ -51,6 +63,18 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) return ulp; } +static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) +{ + const struct tcp_ulp_ops *ulp; + + rcu_read_lock(); + ulp = tcp_ulp_find_id(uid); + if (!ulp || !try_module_get(ulp->owner)) + ulp = NULL; + rcu_read_unlock(); + return ulp; +} + /* Attach new upper layer protocol to the list * of available protocols. */ @@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp) int ret = 0; spin_lock(&tcp_ulp_list_lock); - if (tcp_ulp_find(ulp->name)) { - pr_notice("%s already registered or non-unique name\n", - ulp->name); + if (tcp_ulp_find(ulp->name)) ret = -EEXIST; - } else { + else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); - } spin_unlock(&tcp_ulp_list_lock); return ret; @@ -124,6 +145,34 @@ int tcp_set_ulp(struct sock *sk, const char *name) if (!ulp_ops) return -ENOENT; + if (!ulp_ops->user_visible) { + module_put(ulp_ops->owner); + return -ENOENT; + } + + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } + + icsk->icsk_ulp_ops = ulp_ops; + return 0; +} + +int tcp_set_ulp_id(struct sock *sk, int ulp) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_ulp_ops *ulp_ops; + int err; + + if (icsk->icsk_ulp_ops) + return -EEXIST; + + ulp_ops = __tcp_ulp_lookup(ulp); + if (!ulp_ops) + return -ENOENT; + err = ulp_ops->init(sk); if (err) { module_put(ulp_ops->owner); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 736719c8314e..b0d5fcea47e7 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -484,6 +484,8 @@ out: static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", + .uid = TCP_ULP_TLS, + .user_visible = true, .owner = THIS_MODULE, .init = tls_init, }; From 1aa12bdf1bfb95db7e75bfecf0e39a65f4e8fbf8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:49 -0800 Subject: [PATCH 15/82] bpf: sockmap, add sock close() hook to remove socks The selftests test_maps program was leaving dangling BPF sockmap programs around because not all psock elements were removed from the map. The elements in turn hold a reference on the BPF program they are attached to causing BPF programs to stay open even after test_maps has completed. The original intent was that sk_state_change() would be called when TCP socks went through TCP_CLOSE state. However, because socks may be in SOCK_DEAD state or the sock may be a listening socket the event is not always triggered. To resolve this use the ULP infrastructure and register our own proto close() handler. This fixes the above case. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Prashant Bhole Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 + kernel/bpf/sockmap.c | 168 ++++++++++++++++++++++++++----------------- 2 files changed, 103 insertions(+), 67 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a58292d31e12..e3fc667f9ac2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); enum { TCP_ULP_TLS, + TCP_ULP_BPF, }; struct tcp_ulp_ops { @@ -2003,6 +2004,7 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); +int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0314d1783d77..bd4a6d9c6709 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -86,9 +86,10 @@ struct smap_psock { struct work_struct tx_work; struct work_struct gc_work; + struct proto *sk_proto; + void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); - void (*save_state_change)(struct sock *sk); }; static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static struct proto tcp_bpf_proto; +static int bpf_tcp_init(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return -EINVAL; + } + + if (unlikely(psock->sk_proto)) { + rcu_read_unlock(); + return -EBUSY; + } + + psock->save_close = sk->sk_prot->close; + psock->sk_proto = sk->sk_prot; + sk->sk_prot = &tcp_bpf_proto; + rcu_read_unlock(); + return 0; +} + +static void bpf_tcp_release(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + + if (likely(psock)) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } + rcu_read_unlock(); +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->close(sk, timeout); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + close_fun = psock->save_close; + + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + rcu_read_unlock(); + close_fun(sk, timeout); +} + enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, }; +static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { + .name = "bpf_tcp", + .uid = TCP_ULP_BPF, + .user_visible = false, + .owner = NULL, + .init = bpf_tcp_init, + .release = bpf_tcp_release, +}; + +static int bpf_tcp_ulp_register(void) +{ + tcp_bpf_proto = tcp_prot; + tcp_bpf_proto.close = bpf_tcp_close; + return tcp_register_ulp(&bpf_tcp_ulp_ops); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - -/* Called with lock_sock(sk) held */ -static void smap_state_change(struct sock *sk) -{ - struct smap_psock_map_entry *e, *tmp; - struct smap_psock *psock; - struct socket_wq *wq; - struct sock *osk; - - rcu_read_lock(); - - /* Allowing transitions into an established syn_recv states allows - * for early binding sockets to a smap object before the connection - * is established. - */ - switch (sk->sk_state) { - case TCP_SYN_SENT: - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - case TCP_LAST_ACK: - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - case TCP_LISTEN: - break; - case TCP_CLOSE: - /* Only release if the map entry is in fact the sock in - * question. There is a case where the operator deletes - * the sock from the map, but the TCP sock is closed before - * the psock is detached. Use cmpxchg to verify correct - * sock is removed. - */ - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - write_lock_bh(&sk->sk_callback_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); - } - } - write_unlock_bh(&sk->sk_callback_lock); - break; - default: - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - smap_report_sk_error(psock, EPIPE); - break; - } - - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible_all(&wq->wait); - rcu_read_unlock(); -} - static void smap_read_sock_strparser(struct strparser *strp, struct sk_buff *skb) { @@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; - sk->sk_state_change = psock->save_state_change; psock->save_data_ready = NULL; psock->save_write_space = NULL; - psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; } @@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) if (psock->refcnt) return; + tcp_cleanup_ulp(sock); smap_stop_sock(psock, sock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); @@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk) return; psock->save_data_ready = sk->sk_data_ready; psock->save_write_space = sk->sk_write_space; - psock->save_state_change = sk->sk_state_change; sk->sk_data_ready = smap_data_ready; sk->sk_write_space = smap_write_space; - sk->sk_state_change = smap_state_change; psock->strp_enabled = true; } @@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (attr->value_size > KMALLOC_MAX_SIZE) return ERR_PTR(-E2BIG); + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); @@ -754,6 +784,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_progs; + set_bit(SMAP_TX_RUNNING, &psock->state); } From 3d9e952697de89b53227f06d4241f275eb99cfc4 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:54 -0800 Subject: [PATCH 16/82] bpf: sockmap, fix leaking maps with attached but not detached progs When a program is attached to a map we increment the program refcnt to ensure that the program is not removed while it is potentially being referenced from sockmap side. However, if this same program also references the map (this is a reasonably common pattern in my programs) then the verifier will also increment the maps refcnt from the verifier. This is to ensure the map doesn't get garbage collected while the program has a reference to it. So we are left in a state where the map holds the refcnt on the program stopping it from being removed and releasing the map refcnt. And vice versa the program holds a refcnt on the map stopping it from releasing the refcnt on the prog. All this is fine as long as users detach the program while the map fd is still around. But, if the user omits this detach command we are left with a dangling map we can no longer release. To resolve this when the map fd is released decrement the program references and remove any reference from the map to the program. This fixes the issue with possibly dangling map and creates a user side API constraint. That is, the map fd must be held open for programs to be attached to a map. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index bd4a6d9c6709..48c33417d13c 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -620,11 +620,6 @@ static void sock_map_free(struct bpf_map *map) } rcu_read_unlock(); - if (stab->bpf_verdict) - bpf_prog_put(stab->bpf_verdict); - if (stab->bpf_parse) - bpf_prog_put(stab->bpf_parse); - sock_map_remove_complete(stab); } @@ -900,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } +static void sock_map_release(struct bpf_map *map, struct file *map_file) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + orig = xchg(&stab->bpf_parse, NULL); + if (orig) + bpf_prog_put(orig); + orig = xchg(&stab->bpf_verdict, NULL); + if (orig) + bpf_prog_put(orig); +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -907,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, + .map_release = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, From 7dc68e98757a8eccf8ca7a53a29b896f1eef1f76 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 5 Feb 2018 14:41:45 -0800 Subject: [PATCH 17/82] netfilter: xt_RATEEST: acquire xt_rateest_mutex for hash insert rateest_hash is supposed to be protected by xt_rateest_mutex, and, as suggested by Eric, lookup and insert should be atomic, so we should acquire the xt_rateest_mutex once for both. So introduce a non-locking helper for internal use and keep the locking one for external. Reported-by: Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target") Signed-off-by: Cong Wang Reviewed-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_RATEEST.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 498b54fd04d7..141c295191f6 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -39,23 +39,31 @@ static void xt_rateest_hash_insert(struct xt_rateest *est) hlist_add_head(&est->list, &rateest_hash[h]); } -struct xt_rateest *xt_rateest_lookup(const char *name) +static struct xt_rateest *__xt_rateest_lookup(const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); - mutex_lock(&xt_rateest_mutex); hlist_for_each_entry(est, &rateest_hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; - mutex_unlock(&xt_rateest_mutex); return est; } } - mutex_unlock(&xt_rateest_mutex); + return NULL; } + +struct xt_rateest *xt_rateest_lookup(const char *name) +{ + struct xt_rateest *est; + + mutex_lock(&xt_rateest_mutex); + est = __xt_rateest_lookup(name); + mutex_unlock(&xt_rateest_mutex); + return est; +} EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct xt_rateest *est) @@ -100,8 +108,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); - est = xt_rateest_lookup(info->name); + mutex_lock(&xt_rateest_mutex); + est = __xt_rateest_lookup(info->name); if (est) { + mutex_unlock(&xt_rateest_mutex); /* * If estimator parameters are specified, they must match the * existing estimator. @@ -139,11 +149,13 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) info->est = est; xt_rateest_hash_insert(est); + mutex_unlock(&xt_rateest_mutex); return 0; err2: kfree(est); err1: + mutex_unlock(&xt_rateest_mutex); return ret; } From c0ea1bcb39352b57ac5c4b6da8acd65bddeee2c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Feb 2018 13:22:44 +0100 Subject: [PATCH 18/82] netfilter: nft_flow_offload: move flowtable cleanup routines to nf_flow_table Move the flowtable cleanup routines to nf_flow_table and expose the nf_flow_table_cleanup() helper function. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 3 +++ net/netfilter/nf_flow_table.c | 24 ++++++++++++++++++++++++ net/netfilter/nft_flow_offload.c | 19 +------------------ 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b22b22082733..ed49cd169ecf 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -95,6 +95,9 @@ struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_t int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct flow_offload *flow, void *data), void *data); + +void nf_flow_table_cleanup(struct net *net, struct net_device *dev); + void nf_flow_offload_work_gc(struct work_struct *work); extern const struct rhashtable_params nf_flow_offload_rhash_params; diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c index 2f5099cb85b8..04c08f6b9015 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -425,5 +426,28 @@ int nf_flow_dnat_port(const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); +static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) +{ + struct net_device *dev = data; + + if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + return; + + flow_offload_dead(flow); +} + +static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, + void *data) +{ + nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data); + flush_delayed_work(&flowtable->gc_work); +} + +void nf_flow_table_cleanup(struct net *net, struct net_device *dev) +{ + nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev); +} +EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index e5c45c7ac02a..b65829b2be22 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -194,23 +194,6 @@ static struct nft_expr_type nft_flow_offload_type __read_mostly = { .owner = THIS_MODULE, }; -static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data) -{ - struct net_device *dev = data; - - if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) - return; - - flow_offload_dead(flow); -} - -static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable, - void *data) -{ - nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data); - flush_delayed_work(&flowtable->gc_work); -} - static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -219,7 +202,7 @@ static int flow_offload_netdev_event(struct notifier_block *this, if (event != NETDEV_DOWN) return NOTIFY_DONE; - nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev); + nf_flow_table_cleanup(dev_net(dev), dev); return NOTIFY_DONE; } From b408c5b04f82fe4e20bceb8e4f219453d4f21f02 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Feb 2018 13:22:47 +0100 Subject: [PATCH 19/82] netfilter: nf_tables: fix flowtable free Every flow_offload entry is added into the table twice. Because of this, rhashtable_free_and_destroy can't be used, since it would call kfree for each flow_offload object twice. This patch cleans up the flowtable via nf_flow_table_iterate() to schedule removal of entries by setting on the dying bit, then there is an explicitly invocation of the garbage collector to release resources. Based on patch from Felix Fietkau. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ net/ipv4/netfilter/nf_flow_table_ipv4.c | 1 + net/ipv6/netfilter/nf_flow_table_ipv6.c | 1 + net/netfilter/nf_flow_table.c | 25 +++++++++++++++++++------ net/netfilter/nf_flow_table_inet.c | 1 + net/netfilter/nf_tables_api.c | 9 ++------- 6 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ed49cd169ecf..020ae903066f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -14,6 +14,7 @@ struct nf_flowtable_type { struct list_head list; int family; void (*gc)(struct work_struct *work); + void (*free)(struct nf_flowtable *ft); const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; @@ -98,6 +99,7 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +void nf_flow_table_free(struct nf_flowtable *flow_table); void nf_flow_offload_work_gc(struct work_struct *work); extern const struct rhashtable_params nf_flow_offload_rhash_params; diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index b2d01eb25f2c..25d2975da156 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -260,6 +260,7 @@ static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, .params = &nf_flow_offload_rhash_params, .gc = nf_flow_offload_work_gc, + .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, }; diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index fff21602875a..d346705d6ee6 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -253,6 +253,7 @@ static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, .params = &nf_flow_offload_rhash_params, .gc = nf_flow_offload_work_gc, + .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c index 04c08f6b9015..c17f1af42daa 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table.c @@ -232,19 +232,16 @@ static inline bool nf_flow_is_dying(const struct flow_offload *flow) return flow->flags & FLOW_OFFLOAD_DYING; } -void nf_flow_offload_work_gc(struct work_struct *work) +static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table; struct rhashtable_iter hti; struct flow_offload *flow; int err; - flow_table = container_of(work, struct nf_flowtable, gc_work.work); - err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); if (err) - goto schedule; + return 0; rhashtable_walk_start(&hti); @@ -270,7 +267,16 @@ void nf_flow_offload_work_gc(struct work_struct *work) out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); -schedule: + + return 1; +} + +void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct nf_flowtable *flow_table; + + flow_table = container_of(work, struct nf_flowtable, gc_work.work); + nf_flow_offload_gc_step(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); @@ -449,5 +455,12 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev) } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); +void nf_flow_table_free(struct nf_flowtable *flow_table) +{ + nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); + WARN_ON(!nf_flow_offload_gc_step(flow_table)); +} +EXPORT_SYMBOL_GPL(nf_flow_table_free); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 281209aeba8f..375a1881d93d 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -24,6 +24,7 @@ static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, .params = &nf_flow_offload_rhash_params, .gc = nf_flow_offload_work_gc, + .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 07dd1fac78a8..8b9fe30de0cd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5399,17 +5399,12 @@ err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static void nft_flowtable_destroy(void *ptr, void *arg) -{ - kfree(ptr); -} - static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { cancel_delayed_work_sync(&flowtable->data.gc_work); kfree(flowtable->name); - rhashtable_free_and_destroy(&flowtable->data.rhashtable, - nft_flowtable_destroy, NULL); + flowtable->data.type->free(&flowtable->data); + rhashtable_destroy(&flowtable->data.rhashtable); module_put(flowtable->data.type->owner); } From d8ed9600581d40d818ae417b3086a333841b0559 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 7 Feb 2018 11:50:41 +0900 Subject: [PATCH 20/82] netfilter: remove useless prototype prototype nf_ct_nat_offset is not used anymore. Signed-off-by: Taehee Yoo --- include/net/netfilter/nf_conntrack.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f5223bf2c420..062dc19b5840 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -213,11 +213,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct) return nf_ct_delete(ct, 0, 0); } -/* These are for NAT. Icky. */ -extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); - /* Set all unconfirmed conntrack as dying */ void nf_ct_unconfirmed_destroy(struct net *); From 0ff90b6c20340e57616a51ae1a1bf18156d6638a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 7 Feb 2018 09:49:02 +0100 Subject: [PATCH 21/82] netfilter: nf_flow_offload: fix use-after-free and a resource leak flow_offload_del frees the flow, so all associated resource must be freed before. Since the ct entry in struct flow_offload_entry was allocated by flow_offload_alloc, it should be freed by flow_offload_free to take care of the error handling path when flow_offload_add fails. While at it, make flow_offload_del static, since it should never be called directly, only from the gc step Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - net/netfilter/nf_flow_table.c | 27 +++++++-------------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 020ae903066f..833752dd0c58 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -90,7 +90,6 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct, void flow_offload_free(struct flow_offload *flow); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); -void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); int nf_flow_table_iterate(struct nf_flowtable *flow_table, diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c index c17f1af42daa..ec410cae9307 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table.c @@ -125,7 +125,9 @@ void flow_offload_free(struct flow_offload *flow) dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); e = container_of(flow, struct flow_offload_entry, flow); - kfree(e); + nf_ct_delete(e->ct, 0, 0); + nf_ct_put(e->ct); + kfree_rcu(e, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); @@ -149,11 +151,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) } EXPORT_SYMBOL_GPL(flow_offload_add); -void flow_offload_del(struct nf_flowtable *flow_table, - struct flow_offload *flow) +static void flow_offload_del(struct nf_flowtable *flow_table, + struct flow_offload *flow) { - struct flow_offload_entry *e; - rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, *flow_table->type->params); @@ -161,10 +161,8 @@ void flow_offload_del(struct nf_flowtable *flow_table, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, *flow_table->type->params); - e = container_of(flow, struct flow_offload_entry, flow); - kfree_rcu(e, rcu_head); + flow_offload_free(flow); } -EXPORT_SYMBOL_GPL(flow_offload_del); struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, @@ -175,15 +173,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table, } EXPORT_SYMBOL_GPL(flow_offload_lookup); -static void nf_flow_release_ct(const struct flow_offload *flow) -{ - struct flow_offload_entry *e; - - e = container_of(flow, struct flow_offload_entry, flow); - nf_ct_delete(e->ct, 0, 0); - nf_ct_put(e->ct); -} - int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct flow_offload *flow, void *data), void *data) @@ -259,10 +248,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); if (nf_flow_has_expired(flow) || - nf_flow_is_dying(flow)) { + nf_flow_is_dying(flow)) flow_offload_del(flow_table, flow); - nf_flow_release_ct(flow); - } } out: rhashtable_walk_stop(&hti); From c713fb071edc0efc01a955f65a006b0e1795d2eb Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 5 Feb 2018 12:38:11 -0600 Subject: [PATCH 22/82] rtlwifi: rtl8821ae: Fix connection lost problem correctly There has been a coding error in rtl8821ae since it was first introduced, namely that an 8-bit register was read using a 16-bit read in _rtl8821ae_dbi_read(). This error was fixed with commit 40b368af4b75 ("rtlwifi: Fix alignment issues"); however, this change led to instability in the connection. To restore stability, this change was reverted in commit b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection lost problem"). Unfortunately, the unaligned access causes machine checks in ARM architecture, and we were finally forced to find the actual cause of the problem on x86 platforms. Following a suggestion from Pkshih , it was found that increasing the ASPM L1 latency from 0 to 7 fixed the instability. This parameter was varied to see if a smaller value would work; however, it appears that 7 is the safest value. A new symbol is defined for this quantity, thus it can be easily changed if necessary. Fixes: b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection lost problem") Cc: Stable # 4.14+ Fix-suggested-by: Pkshih Signed-off-by: Larry Finger Tested-by: James Cameron # x86_64 OLPC NL3 Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 5 +++-- drivers/net/wireless/realtek/rtlwifi/wifi.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c index f20e77b4bb65..317c1b3101da 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c @@ -1123,7 +1123,7 @@ static u8 _rtl8821ae_dbi_read(struct rtl_priv *rtlpriv, u16 addr) } if (0 == tmp) { read_addr = REG_DBI_RDATA + addr % 4; - ret = rtl_read_word(rtlpriv, read_addr); + ret = rtl_read_byte(rtlpriv, read_addr); } return ret; } @@ -1165,7 +1165,8 @@ static void _rtl8821ae_enable_aspm_back_door(struct ieee80211_hw *hw) } tmp = _rtl8821ae_dbi_read(rtlpriv, 0x70f); - _rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7)); + _rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7) | + ASPM_L1_LATENCY << 3); tmp = _rtl8821ae_dbi_read(rtlpriv, 0x719); _rtl8821ae_dbi_write(rtlpriv, 0x719, tmp | BIT(3) | BIT(4)); diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h index a7aacbc3984e..46dcb7fef195 100644 --- a/drivers/net/wireless/realtek/rtlwifi/wifi.h +++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h @@ -99,6 +99,7 @@ #define RTL_USB_MAX_RX_COUNT 100 #define QBSS_LOAD_SIZE 5 #define MAX_WMMELE_LENGTH 64 +#define ASPM_L1_LATENCY 7 #define TOTAL_CAM_ENTRY 32 From 0a7fe718239e2f8d9c067070f59f53b509bb17dc Mon Sep 17 00:00:00 2001 From: Yu Wang Date: Tue, 30 Jan 2018 14:06:01 +0200 Subject: [PATCH 23/82] ath10k: correct the length of DRAM dump for QCA6174 hw3.x/QCA9377 hw1.1 The length of DRAM dump for QCA6174 hw3.0/hw3.2 and QCA9377 hw1.1 are less than the actual value, some coredump contents are missed. To fix it, change the length from 0x90000 to 0xa8000. Fixes: 703f261dd77f ("ath10k: add memory dump support for QCA6174/QCA9377") Signed-off-by: Yu Wang Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/coredump.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c index 4dde126dab17..7173b3743b43 100644 --- a/drivers/net/wireless/ath/ath10k/coredump.c +++ b/drivers/net/wireless/ath/ath10k/coredump.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. + * Copyright (c) 2018, The Linux Foundation. All rights reserved. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -616,7 +617,7 @@ static const struct ath10k_mem_region qca6174_hw30_mem_regions[] = { { .type = ATH10K_MEM_REGION_TYPE_DRAM, .start = 0x400000, - .len = 0x90000, + .len = 0xa8000, .name = "DRAM", .section_table = { .sections = NULL, From d5cc61119343b6f1b8716cfe591d4989ea0c5c28 Mon Sep 17 00:00:00 2001 From: Tobias Schramm Date: Tue, 30 Jan 2018 14:06:02 +0200 Subject: [PATCH 24/82] PCI: Add Ubiquiti Networks vendor ID Acked-by: Bjorn Helgaas Signed-off-by: Tobias Schramm Signed-off-by: Kalle Valo --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ab20dc5db423..35871adfdc86 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -149,6 +149,8 @@ #define PCI_VENDOR_ID_DYNALINK 0x0675 #define PCI_DEVICE_ID_DYNALINK_IS64PH 0x1702 +#define PCI_VENDOR_ID_UBIQUITI 0x0777 + #define PCI_VENDOR_ID_BERKOM 0x0871 #define PCI_DEVICE_ID_BERKOM_A1T 0xffa1 #define PCI_DEVICE_ID_BERKOM_T_CONCEPT 0xffa2 From 34f1cb339cae5c0b6b75094e2d5c79d19be424ed Mon Sep 17 00:00:00 2001 From: Tobias Schramm Date: Tue, 30 Jan 2018 14:06:05 +0200 Subject: [PATCH 25/82] ath10k: add support for Ubiquiti rebranded QCA988X v2 Some modern Ubiquiti devices contain a rebranded QCA988X rev2 with a custom Ubiquiti vendor and device id. This patch adds support for those devices, treating them as a QCA988X v2. Signed-off-by: Tobias Schramm [kvalo@codeaurora.org: rebase, add missing fields in hw_params, fix a long line in pci.c:61] Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/core.c | 29 ++++++++++++++++++++++++++ drivers/net/wireless/ath/ath10k/hw.h | 1 + drivers/net/wireless/ath/ath10k/pci.c | 6 ++++++ 3 files changed, 36 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index b0fdc1023619..6fb282f76804 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -90,6 +90,35 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = { .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, }, + { + .id = QCA988X_HW_2_0_VERSION, + .dev_id = QCA988X_2_0_DEVICE_ID_UBNT, + .name = "qca988x hw2.0 ubiquiti", + .patch_load_addr = QCA988X_HW_2_0_PATCH_LOAD_ADDR, + .uart_pin = 7, + .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL, + .otp_exe_param = 0, + .channel_counters_freq_hz = 88000, + .max_probe_resp_desc_thres = 0, + .cal_data_len = 2116, + .fw = { + .dir = QCA988X_HW_2_0_FW_DIR, + .board = QCA988X_HW_2_0_BOARD_DATA_FILE, + .board_size = QCA988X_BOARD_DATA_SZ, + .board_ext_size = QCA988X_BOARD_EXT_DATA_SZ, + }, + .hw_ops = &qca988x_ops, + .decap_align_bytes = 4, + .spectral_bin_discard = 0, + .vht160_mcs_rx_highest = 0, + .vht160_mcs_tx_highest = 0, + .n_cipher_suites = 8, + .num_peers = TARGET_TLV_NUM_PEERS, + .ast_skid_limit = 0x10, + .num_wds_entries = 0x20, + .target_64bit = false, + .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, + }, { .id = QCA9887_HW_1_0_VERSION, .dev_id = QCA9887_1_0_DEVICE_ID, diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 6203bc65799b..413b1b4321f7 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -22,6 +22,7 @@ #define ATH10K_FW_DIR "ath10k" +#define QCA988X_2_0_DEVICE_ID_UBNT (0x11ac) #define QCA988X_2_0_DEVICE_ID (0x003c) #define QCA6164_2_1_DEVICE_ID (0x0041) #define QCA6174_2_1_DEVICE_ID (0x003e) diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index 355db6a0fcf3..1b266cd0c2ec 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -58,6 +58,9 @@ MODULE_PARM_DESC(reset_mode, "0: auto, 1: warm only (default: 0)"); #define ATH10K_DIAG_TRANSFER_LIMIT 0x5000 static const struct pci_device_id ath10k_pci_id_table[] = { + /* PCI-E QCA988X V2 (Ubiquiti branded) */ + { PCI_VDEVICE(UBIQUITI, QCA988X_2_0_DEVICE_ID_UBNT) }, + { PCI_VDEVICE(ATHEROS, QCA988X_2_0_DEVICE_ID) }, /* PCI-E QCA988X V2 */ { PCI_VDEVICE(ATHEROS, QCA6164_2_1_DEVICE_ID) }, /* PCI-E QCA6164 V2.1 */ { PCI_VDEVICE(ATHEROS, QCA6174_2_1_DEVICE_ID) }, /* PCI-E QCA6174 V2.1 */ @@ -74,6 +77,7 @@ static const struct ath10k_pci_supp_chip ath10k_pci_supp_chips[] = { * hacks. ath10k doesn't have them and these devices crash horribly * because of that. */ + { QCA988X_2_0_DEVICE_ID_UBNT, QCA988X_HW_2_0_CHIP_ID_REV }, { QCA988X_2_0_DEVICE_ID, QCA988X_HW_2_0_CHIP_ID_REV }, { QCA6164_2_1_DEVICE_ID, QCA6174_HW_2_1_CHIP_ID_REV }, @@ -2193,6 +2197,7 @@ static int ath10k_pci_get_num_banks(struct ath10k *ar) struct ath10k_pci *ar_pci = ath10k_pci_priv(ar); switch (ar_pci->pdev->device) { + case QCA988X_2_0_DEVICE_ID_UBNT: case QCA988X_2_0_DEVICE_ID: case QCA99X0_2_0_DEVICE_ID: case QCA9888_2_0_DEVICE_ID: @@ -3424,6 +3429,7 @@ static int ath10k_pci_probe(struct pci_dev *pdev, u32 (*targ_cpu_to_ce_addr)(struct ath10k *ar, u32 addr); switch (pci_dev->device) { + case QCA988X_2_0_DEVICE_ID_UBNT: case QCA988X_2_0_DEVICE_ID: hw_rev = ATH10K_HW_QCA988X; pci_ps = false; From b9607de6cf22a5cd268b9206177a9baafb6e8ac8 Mon Sep 17 00:00:00 2001 From: Wojciech Dubowik Date: Tue, 30 Jan 2018 14:06:07 +0200 Subject: [PATCH 26/82] ath9k: Fix get channel default noise floor Commit 8da58553cc63 ("ath9k: Use calibrated noise floor value when available") introduced regression in ath9k_hw_getchan_noise where per chain nominal noise floor has been taken instead default for channel. Revert to original default channel noise floor. Fixes: 8da58553cc63 ("ath9k: Use calibrated noise floor value when available") Reported-by: Sebastian Gottschall Signed-off-by: Wojciech Dubowik Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath9k/calib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c index 3d9447e21025..695c779ae8cf 100644 --- a/drivers/net/wireless/ath/ath9k/calib.c +++ b/drivers/net/wireless/ath/ath9k/calib.c @@ -72,7 +72,7 @@ static s16 ath9k_hw_get_default_nf(struct ath_hw *ah, s16 ath9k_hw_getchan_noise(struct ath_hw *ah, struct ath9k_channel *chan, s16 nf) { - s8 noise = ath9k_hw_get_default_nf(ah, chan, 0); + s8 noise = ATH_DEFAULT_NOISE_FLOOR; if (nf) { s8 delta = nf - ATH9K_NF_CAL_NOISE_THRESH - From 50e79e25250bf928369996277e85b00536b380c7 Mon Sep 17 00:00:00 2001 From: Yu Wang Date: Tue, 30 Jan 2018 14:06:08 +0200 Subject: [PATCH 27/82] ath10k: fix kernel panic issue during pci probe If device gone during chip reset, ar->normal_mode_fw.board is not initialized, but ath10k_debug_print_hwfw_info() will try to access its member, which will cause 'kernel NULL pointer' issue. This was found using a faulty device (pci link went down sometimes) in a random insmod/rmmod/other-op test. To fix it, check ar->normal_mode_fw.board before accessing the member. pci 0000:02:00.0: BAR 0: assigned [mem 0xf7400000-0xf75fffff 64bit] ath10k_pci 0000:02:00.0: enabling device (0000 -> 0002) ath10k_pci 0000:02:00.0: pci irq msi oper_irq_mode 2 irq_mode 0 reset_mode 0 ath10k_pci 0000:02:00.0: failed to read device register, device is gone ath10k_pci 0000:02:00.0: failed to wait for target init: -5 ath10k_pci 0000:02:00.0: failed to warm reset: -5 ath10k_pci 0000:02:00.0: firmware crashed during chip reset ath10k_pci 0000:02:00.0: firmware crashed! (uuid 5d018951-b8e1-404a-8fde-923078b4423a) ath10k_pci 0000:02:00.0: (null) target 0x00000000 chip_id 0x00340aff sub 0000:0000 ath10k_pci 0000:02:00.0: kconfig debug 1 debugfs 1 tracing 1 dfs 1 testmode 1 ath10k_pci 0000:02:00.0: firmware ver api 0 features crc32 00000000 ... BUG: unable to handle kernel NULL pointer dereference at 00000004 ... Call Trace: [] ath10k_print_driver_info+0x12/0x20 [ath10k_core] [] ath10k_pci_fw_crashed_dump+0x6d/0x4d0 [ath10k_pci] [] ? ath10k_pci_sleep.part.19+0x57/0xc0 [ath10k_pci] [] ath10k_pci_hif_power_up+0x14e/0x1b0 [ath10k_pci] [] ? do_page_fault+0xb/0x10 [] ath10k_core_register_work+0x24/0x840 [ath10k_core] [] ? netlbl_unlhsh_remove+0x178/0x410 [] ? __do_page_fault+0x480/0x480 [] process_one_work+0x114/0x3e0 [] worker_thread+0x37/0x4a0 [] kthread+0xa4/0xc0 [] ? create_worker+0x180/0x180 [] ? kthread_park+0x50/0x50 [] ret_from_fork+0x1b/0x28 Code: 78 80 b8 50 09 00 00 00 75 5d 8d 75 94 c7 44 24 08 aa d7 52 fb c7 44 24 04 64 00 00 00 89 34 24 e8 82 52 e2 c5 8b 83 dc 08 00 00 <8b> 50 04 8b 08 31 c0 e8 20 57 e3 c5 89 44 24 10 8b 83 58 09 00 EIP: []- ath10k_debug_print_board_info+0x34/0xb0 [ath10k_core] SS:ESP 0068:f4921d90 CR2: 0000000000000004 Signed-off-by: Yu Wang Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c index 6d836a26272f..554cd7856cb6 100644 --- a/drivers/net/wireless/ath/ath10k/debug.c +++ b/drivers/net/wireless/ath/ath10k/debug.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2005-2011 Atheros Communications Inc. * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. + * Copyright (c) 2018, The Linux Foundation. All rights reserved. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -81,6 +82,8 @@ void ath10k_debug_print_hwfw_info(struct ath10k *ar) void ath10k_debug_print_board_info(struct ath10k *ar) { char boardinfo[100]; + const struct firmware *board; + u32 crc; if (ar->id.bmi_ids_valid) scnprintf(boardinfo, sizeof(boardinfo), "%d:%d", @@ -88,11 +91,16 @@ void ath10k_debug_print_board_info(struct ath10k *ar) else scnprintf(boardinfo, sizeof(boardinfo), "N/A"); + board = ar->normal_mode_fw.board; + if (!IS_ERR_OR_NULL(board)) + crc = crc32_le(0, board->data, board->size); + else + crc = 0; + ath10k_info(ar, "board_file api %d bmi_id %s crc32 %08x", ar->bd_api, boardinfo, - crc32_le(0, ar->normal_mode_fw.board->data, - ar->normal_mode_fw.board->size)); + crc); } void ath10k_debug_print_boot_info(struct ath10k *ar) From 4e12d654ba068df06c5e4c8322d7dcced41e48ee Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 30 Jan 2018 14:06:11 +0200 Subject: [PATCH 28/82] ath9k_htc: add Altai WA1011N-GU as reported in: https://github.com/qca/open-ath9k-htc-firmware/pull/71#issuecomment-361100070 Signed-off-by: Oleksij Rempel Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath9k/hif_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index 56676eaff24c..cb0eef13af1c 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -24,6 +24,7 @@ static const struct usb_device_id ath9k_hif_usb_ids[] = { { USB_DEVICE(0x0cf3, 0x9271) }, /* Atheros */ { USB_DEVICE(0x0cf3, 0x1006) }, /* Atheros */ { USB_DEVICE(0x0846, 0x9030) }, /* Netgear N150 */ + { USB_DEVICE(0x07b8, 0x9271) }, /* Altai WA1011N-GU */ { USB_DEVICE(0x07D1, 0x3A10) }, /* Dlink Wireless 150 */ { USB_DEVICE(0x13D3, 0x3327) }, /* Azurewave */ { USB_DEVICE(0x13D3, 0x3328) }, /* Azurewave */ From 035d808f7cfd578dc210b584beb17e365f34d39a Mon Sep 17 00:00:00 2001 From: Naresh Kamboju Date: Wed, 7 Feb 2018 23:45:34 +0530 Subject: [PATCH 29/82] selftests: bpf: test_kmod.sh: check the module path before insmod test_kmod.sh reported false failure when module not present. Check test_bpf.ko is present in the path before loading it. Two cases to be addressed here, In the development process of test_bpf.c unit testing will be done by developers by using "insmod $SRC_TREE/lib/test_bpf.ko" On the other hand testers run full tests by installing modules on device under test (DUT) and followed by modprobe to insert the modules accordingly. Signed-off-by: Naresh Kamboju Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_kmod.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh index ed4774d8d6ed..35669ccd4d23 100755 --- a/tools/testing/selftests/bpf/test_kmod.sh +++ b/tools/testing/selftests/bpf/test_kmod.sh @@ -10,9 +10,21 @@ test_run() echo "[ JIT enabled:$1 hardened:$2 ]" dmesg -C - insmod $SRC_TREE/lib/test_bpf.ko 2> /dev/null - if [ $? -ne 0 ]; then - rc=1 + if [ -f ${SRC_TREE}/lib/test_bpf.ko ]; then + insmod ${SRC_TREE}/lib/test_bpf.ko 2> /dev/null + if [ $? -ne 0 ]; then + rc=1 + fi + else + # Use modprobe dry run to check for missing test_bpf module + if ! /sbin/modprobe -q -n test_bpf; then + echo "test_bpf: [SKIP]" + elif /sbin/modprobe -q test_bpf; then + echo "test_bpf: ok" + else + echo "test_bpf: [FAIL]" + rc=1 + fi fi rmmod test_bpf 2> /dev/null dmesg | grep FAIL From e729452ec33304260d4741e9ce67784a457ee1e6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 6 Feb 2018 21:17:17 +0100 Subject: [PATCH 30/82] cxgb4: Fix error handling path in 'init_one()' Commit baf5086840ab1 ("cxgb4: restructure VF mgmt code") has reordered some code but an error handling label has not been updated accordingly. So fix it and free 'adapter' if 't4_wait_dev_ready()' fails. Fixes: baf5086840ab1 ("cxgb4: restructure VF mgmt code") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 1ca2a39ed0f8..56bc626ef006 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5166,7 +5166,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->regs = regs; err = t4_wait_dev_ready(regs); if (err < 0) - goto out_unmap_bar0; + goto out_free_adapter; /* We control everything through one PF */ whoami = readl(regs + PL_WHOAMI_A); From 17e9e23b130e4e269fa53c2370325249f3ba75dd Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 6 Feb 2018 16:44:12 +0000 Subject: [PATCH 31/82] rxrpc: Fix received abort handling AF_RXRPC is incorrectly sending back to the server any abort it receives for a client connection. This is due to the final-ACK offload to the connection event processor patch. The abort code is copied into the last-call information on the connection channel and then the event processor is set. Instead, the following should be done: (1) In the case of a final-ACK for a successful call, the ACK should be scheduled as before. (2) In the case of a locally generated ABORT, the ABORT details should be cached for sending in response to further packets related to that call and no further action scheduled at call disconnect time. (3) In the case of an ACK received from the peer, the call should be considered dead, no ABORT should be transmitted at this time. In response to further non-ABORT packets from the peer relating to this call, an RX_USER_ABORT ABORT should be transmitted. (4) In the case of a call killed due to network error, an RX_USER_ABORT ABORT should be cached for transmission in response to further packets, but no ABORT should be sent at this time. Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/conn_client.c | 3 ++- net/rxrpc/conn_object.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 7f74ca3059f8..064175068059 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -834,7 +834,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) * can be skipped if we find a follow-on call. The first DATA packet * of the follow on call will implicitly ACK this call. */ - if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + if (call->completion == RXRPC_CALL_SUCCEEDED && + test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { unsigned long final_ack_at = jiffies + 2; WRITE_ONCE(chan->final_ack_at, final_ack_at); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c628351eb900..ccbac190add1 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -177,13 +177,21 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, * through the channel, whilst disposing of the actual call record. */ trace_rxrpc_disconnect_call(call); - if (call->abort_code) { - chan->last_abort = call->abort_code; - chan->last_type = RXRPC_PACKET_TYPE_ABORT; - } else { + switch (call->completion) { + case RXRPC_CALL_SUCCEEDED: chan->last_seq = call->rx_hard_ack; chan->last_type = RXRPC_PACKET_TYPE_ACK; + break; + case RXRPC_CALL_LOCALLY_ABORTED: + chan->last_abort = call->abort_code; + chan->last_type = RXRPC_PACKET_TYPE_ABORT; + break; + default: + chan->last_abort = RX_USER_ABORT; + chan->last_type = RXRPC_PACKET_TYPE_ABORT; + break; } + /* Sync with rxrpc_conn_retransmit(). */ smp_wmb(); chan->last_call = chan->call_id; From c861ef83d771362ed0475cd510eb56cf4126ef34 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 6 Feb 2018 11:34:23 -0800 Subject: [PATCH 32/82] sun: Add SPDX license tags to Sun network drivers Add the appropriate SPDX license tags to the Sun network drivers as outlined in Documentation/process/license-rules.rst. Signed-off-by: Shannon Nelson Reviewed-by: Zhu Yanjun Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/Kconfig | 1 + drivers/net/ethernet/sun/cassini.c | 1 + drivers/net/ethernet/sun/cassini.h | 1 + drivers/net/ethernet/sun/ldmvsw.c | 1 + drivers/net/ethernet/sun/niu.c | 1 + drivers/net/ethernet/sun/sunbmac.c | 1 + drivers/net/ethernet/sun/sungem.c | 1 + drivers/net/ethernet/sun/sunhme.c | 1 + drivers/net/ethernet/sun/sunqe.c | 1 + drivers/net/ethernet/sun/sunvnet.c | 1 + drivers/net/ethernet/sun/sunvnet_common.c | 1 + 11 files changed, 11 insertions(+) diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig index b2caf5132bd2..7b982e02ea3a 100644 --- a/drivers/net/ethernet/sun/Kconfig +++ b/drivers/net/ethernet/sun/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Sun network device configuration # diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index 113bd57e2ea0..9020b084b953 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* cassini.c: Sun Microsystems Cassini(+) ethernet driver. * * Copyright (C) 2004 Sun Microsystems Inc. diff --git a/drivers/net/ethernet/sun/cassini.h b/drivers/net/ethernet/sun/cassini.h index 882ce168a799..13f3860496a8 100644 --- a/drivers/net/ethernet/sun/cassini.h +++ b/drivers/net/ethernet/sun/cassini.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* $Id: cassini.h,v 1.16 2004/08/17 21:15:16 zaumen Exp $ * cassini.h: Definitions for Sun Microsystems Cassini(+) ethernet driver. * diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index 5ea037672e6f..a5dd627fe2f9 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* ldmvsw.c: Sun4v LDOM Virtual Switch Driver. * * Copyright (C) 2016-2017 Oracle. All rights reserved. diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 06001bacbe0f..8dd545fed30d 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* niu.c: Neptune ethernet driver. * * Copyright (C) 2007, 2008 David S. Miller (davem@davemloft.net) diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index 0b1f41f6bceb..f047b2797156 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* sunbmac.c: Driver for Sparc BigMAC 100baseT ethernet adapters. * * Copyright (C) 1997, 1998, 1999, 2003, 2008 David S. Miller (davem@davemloft.net) diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index a7afcee3c5ae..7a16d40a72d1 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* $Id: sungem.c,v 1.44.2.22 2002/03/13 01:18:12 davem Exp $ * sungem.c: Sun GEM ethernet driver. * diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 0431f1e5f511..06da2f59fcbf 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* sunhme.c: Sparc HME/BigMac 10/100baseT half/full duplex auto switching, * auto carrier detecting ethernet driver. Also known as the * "Happy Meal Ethernet" found on SunSwift SBUS cards. diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c index a6bcdcdd947e..7fe0d5e33922 100644 --- a/drivers/net/ethernet/sun/sunqe.c +++ b/drivers/net/ethernet/sun/sunqe.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* sunqe.c: Sparc QuadEthernet 10baseT SBUS card driver. * Once again I am out to prove that every ethernet * controller out there can be most efficiently programmed diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index 27fb22638885..63d3d6b215f3 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* sunvnet.c: Sun LDOM Virtual Network Driver. * * Copyright (C) 2007, 2008 David S. Miller diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c index 8aa3ce46bb81..d8f4c3f28150 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.c +++ b/drivers/net/ethernet/sun/sunvnet_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* sunvnet.c: Sun LDOM Virtual Network Driver. * * Copyright (C) 2007, 2008 David S. Miller From 58e354c01b1906b603dcdb28ec35250b09eac625 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Feb 2018 12:14:12 -0800 Subject: [PATCH 33/82] net/ipv6: Handle reject routes with onlink flag Verification of nexthops with onlink flag need to handle unreachable routes. The lookup is only intended to validate the gateway address is not a local address and if the gateway resolves the egress device must match the given device. Hence, hitting any default reject route is ok. Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb2d251c0500..69c43d289c69 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2488,7 +2488,8 @@ static int ip6_route_check_nh_onlink(struct net *net, err = 0; grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); if (grt) { - if (grt->rt6i_flags & flags || dev != grt->dst.dev) { + if (!grt->dst.error && + (grt->rt6i_flags & flags || dev != grt->dst.dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); err = -EINVAL; } From 44750f8483a1bc4bda58ec4f7b7af5b053833d49 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 6 Feb 2018 13:17:06 -0800 Subject: [PATCH 34/82] net/ipv6: onlink nexthop checks should default to main table Because of differences in how ipv4 and ipv6 handle fib lookups, verification of nexthops with onlink flag need to default to the main table rather than the local table used by IPv4. As it stands an address within a connected route on device 1 can be used with onlink on device 2. Updating the table properly rejects the route due to the egress device mismatch. Update the extack message as well to show it could be a device mismatch for the nexthop spec. Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 69c43d289c69..9dcfadddd800 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2479,7 +2479,7 @@ static int ip6_route_check_nh_onlink(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; + u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; struct rt6_info *grt; @@ -2490,7 +2490,8 @@ static int ip6_route_check_nh_onlink(struct net *net, if (grt) { if (!grt->dst.error && (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } From bc6d33c8d93f5999920e97a8c6330b8910053d4f Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 6 Feb 2018 13:15:20 -0800 Subject: [PATCH 35/82] i40e: Fix the number of queues available to be mapped for use Fix the number of queues per enabled TC and report available queues to the kernel without having to limit them to the max RSS limit so they are available to be mapped for XPS. This allows a queue per processing thread available for handling traffic for the given traffic class. Signed-off-by: Amritha Nambiar Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_main.c | 27 +++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f95ce9b5e4fb..e31adbc75f9c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1785,7 +1785,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, struct i40e_pf *pf = vsi->back; u16 sections = 0; u8 netdev_tc = 0; - u16 numtc = 0; + u16 numtc = 1; u16 qcount; u8 offset; u16 qmap; @@ -1795,9 +1795,11 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; offset = 0; + /* Number of queues per enabled TC */ + num_tc_qps = vsi->alloc_queue_pairs; if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) { /* Find numtc from enabled TC bitmap */ - for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { + for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { if (enabled_tc & BIT(i)) /* TC is enabled */ numtc++; } @@ -1805,18 +1807,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, dev_warn(&pf->pdev->dev, "DCB is enabled but no TC enabled, forcing TC0\n"); numtc = 1; } - } else { - /* At least TC0 is enabled in non-DCB, non-MQPRIO case */ - numtc = 1; + num_tc_qps = num_tc_qps / numtc; + num_tc_qps = min_t(int, num_tc_qps, + i40e_pf_get_max_q_per_tc(pf)); } vsi->tc_config.numtc = numtc; vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1; - /* Number of queues per enabled TC */ - qcount = vsi->alloc_queue_pairs; - - num_tc_qps = qcount / numtc; - num_tc_qps = min_t(int, num_tc_qps, i40e_pf_get_max_q_per_tc(pf)); /* Do not allow use more TC queue pairs than MSI-X vectors exist */ if (pf->flags & I40E_FLAG_MSIX_ENABLED) @@ -1831,9 +1828,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, switch (vsi->type) { case I40E_VSI_MAIN: - qcount = min_t(int, pf->alloc_rss_size, - num_tc_qps); - break; + if (!(pf->flags & (I40E_FLAG_FD_SB_ENABLED | + I40E_FLAG_FD_ATR_ENABLED)) || + vsi->tc_config.enabled_tc != 1) { + qcount = min_t(int, pf->alloc_rss_size, + num_tc_qps); + break; + } case I40E_VSI_FDIR: case I40E_VSI_SRIOV: case I40E_VSI_VMDQ2: From 3468656fd7599b0cb1092bb1ee717d1a984e93ee Mon Sep 17 00:00:00 2001 From: John Allen Date: Tue, 6 Feb 2018 16:21:49 -0600 Subject: [PATCH 36/82] ibmvnic: Fix rx queue cleanup for non-fatal resets At some point, a check was added to exit the polling routine during resets. This makes sense for most reset conditions, but for a non-fatal error, we expect the polling routine to continue running to properly clean up the rx queues. This patch checks if we are performing a non-fatal reset and if we are, continues normal polling operation. Signed-off-by: John Allen Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index afaf29b201dc..8dc8fc503750 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1831,7 +1831,8 @@ restart_poll: u16 offset; u8 flags = 0; - if (unlikely(adapter->resetting)) { + if (unlikely(adapter->resetting && + adapter->reset_reason != VNIC_RESET_NON_FATAL)) { enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]); napi_complete_done(napi, frames_processed); return frames_processed; From b0992eca00c490c0923044b7d8b853c212b3cacc Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 6 Feb 2018 17:25:23 -0600 Subject: [PATCH 37/82] ibmvnic: Ensure that buffers are NULL after free This change will guard against a double free in the case that the buffers were previously freed at some other time, such as during a device reset. It resolves a kernel oops that occurred when changing the VNIC device's MTU. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 8dc8fc503750..8dabc9d9dfa6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -354,6 +354,8 @@ static void release_stats_buffers(struct ibmvnic_adapter *adapter) { kfree(adapter->tx_stats_buffers); kfree(adapter->rx_stats_buffers); + adapter->tx_stats_buffers = NULL; + adapter->rx_stats_buffers = NULL; } static int init_stats_buffers(struct ibmvnic_adapter *adapter) @@ -599,6 +601,8 @@ static void release_vpd_data(struct ibmvnic_adapter *adapter) kfree(adapter->vpd->buff); kfree(adapter->vpd); + + adapter->vpd = NULL; } static void release_tx_pools(struct ibmvnic_adapter *adapter) @@ -909,6 +913,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter) if (dma_mapping_error(dev, adapter->vpd->dma_addr)) { dev_err(dev, "Could not map VPD buffer\n"); kfree(adapter->vpd->buff); + adapter->vpd->buff = NULL; return -ENOMEM; } From 62f94c2101f35cd45775df00ba09bde77580e26a Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 6 Feb 2018 19:17:06 -0600 Subject: [PATCH 38/82] net: ethernet: ti: cpsw: fix net watchdog timeout It was discovered that simple program which indefinitely sends 200b UDP packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog timeout is triggered due to race between cpsw_ndo_start_xmit() and cpsw_tx_handler() [NAPI] cpsw_ndo_start_xmit() if (unlikely(!cpdma_check_free_tx_desc(txch))) { txq = netdev_get_tx_queue(ndev, q_idx); netif_tx_stop_queue(txq); ^^ as per [1] barier has to be used after set_bit() otherwise new value might not be visible to other cpus } cpsw_tx_handler() if (unlikely(netif_tx_queue_stopped(txq))) netif_tx_wake_queue(txq); and when it happens ndev TX queue became disabled forever while driver's HW TX queue is empty. Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue() calls and double check for free TX descriptors after stopping ndev TX queue - if there are free TX descriptors wake up ndev TX queue. [1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html Signed-off-by: Grygorii Strashko Reviewed-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 3c85a0885f9b..1b1b78fdc138 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1636,6 +1636,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb, q_idx = q_idx % cpsw->tx_ch_num; txch = cpsw->txv[q_idx].ch; + txq = netdev_get_tx_queue(ndev, q_idx); ret = cpsw_tx_packet_submit(priv, skb, txch); if (unlikely(ret != 0)) { cpsw_err(priv, tx_err, "desc submit failed\n"); @@ -1646,15 +1647,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb, * tell the kernel to stop sending us tx frames. */ if (unlikely(!cpdma_check_free_tx_desc(txch))) { - txq = netdev_get_tx_queue(ndev, q_idx); netif_tx_stop_queue(txq); + + /* Barrier, so that stop_queue visible to other cpus */ + smp_mb__after_atomic(); + + if (cpdma_check_free_tx_desc(txch)) + netif_tx_wake_queue(txq); } return NETDEV_TX_OK; fail: ndev->stats.tx_dropped++; - txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb)); netif_tx_stop_queue(txq); + + /* Barrier, so that stop_queue visible to other cpus */ + smp_mb__after_atomic(); + + if (cpdma_check_free_tx_desc(txch)) + netif_tx_wake_queue(txq); + return NETDEV_TX_BUSY; } From 043e337f555e610ad8237fd23522d97c968d72b9 Mon Sep 17 00:00:00 2001 From: "Md. Islam" Date: Tue, 6 Feb 2018 23:14:18 -0500 Subject: [PATCH 39/82] sch_netem: Bug fixing in calculating Netem interval In Kernel 4.15.0+, Netem does not work properly. Netem setup: tc qdisc add dev h1-eth0 root handle 1: netem delay 10ms 2ms Result: PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data. 64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=22.8 ms 64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=10.9 ms 64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=10.9 ms 64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=11.4 ms 64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms 64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=4303 ms 64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.2 ms 64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.3 ms 64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=4304 ms 64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=4303 ms Patch: (rnd % (2 * sigma)) - sigma was overflowing s32. After applying the patch, I found following output which is desirable. PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data. 64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=21.1 ms 64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=8.46 ms 64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=9.00 ms 64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=11.8 ms 64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=8.36 ms 64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms 64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=8.11 ms 64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=10.0 ms 64 bytes from 172.16.101.2: icmp_seq=9 ttl=64 time=11.3 ms 64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.5 ms 64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.2 ms Reviewed-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7bbc13b8ca47..7c179addebcd 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -327,7 +327,7 @@ static s64 tabledist(s64 mu, s32 sigma, /* default uniform distribution */ if (dist == NULL) - return (rnd % (2 * sigma)) - sigma + mu; + return ((rnd % (2 * sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; From 5c487bb9adddbc1d23433e09d2548759375c2b52 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 6 Feb 2018 20:50:23 -0800 Subject: [PATCH 40/82] tcp: tracepoint: only call trace_tcp_send_reset with full socket tracepoint tcp_send_reset requires a full socket to work. However, it may be called when in TCP_TIME_WAIT: case TCP_TW_RST: tcp_v6_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; To avoid this problem, this patch checks the socket with sk_fullsock() before calling trace_tcp_send_reset(). Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset") Signed-off-by: Song Liu Reviewed-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 95738aa0d8a6..f8ad397e285e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) */ if (sk) { arg.bound_dev_if = sk->sk_bound_dev_if; - trace_tcp_send_reset(sk, skb); + if (sk_fullsock(sk)) + trace_tcp_send_reset(sk, skb); } BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a1ab29e2ab3b..412139f4eccd 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -942,7 +942,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (sk) { oif = sk->sk_bound_dev_if; - trace_tcp_send_reset(sk, skb); + if (sk_fullsock(sk)) + trace_tcp_send_reset(sk, skb); } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); From b7d99235473ad3a550f8eb05bd4469edadf1c8e6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:27:12 -0800 Subject: [PATCH 41/82] nfp: bpf: fix immed relocation for larger offsets Immed relocation is missing a shift which means for larger offsets the lower and higher part of the address would be ORed together. Fixes: ce4ebfd859c3 ("nfp: bpf: add helpers for updating immediate instructions") Signed-off-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_asm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index 3f6952b66a49..1e597600c693 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -107,7 +107,7 @@ u16 immed_get_value(u64 instr) if (!unreg_is_imm(reg)) reg = FIELD_GET(OP_IMMED_B_SRC, instr); - return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr); + return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr) << 8; } void immed_set_value(u64 *instr, u16 immed) From 0badd331491097cc3702d05a6dd0264a434712b2 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 7 Feb 2018 20:27:13 -0800 Subject: [PATCH 42/82] libbpf: complete list of strings for guessing program type It seems that the type guessing feature for libbpf, based on the name of the ELF section the program is located in, was inspired from samples/bpf/prog_load.c, which was not used by any sample for loading programs of certain types such as TC actions and classifiers, or LWT-related types. As a consequence, libbpf is not able to guess the type of such programs and to load them automatically if type is not provided to the `bpf_load_prog()` function. Add ELF section names associated to those eBPF program types so that they can be loaded with e.g. bpftool as well. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 71ddc481f349..c64840365433 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1816,12 +1816,17 @@ static const struct { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), + BPF_PROG_SEC("classifier", BPF_PROG_TYPE_SCHED_CLS), + BPF_PROG_SEC("action", BPF_PROG_TYPE_SCHED_ACT), BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), BPF_PROG_SEC("cgroup/skb", BPF_PROG_TYPE_CGROUP_SKB), BPF_PROG_SEC("cgroup/sock", BPF_PROG_TYPE_CGROUP_SOCK), BPF_PROG_SEC("cgroup/dev", BPF_PROG_TYPE_CGROUP_DEVICE), + BPF_PROG_SEC("lwt_in", BPF_PROG_TYPE_LWT_IN), + BPF_PROG_SEC("lwt_out", BPF_PROG_TYPE_LWT_OUT), + BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT), BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS), BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), }; From 92426820f7616caebdf6ba665f749102a670b3d4 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 7 Feb 2018 20:27:14 -0800 Subject: [PATCH 43/82] tools: bpftool: exit doc Makefile early if rst2man is not available If rst2man is not available on the system, running `make doc` from the bpftool directory fails with an error message. However, it creates empty manual pages (.8 files in this case). A subsequent call to `make doc-install` would then succeed and install those empty man pages on the system. To prevent this, raise a Makefile error and exit immediately if rst2man is not available before generating the pages from the rst documentation. Fixes: ff69c21a85a4 ("tools: bpftool: add documentation") Reported-by: Jason van Aaardt Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Documentation/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index c462a928e03d..a9d47c1558bb 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -23,7 +23,12 @@ DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) man: man8 man8: $(DOC_MAN8) +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) + $(OUTPUT)%.8: %.rst +ifndef RST2MAN_DEP + $(error "rst2man not found, but required to generate man pages") +endif $(QUIET_GEN)rst2man $< > $@ clean: From 2148481d5d3111e5e9aa5adc1905fa3539e548c8 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 7 Feb 2018 20:27:15 -0800 Subject: [PATCH 44/82] tools: bpftool: make syntax for program map update explicit in man page Specify in the documentation that when using bpftool to update a map of type BPF_MAP_TYPE_PROG_ARRAY, the syntax for the program used as a value should use the "id|tag|pinned" keywords convention, as used with "bpftool prog" commands. Fixes: ff69c21a85a4 ("tools: bpftool: add documentation") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 0ab32b312aec..457e868bd32f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -31,7 +31,8 @@ MAP COMMANDS | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } -| *VALUE* := { *BYTES* | *MAP* | *PROGRAM* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *VALUE* := { *BYTES* | *MAP* | *PROG* } | *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } DESCRIPTION From 55f3538c4923e9dfca132e99ebec370e8094afda Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 7 Feb 2018 20:27:16 -0800 Subject: [PATCH 45/82] tools: bpftool: add bash completion for `bpftool prog load` Add bash completion for bpftool command `prog load`. Completion for this command is easy, as it only takes existing file paths as arguments. Fixes: 49a086c201a9 ("bpftool: implement prog load command") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/bash-completion/bpftool | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 0137866bb8f6..17d246418348 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -230,10 +230,14 @@ _bpftool() fi return 0 ;; + load) + _filedir + return 0 + ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'dump help pin show list' -- \ - "$cur" ) ) + COMPREPLY=( $( compgen -W 'dump help pin load \ + show list' -- "$cur" ) ) ;; esac ;; From a827a164b98be2acba6a2e7559643ac9a5745086 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 7 Feb 2018 20:27:17 -0800 Subject: [PATCH 46/82] tools: bpftool: add bash completion for cgroup commands Add bash completion for "bpftool cgroup" command family. While at it, also fix the formatting of some keywords in the man page for cgroups. Fixes: 5ccda64d38cc ("bpftool: implement cgroup bpf operations") Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- .../bpftool/Documentation/bpftool-cgroup.rst | 4 +- tools/bpf/bpftool/bash-completion/bpftool | 64 +++++++++++++++++-- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 2fe2a1bdbe3e..0e4e923235b6 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -26,8 +26,8 @@ MAP COMMANDS | **bpftool** **cgroup help** | | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } -| *ATTACH_TYPE* := { *ingress* | *egress* | *sock_create* | *sock_ops* | *device* } -| *ATTACH_FLAGS* := { *multi* | *override* } +| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** } +| *ATTACH_FLAGS* := { **multi** | **override** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 17d246418348..08719c54a614 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -52,16 +52,24 @@ _bpftool_once_attr() done } -# Takes a list of words in argument; adds them all to COMPREPLY if none of them -# is already present on the command line. Returns no value. -_bpftool_one_of_list() +# Takes a list of words as argument; if any of those words is present on the +# command line, return 0. Otherwise, return 1. +_bpftool_search_list() { local w idx for w in $*; do for (( idx=3; idx < ${#words[@]}-1; idx++ )); do - [[ $w == ${words[idx]} ]] && return 1 + [[ $w == ${words[idx]} ]] && return 0 done done + return 1 +} + +# Takes a list of words in argument; adds them all to COMPREPLY if none of them +# is already present on the command line. Returns no value. +_bpftool_one_of_list() +{ + _bpftool_search_list $* && return 1 COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) ) } @@ -351,6 +359,54 @@ _bpftool() ;; esac ;; + cgroup) + case $command in + show|list) + _filedir + return 0 + ;; + attach|detach) + local ATTACH_TYPES='ingress egress sock_create sock_ops \ + device' + local ATTACH_FLAGS='multi override' + local PROG_TYPE='id pinned tag' + case $prev in + $command) + _filedir + return 0 + ;; + ingress|egress|sock_create|sock_ops|device) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ + "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_prog_ids + return 0 + ;; + *) + if ! _bpftool_search_list "$ATTACH_TYPES"; then + COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- \ + "$cur" ) ) + elif [[ "$command" == "attach" ]]; then + # We have an attach type on the command line, + # but it is not the previous word, or + # "id|pinned|tag" (we already checked for + # that). This should only leave the case when + # we need attach flags for "attach" commamnd. + _bpftool_one_of_list "$ATTACH_FLAGS" + fi + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help attach detach \ + show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool From 9ce8b24aa96e983a540bb8244298a10322881ef4 Mon Sep 17 00:00:00 2001 From: Ryan Hsu Date: Wed, 7 Feb 2018 15:51:23 +0200 Subject: [PATCH 47/82] Revert "ath10k: add sanity check to ie_len before parsing fw/board ie" This reverts commit 9ed4f91628737c820af6a1815b65bc06bd31518f. The commit introduced a regression that over read the ie with the padding. - the expected IE information ath10k_pci 0000:03:00.0: found firmware features ie (1 B) ath10k_pci 0000:03:00.0: Enabling feature bit: 6 ath10k_pci 0000:03:00.0: Enabling feature bit: 7 ath10k_pci 0000:03:00.0: features ath10k_pci 0000:03:00.0: 00000000: c0 00 00 00 00 00 00 00 - the wrong IE with padding is read (0x77) ath10k_pci 0000:03:00.0: found firmware features ie (4 B) ath10k_pci 0000:03:00.0: Enabling feature bit: 6 ath10k_pci 0000:03:00.0: Enabling feature bit: 7 ath10k_pci 0000:03:00.0: Enabling feature bit: 8 ath10k_pci 0000:03:00.0: Enabling feature bit: 9 ath10k_pci 0000:03:00.0: Enabling feature bit: 10 ath10k_pci 0000:03:00.0: Enabling feature bit: 12 ath10k_pci 0000:03:00.0: Enabling feature bit: 13 ath10k_pci 0000:03:00.0: Enabling feature bit: 14 ath10k_pci 0000:03:00.0: Enabling feature bit: 16 ath10k_pci 0000:03:00.0: Enabling feature bit: 17 ath10k_pci 0000:03:00.0: Enabling feature bit: 18 ath10k_pci 0000:03:00.0: features ath10k_pci 0000:03:00.0: 00000000: c0 77 07 00 00 00 00 00 Tested-by: Mike Lothian Signed-off-by: Ryan Hsu Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 6fb282f76804..f3ec13b80b20 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -1305,10 +1305,7 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar, len -= sizeof(*hdr); data = hdr->data; - /* jump over the padding */ - ie_len = ALIGN(ie_len, 4); - - if (len < ie_len) { + if (len < ALIGN(ie_len, 4)) { ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n", ie_id, ie_len, len); ret = -EINVAL; @@ -1347,6 +1344,9 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar, goto out; } + /* jump over the padding */ + ie_len = ALIGN(ie_len, 4); + len -= ie_len; data += ie_len; } @@ -1477,9 +1477,6 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name, len -= sizeof(*hdr); data += sizeof(*hdr); - /* jump over the padding */ - ie_len = ALIGN(ie_len, 4); - if (len < ie_len) { ath10k_err(ar, "invalid length for FW IE %d (%zu < %zu)\n", ie_id, len, ie_len); @@ -1585,6 +1582,9 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name, break; } + /* jump over the padding */ + ie_len = ALIGN(ie_len, 4); + len -= ie_len; data += ie_len; } From e3ac6c0737e2b17bf11210e3fd66565e9b818b87 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:55:22 -0800 Subject: [PATCH 48/82] nfp: bpf: require ETH table Upcoming changes will require all netdevs supporting TC offloads to have a full struct nfp_port. Require those for BPF offload. The operation without management FW reporting information about Ethernet ports is something we only support for very old and very basic NIC firmwares anyway. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Tested-by: Pieter Jansen van Vuuren Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 322027792fe8..61898dda11cf 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -35,6 +35,7 @@ #include "../nfpcore/nfp_cpp.h" #include "../nfpcore/nfp_nffw.h" +#include "../nfpcore/nfp_nsp.h" #include "../nfp_app.h" #include "../nfp_main.h" #include "../nfp_net.h" @@ -87,9 +88,20 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn) static int nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id) { + struct nfp_pf *pf = app->pf; struct nfp_bpf_vnic *bv; int err; + if (!pf->eth_tbl) { + nfp_err(pf->cpp, "No ETH table\n"); + return -EINVAL; + } + if (pf->max_data_vnics != pf->eth_tbl->count) { + nfp_err(pf->cpp, "ETH entries don't match vNICs (%d vs %d)\n", + pf->max_data_vnics, pf->eth_tbl->count); + return -EINVAL; + } + bv = kzalloc(sizeof(*bv), GFP_KERNEL); if (!bv) return -ENOMEM; From 0b9de4ca853b6ba2c92ff0b4602281001b166639 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:55:23 -0800 Subject: [PATCH 49/82] nfp: don't advertise hw-tc-offload on non-port netdevs nfp_port is a structure which represents an ASIC port, both PCIe vNIC (on a PF or a VF) or the external MAC port. vNIC netdev (struct nfp_net) and pure representor netdev (struct nfp_repr) both have a pointer to this structure. nfp_reprs always have a port associated. nfp_nets, however, only represent a device port in legacy mode, where they are considered the MAC port. In switchdev mode they are just the CPU's side of the PCIe link. By definition TC offloads only apply to device ports. Don't set the flag on vNICs without a port (i.e. in switchdev mode). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Tested-by: Pieter Jansen van Vuuren Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index c0fd351c86b1..fe77ea8b656c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3734,7 +3734,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn) netdev->features = netdev->hw_features; - if (nfp_app_has_tc(nn->app)) + if (nfp_app_has_tc(nn->app) && nn->port) netdev->hw_features |= NETIF_F_HW_TC; /* Advertise but disable TSO by default. */ From d692403e5cf8008f31f5664a6f3ce3e65d54f458 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:55:24 -0800 Subject: [PATCH 50/82] nfp: forbid disabling hw-tc-offload on representors while offload active All netdevs which can accept TC offloads must implement .ndo_set_features(). nfp_reprs currently do not do that, which means hw-tc-offload can be turned on and off even when offloads are active. Whether the offloads are active is really a question to nfp_ports, so remove the per-app tc_busy callback indirection thing, and simply count the number of offloaded items in nfp_port structure. Fixes: 8a2768732a4d ("nfp: provide infrastructure for offloading flower based TC filters") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Tested-by: Pieter Jansen van Vuuren Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 9 +-------- .../ethernet/netronome/nfp/flower/offload.c | 4 ++++ drivers/net/ethernet/netronome/nfp/nfp_app.h | 9 --------- .../ethernet/netronome/nfp/nfp_net_common.c | 7 +++---- .../net/ethernet/netronome/nfp/nfp_net_repr.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_port.c | 18 ++++++++++++++++++ drivers/net/ethernet/netronome/nfp/nfp_port.h | 6 ++++++ 7 files changed, 33 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 61898dda11cf..34e98aa6b956 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -182,6 +182,7 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type, return err; bv->tc_prog = cls_bpf->prog; + nn->port->tc_offload_cnt = !!bv->tc_prog; return 0; } @@ -219,13 +220,6 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev, } } -static bool nfp_bpf_tc_busy(struct nfp_app *app, struct nfp_net *nn) -{ - struct nfp_bpf_vnic *bv = nn->app_priv; - - return !!bv->tc_prog; -} - static int nfp_bpf_change_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu) { @@ -429,7 +423,6 @@ const struct nfp_app_type app_bpf = { .ctrl_msg_rx = nfp_bpf_ctrl_msg_rx, .setup_tc = nfp_bpf_setup_tc, - .tc_busy = nfp_bpf_tc_busy, .bpf = nfp_ndo_bpf, .xdp_offload = nfp_bpf_xdp_offload, }; diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 08c4c6dc5f7f..eb5c13dea8f5 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -349,6 +349,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, struct tc_cls_flower_offload *flow, bool egress) { enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE; + struct nfp_port *port = nfp_port_from_netdev(netdev); struct nfp_flower_priv *priv = app->priv; struct nfp_fl_payload *flow_pay; struct nfp_fl_key_ls *key_layer; @@ -390,6 +391,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev, INIT_HLIST_NODE(&flow_pay->link); flow_pay->tc_flower_cookie = flow->cookie; hash_add_rcu(priv->flow_table, &flow_pay->link, flow->cookie); + port->tc_offload_cnt++; /* Deallocate flow payload when flower rule has been destroyed. */ kfree(key_layer); @@ -421,6 +423,7 @@ static int nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev, struct tc_cls_flower_offload *flow) { + struct nfp_port *port = nfp_port_from_netdev(netdev); struct nfp_fl_payload *nfp_flow; int err; @@ -442,6 +445,7 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev, err_free_flow: hash_del_rcu(&nfp_flow->link); + port->tc_offload_cnt--; kfree(nfp_flow->action_data); kfree(nfp_flow->mask_data); kfree(nfp_flow->unmasked_data); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index 437964afa8ee..20546ae67909 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -92,7 +92,6 @@ extern const struct nfp_app_type app_flower; * @stop: stop application logic * @ctrl_msg_rx: control message handler * @setup_tc: setup TC ndo - * @tc_busy: TC HW offload busy (rules loaded) * @bpf: BPF ndo offload-related calls * @xdp_offload: offload an XDP program * @eswitch_mode_get: get SR-IOV eswitch mode @@ -135,7 +134,6 @@ struct nfp_app_type { int (*setup_tc)(struct nfp_app *app, struct net_device *netdev, enum tc_setup_type type, void *type_data); - bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn); int (*bpf)(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *xdp); int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn, @@ -301,13 +299,6 @@ static inline bool nfp_app_has_tc(struct nfp_app *app) return app && app->type->setup_tc; } -static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn) -{ - if (!app || !app->type->tc_busy) - return false; - return app->type->tc_busy(app, nn); -} - static inline int nfp_app_setup_tc(struct nfp_app *app, struct net_device *netdev, enum tc_setup_type type, void *type_data) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index fe77ea8b656c..19e989239af7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3210,10 +3210,9 @@ static int nfp_net_set_features(struct net_device *netdev, new_ctrl &= ~NFP_NET_CFG_CTRL_GATHER; } - if (changed & NETIF_F_HW_TC && nfp_app_tc_busy(nn->app, nn)) { - nn_err(nn, "Cannot disable HW TC offload while in use\n"); - return -EBUSY; - } + err = nfp_port_set_features(netdev, features); + if (err) + return err; nn_dbg(nn, "Feature change 0x%llx -> 0x%llx (changed=0x%llx)\n", netdev->features, features, changed); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index f67da6bde9da..619570524d2a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -265,6 +265,7 @@ const struct net_device_ops nfp_repr_netdev_ops = { .ndo_set_vf_spoofchk = nfp_app_set_vf_spoofchk, .ndo_get_vf_config = nfp_app_get_vf_config, .ndo_set_vf_link_state = nfp_app_set_vf_link_state, + .ndo_set_features = nfp_port_set_features, }; static void nfp_repr_clean(struct nfp_repr *repr) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c index 34a6e035fe9a..7bd8be5c833b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c @@ -32,6 +32,7 @@ */ #include +#include #include #include "nfpcore/nfp_cpp.h" @@ -100,6 +101,23 @@ int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type, return nfp_app_setup_tc(port->app, netdev, type, type_data); } +int nfp_port_set_features(struct net_device *netdev, netdev_features_t features) +{ + struct nfp_port *port; + + port = nfp_port_from_netdev(netdev); + if (!port) + return 0; + + if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) && + port->tc_offload_cnt) { + netdev_err(netdev, "Cannot disable HW TC offload while offloads active\n"); + return -EBUSY; + } + + return 0; +} + struct nfp_port * nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id) { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h index 21bd4aa32646..fa7e669a969c 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h @@ -72,6 +72,8 @@ enum nfp_port_flags { * @netdev: backpointer to associated netdev * @type: what port type does the entity represent * @flags: port flags + * @tc_offload_cnt: number of active TC offloads, how offloads are counted + * is not defined, use as a boolean * @app: backpointer to the app structure * @dl_port: devlink port structure * @eth_id: for %NFP_PORT_PHYS_PORT port ID in NFP enumeration scheme @@ -87,6 +89,7 @@ struct nfp_port { enum nfp_port_type type; unsigned long flags; + unsigned long tc_offload_cnt; struct nfp_app *app; @@ -121,6 +124,9 @@ static inline bool nfp_port_is_vnic(const struct nfp_port *port) return port->type == NFP_PORT_PF_PORT || port->type == NFP_PORT_VF_PORT; } +int +nfp_port_set_features(struct net_device *netdev, netdev_features_t features); + struct nfp_port *nfp_port_from_netdev(struct net_device *netdev); struct nfp_port * nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id); From 0d592e52fbbda26155b686dd93878cf774a3ab0e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:55:25 -0800 Subject: [PATCH 51/82] nfp: limit the number of TSO segments Most FWs limit the number of TSO segments a frame can produce to 64. This is for fairness and efficiency (of FW datapath) reasons. If a frame with larger number of segments is submitted the FW will drop it. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++ drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 19e989239af7..a05be0ab2713 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3750,6 +3750,8 @@ static void nfp_net_netdev_init(struct nfp_net *nn) netdev->min_mtu = ETH_MIN_MTU; netdev->max_mtu = nn->max_mtu; + netdev->gso_max_segs = NFP_NET_LSO_MAX_SEGS; + netif_carrier_off(netdev); nfp_net_set_ethtool_ops(netdev); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index eeecef2caac6..4499a7333078 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -59,9 +59,12 @@ #define NFP_NET_RX_OFFSET 32 /** - * Maximum header size supported for LSO frames + * LSO parameters + * %NFP_NET_LSO_MAX_HDR_SZ: Maximum header size supported for LSO frames + * %NFP_NET_LSO_MAX_SEGS: Maximum number of segments LSO frame can produce */ #define NFP_NET_LSO_MAX_HDR_SZ 255 +#define NFP_NET_LSO_MAX_SEGS 64 /** * Prepend field types From 1a5e8e35000577cb9100d22daa8b5ebcfa2be9b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Feb 2018 20:55:26 -0800 Subject: [PATCH 52/82] nfp: populate MODULE_VERSION DKMS and similar out-of-tree module replacement services use module version to make sure the out-of-tree software is not older than the module shipped with the kernel. We use the kernel version in ethtool -i output, put it into MODULE_VERSION as well. Reported-by: Jan Gutter Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index cc570bb6563c..ab301d56430b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -649,3 +649,4 @@ MODULE_FIRMWARE("netronome/nic_AMDA0099-0001_2x25.nffw"); MODULE_AUTHOR("Netronome Systems "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("The Netronome Flow Processor (NFP) driver."); +MODULE_VERSION(UTS_RELEASE); From 8c2f826dc36314059ac146c78d3bf8056b626446 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Feb 2018 15:59:07 +0000 Subject: [PATCH 53/82] rxrpc: Don't put crypto buffers on the stack Don't put buffers of data to be handed to crypto on the stack as this may cause an assertion failure in the kernel (see below). Fix this by using an kmalloc'd buffer instead. kernel BUG at ./include/linux/scatterlist.h:147! ... RIP: 0010:rxkad_encrypt_response.isra.6+0x191/0x1b0 [rxrpc] RSP: 0018:ffffbe2fc06cfca8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff989277d59900 RCX: 0000000000000028 RDX: 0000259dc06cfd88 RSI: 0000000000000025 RDI: ffffbe30406cfd88 RBP: ffffbe2fc06cfd60 R08: ffffbe2fc06cfd08 R09: ffffbe2fc06cfd08 R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff7c5f80d9f95 R13: ffffbe2fc06cfd88 R14: ffff98927a3f7aa0 R15: ffffbe2fc06cfd08 FS: 0000000000000000(0000) GS:ffff98927fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055b1ff28f0f8 CR3: 000000001b412003 CR4: 00000000003606f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rxkad_respond_to_challenge+0x297/0x330 [rxrpc] rxrpc_process_connection+0xd1/0x690 [rxrpc] ? process_one_work+0x1c3/0x680 ? __lock_is_held+0x59/0xa0 process_one_work+0x249/0x680 worker_thread+0x3a/0x390 ? process_one_work+0x680/0x680 kthread+0x121/0x140 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x3a/0x50 Reported-by: Jonathan Billings Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Jonathan Billings Signed-off-by: David S. Miller --- net/rxrpc/conn_event.c | 1 + net/rxrpc/rxkad.c | 90 +++++++++++++++++++++++------------------- 2 files changed, 51 insertions(+), 40 deletions(-) diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 4ca11be6be3c..b1dfae107431 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -460,6 +460,7 @@ void rxrpc_process_connection(struct work_struct *work) case -EKEYEXPIRED: case -EKEYREJECTED: goto protocol_error; + case -ENOMEM: case -EAGAIN: goto requeue_and_leave; case -ECONNABORTED: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index c38b3a1de56c..77cb23c7bd0a 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -773,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, { const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response resp - __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); const char *eproto; u32 version, nonce, min_level, abort_code; @@ -818,26 +817,29 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, token = conn->params.key->payload.data[0]; /* build the response packet */ - memset(&resp, 0, sizeof(resp)); + resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); + if (!resp) + return -ENOMEM; - resp.version = htonl(RXKAD_VERSION); - resp.encrypted.epoch = htonl(conn->proto.epoch); - resp.encrypted.cid = htonl(conn->proto.cid); - resp.encrypted.securityIndex = htonl(conn->security_ix); - resp.encrypted.inc_nonce = htonl(nonce + 1); - resp.encrypted.level = htonl(conn->params.security_level); - resp.kvno = htonl(token->kad->kvno); - resp.ticket_len = htonl(token->kad->ticket_len); - - resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + resp->version = htonl(RXKAD_VERSION); + resp->encrypted.epoch = htonl(conn->proto.epoch); + resp->encrypted.cid = htonl(conn->proto.cid); + resp->encrypted.securityIndex = htonl(conn->security_ix); + resp->encrypted.inc_nonce = htonl(nonce + 1); + resp->encrypted.level = htonl(conn->params.security_level); + resp->kvno = htonl(token->kad->kvno); + resp->ticket_len = htonl(token->kad->ticket_len); + resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(&resp); - rxkad_encrypt_response(conn, &resp, token->kad); - return rxkad_send_response(conn, &sp->hdr, &resp, token->kad); + rxkad_calc_response_checksum(resp); + rxkad_encrypt_response(conn, resp, token->kad); + ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); + kfree(resp); + return ret; protocol_error: trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); @@ -1048,8 +1050,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb, u32 *_abort_code) { - struct rxkad_response response - __attribute__((aligned(8))); /* must be aligned for crypto */ + struct rxkad_response *response; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; const char *eproto; @@ -1061,17 +1062,22 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + ret = -ENOMEM; + response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); + if (!response) + goto temporary_error; + eproto = tracepoint_string("rxkad_rsp_short"); abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &response, sizeof(response)) < 0) + response, sizeof(*response)) < 0) goto protocol_error; - if (!pskb_pull(skb, sizeof(response))) + if (!pskb_pull(skb, sizeof(*response))) BUG(); - version = ntohl(response.version); - ticket_len = ntohl(response.ticket_len); - kvno = ntohl(response.kvno); + version = ntohl(response->version); + ticket_len = ntohl(response->ticket_len); + kvno = ntohl(response->kvno); _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", sp->hdr.serial, version, kvno, ticket_len); @@ -1105,31 +1111,31 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key, &expiry, _abort_code); if (ret < 0) - goto temporary_error_free; + goto temporary_error_free_resp; /* use the session key from inside the ticket to decrypt the * response */ - rxkad_decrypt_response(conn, &response, &session_key); + rxkad_decrypt_response(conn, response, &session_key); eproto = tracepoint_string("rxkad_rsp_param"); abort_code = RXKADSEALEDINCON; - if (ntohl(response.encrypted.epoch) != conn->proto.epoch) + if (ntohl(response->encrypted.epoch) != conn->proto.epoch) goto protocol_error_free; - if (ntohl(response.encrypted.cid) != conn->proto.cid) + if (ntohl(response->encrypted.cid) != conn->proto.cid) goto protocol_error_free; - if (ntohl(response.encrypted.securityIndex) != conn->security_ix) + if (ntohl(response->encrypted.securityIndex) != conn->security_ix) goto protocol_error_free; - csum = response.encrypted.checksum; - response.encrypted.checksum = 0; - rxkad_calc_response_checksum(&response); + csum = response->encrypted.checksum; + response->encrypted.checksum = 0; + rxkad_calc_response_checksum(response); eproto = tracepoint_string("rxkad_rsp_csum"); - if (response.encrypted.checksum != csum) + if (response->encrypted.checksum != csum) goto protocol_error_free; spin_lock(&conn->channel_lock); for (i = 0; i < RXRPC_MAXCALLS; i++) { struct rxrpc_call *call; - u32 call_id = ntohl(response.encrypted.call_id[i]); + u32 call_id = ntohl(response->encrypted.call_id[i]); eproto = tracepoint_string("rxkad_rsp_callid"); if (call_id > INT_MAX) @@ -1153,12 +1159,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_rsp_seq"); abort_code = RXKADOUTOFSEQUENCE; - if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) + if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1) goto protocol_error_free; eproto = tracepoint_string("rxkad_rsp_level"); abort_code = RXKADLEVELFAIL; - level = ntohl(response.encrypted.level); + level = ntohl(response->encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) goto protocol_error_free; conn->params.security_level = level; @@ -1168,9 +1174,10 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, * as for a client connection */ ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); if (ret < 0) - goto temporary_error_free; + goto temporary_error_free_ticket; kfree(ticket); + kfree(response); _leave(" = 0"); return 0; @@ -1179,12 +1186,15 @@ protocol_error_unlock: protocol_error_free: kfree(ticket); protocol_error: + kfree(response); trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); *_abort_code = abort_code; return -EPROTO; -temporary_error_free: +temporary_error_free_ticket: kfree(ticket); +temporary_error_free_resp: + kfree(response); temporary_error: /* Ignore the response packet if we got a temporary error such as * ENOMEM. We just want to send the challenge again. Note that we From cb9f7a9a5c96a773bbc9c70660dc600cfff82f82 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 6 Feb 2018 14:48:32 +0100 Subject: [PATCH 54/82] netlink: ensure to loop over all netns in genlmsg_multicast_allns() Nowadays, nlmsg_multicast() returns only 0 or -ESRCH but this was not the case when commit 134e63756d5f was pushed. However, there was no reason to stop the loop if a netns does not have listeners. Returns -ESRCH only if there was no listeners in all netns. To avoid having the same problem in the future, I didn't take the assumption that nlmsg_multicast() returns only 0 or -ESRCH. Fixes: 134e63756d5f ("genetlink: make netns aware") CC: Johannes Berg Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d444daf1ac04..6f02499ef007 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1081,6 +1081,7 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, { struct sk_buff *tmp; struct net *net, *prev = NULL; + bool delivered = false; int err; for_each_net_rcu(net) { @@ -1092,14 +1093,21 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, } err = nlmsg_multicast(prev->genl_sock, tmp, portid, group, flags); - if (err) + if (!err) + delivered = true; + else if (err != -ESRCH) goto error; } prev = net; } - return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); + err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); + if (!err) + delivered = true; + else if (err != -ESRCH) + goto error; + return delivered ? 0 : -ESRCH; error: kfree_skb(skb); return err; From 762c330d670e3d4b795cf7a8d761866fdd1eef49 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 7 Feb 2018 17:14:46 +0800 Subject: [PATCH 55/82] tuntap: add missing xdp flush When using devmap to redirect packets between interfaces, xdp_do_flush() is usually a must to flush any batched packets. Unfortunately this is missed in current tuntap implementation. Unlike most hardware driver which did XDP inside NAPI loop and call xdp_do_flush() at then end of each round of poll. TAP did it in the context of process e.g tun_get_user(). So fix this by count the pending redirected packets and flush when it exceeds NAPI_POLL_WEIGHT or MSG_MORE was cleared by sendmsg() caller. With this fix, xdp_redirect_map works again between two TAPs. Fixes: 761876c857cb ("tap: XDP support") Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0dc66e4fbb2c..17e496b88f81 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -181,6 +181,7 @@ struct tun_file { struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; + int xdp_pending_pkts; }; struct tun_flow_entry { @@ -1665,6 +1666,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, case XDP_REDIRECT: get_page(alloc_frag->page); alloc_frag->offset += buflen; + ++tfile->xdp_pending_pkts; err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); if (err) goto err_redirect; @@ -1986,6 +1988,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK, false); + if (tfile->xdp_pending_pkts) { + tfile->xdp_pending_pkts = 0; + xdp_do_flush_map(); + } + tun_put(tun); return result; } @@ -2322,6 +2329,13 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); + + if (tfile->xdp_pending_pkts >= NAPI_POLL_WEIGHT || + !(m->msg_flags & MSG_MORE)) { + tfile->xdp_pending_pkts = 0; + xdp_do_flush_map(); + } + tun_put(tun); return ret; } @@ -3153,6 +3167,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring)); + tfile->xdp_pending_pkts = 0; return 0; } From 4ff66cae7f10b65b028dc3bdaaad9cc2989ef6ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 7 Feb 2018 13:53:20 +0100 Subject: [PATCH 56/82] rtnetlink: require unique netns identifier Since we've added support for IFLA_IF_NETNSID for RTM_{DEL,GET,SET,NEW}LINK it is possible for userspace to send us requests with three different properties to identify a target network namespace. This affects at least RTM_{NEW,SET}LINK. Each of them could potentially refer to a different network namespace which is confusing. For legacy reasons the kernel will pick the IFLA_NET_NS_PID property first and then look for the IFLA_NET_NS_FD property but there is no reason to extend this type of behavior to network namespace ids. The regression potential is quite minimal since the rtnetlink requests in question either won't allow IFLA_IF_NETNSID requests before 4.16 is out (RTM_{NEW,SET}LINK) or don't support IFLA_NET_NS_{PID,FD} (RTM_{DEL,GET}LINK) in the first place. Signed-off-by: Christian Brauner Acked-by: Jiri Benc Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 56af8e41abfc..bc290413a49d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1951,6 +1951,38 @@ static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, return net; } +/* Verify that rtnetlink requests do not pass additional properties + * potentially referring to different network namespaces. + */ +static int rtnl_ensure_unique_netns(struct nlattr *tb[], + struct netlink_ext_ack *extack, + bool netns_id_only) +{ + + if (netns_id_only) { + if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) + return 0; + + NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); + return -EOPNOTSUPP; + } + + if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD])) + goto invalid_attr; + + if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID])) + goto invalid_attr; + + return 0; + +invalid_attr: + NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); + return -EINVAL; +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -2553,6 +2585,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + err = rtnl_ensure_unique_netns(tb, extack, false); + if (err < 0) + goto errout; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2649,6 +2685,10 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); @@ -2802,6 +2842,10 @@ replay: if (err < 0) return err; + err = rtnl_ensure_unique_netns(tb, extack, false); + if (err < 0) + return err; + if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -3045,6 +3089,10 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err < 0) + return err; + if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); From 583133b35ecaea84590e92e2732d1a472747ca7d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 7 Feb 2018 10:17:29 -0600 Subject: [PATCH 57/82] atm: he: use 64-bit arithmetic instead of 32-bit Add suffix ULL to constants 272, 204, 136 and 68 in order to give the compiler complete information about the proper arithmetic to use. Notice that these constants are used in contexts that expect expressions of type unsigned long long (64 bits, unsigned). The following expressions are currently being evaluated using 32-bit arithmetic: 272 * mult 204 * mult 136 * mult 68 * mult Addresses-Coverity-ID: 201058 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- drivers/atm/he.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/atm/he.c b/drivers/atm/he.c index e58538c29377..29f102dcfec4 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -738,13 +738,13 @@ static int he_init_cs_block_rcm(struct he_dev *he_dev) #else /* this is pretty, but avoids _divdu3 and is mostly correct */ mult = he_dev->atm_dev->link_rate / ATM_OC3_PCR; - if (rate_cps > (272 * mult)) + if (rate_cps > (272ULL * mult)) buf = 4; - else if (rate_cps > (204 * mult)) + else if (rate_cps > (204ULL * mult)) buf = 3; - else if (rate_cps > (136 * mult)) + else if (rate_cps > (136ULL * mult)) buf = 2; - else if (rate_cps > (68 * mult)) + else if (rate_cps > (68ULL * mult)) buf = 1; else buf = 0; From ec95dffa408f0c24c0b358f3723c6ba262190965 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 7 Feb 2018 13:00:24 -0600 Subject: [PATCH 58/82] ibmvnic: queue reset when CRQ gets closed during reset While handling a driver reset we get a H_CLOSED return trying to send a CRQ event. When this occurs we need to queue up another reset attempt. Without doing this we see instances where the driver is left in a closed state because the reset failed and there is no further attempts to reset the driver. Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 8dabc9d9dfa6..dd4a2946e1da 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2914,8 +2914,12 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter, cpu_to_be64(u64_crq[1])); if (rc) { - if (rc == H_CLOSED) + if (rc == H_CLOSED) { dev_warn(dev, "CRQ Queue closed\n"); + if (adapter->resetting) + ibmvnic_reset(adapter, VNIC_RESET_FATAL); + } + dev_warn(dev, "Send error (rc=%d)\n", rc); } From e728789c52afccc1275cba1dd812f03abe16ea3c Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 7 Feb 2018 20:35:00 +0100 Subject: [PATCH 59/82] net: Extra '_get' in declaration of arch_get_platform_mac_address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit c7f5d105495a ("net: Add eth_platform_get_mac_address() helper."), two declarations were added: int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_get_mac_address(void); An extra '_get' was introduced in arch_get_platform_get_mac_address, remove it. Fix compile warning using W=1: CC net/ethernet/eth.o net/ethernet/eth.c:523:24: warning: no previous prototype for ‘arch_get_platform_mac_address’ [-Wmissing-prototypes] unsigned char * __weak arch_get_platform_mac_address(void) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AR net/ethernet/built-in.o Signed-off-by: Mathieu Malaterre Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 263dbcad22fc..79563840c295 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -31,7 +31,7 @@ #ifdef __KERNEL__ struct device; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); -unsigned char *arch_get_platform_get_mac_address(void); +unsigned char *arch_get_platform_mac_address(void); u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; From 79a8a642bf05cd0dced20621f6fef9d884124abd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Feb 2018 17:44:38 -0800 Subject: [PATCH 60/82] net: Whitelist the skbuff_head_cache "cb" field Most callers of put_cmsg() use a "sizeof(foo)" for the length argument. Within put_cmsg(), a copy_to_user() call is made with a dynamic size, as a result of the cmsg header calculations. This means that hardened usercopy will examine the copy, even though it was technically a fixed size and should be implicitly whitelisted. All the put_cmsg() calls being built from values in skbuff_head_cache are coming out of the protocol-defined "cb" field, so whitelist this field entirely instead of creating per-use bounce buffers, for which there are concerns about performance. Original report was: Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'skbuff_head_cache' (offset 64, size 16)! WARNING: CPU: 0 PID: 3663 at mm/usercopy.c:81 usercopy_warn+0xdb/0x100 mm/usercopy.c:76 ... __check_heap_object+0x89/0xc0 mm/slab.c:4426 check_heap_object mm/usercopy.c:236 [inline] __check_object_size+0x272/0x530 mm/usercopy.c:259 check_object_size include/linux/thread_info.h:112 [inline] check_copy_size include/linux/thread_info.h:143 [inline] copy_to_user include/linux/uaccess.h:154 [inline] put_cmsg+0x233/0x3f0 net/core/scm.c:242 sock_recv_errqueue+0x200/0x3e0 net/core/sock.c:2913 packet_recvmsg+0xb2e/0x17a0 net/packet/af_packet.c:3296 sock_recvmsg_nosec net/socket.c:803 [inline] sock_recvmsg+0xc9/0x110 net/socket.c:810 ___sys_recvmsg+0x2a4/0x640 net/socket.c:2179 __sys_recvmmsg+0x2a9/0xaf0 net/socket.c:2287 SYSC_recvmmsg net/socket.c:2368 [inline] SyS_recvmmsg+0xc4/0x160 net/socket.c:2352 entry_SYSCALL_64_fastpath+0x29/0xa0 Reported-by: syzbot+e2d6cfb305e9f3911dea@syzkaller.appspotmail.com Fixes: 6d07d1cd300f ("usercopy: Restrict non-usercopy caches to size 0") Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8c61c27c1b28..09bd89c90a71 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3894,10 +3894,12 @@ EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { - skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + offsetof(struct sk_buff, cb), + sizeof_field(struct sk_buff, cb), NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), From ebeeb1ad9b8adcc37c2ec21a96f39e9d35199b46 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Sat, 3 Feb 2018 04:26:51 -0800 Subject: [PATCH 61/82] rds: tcp: use rds_destroy_pending() to synchronize netns/module teardown and rds connection/workq management An rds_connection can get added during netns deletion between lines 528 and 529 of 506 static void rds_tcp_kill_sock(struct net *net) : /* code to pull out all the rds_connections that should be destroyed */ : 528 spin_unlock_irq(&rds_tcp_conn_lock); 529 list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) 530 rds_conn_destroy(tc->t_cpath->cp_conn); Such an rds_connection would miss out the rds_conn_destroy() loop (that cancels all pending work) and (if it was scheduled after netns deletion) could trigger the use-after-free. A similar race-window exists for the module unload path in rds_tcp_exit -> rds_tcp_destroy_conns Concurrency with netns deletion (rds_tcp_kill_sock()) must be handled by checking check_net() before enqueuing new work or adding new connections. Concurrency with module-unload is handled by maintaining a module specific flag that is set at the start of the module exit function, and must be checked before enqueuing new work or adding new connections. This commit refactors existing RDS_DESTROY_PENDING checks added by commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with connection teardown") and consolidates all the concurrency checks listed above into the function rds_destroy_pending(). Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/cong.c | 2 +- net/rds/connection.c | 15 +++++++++------ net/rds/ib.c | 17 +++++++++++++++++ net/rds/ib_cm.c | 1 + net/rds/rds.h | 7 +++++++ net/rds/send.c | 10 +++++----- net/rds/tcp.c | 42 ++++++++++++++++++++++++++++++------------ net/rds/tcp_connect.c | 2 +- net/rds/tcp_recv.c | 2 +- net/rds/tcp_send.c | 2 +- net/rds/threads.c | 6 +++--- 11 files changed, 76 insertions(+), 30 deletions(-) diff --git a/net/rds/cong.c b/net/rds/cong.c index 8d19fd25dce3..63da9d2f142d 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -223,7 +223,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) rcu_read_lock(); if (!test_and_set_bit(0, &conn->c_map_queued) && - !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + !rds_destroy_pending(cp->cp_conn)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): diff --git a/net/rds/connection.c b/net/rds/connection.c index b10c0ef36d8d..94e190febfdd 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -220,8 +220,13 @@ static struct rds_connection *__rds_conn_create(struct net *net, is_outgoing); conn->c_path[i].cp_index = i; } - ret = trans->conn_alloc(conn, gfp); + rcu_read_lock(); + if (rds_destroy_pending(conn)) + ret = -ENETDOWN; + else + ret = trans->conn_alloc(conn, gfp); if (ret) { + rcu_read_unlock(); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); @@ -283,6 +288,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, } } spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); out: return conn; @@ -382,13 +388,10 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; - set_bit(RDS_DESTROY_PENDING, &cp->cp_flags); - if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ - synchronize_rcu(); cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); @@ -691,7 +694,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); - if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } @@ -714,7 +717,7 @@ EXPORT_SYMBOL_GPL(rds_conn_drop); void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); - if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } diff --git a/net/rds/ib.c b/net/rds/ib.c index ff0c98096af1..50a88f3e7e39 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -48,6 +48,7 @@ static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; +static atomic_t rds_ib_unloading; module_param(rds_ib_mr_1m_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); @@ -378,8 +379,23 @@ static void rds_ib_unregister_client(void) flush_workqueue(rds_wq); } +static void rds_ib_set_unloading(void) +{ + atomic_set(&rds_ib_unloading, 1); +} + +static bool rds_ib_is_unloading(struct rds_connection *conn) +{ + struct rds_conn_path *cp = &conn->c_path[0]; + + return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) || + atomic_read(&rds_ib_unloading) != 0); +} + void rds_ib_exit(void) { + rds_ib_set_unloading(); + synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); @@ -413,6 +429,7 @@ struct rds_transport rds_ib_transport = { .flush_mrs = rds_ib_flush_mrs, .t_owner = THIS_MODULE, .t_name = "infiniband", + .t_unloading = rds_ib_is_unloading, .t_type = RDS_TRANS_IB }; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 80fb6f63e768..eea1d8611b20 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -117,6 +117,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version)); + set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags); rds_conn_destroy(conn); return; } else { diff --git a/net/rds/rds.h b/net/rds/rds.h index 374ae83b60d4..7301b9b01890 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -518,6 +518,7 @@ struct rds_transport { void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); + bool (*t_unloading)(struct rds_connection *conn); }; struct rds_sock { @@ -862,6 +863,12 @@ static inline void rds_mr_put(struct rds_mr *mr) __rds_put_mr_final(mr); } +static inline bool rds_destroy_pending(struct rds_connection *conn) +{ + return !check_net(rds_conn_net(conn)) || + (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); +} + /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ diff --git a/net/rds/send.c b/net/rds/send.c index d3e32d1f3c7d..b1b0022b8370 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -162,7 +162,7 @@ restart: goto out; } - if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + if (rds_destroy_pending(cp->cp_conn)) { release_in_xmit(cp); ret = -ENETUNREACH; /* dont requeue send work */ goto out; @@ -444,7 +444,7 @@ over_batch: if (batch_count < send_batch_count) goto restart; rcu_read_lock(); - if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cp->cp_send_w, 1); @@ -1162,7 +1162,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) else cpath = &conn->c_path[0]; - if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) { + if (rds_destroy_pending(conn)) { ret = -EAGAIN; goto out; } @@ -1209,7 +1209,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (ret == -ENOMEM || ret == -EAGAIN) { ret = 0; rcu_read_lock(); - if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) + if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); @@ -1295,7 +1295,7 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, /* schedule the send work on rds_wq */ rcu_read_lock(); - if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9920d2f84eff..44c4652721af 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -49,6 +49,7 @@ static unsigned int rds_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); static LIST_HEAD(rds_tcp_conn_list); +static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; @@ -274,14 +275,13 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; - unsigned long flags; rdsdebug("freeing tc %p\n", tc); - spin_lock_irqsave(&rds_tcp_conn_lock, flags); + spin_lock_bh(&rds_tcp_conn_lock); if (!tc->t_tcp_node_detached) list_del(&tc->t_tcp_node); - spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); + spin_unlock_bh(&rds_tcp_conn_lock); kmem_cache_free(rds_tcp_conn_slab, tc); } @@ -296,7 +296,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); if (!tc) { ret = -ENOMEM; - break; + goto fail; } mutex_init(&tc->t_conn_path_lock); tc->t_sock = NULL; @@ -306,14 +306,19 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) conn->c_path[i].cp_transport_data = tc; tc->t_cpath = &conn->c_path[i]; + tc->t_tcp_node_detached = true; - spin_lock_irq(&rds_tcp_conn_lock); - tc->t_tcp_node_detached = false; - list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); - spin_unlock_irq(&rds_tcp_conn_lock); rdsdebug("rds_conn_path [%d] tc %p\n", i, conn->c_path[i].cp_transport_data); } + spin_lock_bh(&rds_tcp_conn_lock); + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + tc = conn->c_path[i].cp_transport_data; + tc->t_tcp_node_detached = false; + list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); + } + spin_unlock_bh(&rds_tcp_conn_lock); +fail: if (ret) { for (j = 0; j < i; j++) rds_tcp_conn_free(conn->c_path[j].cp_transport_data); @@ -332,6 +337,16 @@ static bool list_has_conn(struct list_head *list, struct rds_connection *conn) return false; } +static void rds_tcp_set_unloading(void) +{ + atomic_set(&rds_tcp_unloading, 1); +} + +static bool rds_tcp_is_unloading(struct rds_connection *conn) +{ + return atomic_read(&rds_tcp_unloading) != 0; +} + static void rds_tcp_destroy_conns(void) { struct rds_tcp_connection *tc, *_tc; @@ -370,6 +385,7 @@ struct rds_transport rds_tcp_transport = { .t_type = RDS_TRANS_TCP, .t_prefer_loopback = 1, .t_mp_capable = 1, + .t_unloading = rds_tcp_is_unloading, }; static unsigned int rds_tcp_netid; @@ -513,7 +529,7 @@ static void rds_tcp_kill_sock(struct net *net) rtn->rds_tcp_listen_sock = NULL; rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); - spin_lock_irq(&rds_tcp_conn_lock); + spin_lock_bh(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -526,7 +542,7 @@ static void rds_tcp_kill_sock(struct net *net) tc->t_tcp_node_detached = true; } } - spin_unlock_irq(&rds_tcp_conn_lock); + spin_unlock_bh(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); } @@ -574,7 +590,7 @@ static void rds_tcp_sysctl_reset(struct net *net) { struct rds_tcp_connection *tc, *_tc; - spin_lock_irq(&rds_tcp_conn_lock); + spin_lock_bh(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -584,7 +600,7 @@ static void rds_tcp_sysctl_reset(struct net *net) /* reconnect with new parameters */ rds_conn_path_drop(tc->t_cpath, false); } - spin_unlock_irq(&rds_tcp_conn_lock); + spin_unlock_bh(&rds_tcp_conn_lock); } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, @@ -607,6 +623,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, static void rds_tcp_exit(void) { + rds_tcp_set_unloading(); + synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); unregister_pernet_subsys(&rds_tcp_net_ops); if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 534c67aeb20f..d999e7075645 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) cp->cp_conn, tc, sock); if (sock) { - if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (rds_destroy_pending(cp->cp_conn)) rds_tcp_set_linger(sock); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index dd707b9e73e5..b9fbd2ee74ef 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -323,7 +323,7 @@ void rds_tcp_data_ready(struct sock *sk) if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { rcu_read_lock(); - if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); rcu_read_unlock(); } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 16f65744d984..7df869d37afd 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -204,7 +204,7 @@ void rds_tcp_write_space(struct sock *sk) rcu_read_lock(); if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && - !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + !rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 0); rcu_read_unlock(); diff --git a/net/rds/threads.c b/net/rds/threads.c index eb76db1360b0..c52861d77a59 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -88,7 +88,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) cp->cp_reconnect_jiffies = 0; set_bit(0, &cp->cp_conn->c_map_queued); rcu_read_lock(); - if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) { + if (!rds_destroy_pending(cp->cp_conn)) { queue_delayed_work(rds_wq, &cp->cp_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); } @@ -138,7 +138,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) if (cp->cp_reconnect_jiffies == 0) { cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; rcu_read_lock(); - if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); return; @@ -149,7 +149,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); - if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) + if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_conn_w, rand % cp->cp_reconnect_jiffies); rcu_read_unlock(); From 3968523f855050b8195134da951b87c20bd66130 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Feb 2018 22:34:24 -0800 Subject: [PATCH 62/82] mpls, nospec: Sanitize array index in mpls_label_ok() mpls_label_ok() validates that the 'platform_label' array index from a userspace netlink message payload is valid. Under speculation the mpls_label_ok() result may not resolve in the CPU pipeline until after the index is used to access an array element. Sanitize the index to zero to prevent userspace-controlled arbitrary out-of-bounds speculation, a precursor for a speculative execution side channel vulnerability. Cc: Cc: "David S. Miller" Cc: Eric W. Biederman Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 5dce8336d33f..e545a3c9365f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -935,24 +936,27 @@ errout: return err; } -static bool mpls_label_ok(struct net *net, unsigned int index, +static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { + bool is_ok = true; + /* Reserved labels may not be set */ - if (index < MPLS_LABEL_FIRST_UNRESERVED) { + if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); - return false; + is_ok = false; } /* The full 20 bit range may not be supported. */ - if (index >= net->mpls.platform_labels) { + if (is_ok && *index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); - return false; + is_ok = false; } - return true; + *index = array_index_nospec(*index, net->mpls.platform_labels); + return is_ok; } static int mpls_route_add(struct mpls_route_config *cfg, @@ -975,7 +979,7 @@ static int mpls_route_add(struct mpls_route_config *cfg, index = find_free_label(net); } - if (!mpls_label_ok(net, index, extack)) + if (!mpls_label_ok(net, &index, extack)) goto errout; /* Append makes no sense with mpls */ @@ -1052,7 +1056,7 @@ static int mpls_route_del(struct mpls_route_config *cfg, index = cfg->rc_label; - if (!mpls_label_ok(net, index, extack)) + if (!mpls_label_ok(net, &index, extack)) goto errout; mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); @@ -1810,7 +1814,7 @@ static int rtm_to_route_config(struct sk_buff *skb, goto errout; if (!mpls_label_ok(cfg->rc_nlinfo.nl_net, - cfg->rc_label, extack)) + &cfg->rc_label, extack)) goto errout; break; } @@ -2137,7 +2141,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, goto errout; } - if (!mpls_label_ok(net, in_label, extack)) { + if (!mpls_label_ok(net, &in_label, extack)) { err = -EINVAL; goto errout; } From eb53f7af6f15285e2f6ada97285395343ce9f433 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 8 Feb 2018 16:10:39 +0100 Subject: [PATCH 63/82] net/sched: cls_u32: fix cls_u32 on filter replace The following sequence is currently broken: # tc qdisc add dev foo ingress # tc filter replace dev foo protocol all ingress \ u32 match u8 0 0 action mirred egress mirror dev bar1 # tc filter replace dev foo protocol all ingress \ handle 800::800 pref 49152 \ u32 match u8 0 0 action mirred egress mirror dev bar2 Error: cls_u32: Key node flags do not match passed flags. We have an error talking to the kernel, -1 The error comes from u32_change() when comparing new and existing flags. The existing ones always contains one of TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state. These flags cannot be passed from userspace so the condition (n->flags != flags) in u32_change() always fails. Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and TCA_CLS_FLAGS_IN_HW are not taken into account. Fixes: 24d3dc6d27ea ("net/sched: cls_u32: Reflect HW offload status") Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 6311a548046b..c75e68e839c7 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -955,7 +955,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - if (n->flags != flags) { + if ((n->flags ^ flags) & + ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) { NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; } From 55b3280d1e471795c08dbbe17325720a843e104c Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Thu, 8 Feb 2018 17:16:25 +0100 Subject: [PATCH 64/82] tipc: fix skb truesize/datasize ratio control In commit d618d09a68e4 ("tipc: enforce valid ratio between skb truesize and contents") we introduced a test for ensuring that the condition truesize/datasize <= 4 is true for a received buffer. Unfortunately this test has two problems. - Because of the integer arithmetics the test if (skb->truesize / buf_roundup_len(skb) > 4) will miss all ratios [4 < ratio < 5], which was not the intention. - The buffer returned by skb_copy() inherits skb->truesize of the original buffer, which doesn't help the situation at all. In this commit, we change the ratio condition and replace skb_copy() with a call to skb_copy_expand() to finally get this right. Acked-by: Jon Maloy Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 55d8ba92291d..4e1c6f6450bb 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -208,8 +208,8 @@ bool tipc_msg_validate(struct sk_buff **_skb) int msz, hsz; /* Ensure that flow control ratio condition is satisfied */ - if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) { - skb = skb_copy(skb, GFP_ATOMIC); + if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) { + skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC); if (!skb) return false; kfree_skb(*_skb); From 88c991a91729b402bbfbf247fdba16ac21c369ab Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 8 Feb 2018 14:21:05 -0500 Subject: [PATCH 65/82] net: thunder: change q_len's type to handle max ring size The Cavium thunder nicvf driver supports rx/tx rings of up to 65536 entries per. The number of entires are stored in the q_len member of struct q_desc_mem. The problem is that q_len being a u16, results in 65536 becoming 0. In getting pointers to descriptors in the rings, the driver uses q_len minus 1 as a mask after incrementing the pointer, in order to go back to the beginning and not go past the end of the ring. With the q_len set to 0 the mask is no longer correct and the driver does go beyond the end of the ring, causing various ills. Usually the first thing that shows up is a "NETDEV WATCHDOG: enP2p1s0f1 (nicvf): transmit queue 7 timed out" warning. This patch remedies the problem by changing q_len to a u32. Signed-off-by: Dean Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h index 7d1e4e2aaad0..ce1eed7a6d63 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h @@ -213,7 +213,7 @@ struct rx_tx_queue_stats { struct q_desc_mem { dma_addr_t dma; u64 size; - u16 q_len; + u32 q_len; dma_addr_t phys_base; void *base; void *unalign_base; From 08f5138512180a479ce6b9d23b825c9f4cd3be77 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 8 Feb 2018 21:01:48 +0100 Subject: [PATCH 66/82] net: phy: fix phy_start to consider PHY_IGNORE_INTERRUPT This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates also PHY state changes and we should do what the symbol says. Fixes: 84a527a41f38 ("net: phylib: fix interrupts re-enablement in phy_start") Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index f3313a129531..e3e29c2b028b 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -822,7 +822,7 @@ void phy_start(struct phy_device *phydev) phy_resume(phydev); /* make sure interrupts are re-enabled for the PHY */ - if (phydev->irq != PHY_POLL) { + if (phy_interrupt_is_valid(phydev)) { err = phy_enable_interrupts(phydev); if (err < 0) break; From 8c88181ed452707f3815827225517b1964463b95 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 8 Feb 2018 12:48:12 +0100 Subject: [PATCH 67/82] bpf: Sync kernel ABI header with tooling header for bpf_common.h I recently fixed up a lot of commits that forgot to keep the tooling headers in sync. And then I forgot to do the same thing in commit cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes"). Let correct that before people notice ;-). Lawrence did partly fix/sync this for bpf.h in commit d6d4f60c3a09 ("bpf: add selftest for tcpbpf"). Fixes: cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf_common.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/bpf_common.h b/tools/include/uapi/linux/bpf_common.h index 18be90725ab0..ee97668bdadb 100644 --- a/tools/include/uapi/linux/bpf_common.h +++ b/tools/include/uapi/linux/bpf_common.h @@ -15,9 +15,10 @@ /* ld/ldx fields */ #define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 From 077c066a6c5445423147496e6c9b9a2c2d2ee762 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 8 Feb 2018 12:48:17 +0100 Subject: [PATCH 68/82] tools/libbpf: improve the pr_debug statements to contain section numbers While debugging a bpf ELF loading issue, I needed to correlate the ELF section number with the failed relocation section reference. Thus, add section numbers/index to the pr_debug. In debug mode, also print section that were skipped. This helped me identify that a section (.eh_frame) was skipped, and this was the reason the relocation section (.rel.eh_frame) could not find that section number. The section numbers corresponds to the readelf tools Section Headers [Nr]. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c64840365433..0c794f7fc050 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -319,8 +319,8 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx, prog->section_name = strdup(section_name); if (!prog->section_name) { - pr_warning("failed to alloc name for prog under section %s\n", - section_name); + pr_warning("failed to alloc name for prog under section(%d) %s\n", + idx, section_name); goto errout; } @@ -763,29 +763,29 @@ static int bpf_object__elf_collect(struct bpf_object *obj) idx++; if (gelf_getshdr(scn, &sh) != &sh) { - pr_warning("failed to get section header from %s\n", - obj->path); + pr_warning("failed to get section(%d) header from %s\n", + idx, obj->path); err = -LIBBPF_ERRNO__FORMAT; goto out; } name = elf_strptr(elf, ep->e_shstrndx, sh.sh_name); if (!name) { - pr_warning("failed to get section name from %s\n", - obj->path); + pr_warning("failed to get section(%d) name from %s\n", + idx, obj->path); err = -LIBBPF_ERRNO__FORMAT; goto out; } data = elf_getdata(scn, 0); if (!data) { - pr_warning("failed to get section data from %s(%s)\n", - name, obj->path); + pr_warning("failed to get section(%d) data from %s(%s)\n", + idx, name, obj->path); err = -LIBBPF_ERRNO__FORMAT; goto out; } - pr_debug("section %s, size %ld, link %d, flags %lx, type=%d\n", - name, (unsigned long)data->d_size, + pr_debug("section(%d) %s, size %ld, link %d, flags %lx, type=%d\n", + idx, name, (unsigned long)data->d_size, (int)sh.sh_link, (unsigned long)sh.sh_flags, (int)sh.sh_type); @@ -840,6 +840,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) obj->efile.reloc[n].shdr = sh; obj->efile.reloc[n].data = data; } + } else { + pr_debug("skip section(%d) %s\n", idx, name); } if (err) goto out; @@ -1119,8 +1121,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) prog = bpf_object__find_prog_by_idx(obj, idx); if (!prog) { - pr_warning("relocation failed: no %d section\n", - idx); + pr_warning("relocation failed: no section(%d)\n", idx); return -LIBBPF_ERRNO__RELOC; } From 864db336c677c288d81524c30a656804785902f9 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 8 Feb 2018 12:48:22 +0100 Subject: [PATCH 69/82] selftests/bpf: add test program for loading BPF ELF files V2: Moved program into selftests/bpf from tools/libbpf This program can be used on its own for testing/debugging if a BPF ELF-object file can be loaded with libbpf (from tools/lib/bpf). If something is wrong with the ELF object, the program have a --debug mode that will display the ELF sections and especially the skipped sections. This allows for quickly identifying the problematic ELF section number, which can be corrolated with the readelf tool. The program signal error via return codes, and also have a --quiet mode, which is practical for use in scripts like selftests/bpf. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 2 +- .../testing/selftests/bpf/test_libbpf_open.c | 150 ++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/test_libbpf_open.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 566d6adc172a..3f48da0866ba 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -20,7 +20,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o test_tcpbpf_kern.o + sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ test_offload.py diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c new file mode 100644 index 000000000000..8fcd1c076add --- /dev/null +++ b/tools/testing/selftests/bpf/test_libbpf_open.c @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. + */ +static const char *__doc__ = + "Libbpf test program for loading BPF ELF object files"; + +#include +#include +#include +#include +#include +#include + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"debug", no_argument, NULL, 'D' }, + {"quiet", no_argument, NULL, 'q' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n\n", __doc__); + printf(" Usage: %s (options-see-below) BPF_FILE\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +#define DEFINE_PRINT_FN(name, enabled) \ +static int libbpf_##name(const char *fmt, ...) \ +{ \ + va_list args; \ + int ret; \ + \ + va_start(args, fmt); \ + if (enabled) { \ + fprintf(stderr, "[" #name "] "); \ + ret = vfprintf(stderr, fmt, args); \ + } \ + va_end(args); \ + return ret; \ +} +DEFINE_PRINT_FN(warning, 1) +DEFINE_PRINT_FN(info, 1) +DEFINE_PRINT_FN(debug, 1) + +#define EXIT_FAIL_LIBBPF EXIT_FAILURE +#define EXIT_FAIL_OPTION 2 + +int test_walk_progs(struct bpf_object *obj, bool verbose) +{ + struct bpf_program *prog; + int cnt = 0; + + bpf_object__for_each_program(prog, obj) { + cnt++; + if (verbose) + printf("Prog (count:%d) section_name: %s\n", cnt, + bpf_program__title(prog, false)); + } + return 0; +} + +int test_walk_maps(struct bpf_object *obj, bool verbose) +{ + struct bpf_map *map; + int cnt = 0; + + bpf_map__for_each(map, obj) { + cnt++; + if (verbose) + printf("Map (count:%d) name: %s\n", cnt, + bpf_map__name(map)); + } + return 0; +} + +int test_open_file(char *filename, bool verbose) +{ + struct bpf_object *bpfobj = NULL; + long err; + + if (verbose) + printf("Open BPF ELF-file with libbpf: %s\n", filename); + + /* Load BPF ELF object file and check for errors */ + bpfobj = bpf_object__open(filename); + err = libbpf_get_error(bpfobj); + if (err) { + char err_buf[128]; + libbpf_strerror(err, err_buf, sizeof(err_buf)); + if (verbose) + printf("Unable to load eBPF objects in file '%s': %s\n", + filename, err_buf); + return EXIT_FAIL_LIBBPF; + } + test_walk_progs(bpfobj, verbose); + test_walk_maps(bpfobj, verbose); + + if (verbose) + printf("Close BPF ELF-file with libbpf: %s\n", + bpf_object__name(bpfobj)); + bpf_object__close(bpfobj); + + return 0; +} + +int main(int argc, char **argv) +{ + char filename[1024] = { 0 }; + bool verbose = 1; + int longindex = 0; + int opt; + + libbpf_set_print(libbpf_warning, libbpf_info, NULL); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hDq", + long_options, &longindex)) != -1) { + switch (opt) { + case 'D': + libbpf_set_print(libbpf_warning, libbpf_info, + libbpf_debug); + break; + case 'q': /* Use in scripting mode */ + verbose = 0; + break; + case 'h': + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + if (optind >= argc) { + usage(argv); + printf("ERROR: Expected BPF_FILE argument after options\n"); + return EXIT_FAIL_OPTION; + } + snprintf(filename, sizeof(filename), "%s", argv[optind]); + + return test_open_file(filename, verbose); +} From f09b2e382e9a7053e3ae6f2fb6535efd5760cf5d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 8 Feb 2018 12:48:27 +0100 Subject: [PATCH 70/82] selftests/bpf: add selftest that use test_libbpf_open This script test_libbpf.sh will be part of the 'make run_tests' invocation, but can also be invoked manually in this directory, and a verbose mode can be enabled via setting the environment variable $VERBOSE like: $ VERBOSE=yes ./test_libbpf.sh The script contains some tests that are commented out, as they currently fail. They are reminders about what we need to improve for the libbpf loader library. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 14 ++++++- tools/testing/selftests/bpf/test_libbpf.sh | 49 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/bpf/test_libbpf.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3f48da0866ba..5c43c187f27c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -13,6 +13,7 @@ endif CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include LDLIBS += -lcap -lelf -lrt -lpthread +# Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user @@ -20,17 +21,26 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ - sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open + sample_map_ret0.o test_tcpbpf_kern.o -TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ +# Order correspond to 'make run_tests' order +TEST_PROGS := test_kmod.sh \ + test_libbpf.sh \ + test_xdp_redirect.sh \ + test_xdp_meta.sh \ test_offload.py +# Compile but not part of 'make run_tests' +TEST_GEN_PROGS_EXTENDED = test_libbpf_open + include ../lib.mk BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c $(TEST_GEN_PROGS): $(BPFOBJ) +$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a + .PHONY: force # force a rebuild of BPFOBJ when its dependencies are updated diff --git a/tools/testing/selftests/bpf/test_libbpf.sh b/tools/testing/selftests/bpf/test_libbpf.sh new file mode 100755 index 000000000000..d97dc914cd49 --- /dev/null +++ b/tools/testing/selftests/bpf/test_libbpf.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +export TESTNAME=test_libbpf + +# Determine selftest success via shell exit code +exit_handler() +{ + if (( $? == 0 )); then + echo "selftests: $TESTNAME [PASS]"; + else + echo "$TESTNAME: failed at file $LAST_LOADED" 1>&2 + echo "selftests: $TESTNAME [FAILED]"; + fi +} + +libbpf_open_file() +{ + LAST_LOADED=$1 + if [ -n "$VERBOSE" ]; then + ./test_libbpf_open $1 + else + ./test_libbpf_open --quiet $1 + fi +} + +# Exit script immediately (well catched by trap handler) if any +# program/thing exits with a non-zero status. +set -e + +# (Use 'trap -l' to list meaning of numbers) +trap exit_handler 0 2 3 6 9 + +libbpf_open_file test_l4lb.o + +# TODO: fix libbpf to load noinline functions +# [warning] libbpf: incorrect bpf_call opcode +#libbpf_open_file test_l4lb_noinline.o + +# TODO: fix test_xdp_meta.c to load with libbpf +# [warning] libbpf: test_xdp_meta.o doesn't provide kernel version +#libbpf_open_file test_xdp_meta.o + +# TODO: fix libbpf to handle .eh_frame +# [warning] libbpf: relocation failed: no section(10) +#libbpf_open_file ../../../../samples/bpf/tracex3_kern.o + +# Success +exit 0 From e3d91b0ca523d53158f435a3e13df7f0cb360ea2 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 8 Feb 2018 12:48:32 +0100 Subject: [PATCH 71/82] tools/libbpf: handle issues with bpf ELF objects containing .eh_frames V3: More generic skipping of relo-section (suggested by Daniel) If clang >= 4.0.1 is missing the option '-target bpf', it will cause llc/llvm to create two ELF sections for "Exception Frames", with section names '.eh_frame' and '.rel.eh_frame'. The BPF ELF loader library libbpf fails when loading files with these sections. The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c, handle this gracefully. And iproute2 loader also seems to work with these "eh" sections. The issue in libbpf is caused by bpf_object__elf_collect() skipping some sections, and later when performing relocation it will be pointing to a skipped section, as these sections cannot be found by bpf_object__find_prog_by_idx() in bpf_object__collect_reloc(). This is a general issue that also occurs for other sections, like debug sections which are also skipped and can have relo section. As suggested by Daniel. To avoid keeping state about all skipped sections, instead perform a direct qlookup in the ELF object. Lookup the section that the relo-section points to and check if it contains executable machine instructions (denoted by the sh_flags SHF_EXECINSTR). Use this check to also skip irrelevant relo-sections. Note, for samples/bpf/ the '-target bpf' parameter to clang cannot be used due to incompatibility with asm embedded headers, that some of the samples include. This is explained in more details by Yonghong Song in bpf_devel_QA. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 0c794f7fc050..97073d649c1a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -742,6 +742,24 @@ bpf_object__init_maps(struct bpf_object *obj) return 0; } +static bool section_have_execinstr(struct bpf_object *obj, int idx) +{ + Elf_Scn *scn; + GElf_Shdr sh; + + scn = elf_getscn(obj->efile.elf, idx); + if (!scn) + return false; + + if (gelf_getshdr(scn, &sh) != &sh) + return false; + + if (sh.sh_flags & SHF_EXECINSTR) + return true; + + return false; +} + static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; @@ -825,6 +843,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (sh.sh_type == SHT_REL) { void *reloc = obj->efile.reloc; int nr_reloc = obj->efile.nr_reloc + 1; + int sec = sh.sh_info; /* points to other section */ + + /* Only do relo for section with exec instructions */ + if (!section_have_execinstr(obj, sec)) { + pr_debug("skip relo %s(%d) for section(%d)\n", + name, idx, sec); + continue; + } reloc = realloc(reloc, sizeof(*obj->efile.reloc) * nr_reloc); From faefaa97215a0c05105d7ae180fe1a3b5979ad1f Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 9 Feb 2018 11:41:09 -0600 Subject: [PATCH 72/82] ibmvnic: Reset long term map ID counter When allocating RX or TX buffer pools, the driver needs to provide a unique mapping ID to firmware for each pool. This value is assigned using a counter which is incremented after a new pool is created. The ID can be an integer ranging from 1-255. When migrating to a device that requests a different number of queues, this value was not being reset properly. As a result, after enough migrations, the counter exceeded the upper bound and pool creation failed. This is fixed by resetting the counter to one in this case. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index dd4a2946e1da..ce127a72cda2 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1644,6 +1644,7 @@ static int do_reset(struct ibmvnic_adapter *adapter, return rc; } else if (adapter->req_rx_queues != old_num_rx_queues || adapter->req_tx_queues != old_num_tx_queues) { + adapter->map_id = 1; release_rx_pools(adapter); release_tx_pools(adapter); init_rx_pools(netdev); From 1b84ca187510f60f00f4e15255043ce19bb30410 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 9 Feb 2018 17:22:45 +0100 Subject: [PATCH 73/82] net: stmmac: discard disabled flags in interrupt status register The interrupt status register in both dwmac1000 and dwmac4 ignores interrupt enable (for dwmac4) / interrupt mask (for dwmac1000). Therefore, if we want to check only the bits that can actually trigger an irq, we have to filter the interrupt status register manually. Commit 0a764db10337 ("stmmac: Discard masked flags in interrupt status register") fixed this for dwmac1000. Fix the same issue for dwmac4. Just like commit 0a764db10337 ("stmmac: Discard masked flags in interrupt status register"), this makes sure that we do not get spurious link up/link down prints. Signed-off-by: Niklas Cassel Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index ed222b20fcf1..1e0a7668b752 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -572,10 +572,12 @@ static int dwmac4_irq_status(struct mac_device_info *hw, struct stmmac_extra_stats *x) { void __iomem *ioaddr = hw->pcsr; - u32 intr_status; + u32 intr_status = readl(ioaddr + GMAC_INT_STATUS); + u32 intr_enable = readl(ioaddr + GMAC_INT_EN); int ret = 0; - intr_status = readl(ioaddr + GMAC_INT_STATUS); + /* Discard disabled bits */ + intr_status &= intr_enable; /* Not used events (e.g. MMC interrupts) are not handled. */ if ((intr_status & mmc_tx_irq)) From e879b7ab3739d8f9990961fc7df2f63861bd780b Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 9 Feb 2018 17:22:46 +0100 Subject: [PATCH 74/82] net: stmmac: rename GMAC_INT_DEFAULT_MASK for dwmac4 GMAC_INT_DEFAULT_MASK is written to the interrupt enable register. In previous versions of the IP (e.g. dwmac1000), this register was instead an interrupt mask register. To improve clarity and reflect reality, rename GMAC_INT_DEFAULT_MASK to GMAC_INT_DEFAULT_ENABLE. Signed-off-by: Niklas Cassel Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 789dad8a07b5..7761a26ec9c5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -98,7 +98,7 @@ #define GMAC_PCS_IRQ_DEFAULT (GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK | \ GMAC_INT_PCS_ANE) -#define GMAC_INT_DEFAULT_MASK (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN) +#define GMAC_INT_DEFAULT_ENABLE (GMAC_INT_PMT_EN | GMAC_INT_LPI_EN) enum dwmac4_irq_status { time_stamp_irq = 0x00001000, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 1e0a7668b752..6badc63d8e6d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -61,8 +61,8 @@ static void dwmac4_core_init(struct mac_device_info *hw, writel(value, ioaddr + GMAC_CONFIG); - /* Mask GMAC interrupts */ - value = GMAC_INT_DEFAULT_MASK; + /* Enable GMAC interrupts */ + value = GMAC_INT_DEFAULT_ENABLE; if (hw->pmt) value |= GMAC_INT_PMT_EN; if (hw->pcs) From 1029117127540fef4edcf4f0887dc3e1f7d5adb2 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 9 Feb 2018 17:22:47 +0100 Subject: [PATCH 75/82] net: stmmac: remove redundant enable of PMT irq For dwmac4, GMAC_INT_DEFAULT_ENABLE already includes GMAC_INT_PMT_EN, so it is redundant to check if hw->pmt is set, and if so, setting the bit again. For dwmac1000, GMAC_INT_DEFAULT_MASK does not include GMAC_INT_DISABLE_PMT, so it is redundant to check if hw->pmt is set, and if so, clearing an already cleared bit. Improve code readability by removing this redundant code. Signed-off-by: Niklas Cassel Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 -- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 540d21786a43..ef10baf14186 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -74,8 +74,6 @@ static void dwmac1000_core_init(struct mac_device_info *hw, /* Mask GMAC interrupts */ value = GMAC_INT_DEFAULT_MASK; - if (hw->pmt) - value &= ~GMAC_INT_DISABLE_PMT; if (hw->pcs) value &= ~GMAC_INT_DISABLE_PCS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 6badc63d8e6d..63795ecafc8d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -63,8 +63,7 @@ static void dwmac4_core_init(struct mac_device_info *hw, /* Enable GMAC interrupts */ value = GMAC_INT_DEFAULT_ENABLE; - if (hw->pmt) - value |= GMAC_INT_PMT_EN; + if (hw->pcs) value |= GMAC_PCS_IRQ_DEFAULT; From 6e6e41c3112276288ccaf80c70916779b84bb276 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 9 Feb 2018 17:45:49 +0800 Subject: [PATCH 76/82] ptr_ring: fail early if queue occupies more than KMALLOC_MAX_SIZE To avoid slab to warn about exceeded size, fail early if queue occupies more than KMALLOC_MAX_SIZE. Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 1883d6137e9b..6051a5fe6913 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -466,6 +466,8 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) { + if (size * sizeof(void *) > KMALLOC_MAX_SIZE) + return NULL; return kcalloc(size, sizeof(void *), gfp); } From 0bf7800f1799b5b1fd7d4f024e9ece53ac489011 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 9 Feb 2018 17:45:50 +0800 Subject: [PATCH 77/82] ptr_ring: try vmalloc() when kmalloc() fails This patch switch to use kvmalloc_array() for using a vmalloc() fallback to help in case kmalloc() fails. Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6051a5fe6913..b884b7794187 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -464,11 +464,14 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, __PTR_RING_PEEK_CALL_v; \ }) +/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See + * documentation for vmalloc for which of them are legal. + */ static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) { if (size * sizeof(void *) > KMALLOC_MAX_SIZE) return NULL; - return kcalloc(size, sizeof(void *), gfp); + return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) @@ -603,7 +606,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); - kfree(old); + kvfree(old); return 0; } @@ -643,7 +646,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, } for (i = 0; i < nrings; ++i) - kfree(queues[i]); + kvfree(queues[i]); kfree(queues); @@ -651,7 +654,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, nomem: while (--i >= 0) - kfree(queues[i]); + kvfree(queues[i]); kfree(queues); @@ -666,7 +669,7 @@ static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); - kfree(r->queue); + kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */ From 89271c65edd599207dd982007900506283c90ae3 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 9 Feb 2018 11:03:49 +0100 Subject: [PATCH 78/82] s390/qeth: fix underestimated count of buffer elements For a memory range/skb where the last byte falls onto a page boundary (ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the calculation currently doesn't round up to the next PFN due to an off-by-one error. Thus qeth believes that the skb occupies one page less than it actually does, and may select a IO buffer that doesn't have enough spare buffer elements to fit all of the skb's data. HW detects this as a malformed buffer descriptor, and raises an exception which then triggers device recovery. Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count") Signed-off-by: Ursula Braun Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index db42107bf2f5..c33fbc4c2e91 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -846,7 +846,7 @@ struct qeth_trap_id { */ static inline int qeth_get_elements_for_range(addr_t start, addr_t end) { - return PFN_UP(end - 1) - PFN_DOWN(start); + return PFN_UP(end) - PFN_DOWN(start); } static inline int qeth_get_micros(void) From 1c5b2216fbb973a9410e0b06389740b5c1289171 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 9 Feb 2018 11:03:50 +0100 Subject: [PATCH 79/82] s390/qeth: fix SETIP command handling send_control_data() applies some special handling to SETIP v4 IPA commands. But current code parses *all* command types for the SETIP command code. Limit the command code check to IPA commands. Fixes: 5b54e16f1a54 ("qeth: do not spin for SETIP ip assist command") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core.h | 5 +++++ drivers/s390/net/qeth_core_main.c | 14 ++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index c33fbc4c2e91..959c65cf75d9 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -591,6 +591,11 @@ struct qeth_cmd_buffer { void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *); }; +static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob) +{ + return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE); +} + /** * definition of a qeth channel, used for read and write */ diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 6abd3bc285e4..ca72f3311004 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -2120,7 +2120,7 @@ int qeth_send_control_data(struct qeth_card *card, int len, unsigned long flags; struct qeth_reply *reply = NULL; unsigned long timeout, event_timeout; - struct qeth_ipa_cmd *cmd; + struct qeth_ipa_cmd *cmd = NULL; QETH_CARD_TEXT(card, 2, "sendctl"); @@ -2146,10 +2146,13 @@ int qeth_send_control_data(struct qeth_card *card, int len, while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ; qeth_prepare_control_data(card, len, iob); - if (IS_IPA(iob->data)) + if (IS_IPA(iob->data)) { + cmd = __ipa_cmd(iob); event_timeout = QETH_IPA_TIMEOUT; - else + } else { event_timeout = QETH_TIMEOUT; + } + timeout = jiffies + event_timeout; QETH_CARD_TEXT(card, 6, "noirqpnd"); @@ -2174,9 +2177,8 @@ int qeth_send_control_data(struct qeth_card *card, int len, /* we have only one long running ipassist, since we can ensure process context of this command we can sleep */ - cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE); - if ((cmd->hdr.command == IPA_CMD_SETIP) && - (cmd->hdr.prot_version == QETH_PROT_IPV4)) { + if (cmd && cmd->hdr.command == IPA_CMD_SETIP && + cmd->hdr.prot_version == QETH_PROT_IPV4) { if (!wait_event_timeout(reply->wait_q, atomic_read(&reply->received), event_timeout)) goto time_err; From 07f2c7ab6f8d0a7e7c5764c4e6cc9c52951b9d9c Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 9 Feb 2018 17:35:23 +0300 Subject: [PATCH 80/82] sctp: verify size of a new chunk in _sctp_make_chunk() When SCTP makes INIT or INIT_ACK packet the total chunk length can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when transmitting these packets, e.g. the crash on sending INIT_ACK: [ 597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168 put:120156 head:000000007aa47635 data:00000000d991c2de tail:0x1d640 end:0xfec0 dev: ... [ 597.976970] ------------[ cut here ]------------ [ 598.033408] kernel BUG at net/core/skbuff.c:104! [ 600.314841] Call Trace: [ 600.345829] [ 600.371639] ? sctp_packet_transmit+0x2095/0x26d0 [sctp] [ 600.436934] skb_put+0x16c/0x200 [ 600.477295] sctp_packet_transmit+0x2095/0x26d0 [sctp] [ 600.540630] ? sctp_packet_config+0x890/0x890 [sctp] [ 600.601781] ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp] [ 600.671356] ? sctp_cmp_addr_exact+0x3f/0x90 [sctp] [ 600.731482] sctp_outq_flush+0x663/0x30d0 [sctp] [ 600.788565] ? sctp_make_init+0xbf0/0xbf0 [sctp] [ 600.845555] ? sctp_check_transmitted+0x18f0/0x18f0 [sctp] [ 600.912945] ? sctp_outq_tail+0x631/0x9d0 [sctp] [ 600.969936] sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp] [ 601.041593] ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp] [ 601.104837] ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp] [ 601.175436] ? sctp_eat_data+0x1710/0x1710 [sctp] [ 601.233575] sctp_do_sm+0x182/0x560 [sctp] [ 601.284328] ? sctp_has_association+0x70/0x70 [sctp] [ 601.345586] ? sctp_rcv+0xef4/0x32f0 [sctp] [ 601.397478] ? sctp6_rcv+0xa/0x20 [sctp] ... Here the chunk size for INIT_ACK packet becomes too big, mostly because of the state cookie (INIT packet has large size with many address parameters), plus additional server parameters. Later this chunk causes the panic in skb_put_data(): skb_packet_transmit() sctp_packet_pack() skb_put_data(nskb, chunk->skb->data, chunk->skb->len); 'nskb' (head skb) was previously allocated with packet->size from u16 'chunk->chunk_hdr->length'. As suggested by Marcelo we should check the chunk's length in _sctp_make_chunk() before trying to allocate skb for it and discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN. Signed-off-by: Alexey Kodanev Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 793b05ec692b..d01475f5f710 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1380,9 +1380,14 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; + int chunklen; + + chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); + if (chunklen > SCTP_MAX_CHUNK_LEN) + goto nodata; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(SCTP_PAD4(sizeof(*chunk_hdr) + paylen), gfp); + skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; From 941ff6f11c020913f5cddf543a9ec63475d7c082 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Feb 2018 14:49:44 +0100 Subject: [PATCH 81/82] bpf: fix rlimit in reuseport net selftest Fix two issues in the reuseport_bpf selftests that were reported by Linaro CI: [...] + ./reuseport_bpf ---- IPv4 UDP ---- Testing EBPF mod 10... Reprograming, testing mod 5... ./reuseport_bpf: ebpf error. log: 0: (bf) r6 = r1 1: (20) r0 = *(u32 *)skb[0] 2: (97) r0 %= 10 3: (95) exit processed 4 insns : Operation not permitted + echo FAIL [...] ---- IPv4 TCP ---- Testing EBPF mod 10... ./reuseport_bpf: failed to bind send socket: Address already in use + echo FAIL [...] For the former adjust rlimit since this was the cause of failure for loading the BPF prog, and for the latter add SO_REUSEADDR. Reported-by: Naresh Kamboju Link: https://bugs.linaro.org/show_bug.cgi?id=3502 Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/net/reuseport_bpf.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c index 4a8217448f20..cad14cd0ea92 100644 --- a/tools/testing/selftests/net/reuseport_bpf.c +++ b/tools/testing/selftests/net/reuseport_bpf.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #ifndef ARRAY_SIZE @@ -190,11 +191,14 @@ static void send_from(struct test_params p, uint16_t sport, char *buf, struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport); struct sockaddr * const daddr = new_loopback_sockaddr(p.send_family, p.recv_port); - const int fd = socket(p.send_family, p.protocol, 0); + const int fd = socket(p.send_family, p.protocol, 0), one = 1; if (fd < 0) error(1, errno, "failed to create send socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) + error(1, errno, "failed to set reuseaddr"); + if (bind(fd, saddr, sockaddr_size())) error(1, errno, "failed to bind send socket"); @@ -433,6 +437,21 @@ void enable_fastopen(void) } } +static struct rlimit rlim_old, rlim_new; + +static __attribute__((constructor)) void main_ctor(void) +{ + getrlimit(RLIMIT_MEMLOCK, &rlim_old); + rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20); + rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20); + setrlimit(RLIMIT_MEMLOCK, &rlim_new); +} + +static __attribute__((destructor)) void main_dtor(void) +{ + setrlimit(RLIMIT_MEMLOCK, &rlim_old); +} + int main(void) { fprintf(stderr, "---- IPv4 UDP ----\n"); From 2fa56a494484f19e06bf4f3464b2155a92beafac Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 9 Feb 2018 13:19:46 -0600 Subject: [PATCH 82/82] ibmvnic: Remove skb->protocol checks in ibmvnic_xmit Having these checks in ibmvnic_xmit causes problems with VLAN tagging and balance-alb/tlb bonding modes. The restriction they imposed can be removed. Signed-off-by: John Allen Signed-off-by: Nathan Fontenot Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index ce127a72cda2..27447260215d 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1419,10 +1419,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) hdrs += 2; } /* determine if l2/3/4 headers are sent to firmware */ - if ((*hdrs >> 7) & 1 && - (skb->protocol == htons(ETH_P_IP) || - skb->protocol == htons(ETH_P_IPV6) || - skb->protocol == htons(ETH_P_ARP))) { + if ((*hdrs >> 7) & 1) { build_hdr_descs_arr(tx_buff, &num_entries, *hdrs); tx_crq.v1.n_crq_elem = num_entries; tx_buff->indir_arr[0] = tx_crq;