From a9e6d44ddeccd3522670e641f1ed9b068e746ff7 Mon Sep 17 00:00:00 2001
From: Sven Joachim <svenjoac@gmx.de>
Date: Fri, 26 Jan 2018 10:38:01 +0100
Subject: [PATCH 01/82] ssb: Do not disable PCI host on non-Mips

After upgrading an old laptop to 4.15-rc9, I found that the eth0 and
wlan0 interfaces had disappeared.  It turns out that the b43 and b44
drivers require SSB_PCIHOST_POSSIBLE which depends on
PCI_DRIVERS_LEGACY, a config option that only exists on Mips.

Fixes: 58eae1416b80 ("ssb: Disable PCI host for PCI_DRIVERS_GENERIC")
Cc: stable@vger.org
Signed-off-by: Sven Joachim <svenjoac@gmx.de>
Reviewed-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/ssb/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig
index 71c73766ee22..65af12c3bdb2 100644
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -32,7 +32,7 @@ config SSB_BLOCKIO
 
 config SSB_PCIHOST_POSSIBLE
 	bool
-	depends on SSB && (PCI = y || PCI = SSB) && PCI_DRIVERS_LEGACY
+	depends on SSB && (PCI = y || PCI = SSB) && (PCI_DRIVERS_LEGACY || !MIPS)
 	default y
 
 config SSB_PCIHOST

From d71ef28636e435079028c1ed255fa92d8ff6ed76 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:03 +0100
Subject: [PATCH 02/82] mt76: implement AP_LINK_PS

With software A-MPDU reordering in place, frames that notify mac80211 of
powersave changes are reordered as well, which can cause connection
stalls. Fix this by implementing powersave state processing in the
driver.

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 52 ++++++++++++++++++-
 drivers/net/wireless/mediatek/mt76/mt76.h     | 10 ++++
 drivers/net/wireless/mediatek/mt76/mt76x2.h   |  2 +
 .../net/wireless/mediatek/mt76/mt76x2_init.c  |  1 +
 .../net/wireless/mediatek/mt76/mt76x2_main.c  | 28 +++++-----
 5 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 5fcb2deb89a2..85f8d324ebf8 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -276,6 +276,7 @@ int mt76_register_device(struct mt76_dev *dev, bool vht,
 	ieee80211_hw_set(hw, TX_AMSDU);
 	ieee80211_hw_set(hw, TX_FRAG_LIST);
 	ieee80211_hw_set(hw, MFP_CAPABLE);
+	ieee80211_hw_set(hw, AP_LINK_PS);
 
 	wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
 
@@ -470,6 +471,53 @@ mt76_check_ccmp_pn(struct sk_buff *skb)
 	return 0;
 }
 
+static void
+mt76_check_ps(struct mt76_dev *dev, struct sk_buff *skb)
+{
+	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_sta *sta;
+	struct mt76_wcid *wcid = status->wcid;
+	bool ps;
+
+	if (!wcid || !wcid->sta)
+		return;
+
+	sta = container_of((void *) wcid, struct ieee80211_sta, drv_priv);
+
+	if (!test_bit(MT_WCID_FLAG_CHECK_PS, &wcid->flags))
+		return;
+
+	if (ieee80211_is_pspoll(hdr->frame_control)) {
+		ieee80211_sta_pspoll(sta);
+		return;
+	}
+
+	if (ieee80211_has_morefrags(hdr->frame_control) ||
+		!(ieee80211_is_mgmt(hdr->frame_control) ||
+		  ieee80211_is_data(hdr->frame_control)))
+		return;
+
+	ps = ieee80211_has_pm(hdr->frame_control);
+
+	if (ps && (ieee80211_is_data_qos(hdr->frame_control) ||
+		   ieee80211_is_qos_nullfunc(hdr->frame_control)))
+		ieee80211_sta_uapsd_trigger(sta, status->tid);
+
+	if (!!test_bit(MT_WCID_FLAG_PS, &wcid->flags) == ps)
+		return;
+
+	if (ps) {
+		set_bit(MT_WCID_FLAG_PS, &wcid->flags);
+		mt76_stop_tx_queues(dev, sta, true);
+	} else {
+		clear_bit(MT_WCID_FLAG_PS, &wcid->flags);
+	}
+
+	ieee80211_sta_ps_transition(sta, ps);
+	dev->drv->sta_ps(dev, sta, ps);
+}
+
 void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames,
 		      int queue)
 {
@@ -498,8 +546,10 @@ void mt76_rx_poll_complete(struct mt76_dev *dev, enum mt76_rxq_id q)
 
 	__skb_queue_head_init(&frames);
 
-	while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL)
+	while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL) {
+		mt76_check_ps(dev, skb);
 		mt76_rx_aggr_reorder(skb, &frames);
+	}
 
 	mt76_rx_complete(dev, &frames, q);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 129015c9d116..d2ce15093edd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -121,11 +121,18 @@ struct mt76_queue_ops {
 	void (*kick)(struct mt76_dev *dev, struct mt76_queue *q);
 };
 
+enum mt76_wcid_flags {
+	MT_WCID_FLAG_CHECK_PS,
+	MT_WCID_FLAG_PS,
+};
+
 struct mt76_wcid {
 	struct mt76_rx_tid __rcu *aggr[IEEE80211_NUM_TIDS];
 
 	struct work_struct aggr_work;
 
+	unsigned long flags;
+
 	u8 idx;
 	u8 hw_key_idx;
 
@@ -206,6 +213,9 @@ struct mt76_driver_ops {
 		       struct sk_buff *skb);
 
 	void (*rx_poll_complete)(struct mt76_dev *dev, enum mt76_rxq_id q);
+
+	void (*sta_ps)(struct mt76_dev *dev, struct ieee80211_sta *sta,
+		       bool ps);
 };
 
 struct mt76_channel_state {
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2.h
index 17df17afd9bf..e62131b88102 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2.h
@@ -218,6 +218,8 @@ void mt76x2_rx_poll_complete(struct mt76_dev *mdev, enum mt76_rxq_id q);
 void mt76x2_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q,
 			 struct sk_buff *skb);
 
+void mt76x2_sta_ps(struct mt76_dev *dev, struct ieee80211_sta *sta, bool ps);
+
 void mt76x2_update_channel(struct mt76_dev *mdev);
 
 s8 mt76x2_tx_get_max_txpwr_adj(struct mt76x2_dev *dev,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index 1b00ae4465a2..9dbf94947324 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -630,6 +630,7 @@ struct mt76x2_dev *mt76x2_alloc_device(struct device *pdev)
 		.tx_complete_skb = mt76x2_tx_complete_skb,
 		.rx_skb = mt76x2_queue_rx_skb,
 		.rx_poll_complete = mt76x2_rx_poll_complete,
+		.sta_ps = mt76x2_sta_ps,
 	};
 	struct ieee80211_hw *hw;
 	struct mt76x2_dev *dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index bf26284b9989..205043b470b2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -282,6 +282,9 @@ mt76x2_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	for (i = 0; i < ARRAY_SIZE(sta->txq); i++)
 		mt76x2_txq_init(dev, sta->txq[i]);
 
+	if (vif->type == NL80211_IFTYPE_AP)
+		set_bit(MT_WCID_FLAG_CHECK_PS, &msta->wcid.flags);
+
 	rcu_assign_pointer(dev->wcid[idx], &msta->wcid);
 
 out:
@@ -311,23 +314,14 @@ mt76x2_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	return 0;
 }
 
-static void
-mt76x2_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		  enum sta_notify_cmd cmd, struct ieee80211_sta *sta)
+void
+mt76x2_sta_ps(struct mt76_dev *mdev, struct ieee80211_sta *sta, bool ps)
 {
 	struct mt76x2_sta *msta = (struct mt76x2_sta *) sta->drv_priv;
-	struct mt76x2_dev *dev = hw->priv;
+	struct mt76x2_dev *dev = container_of(mdev, struct mt76x2_dev, mt76);
 	int idx = msta->wcid.idx;
 
-	switch (cmd) {
-	case STA_NOTIFY_SLEEP:
-		mt76x2_mac_wcid_set_drop(dev, idx, true);
-		mt76_stop_tx_queues(&dev->mt76, sta, true);
-		break;
-	case STA_NOTIFY_AWAKE:
-		mt76x2_mac_wcid_set_drop(dev, idx, false);
-		break;
-	}
+	mt76x2_mac_wcid_set_drop(dev, idx, ps);
 }
 
 static int
@@ -549,6 +543,12 @@ static void mt76x2_set_coverage_class(struct ieee80211_hw *hw,
 	mutex_unlock(&dev->mutex);
 }
 
+static int
+mt76x2_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set)
+{
+	return 0;
+}
+
 const struct ieee80211_ops mt76x2_ops = {
 	.tx = mt76x2_tx,
 	.start = mt76x2_start,
@@ -560,7 +560,6 @@ const struct ieee80211_ops mt76x2_ops = {
 	.bss_info_changed = mt76x2_bss_info_changed,
 	.sta_add = mt76x2_sta_add,
 	.sta_remove = mt76x2_sta_remove,
-	.sta_notify = mt76x2_sta_notify,
 	.set_key = mt76x2_set_key,
 	.conf_tx = mt76x2_conf_tx,
 	.sw_scan_start = mt76x2_sw_scan,
@@ -573,5 +572,6 @@ const struct ieee80211_ops mt76x2_ops = {
 	.release_buffered_frames = mt76_release_buffered_frames,
 	.set_coverage_class = mt76x2_set_coverage_class,
 	.get_survey = mt76_get_survey,
+	.set_tim = mt76x2_set_tim,
 };
 

From 17cf68b702a60aee61432d59098b1ba6ceab2f98 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:04 +0100
Subject: [PATCH 03/82] mt76: implement processing of BlockAckReq frames

Avoids timeouts on reordered A-MPDU rx frames

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 34 ++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index 8027bb7c03c2..e9784b50e2af 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -113,6 +113,33 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 	local_bh_enable();
 }
 
+static void
+mt76_rx_aggr_check_ctl(struct sk_buff *skb, struct sk_buff_head *frames)
+{
+	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
+	struct ieee80211_bar *bar = (struct ieee80211_bar *) skb->data;
+	struct mt76_wcid *wcid = status->wcid;
+	struct mt76_rx_tid *tid;
+	u16 seqno;
+
+	if (!ieee80211_is_ctl(bar->frame_control))
+		return;
+
+	if (!ieee80211_is_back_req(bar->frame_control))
+		return;
+
+	status->tid = le16_to_cpu(bar->control) >> 12;
+	seqno = le16_to_cpu(bar->start_seq_num) >> 4;
+	tid = rcu_dereference(wcid->aggr[status->tid]);
+	if (!tid)
+		return;
+
+	spin_lock_bh(&tid->lock);
+	mt76_rx_aggr_release_frames(tid, frames, seqno);
+	mt76_rx_aggr_release_head(tid, frames);
+	spin_unlock_bh(&tid->lock);
+}
+
 void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 {
 	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
@@ -126,9 +153,14 @@ void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 	__skb_queue_tail(frames, skb);
 
 	sta = wcid_to_sta(wcid);
-	if (!sta || !status->aggr)
+	if (!sta)
 		return;
 
+	if (!status->aggr) {
+		mt76_rx_aggr_check_ctl(skb, frames);
+		return;
+	}
+
 	tid = rcu_dereference(wcid->aggr[status->tid]);
 	if (!tid)
 		return;

From fb208dc73ff1667191ba26d87610bba983ea1535 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:05 +0100
Subject: [PATCH 04/82] mt76: avoid re-queueing A-MPDU rx reorder work if no
 frames are pending

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index e9784b50e2af..fcb208d1f276 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -98,6 +98,7 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 					       reorder_work.work);
 	struct mt76_dev *dev = tid->dev;
 	struct sk_buff_head frames;
+	int nframes;
 
 	__skb_queue_head_init(&frames);
 
@@ -105,9 +106,12 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 
 	spin_lock(&tid->lock);
 	mt76_rx_aggr_check_release(tid, &frames);
+	nframes = tid->nframes;
 	spin_unlock(&tid->lock);
 
-	ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work, REORDER_TIMEOUT);
+	if (nframes)
+		ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work,
+					     REORDER_TIMEOUT);
 	mt76_rx_complete(dev, &frames, -1);
 
 	local_bh_enable();

From cbbde7e8d98542dc1777ac42ec2a08875aea26ee Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:06 +0100
Subject: [PATCH 05/82] mt76: do not set status->aggr for NULL data frames

Avoids data connection stalls when the client toggles powersave mode

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
index 6c30b5eaa9ca..7ea3d841918e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
@@ -341,7 +341,7 @@ int mt76x2_mac_process_rx(struct mt76x2_dev *dev, struct sk_buff *skb,
 
 	mt76x2_remove_hdr_pad(skb, pad_len);
 
-	if (rxinfo & MT_RXINFO_BA)
+	if ((rxinfo & MT_RXINFO_BA) && !(rxinfo & MT_RXINFO_NULL))
 		status->aggr = true;
 
 	if (WARN_ON_ONCE(len > skb->len))

From 0537250fdc6c876ed4cbbe874c739aebef493ee2 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Tue, 30 Jan 2018 11:30:11 -0800
Subject: [PATCH 06/82] netfilter: x_tables: make allocation less aggressive

syzbot has noticed that xt_alloc_table_info can allocate a lot of memory.
This is an admin only interface but an admin in a namespace is sufficient
as well.  eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in
xt_alloc_table_info()") has changed the opencoded kmalloc->vmalloc
fallback into kvmalloc.  It has dropped __GFP_NORETRY on the way because
vmalloc has simply never fully supported __GFP_NORETRY semantic.  This is
still the case because e.g.  page tables backing the vmalloc area are
hardcoded GFP_KERNEL.

Revert back to __GFP_NORETRY as a poors man defence against excessively
large allocation request here.  We will not rule out the OOM killer
completely but __GFP_NORETRY should at least stop the large request in
most cases.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_tableLink: http://lkml.kernel.org/r/20180130140104.GE21609@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8fa4d37141a7..2f685ee1f9c8 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1008,7 +1008,12 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	if ((size >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
-	info = kvmalloc(sz, GFP_KERNEL);
+	/* __GFP_NORETRY is not fully supported by kvmalloc but it should
+	 * work reasonably well if sz is too large and bail out rather
+	 * than shoot all processes down before realizing there is nothing
+	 * more to reclaim.
+	 */
+	info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY);
 	if (!info)
 		return NULL;
 

From ea23d5e3bf340e413b8e05c13da233c99c64142b Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 31 Jan 2018 04:50:01 -0700
Subject: [PATCH 07/82] netfilter: ipv6: nf_defrag: Kill frag queue on RFC2460
 failure

Failures were seen in ICMPv6 fragmentation timeout tests if they were
run after the RFC2460 failure tests. Kernel was not sending out the
ICMPv6 fragment reassembly time exceeded packet after the fragmentation
reassembly timeout of 1 minute had elapsed.

This happened because the frag queue was not released if an error in
IPv6 fragmentation header was detected by RFC2460.

Fixes: 83f1999caeb1 ("netfilter: ipv6: nf_defrag: Pass on packets to stack per RFC2460")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ce53dcfda88a..b84ce3e6d728 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -264,6 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
+			inet_frag_kill(&fq->q, &nf_frags);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {

From 6be3bcd75afb673a37a82e18ba46d50430f172c1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 31 Jan 2018 18:13:39 +0100
Subject: [PATCH 08/82] netfilter: flowtable infrastructure depends on
 NETFILTER_INGRESS

config NF_FLOW_TABLE depends on NETFILTER_INGRESS. If users forget to
enable this toggle, flowtable registration fails with EOPNOTSUPP.

Moreover, turn 'select NF_FLOW_TABLE' in every flowtable family flavour
into dependency instead, otherwise this new dependency on
NETFILTER_INGRESS causes a warning. This also allows us to remove the
explicit dependency between family flowtables <-> NF_TABLES and
NF_CONNTRACK, given they depend on the NF_FLOW_TABLE core that already
expresses the general dependencies for this new infrastructure.

Moreover, NF_FLOW_TABLE_INET depends on NF_FLOW_TABLE_IPV4 and
NF_FLOWTABLE_IPV6, which already depends on NF_FLOW_TABLE. So we can get
rid of direct dependency with NF_FLOW_TABLE.

In general, let's avoid 'select', it just makes things more complicated.

Reported-by: John Crispin <john@phrozen.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig | 3 +--
 net/ipv6/netfilter/Kconfig | 3 +--
 net/netfilter/Kconfig      | 8 +++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 5f52236780b4..dfe6fa4ea554 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -80,8 +80,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV4
 	tristate "Netfilter flow table IPv4 module"
-	depends on NF_CONNTRACK && NF_TABLES
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv4 support.
 
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4a634b7a2c80..d395d1590699 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -73,8 +73,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV6
 	tristate "Netfilter flow table IPv6 module"
-	depends on NF_CONNTRACK && NF_TABLES
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv6 support.
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9019fa98003d..d3220b43c832 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -666,8 +666,8 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_INET
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
-	depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE_IPV4
+	depends on NF_FLOW_TABLE_IPV6
 	help
           This option adds the flow table mixed IPv4/IPv6 support.
 
@@ -675,7 +675,9 @@ config NF_FLOW_TABLE_INET
 
 config NF_FLOW_TABLE
 	tristate "Netfilter flow table module"
-	depends on NF_CONNTRACK && NF_TABLES
+	depends on NETFILTER_INGRESS
+	depends on NF_CONNTRACK
+	depends on NF_TABLES
 	help
 	  This option adds the flow table core infrastructure.
 

From ba7cd5d95f25cc6005f687dabdb4e7a6063adda9 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 31 Jan 2018 15:02:47 -0800
Subject: [PATCH 09/82] netfilter: xt_cgroup: initialize info->priv in
 cgroup_mt_check_v1()

xt_cgroup_info_v1->priv is an internal pointer only used for kernel,
we should not trust what user-space provides.

Reported-by: <syzbot+4fbcfcc0d2e6592bd641@syzkaller.appspotmail.com>
Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 1db1ce59079f..891f4e7e8ea7 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -52,6 +52,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	}
 
+	info->priv = NULL;
 	if (info->has_path) {
 		cgrp = cgroup_get_from_path(info->path);
 		if (IS_ERR(cgrp)) {

From c7f0030b5b67866c588845abde7bf011de25b98a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:00 +0100
Subject: [PATCH 10/82] netfilter: nft_flow_offload: wait for garbage collector
 to run after cleanup

If netdevice goes down, then flowtable entries are scheduled to be
removed. Wait for garbage collector to have a chance to run so it can
delete them from the hashtable.

The flush call might sleep, so hold the nfnl mutex from
nft_flow_table_iterate() instead of rcu read side lock. The use of the
nfnl mutex is also implicitly fixing races between updates via nfnetlink
and netdevice event.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c    | 8 ++++----
 net/netfilter/nft_flow_offload.c | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0791813a1e7d..07dd1fac78a8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5006,13 +5006,13 @@ void nft_flow_table_iterate(struct net *net,
 	struct nft_flowtable *flowtable;
 	const struct nft_table *table;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_for_each_entry(table, &net->nft.tables, list) {
+		list_for_each_entry(flowtable, &table->flowtables, list) {
 			iter(&flowtable->data, data);
 		}
 	}
-	rcu_read_unlock();
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
 EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
 
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 4503b8dcf9c0..1739ff8ca21f 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -208,6 +208,7 @@ static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
 					     void *data)
 {
 	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+	flush_delayed_work(&flowtable->gc_work);
 }
 
 static int flow_offload_netdev_event(struct notifier_block *this,

From 992cfc7c5d105094da7c21c9c74d97ac26bb1e56 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:01 +0100
Subject: [PATCH 11/82] netfilter: nft_flow_offload: no need to flush entries
 on module removal

nft_flow_offload module removal does not require to flush existing
flowtables, it is valid to remove this module while keeping flowtables
around.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 1739ff8ca21f..e5c45c7ac02a 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -247,14 +247,8 @@ register_expr:
 
 static void __exit nft_flow_offload_module_exit(void)
 {
-	struct net *net;
-
 	nft_unregister_expr(&nft_flow_offload_type);
 	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
-	rtnl_lock();
-	for_each_net(net)
-		nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
-	rtnl_unlock();
 }
 
 module_init(nft_flow_offload_module_init);

From 09584b406742413ac4c8d7e030374d4daa045b69 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 2 Feb 2018 22:37:15 -0800
Subject: [PATCH 12/82] bpf: fix selftests/bpf test_kmod.sh failure when
 CONFIG_BPF_JIT_ALWAYS_ON=y

With CONFIG_BPF_JIT_ALWAYS_ON is defined in the config file,
tools/testing/selftests/bpf/test_kmod.sh failed like below:
  [root@localhost bpf]# ./test_kmod.sh
  sysctl: setting key "net.core.bpf_jit_enable": Invalid argument
  [ JIT enabled:0 hardened:0 ]
  [  132.175681] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  132.458834] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:0 ]
  [  133.456025] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  133.730935] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:1 ]
  [  134.769730] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  135.050864] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:2 ]
  [  136.442882] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  136.821810] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [root@localhost bpf]#

The test_kmod.sh load/remove test_bpf.ko multiple times with different
settings for sysctl net.core.bpf_jit_{enable,harden}. The failed test #297
of test_bpf.ko is designed such that JIT always fails.

Commit 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config)
introduced the following tightening logic:
    ...
        if (!bpf_prog_is_dev_bound(fp->aux)) {
                fp = bpf_int_jit_compile(fp);
    #ifdef CONFIG_BPF_JIT_ALWAYS_ON
                if (!fp->jited) {
                        *err = -ENOTSUPP;
                        return fp;
                }
    #endif
    ...
With this logic, Test #297 always gets return value -ENOTSUPP
when CONFIG_BPF_JIT_ALWAYS_ON is defined, causing the test failure.

This patch fixed the failure by marking Test #297 as expected failure
when CONFIG_BPF_JIT_ALWAYS_ON is defined.

Fixes: 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config)
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 4cd9ea9b3449..b4e22345963f 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -83,6 +83,7 @@ struct bpf_test {
 		__u32 result;
 	} test[MAX_SUBTESTS];
 	int (*fill_helper)(struct bpf_test *self);
+	int expected_errcode; /* used when FLAG_EXPECTED_FAIL is set in the aux */
 	__u8 frag_data[MAX_DATA];
 	int stack_depth; /* for eBPF only, since tests don't call verifier */
 };
@@ -2026,7 +2027,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: div_k_0",
@@ -2036,7 +2039,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: unknown insn",
@@ -2047,7 +2052,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: out of range spill/fill",
@@ -2057,7 +2064,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"JUMPS + HOLES",
@@ -2149,6 +2158,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: LDX + RET X",
@@ -2159,6 +2170,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Mainly checking JIT here. */
 		"M[]: alt STX + LDX",
@@ -2333,6 +2346,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Passes checker but fails during runtime. */
 		"LD [SKF_AD_OFF-1]",
@@ -5395,6 +5410,7 @@ static struct bpf_test tests[] = {
 		{ },
 		{ },
 		.fill_helper = bpf_fill_maxinsns4,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Mainly checking JIT here. */
 		"BPF_MAXINSNS: Very long jump",
@@ -5450,10 +5466,15 @@ static struct bpf_test tests[] = {
 	{
 		"BPF_MAXINSNS: Jump, gap, jump, ...",
 		{ },
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
+#else
 		CLASSIC | FLAG_NO_DATA,
+#endif
 		{ },
 		{ { 0, 0xababcbac } },
 		.fill_helper = bpf_fill_maxinsns11,
+		.expected_errcode = -ENOTSUPP,
 	},
 	{
 		"BPF_MAXINSNS: ld_abs+get_processor_id",
@@ -6344,7 +6365,7 @@ static struct bpf_prog *generate_filter(int which, int *err)
 
 		*err = bpf_prog_create(&fp, &fprog);
 		if (tests[which].aux & FLAG_EXPECTED_FAIL) {
-			if (*err == -EINVAL) {
+			if (*err == tests[which].expected_errcode) {
 				pr_cont("PASS\n");
 				/* Verifier rejected filter as expected. */
 				*err = 0;

From 7b4eb53d95c30bdc55c44647c6968f24227fd1ba Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 5 Feb 2018 16:25:24 -0800
Subject: [PATCH 13/82] tools/bpf: fix batch-mode test failure of
 test_xdp_redirect.sh

The tests at tools/testing/selftests/bpf can run in patch mode, e.g.,
    make -C tools/testing/selftests/bpf run_tests

With the batch mode, I experimented intermittent test failure of
test_xdp_redirect.sh.
    ....
    selftests: test_xdp_redirect [PASS]
    selftests: test_xdp_redirect.sh [PASS]
    RTNETLINK answers: File exists
    selftests: test_xdp_meta [FAILED]
    selftests: test_xdp_meta.sh [FAIL]
    ....

The following illustrates what caused the failure:
     (1). test_xdp_redirect creates veth pairs (veth1,veth11) and
          (veth2,veth22), and assign veth11 and veth22 to namespace
          ns1 and ns2 respectively.
     (2). at the end of test_xdp_redirect test, ns1 and ns2 are
          deleted. During this process, the deletion of actual
          namespace resources, including deletion of veth1{1} and veth2{2},
          is put into a workqueue to be processed asynchronously.
     (3). test_xdp_meta tries to create veth pair (veth1, veth2).
          The previous veth deletions in step (2) have not finished yet,
          and veth1 or veth2 may be still valid in the kernel, thus
          causing the failure.

The fix is to explicitly delete the veth pair before test_xdp_redirect
exits. Only one end of veth needs deletion as the kernel will delete
the other end automatically. Also test_xdp_meta is also fixed in
similar manner to avoid future potential issues.

Fixes: 996139e801fd ("selftests: bpf: add a test for XDP redirect")
Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_xdp_meta.sh     | 1 +
 tools/testing/selftests/bpf/test_xdp_redirect.sh | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
index 307aa856cee3..637fcf4fe4e3 100755
--- a/tools/testing/selftests/bpf/test_xdp_meta.sh
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -9,6 +9,7 @@ cleanup()
 	fi
 
 	set +e
+	ip link del veth1 2> /dev/null
 	ip netns del ns1 2> /dev/null
 	ip netns del ns2 2> /dev/null
 }
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
index 344a3656dea6..c4b17e08d431 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -19,6 +19,8 @@ cleanup()
 	fi
 
 	set +e
+	ip link del veth1 2> /dev/null
+	ip link del veth2 2> /dev/null
 	ip netns del ns1 2> /dev/null
 	ip netns del ns2 2> /dev/null
 }

From b11a632c442eef34a0afeba61fab923241f317e9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:43 -0800
Subject: [PATCH 14/82] net: add a UID to use for ULP socket assignment

Create a UID field and enum that can be used to assign ULPs to
sockets. This saves a set of string comparisons if the ULP id
is known.

For sockmap, which is added in the next patches, a ULP is used to
hook into TCP sockets close state. In this case the ULP being added
is done at map insert time and the ULP is known and done on the kernel
side. In this case the named lookup is not needed. Because we don't
want to expose psock internals to user space socket options a user
visible flag is also added. For TLS this is set for BPF it will be
cleared.

Alos remove pr_notice, user gets an error code back and should check
that rather than rely on logs.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h  |  6 +++++
 net/ipv4/tcp_ulp.c | 59 ++++++++++++++++++++++++++++++++++++++++++----
 net/tls/tls_main.c |  2 ++
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 58278669cc55..a58292d31e12 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1983,6 +1983,10 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 #define TCP_ULP_MAX		128
 #define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
 
+enum {
+	TCP_ULP_TLS,
+};
+
 struct tcp_ulp_ops {
 	struct list_head	list;
 
@@ -1991,7 +1995,9 @@ struct tcp_ulp_ops {
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
+	int		uid;
 	char		name[TCP_ULP_NAME_MAX];
+	bool		user_visible;
 	struct module	*owner;
 };
 int tcp_register_ulp(struct tcp_ulp_ops *type);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 6bb9e14c710a..622caa4039e0 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
 	return NULL;
 }
 
+static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
+{
+	struct tcp_ulp_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+		if (e->uid == ulp)
+			return e;
+	}
+
+	return NULL;
+}
+
 static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 {
 	const struct tcp_ulp_ops *ulp = NULL;
@@ -51,6 +63,18 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 	return ulp;
 }
 
+static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
+{
+	const struct tcp_ulp_ops *ulp;
+
+	rcu_read_lock();
+	ulp = tcp_ulp_find_id(uid);
+	if (!ulp || !try_module_get(ulp->owner))
+		ulp = NULL;
+	rcu_read_unlock();
+	return ulp;
+}
+
 /* Attach new upper layer protocol to the list
  * of available protocols.
  */
@@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp)
 	int ret = 0;
 
 	spin_lock(&tcp_ulp_list_lock);
-	if (tcp_ulp_find(ulp->name)) {
-		pr_notice("%s already registered or non-unique name\n",
-			  ulp->name);
+	if (tcp_ulp_find(ulp->name))
 		ret = -EEXIST;
-	} else {
+	else
 		list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
-	}
 	spin_unlock(&tcp_ulp_list_lock);
 
 	return ret;
@@ -124,6 +145,34 @@ int tcp_set_ulp(struct sock *sk, const char *name)
 	if (!ulp_ops)
 		return -ENOENT;
 
+	if (!ulp_ops->user_visible) {
+		module_put(ulp_ops->owner);
+		return -ENOENT;
+	}
+
+	err = ulp_ops->init(sk);
+	if (err) {
+		module_put(ulp_ops->owner);
+		return err;
+	}
+
+	icsk->icsk_ulp_ops = ulp_ops;
+	return 0;
+}
+
+int tcp_set_ulp_id(struct sock *sk, int ulp)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_ulp_ops *ulp_ops;
+	int err;
+
+	if (icsk->icsk_ulp_ops)
+		return -EEXIST;
+
+	ulp_ops = __tcp_ulp_lookup(ulp);
+	if (!ulp_ops)
+		return -ENOENT;
+
 	err = ulp_ops->init(sk);
 	if (err) {
 		module_put(ulp_ops->owner);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 736719c8314e..b0d5fcea47e7 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -484,6 +484,8 @@ out:
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
+	.uid			= TCP_ULP_TLS,
+	.user_visible		= true,
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
 };

From 1aa12bdf1bfb95db7e75bfecf0e39a65f4e8fbf8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:49 -0800
Subject: [PATCH 15/82] bpf: sockmap, add sock close() hook to remove socks

The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.

The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.

To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h    |   2 +
 kernel/bpf/sockmap.c | 168 ++++++++++++++++++++++++++-----------------
 2 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a58292d31e12..e3fc667f9ac2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
 enum {
 	TCP_ULP_TLS,
+	TCP_ULP_BPF,
 };
 
 struct tcp_ulp_ops {
@@ -2003,6 +2004,7 @@ struct tcp_ulp_ops {
 int tcp_register_ulp(struct tcp_ulp_ops *type);
 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
+int tcp_set_ulp_id(struct sock *sk, const int ulp);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d1783d77..bd4a6d9c6709 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
 	struct work_struct tx_work;
 	struct work_struct gc_work;
 
+	struct proto *sk_proto;
+	void (*save_close)(struct sock *sk, long timeout);
 	void (*save_data_ready)(struct sock *sk);
 	void (*save_write_space)(struct sock *sk);
-	void (*save_state_change)(struct sock *sk);
 };
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+static struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	if (unlikely(psock->sk_proto)) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+
+	psock->save_close = sk->sk_prot->close;
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = &tcp_bpf_proto;
+	rcu_read_unlock();
+	return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+
+	if (likely(psock)) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+	rcu_read_unlock();
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+	void (*close_fun)(struct sock *sk, long timeout);
+	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	/* The psock may be destroyed anytime after exiting the RCU critial
+	 * section so by the time we use close_fun the psock may no longer
+	 * be valid. However, bpf_tcp_close is called with the sock lock
+	 * held so the close hook and sk are still valid.
+	 */
+	close_fun = psock->save_close;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		osk = cmpxchg(e->entry, sk, NULL);
+		if (osk == sk) {
+			list_del(&e->list);
+			smap_release_sock(psock, sk);
+		}
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	rcu_read_unlock();
+	close_fun(sk, timeout);
+}
+
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,
 	__SK_REDIRECT,
 };
 
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+	.name		= "bpf_tcp",
+	.uid		= TCP_ULP_BPF,
+	.user_visible	= false,
+	.owner		= NULL,
+	.init		= bpf_tcp_init,
+	.release	= bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+	tcp_bpf_proto = tcp_prot;
+	tcp_bpf_proto.close = bpf_tcp_close;
+	return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct smap_psock *psock;
-	struct socket_wq *wq;
-	struct sock *osk;
-
-	rcu_read_lock();
-
-	/* Allowing transitions into an established syn_recv states allows
-	 * for early binding sockets to a smap object before the connection
-	 * is established.
-	 */
-	switch (sk->sk_state) {
-	case TCP_SYN_SENT:
-	case TCP_SYN_RECV:
-	case TCP_ESTABLISHED:
-		break;
-	case TCP_CLOSE_WAIT:
-	case TCP_CLOSING:
-	case TCP_LAST_ACK:
-	case TCP_FIN_WAIT1:
-	case TCP_FIN_WAIT2:
-	case TCP_LISTEN:
-		break;
-	case TCP_CLOSE:
-		/* Only release if the map entry is in fact the sock in
-		 * question. There is a case where the operator deletes
-		 * the sock from the map, but the TCP sock is closed before
-		 * the psock is detached. Use cmpxchg to verify correct
-		 * sock is removed.
-		 */
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		write_lock_bh(&sk->sk_callback_lock);
-		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-			osk = cmpxchg(e->entry, sk, NULL);
-			if (osk == sk) {
-				list_del(&e->list);
-				smap_release_sock(psock, sk);
-			}
-		}
-		write_unlock_bh(&sk->sk_callback_lock);
-		break;
-	default:
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		smap_report_sk_error(psock, EPIPE);
-		break;
-	}
-
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_all(&wq->wait);
-	rcu_read_unlock();
-}
-
 static void smap_read_sock_strparser(struct strparser *strp,
 				     struct sk_buff *skb)
 {
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
-	sk->sk_state_change = psock->save_state_change;
 	psock->save_data_ready = NULL;
 	psock->save_write_space = NULL;
-	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
 }
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 	if (psock->refcnt)
 		return;
 
+	tcp_cleanup_ulp(sock);
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
 	rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	psock->save_data_ready = sk->sk_data_ready;
 	psock->save_write_space = sk->sk_write_space;
-	psock->save_state_change = sk->sk_state_change;
 	sk->sk_data_ready = smap_data_ready;
 	sk->sk_write_space = smap_write_space;
-	sk->sk_state_change = smap_state_change;
 	psock->strp_enabled = true;
 }
 
@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
 	stab = kzalloc(sizeof(*stab), GFP_USER);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -754,6 +784,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 
+		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+		if (err)
+			goto out_progs;
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}
 

From 3d9e952697de89b53227f06d4241f275eb99cfc4 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:54 -0800
Subject: [PATCH 16/82] bpf: sockmap, fix leaking maps with attached but not
 detached progs

When a program is attached to a map we increment the program refcnt
to ensure that the program is not removed while it is potentially
being referenced from sockmap side. However, if this same program
also references the map (this is a reasonably common pattern in
my programs) then the verifier will also increment the maps refcnt
from the verifier. This is to ensure the map doesn't get garbage
collected while the program has a reference to it.

So we are left in a state where the map holds the refcnt on the
program stopping it from being removed and releasing the map refcnt.
And vice versa the program holds a refcnt on the map stopping it
from releasing the refcnt on the prog.

All this is fine as long as users detach the program while the
map fd is still around. But, if the user omits this detach command
we are left with a dangling map we can no longer release.

To resolve this when the map fd is released decrement the program
references and remove any reference from the map to the program.
This fixes the issue with possibly dangling map and creates a
user side API constraint. That is, the map fd must be held open
for programs to be attached to a map.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index bd4a6d9c6709..48c33417d13c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -620,11 +620,6 @@ static void sock_map_free(struct bpf_map *map)
 	}
 	rcu_read_unlock();
 
-	if (stab->bpf_verdict)
-		bpf_prog_put(stab->bpf_verdict);
-	if (stab->bpf_parse)
-		bpf_prog_put(stab->bpf_parse);
-
 	sock_map_remove_complete(stab);
 }
 
@@ -900,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *orig;
+
+	orig = xchg(&stab->bpf_parse, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+	orig = xchg(&stab->bpf_verdict, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -907,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
+	.map_release = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,

From 7dc68e98757a8eccf8ca7a53a29b896f1eef1f76 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 5 Feb 2018 14:41:45 -0800
Subject: [PATCH 17/82] netfilter: xt_RATEEST: acquire xt_rateest_mutex for
 hash insert

rateest_hash is supposed to be protected by xt_rateest_mutex,
and, as suggested by Eric, lookup and insert should be atomic,
so we should acquire the xt_rateest_mutex once for both.

So introduce a non-locking helper for internal use and keep the
locking one for external.

Reported-by: <syzbot+5cb189720978275e4c75@syzkaller.appspotmail.com>
Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_RATEEST.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 498b54fd04d7..141c295191f6 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -39,23 +39,31 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
 	hlist_add_head(&est->list, &rateest_hash[h]);
 }
 
-struct xt_rateest *xt_rateest_lookup(const char *name)
+static struct xt_rateest *__xt_rateest_lookup(const char *name)
 {
 	struct xt_rateest *est;
 	unsigned int h;
 
 	h = xt_rateest_hash(name);
-	mutex_lock(&xt_rateest_mutex);
 	hlist_for_each_entry(est, &rateest_hash[h], list) {
 		if (strcmp(est->name, name) == 0) {
 			est->refcnt++;
-			mutex_unlock(&xt_rateest_mutex);
 			return est;
 		}
 	}
-	mutex_unlock(&xt_rateest_mutex);
+
 	return NULL;
 }
+
+struct xt_rateest *xt_rateest_lookup(const char *name)
+{
+	struct xt_rateest *est;
+
+	mutex_lock(&xt_rateest_mutex);
+	est = __xt_rateest_lookup(name);
+	mutex_unlock(&xt_rateest_mutex);
+	return est;
+}
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
 void xt_rateest_put(struct xt_rateest *est)
@@ -100,8 +108,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 
 	net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
-	est = xt_rateest_lookup(info->name);
+	mutex_lock(&xt_rateest_mutex);
+	est = __xt_rateest_lookup(info->name);
 	if (est) {
+		mutex_unlock(&xt_rateest_mutex);
 		/*
 		 * If estimator parameters are specified, they must match the
 		 * existing estimator.
@@ -139,11 +149,13 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 
 	info->est = est;
 	xt_rateest_hash_insert(est);
+	mutex_unlock(&xt_rateest_mutex);
 	return 0;
 
 err2:
 	kfree(est);
 err1:
+	mutex_unlock(&xt_rateest_mutex);
 	return ret;
 }
 

From c0ea1bcb39352b57ac5c4b6da8acd65bddeee2c5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Feb 2018 13:22:44 +0100
Subject: [PATCH 18/82] netfilter: nft_flow_offload: move flowtable cleanup
 routines to nf_flow_table

Move the flowtable cleanup routines to nf_flow_table and expose the
nf_flow_table_cleanup() helper function.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  3 +++
 net/netfilter/nf_flow_table.c         | 24 ++++++++++++++++++++++++
 net/netfilter/nft_flow_offload.c      | 19 +------------------
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index b22b22082733..ed49cd169ecf 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -95,6 +95,9 @@ struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_t
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 			  void (*iter)(struct flow_offload *flow, void *data),
 			  void *data);
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+
 void nf_flow_offload_work_gc(struct work_struct *work);
 extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index 2f5099cb85b8..04c08f6b9015 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,7 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -425,5 +426,28 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
 
+static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+{
+	struct net_device *dev = data;
+
+	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+		return;
+
+	flow_offload_dead(flow);
+}
+
+static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
+					  void *data)
+{
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	flush_delayed_work(&flowtable->gc_work);
+}
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+{
+	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index e5c45c7ac02a..b65829b2be22 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -194,23 +194,6 @@ static struct nft_expr_type nft_flow_offload_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
-{
-	struct net_device *dev = data;
-
-	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
-		return;
-
-	flow_offload_dead(flow);
-}
-
-static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
-					     void *data)
-{
-	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
-	flush_delayed_work(&flowtable->gc_work);
-}
-
 static int flow_offload_netdev_event(struct notifier_block *this,
 				     unsigned long event, void *ptr)
 {
@@ -219,7 +202,7 @@ static int flow_offload_netdev_event(struct notifier_block *this,
 	if (event != NETDEV_DOWN)
 		return NOTIFY_DONE;
 
-	nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+	nf_flow_table_cleanup(dev_net(dev), dev);
 
 	return NOTIFY_DONE;
 }

From b408c5b04f82fe4e20bceb8e4f219453d4f21f02 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Feb 2018 13:22:47 +0100
Subject: [PATCH 19/82] netfilter: nf_tables: fix flowtable free

Every flow_offload entry is added into the table twice. Because of this,
rhashtable_free_and_destroy can't be used, since it would call kfree for
each flow_offload object twice.

This patch cleans up the flowtable via nf_flow_table_iterate() to
schedule removal of entries by setting on the dying bit, then there is
an explicitly invocation of the garbage collector to release resources.

Based on patch from Felix Fietkau.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c |  1 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c |  1 +
 net/netfilter/nf_flow_table.c           | 25 +++++++++++++++++++------
 net/netfilter/nf_flow_table_inet.c      |  1 +
 net/netfilter/nf_tables_api.c           |  9 ++-------
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ed49cd169ecf..020ae903066f 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,6 +14,7 @@ struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
 	void				(*gc)(struct work_struct *work);
+	void				(*free)(struct nf_flowtable *ft);
 	const struct rhashtable_params	*params;
 	nf_hookfn			*hook;
 	struct module			*owner;
@@ -98,6 +99,7 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 
+void nf_flow_table_free(struct nf_flowtable *flow_table);
 void nf_flow_offload_work_gc(struct work_struct *work);
 extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index b2d01eb25f2c..25d2975da156 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -260,6 +260,7 @@ static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index fff21602875a..d346705d6ee6 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -253,6 +253,7 @@ static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index 04c08f6b9015..c17f1af42daa 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -232,19 +232,16 @@ static inline bool nf_flow_is_dying(const struct flow_offload *flow)
 	return flow->flags & FLOW_OFFLOAD_DYING;
 }
 
-void nf_flow_offload_work_gc(struct work_struct *work)
+static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 {
 	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table;
 	struct rhashtable_iter hti;
 	struct flow_offload *flow;
 	int err;
 
-	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
-
 	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
 	if (err)
-		goto schedule;
+		return 0;
 
 	rhashtable_walk_start(&hti);
 
@@ -270,7 +267,16 @@ void nf_flow_offload_work_gc(struct work_struct *work)
 out:
 	rhashtable_walk_stop(&hti);
 	rhashtable_walk_exit(&hti);
-schedule:
+
+	return 1;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+	struct nf_flowtable *flow_table;
+
+	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+	nf_flow_offload_gc_step(flow_table);
 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
@@ -449,5 +455,12 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
+void nf_flow_table_free(struct nf_flowtable *flow_table)
+{
+	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_free);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 281209aeba8f..375a1881d93d 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -24,6 +24,7 @@ static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 07dd1fac78a8..8b9fe30de0cd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5399,17 +5399,12 @@ err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
-static void nft_flowtable_destroy(void *ptr, void *arg)
-{
-	kfree(ptr);
-}
-
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
 	cancel_delayed_work_sync(&flowtable->data.gc_work);
 	kfree(flowtable->name);
-	rhashtable_free_and_destroy(&flowtable->data.rhashtable,
-				    nft_flowtable_destroy, NULL);
+	flowtable->data.type->free(&flowtable->data);
+	rhashtable_destroy(&flowtable->data.rhashtable);
 	module_put(flowtable->data.type->owner);
 }
 

From d8ed9600581d40d818ae417b3086a333841b0559 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 7 Feb 2018 11:50:41 +0900
Subject: [PATCH 20/82] netfilter: remove useless prototype

prototype nf_ct_nat_offset is not used anymore.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 include/net/netfilter/nf_conntrack.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index f5223bf2c420..062dc19b5840 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -213,11 +213,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
 	return nf_ct_delete(ct, 0, 0);
 }
 
-/* These are for NAT.  Icky. */
-extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
-			       enum ip_conntrack_dir dir,
-			       u32 seq);
-
 /* Set all unconfirmed conntrack as dying */
 void nf_ct_unconfirmed_destroy(struct net *);
 

From 0ff90b6c20340e57616a51ae1a1bf18156d6638a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 7 Feb 2018 09:49:02 +0100
Subject: [PATCH 21/82] netfilter: nf_flow_offload: fix use-after-free and a
 resource leak

flow_offload_del frees the flow, so all associated resource must be
freed before.

Since the ct entry in struct flow_offload_entry was allocated by
flow_offload_alloc, it should be freed by flow_offload_free to take care
of the error handling path when flow_offload_add fails.

While at it, make flow_offload_del static, since it should never be
called directly, only from the gc step

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 -
 net/netfilter/nf_flow_table.c         | 27 +++++++--------------------
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 020ae903066f..833752dd0c58 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -90,7 +90,6 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
 void flow_offload_free(struct flow_offload *flow);
 
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
-void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
 struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
 						     struct flow_offload_tuple *tuple);
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index c17f1af42daa..ec410cae9307 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -125,7 +125,9 @@ void flow_offload_free(struct flow_offload *flow)
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
 	e = container_of(flow, struct flow_offload_entry, flow);
-	kfree(e);
+	nf_ct_delete(e->ct, 0, 0);
+	nf_ct_put(e->ct);
+	kfree_rcu(e, rcu_head);
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
@@ -149,11 +151,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
 
-void flow_offload_del(struct nf_flowtable *flow_table,
-		      struct flow_offload *flow)
+static void flow_offload_del(struct nf_flowtable *flow_table,
+			     struct flow_offload *flow)
 {
-	struct flow_offload_entry *e;
-
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
 			       *flow_table->type->params);
@@ -161,10 +161,8 @@ void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       *flow_table->type->params);
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	kfree_rcu(e, rcu_head);
+	flow_offload_free(flow);
 }
-EXPORT_SYMBOL_GPL(flow_offload_del);
 
 struct flow_offload_tuple_rhash *
 flow_offload_lookup(struct nf_flowtable *flow_table,
@@ -175,15 +173,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
-static void nf_flow_release_ct(const struct flow_offload *flow)
-{
-	struct flow_offload_entry *e;
-
-	e = container_of(flow, struct flow_offload_entry, flow);
-	nf_ct_delete(e->ct, 0, 0);
-	nf_ct_put(e->ct);
-}
-
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 			  void (*iter)(struct flow_offload *flow, void *data),
 			  void *data)
@@ -259,10 +248,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
 
 		if (nf_flow_has_expired(flow) ||
-		    nf_flow_is_dying(flow)) {
+		    nf_flow_is_dying(flow))
 			flow_offload_del(flow_table, flow);
-			nf_flow_release_ct(flow);
-		}
 	}
 out:
 	rhashtable_walk_stop(&hti);

From c713fb071edc0efc01a955f65a006b0e1795d2eb Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Mon, 5 Feb 2018 12:38:11 -0600
Subject: [PATCH 22/82] rtlwifi: rtl8821ae: Fix connection lost problem
 correctly

There has been a coding error in rtl8821ae since it was first introduced,
namely that an 8-bit register was read using a 16-bit read in
_rtl8821ae_dbi_read(). This error was fixed with commit 40b368af4b75
("rtlwifi: Fix alignment issues"); however, this change led to
instability in the connection. To restore stability, this change
was reverted in commit b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection
lost problem").

Unfortunately, the unaligned access causes machine checks in ARM
architecture, and we were finally forced to find the actual cause of the
problem on x86 platforms. Following a suggestion from Pkshih
<pkshih@realtek.com>, it was found that increasing the ASPM L1
latency from 0 to 7 fixed the instability. This parameter was varied to
see if a smaller value would work; however, it appears that 7 is the
safest value. A new symbol is defined for this quantity, thus it can be
easily changed if necessary.

Fixes: b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection lost problem")
Cc: Stable <stable@vger.kernel.org> # 4.14+
Fix-suggested-by: Pkshih <pkshih@realtek.com>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Tested-by: James Cameron <quozl@laptop.org>  # x86_64 OLPC NL3
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 5 +++--
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
index f20e77b4bb65..317c1b3101da 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
@@ -1123,7 +1123,7 @@ static u8 _rtl8821ae_dbi_read(struct rtl_priv *rtlpriv, u16 addr)
 	}
 	if (0 == tmp) {
 		read_addr = REG_DBI_RDATA + addr % 4;
-		ret = rtl_read_word(rtlpriv, read_addr);
+		ret = rtl_read_byte(rtlpriv, read_addr);
 	}
 	return ret;
 }
@@ -1165,7 +1165,8 @@ static void _rtl8821ae_enable_aspm_back_door(struct ieee80211_hw *hw)
 	}
 
 	tmp = _rtl8821ae_dbi_read(rtlpriv, 0x70f);
-	_rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7));
+	_rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7) |
+			     ASPM_L1_LATENCY << 3);
 
 	tmp = _rtl8821ae_dbi_read(rtlpriv, 0x719);
 	_rtl8821ae_dbi_write(rtlpriv, 0x719, tmp | BIT(3) | BIT(4));
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index a7aacbc3984e..46dcb7fef195 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -99,6 +99,7 @@
 #define RTL_USB_MAX_RX_COUNT			100
 #define QBSS_LOAD_SIZE				5
 #define MAX_WMMELE_LENGTH			64
+#define ASPM_L1_LATENCY				7
 
 #define TOTAL_CAM_ENTRY				32
 

From 0a7fe718239e2f8d9c067070f59f53b509bb17dc Mon Sep 17 00:00:00 2001
From: Yu Wang <yyuwang@codeaurora.org>
Date: Tue, 30 Jan 2018 14:06:01 +0200
Subject: [PATCH 23/82] ath10k: correct the length of DRAM dump for QCA6174
 hw3.x/QCA9377 hw1.1

The length of DRAM dump for QCA6174 hw3.0/hw3.2 and QCA9377 hw1.1
are less than the actual value, some coredump contents are missed.
To fix it, change the length from 0x90000 to 0xa8000.

Fixes: 703f261dd77f ("ath10k: add memory dump support for QCA6174/QCA9377")
Signed-off-by: Yu Wang <yyuwang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/coredump.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c
index 4dde126dab17..7173b3743b43 100644
--- a/drivers/net/wireless/ath/ath10k/coredump.c
+++ b/drivers/net/wireless/ath/ath10k/coredump.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -616,7 +617,7 @@ static const struct ath10k_mem_region qca6174_hw30_mem_regions[] = {
 	{
 		.type = ATH10K_MEM_REGION_TYPE_DRAM,
 		.start = 0x400000,
-		.len = 0x90000,
+		.len = 0xa8000,
 		.name = "DRAM",
 		.section_table = {
 			.sections = NULL,

From d5cc61119343b6f1b8716cfe591d4989ea0c5c28 Mon Sep 17 00:00:00 2001
From: Tobias Schramm <tobleminer@gmail.com>
Date: Tue, 30 Jan 2018 14:06:02 +0200
Subject: [PATCH 24/82] PCI: Add Ubiquiti Networks vendor ID

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Tobias Schramm <tobleminer@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ab20dc5db423..35871adfdc86 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -149,6 +149,8 @@
 #define PCI_VENDOR_ID_DYNALINK		0x0675
 #define PCI_DEVICE_ID_DYNALINK_IS64PH	0x1702
 
+#define PCI_VENDOR_ID_UBIQUITI		0x0777
+
 #define PCI_VENDOR_ID_BERKOM			0x0871
 #define PCI_DEVICE_ID_BERKOM_A1T		0xffa1
 #define PCI_DEVICE_ID_BERKOM_T_CONCEPT		0xffa2

From 34f1cb339cae5c0b6b75094e2d5c79d19be424ed Mon Sep 17 00:00:00 2001
From: Tobias Schramm <tobleminer@gmail.com>
Date: Tue, 30 Jan 2018 14:06:05 +0200
Subject: [PATCH 25/82] ath10k: add support for Ubiquiti rebranded QCA988X v2

Some modern Ubiquiti devices contain a rebranded QCA988X rev2 with
a custom Ubiquiti vendor and device id. This patch adds support for
those devices, treating them as a QCA988X v2.

Signed-off-by: Tobias Schramm <tobleminer@gmail.com>
[kvalo@codeaurora.org: rebase, add missing fields in hw_params, fix a long line in pci.c:61]
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 29 ++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/hw.h   |  1 +
 drivers/net/wireless/ath/ath10k/pci.c  |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index b0fdc1023619..6fb282f76804 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -90,6 +90,35 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 	},
+	{
+		.id = QCA988X_HW_2_0_VERSION,
+		.dev_id = QCA988X_2_0_DEVICE_ID_UBNT,
+		.name = "qca988x hw2.0 ubiquiti",
+		.patch_load_addr = QCA988X_HW_2_0_PATCH_LOAD_ADDR,
+		.uart_pin = 7,
+		.cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL,
+		.otp_exe_param = 0,
+		.channel_counters_freq_hz = 88000,
+		.max_probe_resp_desc_thres = 0,
+		.cal_data_len = 2116,
+		.fw = {
+			.dir = QCA988X_HW_2_0_FW_DIR,
+			.board = QCA988X_HW_2_0_BOARD_DATA_FILE,
+			.board_size = QCA988X_BOARD_DATA_SZ,
+			.board_ext_size = QCA988X_BOARD_EXT_DATA_SZ,
+		},
+		.hw_ops = &qca988x_ops,
+		.decap_align_bytes = 4,
+		.spectral_bin_discard = 0,
+		.vht160_mcs_rx_highest = 0,
+		.vht160_mcs_tx_highest = 0,
+		.n_cipher_suites = 8,
+		.num_peers = TARGET_TLV_NUM_PEERS,
+		.ast_skid_limit = 0x10,
+		.num_wds_entries = 0x20,
+		.target_64bit = false,
+		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+	},
 	{
 		.id = QCA9887_HW_1_0_VERSION,
 		.dev_id = QCA9887_1_0_DEVICE_ID,
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index 6203bc65799b..413b1b4321f7 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -22,6 +22,7 @@
 
 #define ATH10K_FW_DIR			"ath10k"
 
+#define QCA988X_2_0_DEVICE_ID_UBNT   (0x11ac)
 #define QCA988X_2_0_DEVICE_ID   (0x003c)
 #define QCA6164_2_1_DEVICE_ID   (0x0041)
 #define QCA6174_2_1_DEVICE_ID   (0x003e)
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index 355db6a0fcf3..1b266cd0c2ec 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -58,6 +58,9 @@ MODULE_PARM_DESC(reset_mode, "0: auto, 1: warm only (default: 0)");
 #define ATH10K_DIAG_TRANSFER_LIMIT	0x5000
 
 static const struct pci_device_id ath10k_pci_id_table[] = {
+	/* PCI-E QCA988X V2 (Ubiquiti branded) */
+	{ PCI_VDEVICE(UBIQUITI, QCA988X_2_0_DEVICE_ID_UBNT) },
+
 	{ PCI_VDEVICE(ATHEROS, QCA988X_2_0_DEVICE_ID) }, /* PCI-E QCA988X V2 */
 	{ PCI_VDEVICE(ATHEROS, QCA6164_2_1_DEVICE_ID) }, /* PCI-E QCA6164 V2.1 */
 	{ PCI_VDEVICE(ATHEROS, QCA6174_2_1_DEVICE_ID) }, /* PCI-E QCA6174 V2.1 */
@@ -74,6 +77,7 @@ static const struct ath10k_pci_supp_chip ath10k_pci_supp_chips[] = {
 	 * hacks. ath10k doesn't have them and these devices crash horribly
 	 * because of that.
 	 */
+	{ QCA988X_2_0_DEVICE_ID_UBNT, QCA988X_HW_2_0_CHIP_ID_REV },
 	{ QCA988X_2_0_DEVICE_ID, QCA988X_HW_2_0_CHIP_ID_REV },
 
 	{ QCA6164_2_1_DEVICE_ID, QCA6174_HW_2_1_CHIP_ID_REV },
@@ -2193,6 +2197,7 @@ static int ath10k_pci_get_num_banks(struct ath10k *ar)
 	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
 
 	switch (ar_pci->pdev->device) {
+	case QCA988X_2_0_DEVICE_ID_UBNT:
 	case QCA988X_2_0_DEVICE_ID:
 	case QCA99X0_2_0_DEVICE_ID:
 	case QCA9888_2_0_DEVICE_ID:
@@ -3424,6 +3429,7 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 	u32 (*targ_cpu_to_ce_addr)(struct ath10k *ar, u32 addr);
 
 	switch (pci_dev->device) {
+	case QCA988X_2_0_DEVICE_ID_UBNT:
 	case QCA988X_2_0_DEVICE_ID:
 		hw_rev = ATH10K_HW_QCA988X;
 		pci_ps = false;

From b9607de6cf22a5cd268b9206177a9baafb6e8ac8 Mon Sep 17 00:00:00 2001
From: Wojciech Dubowik <Wojciech.Dubowik@neratec.com>
Date: Tue, 30 Jan 2018 14:06:07 +0200
Subject: [PATCH 26/82] ath9k: Fix get channel default noise floor

Commit 8da58553cc63 ("ath9k: Use calibrated noise floor value
when available") introduced regression in ath9k_hw_getchan_noise
where per chain nominal noise floor has been taken instead default
for channel.
Revert to original default channel noise floor.

Fixes: 8da58553cc63 ("ath9k: Use calibrated noise floor value when available")
Reported-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Wojciech Dubowik <Wojciech.Dubowik@neratec.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath9k/calib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c
index 3d9447e21025..695c779ae8cf 100644
--- a/drivers/net/wireless/ath/ath9k/calib.c
+++ b/drivers/net/wireless/ath/ath9k/calib.c
@@ -72,7 +72,7 @@ static s16 ath9k_hw_get_default_nf(struct ath_hw *ah,
 s16 ath9k_hw_getchan_noise(struct ath_hw *ah, struct ath9k_channel *chan,
 			   s16 nf)
 {
-	s8 noise = ath9k_hw_get_default_nf(ah, chan, 0);
+	s8 noise = ATH_DEFAULT_NOISE_FLOOR;
 
 	if (nf) {
 		s8 delta = nf - ATH9K_NF_CAL_NOISE_THRESH -

From 50e79e25250bf928369996277e85b00536b380c7 Mon Sep 17 00:00:00 2001
From: Yu Wang <yyuwang@codeaurora.org>
Date: Tue, 30 Jan 2018 14:06:08 +0200
Subject: [PATCH 27/82] ath10k: fix kernel panic issue during pci probe

If device gone during chip reset, ar->normal_mode_fw.board is not
initialized, but ath10k_debug_print_hwfw_info() will try to access its
member, which will cause 'kernel NULL pointer' issue. This was found
using a faulty device (pci link went down sometimes) in a random
insmod/rmmod/other-op test.
To fix it, check ar->normal_mode_fw.board before accessing the member.

pci 0000:02:00.0: BAR 0: assigned [mem 0xf7400000-0xf75fffff 64bit]
ath10k_pci 0000:02:00.0: enabling device (0000 -> 0002)
ath10k_pci 0000:02:00.0: pci irq msi oper_irq_mode 2 irq_mode 0 reset_mode 0
ath10k_pci 0000:02:00.0: failed to read device register, device is gone
ath10k_pci 0000:02:00.0: failed to wait for target init: -5
ath10k_pci 0000:02:00.0: failed to warm reset: -5
ath10k_pci 0000:02:00.0: firmware crashed during chip reset
ath10k_pci 0000:02:00.0: firmware crashed! (uuid 5d018951-b8e1-404a-8fde-923078b4423a)
ath10k_pci 0000:02:00.0: (null) target 0x00000000 chip_id 0x00340aff sub 0000:0000
ath10k_pci 0000:02:00.0: kconfig debug 1 debugfs 1 tracing 1 dfs 1 testmode 1
ath10k_pci 0000:02:00.0: firmware ver  api 0 features  crc32 00000000
...
BUG: unable to handle kernel NULL pointer dereference at 00000004
...
Call Trace:
 [<fb4e7882>] ath10k_print_driver_info+0x12/0x20 [ath10k_core]
 [<fb62b7dd>] ath10k_pci_fw_crashed_dump+0x6d/0x4d0 [ath10k_pci]
 [<fb629f07>] ? ath10k_pci_sleep.part.19+0x57/0xc0 [ath10k_pci]
 [<fb62c8ee>] ath10k_pci_hif_power_up+0x14e/0x1b0 [ath10k_pci]
 [<c10477fb>] ? do_page_fault+0xb/0x10
 [<fb4eb934>] ath10k_core_register_work+0x24/0x840 [ath10k_core]
 [<c18a00d8>] ? netlbl_unlhsh_remove+0x178/0x410
 [<c10477f0>] ? __do_page_fault+0x480/0x480
 [<c1068e44>] process_one_work+0x114/0x3e0
 [<c1069d07>] worker_thread+0x37/0x4a0
 [<c106e294>] kthread+0xa4/0xc0
 [<c1069cd0>] ? create_worker+0x180/0x180
 [<c106e1f0>] ? kthread_park+0x50/0x50
 [<c18ab4f7>] ret_from_fork+0x1b/0x28
 Code: 78 80 b8 50 09 00 00 00 75 5d 8d 75 94 c7 44 24 08 aa d7 52 fb c7 44 24 04 64 00 00 00
 89 34 24 e8 82 52 e2 c5 8b 83 dc 08 00 00 <8b> 50 04 8b 08 31 c0 e8 20 57 e3 c5 89 44 24 10 8b 83 58 09 00
 EIP: [<fb4e7754>]-
 ath10k_debug_print_board_info+0x34/0xb0 [ath10k_core]
 SS:ESP 0068:f4921d90
 CR2: 0000000000000004

Signed-off-by: Yu Wang <yyuwang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/debug.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index 6d836a26272f..554cd7856cb6 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -81,6 +82,8 @@ void ath10k_debug_print_hwfw_info(struct ath10k *ar)
 void ath10k_debug_print_board_info(struct ath10k *ar)
 {
 	char boardinfo[100];
+	const struct firmware *board;
+	u32 crc;
 
 	if (ar->id.bmi_ids_valid)
 		scnprintf(boardinfo, sizeof(boardinfo), "%d:%d",
@@ -88,11 +91,16 @@ void ath10k_debug_print_board_info(struct ath10k *ar)
 	else
 		scnprintf(boardinfo, sizeof(boardinfo), "N/A");
 
+	board = ar->normal_mode_fw.board;
+	if (!IS_ERR_OR_NULL(board))
+		crc = crc32_le(0, board->data, board->size);
+	else
+		crc = 0;
+
 	ath10k_info(ar, "board_file api %d bmi_id %s crc32 %08x",
 		    ar->bd_api,
 		    boardinfo,
-		    crc32_le(0, ar->normal_mode_fw.board->data,
-			     ar->normal_mode_fw.board->size));
+		    crc);
 }
 
 void ath10k_debug_print_boot_info(struct ath10k *ar)

From 4e12d654ba068df06c5e4c8322d7dcced41e48ee Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <linux@rempel-privat.de>
Date: Tue, 30 Jan 2018 14:06:11 +0200
Subject: [PATCH 28/82] ath9k_htc: add Altai WA1011N-GU

as reported in:
https://github.com/qca/open-ath9k-htc-firmware/pull/71#issuecomment-361100070

Signed-off-by: Oleksij Rempel <linux@rempel-privat.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath9k/hif_usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c
index 56676eaff24c..cb0eef13af1c 100644
--- a/drivers/net/wireless/ath/ath9k/hif_usb.c
+++ b/drivers/net/wireless/ath/ath9k/hif_usb.c
@@ -24,6 +24,7 @@ static const struct usb_device_id ath9k_hif_usb_ids[] = {
 	{ USB_DEVICE(0x0cf3, 0x9271) }, /* Atheros */
 	{ USB_DEVICE(0x0cf3, 0x1006) }, /* Atheros */
 	{ USB_DEVICE(0x0846, 0x9030) }, /* Netgear N150 */
+	{ USB_DEVICE(0x07b8, 0x9271) }, /* Altai WA1011N-GU */
 	{ USB_DEVICE(0x07D1, 0x3A10) }, /* Dlink Wireless 150 */
 	{ USB_DEVICE(0x13D3, 0x3327) }, /* Azurewave */
 	{ USB_DEVICE(0x13D3, 0x3328) }, /* Azurewave */

From 035d808f7cfd578dc210b584beb17e365f34d39a Mon Sep 17 00:00:00 2001
From: Naresh Kamboju <naresh.kamboju@linaro.org>
Date: Wed, 7 Feb 2018 23:45:34 +0530
Subject: [PATCH 29/82] selftests: bpf: test_kmod.sh: check the module path
 before insmod

test_kmod.sh reported false failure when module not present.
Check test_bpf.ko is present in the path before loading it.

Two cases to be addressed here,
In the development process of test_bpf.c unit testing will be done by
developers by using "insmod $SRC_TREE/lib/test_bpf.ko"

On the other hand testers run full tests by installing modules on device
under test (DUT) and followed by modprobe to insert the modules accordingly.

Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_kmod.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
index ed4774d8d6ed..35669ccd4d23 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -10,9 +10,21 @@ test_run()
 
 	echo "[ JIT enabled:$1 hardened:$2 ]"
 	dmesg -C
-	insmod $SRC_TREE/lib/test_bpf.ko 2> /dev/null
-	if [ $? -ne 0 ]; then
-		rc=1
+	if [ -f ${SRC_TREE}/lib/test_bpf.ko ]; then
+		insmod ${SRC_TREE}/lib/test_bpf.ko 2> /dev/null
+		if [ $? -ne 0 ]; then
+			rc=1
+		fi
+	else
+		# Use modprobe dry run to check for missing test_bpf module
+		if ! /sbin/modprobe -q -n test_bpf; then
+			echo "test_bpf: [SKIP]"
+		elif /sbin/modprobe -q test_bpf; then
+			echo "test_bpf: ok"
+		else
+			echo "test_bpf: [FAIL]"
+			rc=1
+		fi
 	fi
 	rmmod  test_bpf 2> /dev/null
 	dmesg | grep FAIL

From e729452ec33304260d4741e9ce67784a457ee1e6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 6 Feb 2018 21:17:17 +0100
Subject: [PATCH 30/82] cxgb4: Fix error handling path in 'init_one()'

Commit baf5086840ab1 ("cxgb4: restructure VF mgmt code") has reordered
some code but an error handling label has not been updated accordingly.
So fix it and free 'adapter' if 't4_wait_dev_ready()' fails.

Fixes: baf5086840ab1 ("cxgb4: restructure VF mgmt code")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 1ca2a39ed0f8..56bc626ef006 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5166,7 +5166,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->regs = regs;
 	err = t4_wait_dev_ready(regs);
 	if (err < 0)
-		goto out_unmap_bar0;
+		goto out_free_adapter;
 
 	/* We control everything through one PF */
 	whoami = readl(regs + PL_WHOAMI_A);

From 17e9e23b130e4e269fa53c2370325249f3ba75dd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Feb 2018 16:44:12 +0000
Subject: [PATCH 31/82] rxrpc: Fix received abort handling

AF_RXRPC is incorrectly sending back to the server any abort it receives
for a client connection.  This is due to the final-ACK offload to the
connection event processor patch.  The abort code is copied into the
last-call information on the connection channel and then the event
processor is set.

Instead, the following should be done:

 (1) In the case of a final-ACK for a successful call, the ACK should be
     scheduled as before.

 (2) In the case of a locally generated ABORT, the ABORT details should be
     cached for sending in response to further packets related to that
     call and no further action scheduled at call disconnect time.

 (3) In the case of an ACK received from the peer, the call should be
     considered dead, no ABORT should be transmitted at this time.  In
     response to further non-ABORT packets from the peer relating to this
     call, an RX_USER_ABORT ABORT should be transmitted.

 (4) In the case of a call killed due to network error, an RX_USER_ABORT
     ABORT should be cached for transmission in response to further
     packets, but no ABORT should be sent at this time.

Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/conn_client.c |  3 ++-
 net/rxrpc/conn_object.c | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 7f74ca3059f8..064175068059 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -834,7 +834,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	 * can be skipped if we find a follow-on call.  The first DATA packet
 	 * of the follow on call will implicitly ACK this call.
 	 */
-	if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
+	if (call->completion == RXRPC_CALL_SUCCEEDED &&
+	    test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
 		unsigned long final_ack_at = jiffies + 2;
 
 		WRITE_ONCE(chan->final_ack_at, final_ack_at);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index c628351eb900..ccbac190add1 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -177,13 +177,21 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 		 * through the channel, whilst disposing of the actual call record.
 		 */
 		trace_rxrpc_disconnect_call(call);
-		if (call->abort_code) {
-			chan->last_abort = call->abort_code;
-			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
-		} else {
+		switch (call->completion) {
+		case RXRPC_CALL_SUCCEEDED:
 			chan->last_seq = call->rx_hard_ack;
 			chan->last_type = RXRPC_PACKET_TYPE_ACK;
+			break;
+		case RXRPC_CALL_LOCALLY_ABORTED:
+			chan->last_abort = call->abort_code;
+			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+			break;
+		default:
+			chan->last_abort = RX_USER_ABORT;
+			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+			break;
 		}
+
 		/* Sync with rxrpc_conn_retransmit(). */
 		smp_wmb();
 		chan->last_call = chan->call_id;

From c861ef83d771362ed0475cd510eb56cf4126ef34 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Tue, 6 Feb 2018 11:34:23 -0800
Subject: [PATCH 32/82] sun: Add SPDX license tags to Sun network drivers

Add the appropriate SPDX license tags to the Sun network drivers
as outlined in Documentation/process/license-rules.rst.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/Kconfig          | 1 +
 drivers/net/ethernet/sun/cassini.c        | 1 +
 drivers/net/ethernet/sun/cassini.h        | 1 +
 drivers/net/ethernet/sun/ldmvsw.c         | 1 +
 drivers/net/ethernet/sun/niu.c            | 1 +
 drivers/net/ethernet/sun/sunbmac.c        | 1 +
 drivers/net/ethernet/sun/sungem.c         | 1 +
 drivers/net/ethernet/sun/sunhme.c         | 1 +
 drivers/net/ethernet/sun/sunqe.c          | 1 +
 drivers/net/ethernet/sun/sunvnet.c        | 1 +
 drivers/net/ethernet/sun/sunvnet_common.c | 1 +
 11 files changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig
index b2caf5132bd2..7b982e02ea3a 100644
--- a/drivers/net/ethernet/sun/Kconfig
+++ b/drivers/net/ethernet/sun/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Sun network device configuration
 #
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 113bd57e2ea0..9020b084b953 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* cassini.c: Sun Microsystems Cassini(+) ethernet driver.
  *
  * Copyright (C) 2004 Sun Microsystems Inc.
diff --git a/drivers/net/ethernet/sun/cassini.h b/drivers/net/ethernet/sun/cassini.h
index 882ce168a799..13f3860496a8 100644
--- a/drivers/net/ethernet/sun/cassini.h
+++ b/drivers/net/ethernet/sun/cassini.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* $Id: cassini.h,v 1.16 2004/08/17 21:15:16 zaumen Exp $
  * cassini.h: Definitions for Sun Microsystems Cassini(+) ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c
index 5ea037672e6f..a5dd627fe2f9 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* ldmvsw.c: Sun4v LDOM Virtual Switch Driver.
  *
  * Copyright (C) 2016-2017 Oracle. All rights reserved.
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 06001bacbe0f..8dd545fed30d 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* niu.c: Neptune ethernet driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller (davem@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c
index 0b1f41f6bceb..f047b2797156 100644
--- a/drivers/net/ethernet/sun/sunbmac.c
+++ b/drivers/net/ethernet/sun/sunbmac.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunbmac.c: Driver for Sparc BigMAC 100baseT ethernet adapters.
  *
  * Copyright (C) 1997, 1998, 1999, 2003, 2008 David S. Miller (davem@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index a7afcee3c5ae..7a16d40a72d1 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* $Id: sungem.c,v 1.44.2.22 2002/03/13 01:18:12 davem Exp $
  * sungem.c: Sun GEM ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index 0431f1e5f511..06da2f59fcbf 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunhme.c: Sparc HME/BigMac 10/100baseT half/full duplex auto switching,
  *           auto carrier detecting ethernet driver.  Also known as the
  *           "Happy Meal Ethernet" found on SunSwift SBUS cards.
diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c
index a6bcdcdd947e..7fe0d5e33922 100644
--- a/drivers/net/ethernet/sun/sunqe.c
+++ b/drivers/net/ethernet/sun/sunqe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunqe.c: Sparc QuadEthernet 10baseT SBUS card driver.
  *          Once again I am out to prove that every ethernet
  *          controller out there can be most efficiently programmed
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 27fb22638885..63d3d6b215f3 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c
index 8aa3ce46bb81..d8f4c3f28150 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>

From 58e354c01b1906b603dcdb28ec35250b09eac625 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 6 Feb 2018 12:14:12 -0800
Subject: [PATCH 33/82] net/ipv6: Handle reject routes with onlink flag

Verification of nexthops with onlink flag need to handle unreachable
routes. The lookup is only intended to validate the gateway address
is not a local address and if the gateway resolves the egress device
must match the given device. Hence, hitting any default reject route
is ok.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb2d251c0500..69c43d289c69 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2488,7 +2488,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
 	err = 0;
 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
 	if (grt) {
-		if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
+		if (!grt->dst.error &&
+		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
 			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
 			err = -EINVAL;
 		}

From 44750f8483a1bc4bda58ec4f7b7af5b053833d49 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 6 Feb 2018 13:17:06 -0800
Subject: [PATCH 34/82] net/ipv6: onlink nexthop checks should default to main
 table

Because of differences in how ipv4 and ipv6 handle fib lookups,
verification of nexthops with onlink flag need to default to the main
table rather than the local table used by IPv4. As it stands an
address within a connected route on device 1 can be used with
onlink on device 2. Updating the table properly rejects the route
due to the egress device mismatch.

Update the extack message as well to show it could be a device
mismatch for the nexthop spec.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 69c43d289c69..9dcfadddd800 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2479,7 +2479,7 @@ static int ip6_route_check_nh_onlink(struct net *net,
 				     struct net_device *dev,
 				     struct netlink_ext_ack *extack)
 {
-	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
 	struct rt6_info *grt;
@@ -2490,7 +2490,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
 	if (grt) {
 		if (!grt->dst.error &&
 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
-			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop has invalid gateway or device mismatch");
 			err = -EINVAL;
 		}
 

From bc6d33c8d93f5999920e97a8c6330b8910053d4f Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Tue, 6 Feb 2018 13:15:20 -0800
Subject: [PATCH 35/82] i40e: Fix the number of queues available to be mapped
 for use

Fix the number of queues per enabled TC and report available queues
to the kernel without having to limit them to the max RSS limit so
they are available to be mapped for XPS. This allows a queue per
processing thread available for handling traffic for the given
traffic class.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 27 +++++++++++----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f95ce9b5e4fb..e31adbc75f9c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1785,7 +1785,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	struct i40e_pf *pf = vsi->back;
 	u16 sections = 0;
 	u8 netdev_tc = 0;
-	u16 numtc = 0;
+	u16 numtc = 1;
 	u16 qcount;
 	u8 offset;
 	u16 qmap;
@@ -1795,9 +1795,11 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
 	offset = 0;
 
+	/* Number of queues per enabled TC */
+	num_tc_qps = vsi->alloc_queue_pairs;
 	if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
 		/* Find numtc from enabled TC bitmap */
-		for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
 			if (enabled_tc & BIT(i)) /* TC is enabled */
 				numtc++;
 		}
@@ -1805,18 +1807,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 			dev_warn(&pf->pdev->dev, "DCB is enabled but no TC enabled, forcing TC0\n");
 			numtc = 1;
 		}
-	} else {
-		/* At least TC0 is enabled in non-DCB, non-MQPRIO case */
-		numtc = 1;
+		num_tc_qps = num_tc_qps / numtc;
+		num_tc_qps = min_t(int, num_tc_qps,
+				   i40e_pf_get_max_q_per_tc(pf));
 	}
 
 	vsi->tc_config.numtc = numtc;
 	vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1;
-	/* Number of queues per enabled TC */
-	qcount = vsi->alloc_queue_pairs;
-
-	num_tc_qps = qcount / numtc;
-	num_tc_qps = min_t(int, num_tc_qps, i40e_pf_get_max_q_per_tc(pf));
 
 	/* Do not allow use more TC queue pairs than MSI-X vectors exist */
 	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -1831,9 +1828,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 
 			switch (vsi->type) {
 			case I40E_VSI_MAIN:
-				qcount = min_t(int, pf->alloc_rss_size,
-					       num_tc_qps);
-				break;
+				if (!(pf->flags & (I40E_FLAG_FD_SB_ENABLED |
+				    I40E_FLAG_FD_ATR_ENABLED)) ||
+				    vsi->tc_config.enabled_tc != 1) {
+					qcount = min_t(int, pf->alloc_rss_size,
+						       num_tc_qps);
+					break;
+				}
 			case I40E_VSI_FDIR:
 			case I40E_VSI_SRIOV:
 			case I40E_VSI_VMDQ2:

From 3468656fd7599b0cb1092bb1ee717d1a984e93ee Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Tue, 6 Feb 2018 16:21:49 -0600
Subject: [PATCH 36/82] ibmvnic: Fix rx queue cleanup for non-fatal resets

At some point, a check was added to exit the polling routine during resets.
This makes sense for most reset conditions, but for a non-fatal error, we
expect the polling routine to continue running to properly clean up the rx
queues. This patch checks if we are performing a non-fatal reset and if we
are, continues normal polling operation.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index afaf29b201dc..8dc8fc503750 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1831,7 +1831,8 @@ restart_poll:
 		u16 offset;
 		u8 flags = 0;
 
-		if (unlikely(adapter->resetting)) {
+		if (unlikely(adapter->resetting &&
+			     adapter->reset_reason != VNIC_RESET_NON_FATAL)) {
 			enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]);
 			napi_complete_done(napi, frames_processed);
 			return frames_processed;

From b0992eca00c490c0923044b7d8b853c212b3cacc Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Tue, 6 Feb 2018 17:25:23 -0600
Subject: [PATCH 37/82] ibmvnic: Ensure that buffers are NULL after free

This change will guard against a double free in the case that the
buffers were previously freed at some other time, such as during
a device reset. It resolves a kernel oops that occurred when changing
the VNIC device's MTU.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 8dc8fc503750..8dabc9d9dfa6 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -354,6 +354,8 @@ static void release_stats_buffers(struct ibmvnic_adapter *adapter)
 {
 	kfree(adapter->tx_stats_buffers);
 	kfree(adapter->rx_stats_buffers);
+	adapter->tx_stats_buffers = NULL;
+	adapter->rx_stats_buffers = NULL;
 }
 
 static int init_stats_buffers(struct ibmvnic_adapter *adapter)
@@ -599,6 +601,8 @@ static void release_vpd_data(struct ibmvnic_adapter *adapter)
 
 	kfree(adapter->vpd->buff);
 	kfree(adapter->vpd);
+
+	adapter->vpd = NULL;
 }
 
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
@@ -909,6 +913,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
 		dev_err(dev, "Could not map VPD buffer\n");
 		kfree(adapter->vpd->buff);
+		adapter->vpd->buff = NULL;
 		return -ENOMEM;
 	}
 

From 62f94c2101f35cd45775df00ba09bde77580e26a Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 6 Feb 2018 19:17:06 -0600
Subject: [PATCH 38/82] net: ethernet: ti: cpsw: fix net watchdog timeout

It was discovered that simple program which indefinitely sends 200b UDP
packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network
watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog
timeout is triggered due to race between cpsw_ndo_start_xmit() and
cpsw_tx_handler() [NAPI]

cpsw_ndo_start_xmit()
	if (unlikely(!cpdma_check_free_tx_desc(txch))) {
		txq = netdev_get_tx_queue(ndev, q_idx);
		netif_tx_stop_queue(txq);

^^ as per [1] barier has to be used after set_bit() otherwise new value
might not be visible to other cpus
	}

cpsw_tx_handler()
	if (unlikely(netif_tx_queue_stopped(txq)))
		netif_tx_wake_queue(txq);

and when it happens ndev TX queue became disabled forever while driver's HW
TX queue is empty.

Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue()
calls and double check for free TX descriptors after stopping ndev TX queue
- if there are free TX descriptors wake up ndev TX queue.

[1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Reviewed-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3c85a0885f9b..1b1b78fdc138 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1636,6 +1636,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 		q_idx = q_idx % cpsw->tx_ch_num;
 
 	txch = cpsw->txv[q_idx].ch;
+	txq = netdev_get_tx_queue(ndev, q_idx);
 	ret = cpsw_tx_packet_submit(priv, skb, txch);
 	if (unlikely(ret != 0)) {
 		cpsw_err(priv, tx_err, "desc submit failed\n");
@@ -1646,15 +1647,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 	 * tell the kernel to stop sending us tx frames.
 	 */
 	if (unlikely(!cpdma_check_free_tx_desc(txch))) {
-		txq = netdev_get_tx_queue(ndev, q_idx);
 		netif_tx_stop_queue(txq);
+
+		/* Barrier, so that stop_queue visible to other cpus */
+		smp_mb__after_atomic();
+
+		if (cpdma_check_free_tx_desc(txch))
+			netif_tx_wake_queue(txq);
 	}
 
 	return NETDEV_TX_OK;
 fail:
 	ndev->stats.tx_dropped++;
-	txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
 	netif_tx_stop_queue(txq);
+
+	/* Barrier, so that stop_queue visible to other cpus */
+	smp_mb__after_atomic();
+
+	if (cpdma_check_free_tx_desc(txch))
+		netif_tx_wake_queue(txq);
+
 	return NETDEV_TX_BUSY;
 }
 

From 043e337f555e610ad8237fd23522d97c968d72b9 Mon Sep 17 00:00:00 2001
From: "Md. Islam" <mislam4@kent.edu>
Date: Tue, 6 Feb 2018 23:14:18 -0500
Subject: [PATCH 39/82] sch_netem: Bug fixing in calculating Netem interval

In Kernel 4.15.0+, Netem does not work properly.

Netem setup:

tc qdisc add dev h1-eth0 root handle 1: netem delay 10ms 2ms

Result:

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=22.8 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=11.4 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=4303 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.2 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.3 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=4304 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=4303 ms

Patch:

(rnd % (2 * sigma)) - sigma was overflowing s32. After applying the
patch, I found following output which is desirable.

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=21.1 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=8.46 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=9.00 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=8.36 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=8.11 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=10.0 ms
64 bytes from 172.16.101.2: icmp_seq=9 ttl=64 time=11.3 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.5 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.2 ms

Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7bbc13b8ca47..7c179addebcd 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -327,7 +327,7 @@ static s64 tabledist(s64 mu, s32 sigma,
 
 	/* default uniform distribution */
 	if (dist == NULL)
-		return (rnd % (2 * sigma)) - sigma + mu;
+		return ((rnd % (2 * sigma)) + mu) - sigma;
 
 	t = dist->table[rnd % dist->size];
 	x = (sigma % NETEM_DIST_SCALE) * t;

From 5c487bb9adddbc1d23433e09d2548759375c2b52 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 6 Feb 2018 20:50:23 -0800
Subject: [PATCH 40/82] tcp: tracepoint: only call trace_tcp_send_reset with
 full socket

tracepoint tcp_send_reset requires a full socket to work. However, it
may be called when in TCP_TIME_WAIT:

        case TCP_TW_RST:
                tcp_v6_send_reset(sk, skb);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;

To avoid this problem, this patch checks the socket with sk_fullsock()
before calling trace_tcp_send_reset().

Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset")
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95738aa0d8a6..f8ad397e285e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	 */
 	if (sk) {
 		arg.bound_dev_if = sk->sk_bound_dev_if;
-		trace_tcp_send_reset(sk, skb);
+		if (sk_fullsock(sk))
+			trace_tcp_send_reset(sk, skb);
 	}
 
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a1ab29e2ab3b..412139f4eccd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -942,7 +942,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
-		trace_tcp_send_reset(sk, skb);
+		if (sk_fullsock(sk))
+			trace_tcp_send_reset(sk, skb);
 	}
 
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);

From b7d99235473ad3a550f8eb05bd4469edadf1c8e6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:27:12 -0800
Subject: [PATCH 41/82] nfp: bpf: fix immed relocation for larger offsets

Immed relocation is missing a shift which means for larger
offsets the lower and higher part of the address would be
ORed together.

Fixes: ce4ebfd859c3 ("nfp: bpf: add helpers for updating immediate instructions")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_asm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index 3f6952b66a49..1e597600c693 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -107,7 +107,7 @@ u16 immed_get_value(u64 instr)
 	if (!unreg_is_imm(reg))
 		reg = FIELD_GET(OP_IMMED_B_SRC, instr);
 
-	return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr);
+	return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr) << 8;
 }
 
 void immed_set_value(u64 *instr, u16 immed)

From 0badd331491097cc3702d05a6dd0264a434712b2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:13 -0800
Subject: [PATCH 42/82] libbpf: complete list of strings for guessing program
 type

It seems that the type guessing feature for libbpf, based on the name of
the ELF section the program is located in, was inspired from
samples/bpf/prog_load.c, which was not used by any sample for loading
programs of certain types such as TC actions and classifiers, or
LWT-related types. As a consequence, libbpf is not able to guess the
type of such programs and to load them automatically if type is not
provided to the `bpf_load_prog()` function.

Add ELF section names associated to those eBPF program types so that
they can be loaded with e.g. bpftool as well.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 71ddc481f349..c64840365433 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1816,12 +1816,17 @@ static const struct {
 	BPF_PROG_SEC("socket",		BPF_PROG_TYPE_SOCKET_FILTER),
 	BPF_PROG_SEC("kprobe/",		BPF_PROG_TYPE_KPROBE),
 	BPF_PROG_SEC("kretprobe/",	BPF_PROG_TYPE_KPROBE),
+	BPF_PROG_SEC("classifier",	BPF_PROG_TYPE_SCHED_CLS),
+	BPF_PROG_SEC("action",		BPF_PROG_TYPE_SCHED_ACT),
 	BPF_PROG_SEC("tracepoint/",	BPF_PROG_TYPE_TRACEPOINT),
 	BPF_PROG_SEC("xdp",		BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",	BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("cgroup/skb",	BPF_PROG_TYPE_CGROUP_SKB),
 	BPF_PROG_SEC("cgroup/sock",	BPF_PROG_TYPE_CGROUP_SOCK),
 	BPF_PROG_SEC("cgroup/dev",	BPF_PROG_TYPE_CGROUP_DEVICE),
+	BPF_PROG_SEC("lwt_in",		BPF_PROG_TYPE_LWT_IN),
+	BPF_PROG_SEC("lwt_out",		BPF_PROG_TYPE_LWT_OUT),
+	BPF_PROG_SEC("lwt_xmit",	BPF_PROG_TYPE_LWT_XMIT),
 	BPF_PROG_SEC("sockops",		BPF_PROG_TYPE_SOCK_OPS),
 	BPF_PROG_SEC("sk_skb",		BPF_PROG_TYPE_SK_SKB),
 };

From 92426820f7616caebdf6ba665f749102a670b3d4 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:14 -0800
Subject: [PATCH 43/82] tools: bpftool: exit doc Makefile early if rst2man is
 not available

If rst2man is not available on the system, running `make doc` from the
bpftool directory fails with an error message. However, it creates empty
manual pages (.8 files in this case). A subsequent call to `make
doc-install` would then succeed and install those empty man pages on the
system.

To prevent this, raise a Makefile error and exit immediately if rst2man
is not available before generating the pages from the rst documentation.

Fixes: ff69c21a85a4 ("tools: bpftool: add documentation")
Reported-by: Jason van Aaardt <jason.vanaardt@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index c462a928e03d..a9d47c1558bb 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -23,7 +23,12 @@ DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8))
 man: man8
 man8: $(DOC_MAN8)
 
+RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
+
 $(OUTPUT)%.8: %.rst
+ifndef RST2MAN_DEP
+	$(error "rst2man not found, but required to generate man pages")
+endif
 	$(QUIET_GEN)rst2man $< > $@
 
 clean:

From 2148481d5d3111e5e9aa5adc1905fa3539e548c8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:15 -0800
Subject: [PATCH 44/82] tools: bpftool: make syntax for program map update
 explicit in man page

Specify in the documentation that when using bpftool to update a map of
type BPF_MAP_TYPE_PROG_ARRAY, the syntax for the program used as a value
should use the "id|tag|pinned" keywords convention, as used with
"bpftool prog" commands.

Fixes: ff69c21a85a4 ("tools: bpftool: add documentation")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 0ab32b312aec..457e868bd32f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -31,7 +31,8 @@ MAP COMMANDS
 |	**bpftool** **map help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
-|	*VALUE* := { *BYTES* | *MAP* | *PROGRAM* }
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*VALUE* := { *BYTES* | *MAP* | *PROG* }
 |	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
 
 DESCRIPTION

From 55f3538c4923e9dfca132e99ebec370e8094afda Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:16 -0800
Subject: [PATCH 45/82] tools: bpftool: add bash completion for `bpftool prog
 load`

Add bash completion for bpftool command `prog load`. Completion for this
command is easy, as it only takes existing file paths as arguments.

Fixes: 49a086c201a9 ("bpftool: implement prog load command")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/bash-completion/bpftool | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 0137866bb8f6..17d246418348 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -230,10 +230,14 @@ _bpftool()
                     fi
                     return 0
                     ;;
+                load)
+                    _filedir
+                    return 0
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'dump help pin show list' -- \
-                            "$cur" ) )
+                        COMPREPLY=( $( compgen -W 'dump help pin load \
+                            show list' -- "$cur" ) )
                     ;;
             esac
             ;;

From a827a164b98be2acba6a2e7559643ac9a5745086 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:17 -0800
Subject: [PATCH 46/82] tools: bpftool: add bash completion for cgroup commands

Add bash completion for "bpftool cgroup" command family. While at it,
also fix the formatting of some keywords in the man page for cgroups.

Fixes: 5ccda64d38cc ("bpftool: implement cgroup bpf operations")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../bpftool/Documentation/bpftool-cgroup.rst  |  4 +-
 tools/bpf/bpftool/bash-completion/bpftool     | 64 +++++++++++++++++--
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 2fe2a1bdbe3e..0e4e923235b6 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -26,8 +26,8 @@ MAP COMMANDS
 |	**bpftool** **cgroup help**
 |
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*ATTACH_TYPE* := { *ingress* | *egress* | *sock_create* | *sock_ops* | *device* }
-|	*ATTACH_FLAGS* := { *multi* | *override* }
+|	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** }
+|	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 17d246418348..08719c54a614 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -52,16 +52,24 @@ _bpftool_once_attr()
     done
 }
 
-# Takes a list of words in argument; adds them all to COMPREPLY if none of them
-# is already present on the command line. Returns no value.
-_bpftool_one_of_list()
+# Takes a list of words as argument; if any of those words is present on the
+# command line, return 0. Otherwise, return 1.
+_bpftool_search_list()
 {
     local w idx
     for w in $*; do
         for (( idx=3; idx < ${#words[@]}-1; idx++ )); do
-            [[ $w == ${words[idx]} ]] && return 1
+            [[ $w == ${words[idx]} ]] && return 0
         done
     done
+    return 1
+}
+
+# Takes a list of words in argument; adds them all to COMPREPLY if none of them
+# is already present on the command line. Returns no value.
+_bpftool_one_of_list()
+{
+    _bpftool_search_list $* && return 1
     COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) )
 }
 
@@ -351,6 +359,54 @@ _bpftool()
                     ;;
             esac
             ;;
+        cgroup)
+            case $command in
+                show|list)
+                    _filedir
+                    return 0
+                    ;;
+                attach|detach)
+                    local ATTACH_TYPES='ingress egress sock_create sock_ops \
+                        device'
+                    local ATTACH_FLAGS='multi override'
+                    local PROG_TYPE='id pinned tag'
+                    case $prev in
+                        $command)
+                            _filedir
+                            return 0
+                            ;;
+                        ingress|egress|sock_create|sock_ops|device)
+                            COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
+                                "$cur" ) )
+                            return 0
+                            ;;
+                        id)
+                            _bpftool_get_prog_ids
+                            return 0
+                            ;;
+                        *)
+                            if ! _bpftool_search_list "$ATTACH_TYPES"; then
+                                COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- \
+                                    "$cur" ) )
+                            elif [[ "$command" == "attach" ]]; then
+                                # We have an attach type on the command line,
+                                # but it is not the previous word, or
+                                # "id|pinned|tag" (we already checked for
+                                # that). This should only leave the case when
+                                # we need attach flags for "attach" commamnd.
+                                _bpftool_one_of_list "$ATTACH_FLAGS"
+                            fi
+                            return 0
+                            ;;
+                    esac
+                    ;;
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help attach detach \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool

From 9ce8b24aa96e983a540bb8244298a10322881ef4 Mon Sep 17 00:00:00 2001
From: Ryan Hsu <ryanhsu@codeaurora.org>
Date: Wed, 7 Feb 2018 15:51:23 +0200
Subject: [PATCH 47/82] Revert "ath10k: add sanity check to ie_len before
 parsing fw/board ie"

This reverts commit 9ed4f91628737c820af6a1815b65bc06bd31518f.

The commit introduced a regression that over read the ie with
the padding.

- the expected IE information

ath10k_pci 0000:03:00.0: found firmware features ie (1 B)
ath10k_pci 0000:03:00.0: Enabling feature bit: 6
ath10k_pci 0000:03:00.0: Enabling feature bit: 7
ath10k_pci 0000:03:00.0: features
ath10k_pci 0000:03:00.0: 00000000: c0 00 00 00 00 00 00 00

- the wrong IE with padding is read (0x77)

ath10k_pci 0000:03:00.0: found firmware features ie (4 B)
ath10k_pci 0000:03:00.0: Enabling feature bit: 6
ath10k_pci 0000:03:00.0: Enabling feature bit: 7
ath10k_pci 0000:03:00.0: Enabling feature bit: 8
ath10k_pci 0000:03:00.0: Enabling feature bit: 9
ath10k_pci 0000:03:00.0: Enabling feature bit: 10
ath10k_pci 0000:03:00.0: Enabling feature bit: 12
ath10k_pci 0000:03:00.0: Enabling feature bit: 13
ath10k_pci 0000:03:00.0: Enabling feature bit: 14
ath10k_pci 0000:03:00.0: Enabling feature bit: 16
ath10k_pci 0000:03:00.0: Enabling feature bit: 17
ath10k_pci 0000:03:00.0: Enabling feature bit: 18
ath10k_pci 0000:03:00.0: features
ath10k_pci 0000:03:00.0: 00000000: c0 77 07 00 00 00 00 00

Tested-by: Mike Lothian <mike@fireburn.co.uk>
Signed-off-by: Ryan Hsu <ryanhsu@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 6fb282f76804..f3ec13b80b20 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -1305,10 +1305,7 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 		len -= sizeof(*hdr);
 		data = hdr->data;
 
-		/* jump over the padding */
-		ie_len = ALIGN(ie_len, 4);
-
-		if (len < ie_len) {
+		if (len < ALIGN(ie_len, 4)) {
 			ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n",
 				   ie_id, ie_len, len);
 			ret = -EINVAL;
@@ -1347,6 +1344,9 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 			goto out;
 		}
 
+		/* jump over the padding */
+		ie_len = ALIGN(ie_len, 4);
+
 		len -= ie_len;
 		data += ie_len;
 	}
@@ -1477,9 +1477,6 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name,
 		len -= sizeof(*hdr);
 		data += sizeof(*hdr);
 
-		/* jump over the padding */
-		ie_len = ALIGN(ie_len, 4);
-
 		if (len < ie_len) {
 			ath10k_err(ar, "invalid length for FW IE %d (%zu < %zu)\n",
 				   ie_id, len, ie_len);
@@ -1585,6 +1582,9 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name,
 			break;
 		}
 
+		/* jump over the padding */
+		ie_len = ALIGN(ie_len, 4);
+
 		len -= ie_len;
 		data += ie_len;
 	}

From e3ac6c0737e2b17bf11210e3fd66565e9b818b87 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:22 -0800
Subject: [PATCH 48/82] nfp: bpf: require ETH table

Upcoming changes will require all netdevs supporting TC offloads
to have a full struct nfp_port.  Require those for BPF offload.
The operation without management FW reporting information about
Ethernet ports is something we only support for very old and very
basic NIC firmwares anyway.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 322027792fe8..61898dda11cf 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -35,6 +35,7 @@
 
 #include "../nfpcore/nfp_cpp.h"
 #include "../nfpcore/nfp_nffw.h"
+#include "../nfpcore/nfp_nsp.h"
 #include "../nfp_app.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
@@ -87,9 +88,20 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn)
 static int
 nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 {
+	struct nfp_pf *pf = app->pf;
 	struct nfp_bpf_vnic *bv;
 	int err;
 
+	if (!pf->eth_tbl) {
+		nfp_err(pf->cpp, "No ETH table\n");
+		return -EINVAL;
+	}
+	if (pf->max_data_vnics != pf->eth_tbl->count) {
+		nfp_err(pf->cpp, "ETH entries don't match vNICs (%d vs %d)\n",
+			pf->max_data_vnics, pf->eth_tbl->count);
+		return -EINVAL;
+	}
+
 	bv = kzalloc(sizeof(*bv), GFP_KERNEL);
 	if (!bv)
 		return -ENOMEM;

From 0b9de4ca853b6ba2c92ff0b4602281001b166639 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:23 -0800
Subject: [PATCH 49/82] nfp: don't advertise hw-tc-offload on non-port netdevs

nfp_port is a structure which represents an ASIC port, both
PCIe vNIC (on a PF or a VF) or the external MAC port.  vNIC
netdev (struct nfp_net) and pure representor netdev (struct
nfp_repr) both have a pointer to this structure.  nfp_reprs
always have a port associated.  nfp_nets, however, only represent
a device port in legacy mode, where they are considered the
MAC port. In switchdev mode they are just the CPU's side of
the PCIe link.

By definition TC offloads only apply to device ports.  Don't
set the flag on vNICs without a port (i.e. in switchdev mode).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index c0fd351c86b1..fe77ea8b656c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3734,7 +3734,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn)
 
 	netdev->features = netdev->hw_features;
 
-	if (nfp_app_has_tc(nn->app))
+	if (nfp_app_has_tc(nn->app) && nn->port)
 		netdev->hw_features |= NETIF_F_HW_TC;
 
 	/* Advertise but disable TSO by default. */

From d692403e5cf8008f31f5664a6f3ce3e65d54f458 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:24 -0800
Subject: [PATCH 50/82] nfp: forbid disabling hw-tc-offload on representors
 while offload active

All netdevs which can accept TC offloads must implement
.ndo_set_features().  nfp_reprs currently do not do that, which
means hw-tc-offload can be turned on and off even when offloads
are active.

Whether the offloads are active is really a question to nfp_ports,
so remove the per-app tc_busy callback indirection thing, and
simply count the number of offloaded items in nfp_port structure.

Fixes: 8a2768732a4d ("nfp: provide infrastructure for offloading flower based TC filters")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c  |  9 +--------
 .../ethernet/netronome/nfp/flower/offload.c    |  4 ++++
 drivers/net/ethernet/netronome/nfp/nfp_app.h   |  9 ---------
 .../ethernet/netronome/nfp/nfp_net_common.c    |  7 +++----
 .../net/ethernet/netronome/nfp/nfp_net_repr.c  |  1 +
 drivers/net/ethernet/netronome/nfp/nfp_port.c  | 18 ++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_port.h  |  6 ++++++
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 61898dda11cf..34e98aa6b956 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -182,6 +182,7 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 		return err;
 
 	bv->tc_prog = cls_bpf->prog;
+	nn->port->tc_offload_cnt = !!bv->tc_prog;
 	return 0;
 }
 
@@ -219,13 +220,6 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 	}
 }
 
-static bool nfp_bpf_tc_busy(struct nfp_app *app, struct nfp_net *nn)
-{
-	struct nfp_bpf_vnic *bv = nn->app_priv;
-
-	return !!bv->tc_prog;
-}
-
 static int
 nfp_bpf_change_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu)
 {
@@ -429,7 +423,6 @@ const struct nfp_app_type app_bpf = {
 	.ctrl_msg_rx	= nfp_bpf_ctrl_msg_rx,
 
 	.setup_tc	= nfp_bpf_setup_tc,
-	.tc_busy	= nfp_bpf_tc_busy,
 	.bpf		= nfp_ndo_bpf,
 	.xdp_offload	= nfp_bpf_xdp_offload,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 08c4c6dc5f7f..eb5c13dea8f5 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -349,6 +349,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 		       struct tc_cls_flower_offload *flow, bool egress)
 {
 	enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE;
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
 	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_fl_payload *flow_pay;
 	struct nfp_fl_key_ls *key_layer;
@@ -390,6 +391,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	INIT_HLIST_NODE(&flow_pay->link);
 	flow_pay->tc_flower_cookie = flow->cookie;
 	hash_add_rcu(priv->flow_table, &flow_pay->link, flow->cookie);
+	port->tc_offload_cnt++;
 
 	/* Deallocate flow payload when flower rule has been destroyed. */
 	kfree(key_layer);
@@ -421,6 +423,7 @@ static int
 nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 		       struct tc_cls_flower_offload *flow)
 {
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
 	struct nfp_fl_payload *nfp_flow;
 	int err;
 
@@ -442,6 +445,7 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 
 err_free_flow:
 	hash_del_rcu(&nfp_flow->link);
+	port->tc_offload_cnt--;
 	kfree(nfp_flow->action_data);
 	kfree(nfp_flow->mask_data);
 	kfree(nfp_flow->unmasked_data);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 437964afa8ee..20546ae67909 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -92,7 +92,6 @@ extern const struct nfp_app_type app_flower;
  * @stop:	stop application logic
  * @ctrl_msg_rx:    control message handler
  * @setup_tc:	setup TC ndo
- * @tc_busy:	TC HW offload busy (rules loaded)
  * @bpf:	BPF ndo offload-related calls
  * @xdp_offload:    offload an XDP program
  * @eswitch_mode_get:    get SR-IOV eswitch mode
@@ -135,7 +134,6 @@ struct nfp_app_type {
 
 	int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
 			enum tc_setup_type type, void *type_data);
-	bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
 	int (*bpf)(struct nfp_app *app, struct nfp_net *nn,
 		   struct netdev_bpf *xdp);
 	int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
@@ -301,13 +299,6 @@ static inline bool nfp_app_has_tc(struct nfp_app *app)
 	return app && app->type->setup_tc;
 }
 
-static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn)
-{
-	if (!app || !app->type->tc_busy)
-		return false;
-	return app->type->tc_busy(app, nn);
-}
-
 static inline int nfp_app_setup_tc(struct nfp_app *app,
 				   struct net_device *netdev,
 				   enum tc_setup_type type, void *type_data)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index fe77ea8b656c..19e989239af7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3210,10 +3210,9 @@ static int nfp_net_set_features(struct net_device *netdev,
 			new_ctrl &= ~NFP_NET_CFG_CTRL_GATHER;
 	}
 
-	if (changed & NETIF_F_HW_TC && nfp_app_tc_busy(nn->app, nn)) {
-		nn_err(nn, "Cannot disable HW TC offload while in use\n");
-		return -EBUSY;
-	}
+	err = nfp_port_set_features(netdev, features);
+	if (err)
+		return err;
 
 	nn_dbg(nn, "Feature change 0x%llx -> 0x%llx (changed=0x%llx)\n",
 	       netdev->features, features, changed);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index f67da6bde9da..619570524d2a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -265,6 +265,7 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 	.ndo_set_vf_spoofchk	= nfp_app_set_vf_spoofchk,
 	.ndo_get_vf_config	= nfp_app_get_vf_config,
 	.ndo_set_vf_link_state	= nfp_app_set_vf_link_state,
+	.ndo_set_features	= nfp_port_set_features,
 };
 
 static void nfp_repr_clean(struct nfp_repr *repr)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index 34a6e035fe9a..7bd8be5c833b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/lockdep.h>
+#include <linux/netdevice.h>
 #include <net/switchdev.h>
 
 #include "nfpcore/nfp_cpp.h"
@@ -100,6 +101,23 @@ int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	return nfp_app_setup_tc(port->app, netdev, type, type_data);
 }
 
+int nfp_port_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct nfp_port *port;
+
+	port = nfp_port_from_netdev(netdev);
+	if (!port)
+		return 0;
+
+	if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
+	    port->tc_offload_cnt) {
+		netdev_err(netdev, "Cannot disable HW TC offload while offloads active\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
 struct nfp_port *
 nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 21bd4aa32646..fa7e669a969c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -72,6 +72,8 @@ enum nfp_port_flags {
  * @netdev:	backpointer to associated netdev
  * @type:	what port type does the entity represent
  * @flags:	port flags
+ * @tc_offload_cnt:	number of active TC offloads, how offloads are counted
+ *			is not defined, use as a boolean
  * @app:	backpointer to the app structure
  * @dl_port:	devlink port structure
  * @eth_id:	for %NFP_PORT_PHYS_PORT port ID in NFP enumeration scheme
@@ -87,6 +89,7 @@ struct nfp_port {
 	enum nfp_port_type type;
 
 	unsigned long flags;
+	unsigned long tc_offload_cnt;
 
 	struct nfp_app *app;
 
@@ -121,6 +124,9 @@ static inline bool nfp_port_is_vnic(const struct nfp_port *port)
 	return port->type == NFP_PORT_PF_PORT || port->type == NFP_PORT_VF_PORT;
 }
 
+int
+nfp_port_set_features(struct net_device *netdev, netdev_features_t features);
+
 struct nfp_port *nfp_port_from_netdev(struct net_device *netdev);
 struct nfp_port *
 nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id);

From 0d592e52fbbda26155b686dd93878cf774a3ab0e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:25 -0800
Subject: [PATCH 51/82] nfp: limit the number of TSO segments

Most FWs limit the number of TSO segments a frame can produce
to 64.  This is for fairness and efficiency (of FW datapath)
reasons.  If a frame with larger number of segments is submitted
the FW will drop it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h   | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 19e989239af7..a05be0ab2713 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3750,6 +3750,8 @@ static void nfp_net_netdev_init(struct nfp_net *nn)
 	netdev->min_mtu = ETH_MIN_MTU;
 	netdev->max_mtu = nn->max_mtu;
 
+	netdev->gso_max_segs = NFP_NET_LSO_MAX_SEGS;
+
 	netif_carrier_off(netdev);
 
 	nfp_net_set_ethtool_ops(netdev);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
index eeecef2caac6..4499a7333078 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -59,9 +59,12 @@
 #define NFP_NET_RX_OFFSET               32
 
 /**
- * Maximum header size supported for LSO frames
+ * LSO parameters
+ * %NFP_NET_LSO_MAX_HDR_SZ:	Maximum header size supported for LSO frames
+ * %NFP_NET_LSO_MAX_SEGS:	Maximum number of segments LSO frame can produce
  */
 #define NFP_NET_LSO_MAX_HDR_SZ		255
+#define NFP_NET_LSO_MAX_SEGS		64
 
 /**
  * Prepend field types

From 1a5e8e35000577cb9100d22daa8b5ebcfa2be9b2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:26 -0800
Subject: [PATCH 52/82] nfp: populate MODULE_VERSION

DKMS and similar out-of-tree module replacement services use
module version to make sure the out-of-tree software is not
older than the module shipped with the kernel.  We use the
kernel version in ethtool -i output, put it into MODULE_VERSION
as well.

Reported-by: Jan Gutter <jan.gutter@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index cc570bb6563c..ab301d56430b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -649,3 +649,4 @@ MODULE_FIRMWARE("netronome/nic_AMDA0099-0001_2x25.nffw");
 MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("The Netronome Flow Processor (NFP) driver.");
+MODULE_VERSION(UTS_RELEASE);

From 8c2f826dc36314059ac146c78d3bf8056b626446 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Feb 2018 15:59:07 +0000
Subject: [PATCH 53/82] rxrpc: Don't put crypto buffers on the stack

Don't put buffers of data to be handed to crypto on the stack as this may
cause an assertion failure in the kernel (see below).  Fix this by using an
kmalloc'd buffer instead.

kernel BUG at ./include/linux/scatterlist.h:147!
...
RIP: 0010:rxkad_encrypt_response.isra.6+0x191/0x1b0 [rxrpc]
RSP: 0018:ffffbe2fc06cfca8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff989277d59900 RCX: 0000000000000028
RDX: 0000259dc06cfd88 RSI: 0000000000000025 RDI: ffffbe30406cfd88
RBP: ffffbe2fc06cfd60 R08: ffffbe2fc06cfd08 R09: ffffbe2fc06cfd08
R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff7c5f80d9f95
R13: ffffbe2fc06cfd88 R14: ffff98927a3f7aa0 R15: ffffbe2fc06cfd08
FS:  0000000000000000(0000) GS:ffff98927fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b1ff28f0f8 CR3: 000000001b412003 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 rxkad_respond_to_challenge+0x297/0x330 [rxrpc]
 rxrpc_process_connection+0xd1/0x690 [rxrpc]
 ? process_one_work+0x1c3/0x680
 ? __lock_is_held+0x59/0xa0
 process_one_work+0x249/0x680
 worker_thread+0x3a/0x390
 ? process_one_work+0x680/0x680
 kthread+0x121/0x140
 ? kthread_create_worker_on_cpu+0x70/0x70
 ret_from_fork+0x3a/0x50

Reported-by: Jonathan Billings <jsbillings@jsbillings.org>
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jonathan Billings <jsbillings@jsbillings.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/conn_event.c |  1 +
 net/rxrpc/rxkad.c      | 90 +++++++++++++++++++++++-------------------
 2 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 4ca11be6be3c..b1dfae107431 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -460,6 +460,7 @@ void rxrpc_process_connection(struct work_struct *work)
 		case -EKEYEXPIRED:
 		case -EKEYREJECTED:
 			goto protocol_error;
+		case -ENOMEM:
 		case -EAGAIN:
 			goto requeue_and_leave;
 		case -ECONNABORTED:
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index c38b3a1de56c..77cb23c7bd0a 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -773,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 {
 	const struct rxrpc_key_token *token;
 	struct rxkad_challenge challenge;
-	struct rxkad_response resp
-		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxkad_response *resp;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	const char *eproto;
 	u32 version, nonce, min_level, abort_code;
@@ -818,26 +817,29 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	token = conn->params.key->payload.data[0];
 
 	/* build the response packet */
-	memset(&resp, 0, sizeof(resp));
+	resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
+	if (!resp)
+		return -ENOMEM;
 
-	resp.version			= htonl(RXKAD_VERSION);
-	resp.encrypted.epoch		= htonl(conn->proto.epoch);
-	resp.encrypted.cid		= htonl(conn->proto.cid);
-	resp.encrypted.securityIndex	= htonl(conn->security_ix);
-	resp.encrypted.inc_nonce	= htonl(nonce + 1);
-	resp.encrypted.level		= htonl(conn->params.security_level);
-	resp.kvno			= htonl(token->kad->kvno);
-	resp.ticket_len			= htonl(token->kad->ticket_len);
-
-	resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
-	resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
-	resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
-	resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
+	resp->version			= htonl(RXKAD_VERSION);
+	resp->encrypted.epoch		= htonl(conn->proto.epoch);
+	resp->encrypted.cid		= htonl(conn->proto.cid);
+	resp->encrypted.securityIndex	= htonl(conn->security_ix);
+	resp->encrypted.inc_nonce	= htonl(nonce + 1);
+	resp->encrypted.level		= htonl(conn->params.security_level);
+	resp->kvno			= htonl(token->kad->kvno);
+	resp->ticket_len		= htonl(token->kad->ticket_len);
+	resp->encrypted.call_id[0]	= htonl(conn->channels[0].call_counter);
+	resp->encrypted.call_id[1]	= htonl(conn->channels[1].call_counter);
+	resp->encrypted.call_id[2]	= htonl(conn->channels[2].call_counter);
+	resp->encrypted.call_id[3]	= htonl(conn->channels[3].call_counter);
 
 	/* calculate the response checksum and then do the encryption */
-	rxkad_calc_response_checksum(&resp);
-	rxkad_encrypt_response(conn, &resp, token->kad);
-	return rxkad_send_response(conn, &sp->hdr, &resp, token->kad);
+	rxkad_calc_response_checksum(resp);
+	rxkad_encrypt_response(conn, resp, token->kad);
+	ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
+	kfree(resp);
+	return ret;
 
 protocol_error:
 	trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto);
@@ -1048,8 +1050,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 				 struct sk_buff *skb,
 				 u32 *_abort_code)
 {
-	struct rxkad_response response
-		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxkad_response *response;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_crypt session_key;
 	const char *eproto;
@@ -1061,17 +1062,22 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
 
+	ret = -ENOMEM;
+	response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
+	if (!response)
+		goto temporary_error;
+
 	eproto = tracepoint_string("rxkad_rsp_short");
 	abort_code = RXKADPACKETSHORT;
 	if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
-			  &response, sizeof(response)) < 0)
+			  response, sizeof(*response)) < 0)
 		goto protocol_error;
-	if (!pskb_pull(skb, sizeof(response)))
+	if (!pskb_pull(skb, sizeof(*response)))
 		BUG();
 
-	version = ntohl(response.version);
-	ticket_len = ntohl(response.ticket_len);
-	kvno = ntohl(response.kvno);
+	version = ntohl(response->version);
+	ticket_len = ntohl(response->ticket_len);
+	kvno = ntohl(response->kvno);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
 	       sp->hdr.serial, version, kvno, ticket_len);
 
@@ -1105,31 +1111,31 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key,
 				   &expiry, _abort_code);
 	if (ret < 0)
-		goto temporary_error_free;
+		goto temporary_error_free_resp;
 
 	/* use the session key from inside the ticket to decrypt the
 	 * response */
-	rxkad_decrypt_response(conn, &response, &session_key);
+	rxkad_decrypt_response(conn, response, &session_key);
 
 	eproto = tracepoint_string("rxkad_rsp_param");
 	abort_code = RXKADSEALEDINCON;
-	if (ntohl(response.encrypted.epoch) != conn->proto.epoch)
+	if (ntohl(response->encrypted.epoch) != conn->proto.epoch)
 		goto protocol_error_free;
-	if (ntohl(response.encrypted.cid) != conn->proto.cid)
+	if (ntohl(response->encrypted.cid) != conn->proto.cid)
 		goto protocol_error_free;
-	if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
+	if (ntohl(response->encrypted.securityIndex) != conn->security_ix)
 		goto protocol_error_free;
-	csum = response.encrypted.checksum;
-	response.encrypted.checksum = 0;
-	rxkad_calc_response_checksum(&response);
+	csum = response->encrypted.checksum;
+	response->encrypted.checksum = 0;
+	rxkad_calc_response_checksum(response);
 	eproto = tracepoint_string("rxkad_rsp_csum");
-	if (response.encrypted.checksum != csum)
+	if (response->encrypted.checksum != csum)
 		goto protocol_error_free;
 
 	spin_lock(&conn->channel_lock);
 	for (i = 0; i < RXRPC_MAXCALLS; i++) {
 		struct rxrpc_call *call;
-		u32 call_id = ntohl(response.encrypted.call_id[i]);
+		u32 call_id = ntohl(response->encrypted.call_id[i]);
 
 		eproto = tracepoint_string("rxkad_rsp_callid");
 		if (call_id > INT_MAX)
@@ -1153,12 +1159,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 
 	eproto = tracepoint_string("rxkad_rsp_seq");
 	abort_code = RXKADOUTOFSEQUENCE;
-	if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
+	if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1)
 		goto protocol_error_free;
 
 	eproto = tracepoint_string("rxkad_rsp_level");
 	abort_code = RXKADLEVELFAIL;
-	level = ntohl(response.encrypted.level);
+	level = ntohl(response->encrypted.level);
 	if (level > RXRPC_SECURITY_ENCRYPT)
 		goto protocol_error_free;
 	conn->params.security_level = level;
@@ -1168,9 +1174,10 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	 * as for a client connection */
 	ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
 	if (ret < 0)
-		goto temporary_error_free;
+		goto temporary_error_free_ticket;
 
 	kfree(ticket);
+	kfree(response);
 	_leave(" = 0");
 	return 0;
 
@@ -1179,12 +1186,15 @@ protocol_error_unlock:
 protocol_error_free:
 	kfree(ticket);
 protocol_error:
+	kfree(response);
 	trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto);
 	*_abort_code = abort_code;
 	return -EPROTO;
 
-temporary_error_free:
+temporary_error_free_ticket:
 	kfree(ticket);
+temporary_error_free_resp:
+	kfree(response);
 temporary_error:
 	/* Ignore the response packet if we got a temporary error such as
 	 * ENOMEM.  We just want to send the challenge again.  Note that we

From cb9f7a9a5c96a773bbc9c70660dc600cfff82f82 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 6 Feb 2018 14:48:32 +0100
Subject: [PATCH 54/82] netlink: ensure to loop over all netns in
 genlmsg_multicast_allns()

Nowadays, nlmsg_multicast() returns only 0 or -ESRCH but this was not the
case when commit 134e63756d5f was pushed.
However, there was no reason to stop the loop if a netns does not have
listeners.
Returns -ESRCH only if there was no listeners in all netns.

To avoid having the same problem in the future, I didn't take the
assumption that nlmsg_multicast() returns only 0 or -ESRCH.

Fixes: 134e63756d5f ("genetlink: make netns aware")
CC: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/genetlink.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d444daf1ac04..6f02499ef007 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1081,6 +1081,7 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 {
 	struct sk_buff *tmp;
 	struct net *net, *prev = NULL;
+	bool delivered = false;
 	int err;
 
 	for_each_net_rcu(net) {
@@ -1092,14 +1093,21 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 			}
 			err = nlmsg_multicast(prev->genl_sock, tmp,
 					      portid, group, flags);
-			if (err)
+			if (!err)
+				delivered = true;
+			else if (err != -ESRCH)
 				goto error;
 		}
 
 		prev = net;
 	}
 
-	return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
+	err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
+	if (!err)
+		delivered = true;
+	else if (err != -ESRCH)
+		goto error;
+	return delivered ? 0 : -ESRCH;
  error:
 	kfree_skb(skb);
 	return err;

From 762c330d670e3d4b795cf7a8d761866fdd1eef49 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 7 Feb 2018 17:14:46 +0800
Subject: [PATCH 55/82] tuntap: add missing xdp flush

When using devmap to redirect packets between interfaces,
xdp_do_flush() is usually a must to flush any batched
packets. Unfortunately this is missed in current tuntap
implementation.

Unlike most hardware driver which did XDP inside NAPI loop and call
xdp_do_flush() at then end of each round of poll. TAP did it in the
context of process e.g tun_get_user(). So fix this by count the
pending redirected packets and flush when it exceeds NAPI_POLL_WEIGHT
or MSG_MORE was cleared by sendmsg() caller.

With this fix, xdp_redirect_map works again between two TAPs.

Fixes: 761876c857cb ("tap: XDP support")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0dc66e4fbb2c..17e496b88f81 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -181,6 +181,7 @@ struct tun_file {
 	struct tun_struct *detached;
 	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
+	int xdp_pending_pkts;
 };
 
 struct tun_flow_entry {
@@ -1665,6 +1666,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		case XDP_REDIRECT:
 			get_page(alloc_frag->page);
 			alloc_frag->offset += buflen;
+			++tfile->xdp_pending_pkts;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			if (err)
 				goto err_redirect;
@@ -1986,6 +1988,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	result = tun_get_user(tun, tfile, NULL, from,
 			      file->f_flags & O_NONBLOCK, false);
 
+	if (tfile->xdp_pending_pkts) {
+		tfile->xdp_pending_pkts = 0;
+		xdp_do_flush_map();
+	}
+
 	tun_put(tun);
 	return result;
 }
@@ -2322,6 +2329,13 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
+
+	if (tfile->xdp_pending_pkts >= NAPI_POLL_WEIGHT ||
+	    !(m->msg_flags & MSG_MORE)) {
+		tfile->xdp_pending_pkts = 0;
+		xdp_do_flush_map();
+	}
+
 	tun_put(tun);
 	return ret;
 }
@@ -3153,6 +3167,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
 	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
+	tfile->xdp_pending_pkts = 0;
 
 	return 0;
 }

From 4ff66cae7f10b65b028dc3bdaaad9cc2989ef6ae Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 7 Feb 2018 13:53:20 +0100
Subject: [PATCH 56/82] rtnetlink: require unique netns identifier

Since we've added support for IFLA_IF_NETNSID for RTM_{DEL,GET,SET,NEW}LINK
it is possible for userspace to send us requests with three different
properties to identify a target network namespace. This affects at least
RTM_{NEW,SET}LINK. Each of them could potentially refer to a different
network namespace which is confusing. For legacy reasons the kernel will
pick the IFLA_NET_NS_PID property first and then look for the
IFLA_NET_NS_FD property but there is no reason to extend this type of
behavior to network namespace ids. The regression potential is quite
minimal since the rtnetlink requests in question either won't allow
IFLA_IF_NETNSID requests before 4.16 is out (RTM_{NEW,SET}LINK) or don't
support IFLA_NET_NS_{PID,FD} (RTM_{DEL,GET}LINK) in the first place.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56af8e41abfc..bc290413a49d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1951,6 +1951,38 @@ static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
 	return net;
 }
 
+/* Verify that rtnetlink requests do not pass additional properties
+ * potentially referring to different network namespaces.
+ */
+static int rtnl_ensure_unique_netns(struct nlattr *tb[],
+				    struct netlink_ext_ack *extack,
+				    bool netns_id_only)
+{
+
+	if (netns_id_only) {
+		if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
+			return 0;
+
+		NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+		goto invalid_attr;
+
+	if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+		goto invalid_attr;
+
+	if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+		goto invalid_attr;
+
+	return 0;
+
+invalid_attr:
+	NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
+	return -EINVAL;
+}
+
 static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 {
 	if (dev) {
@@ -2553,6 +2585,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
+	err = rtnl_ensure_unique_netns(tb, extack, false);
+	if (err < 0)
+		goto errout;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2649,6 +2685,10 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, true);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
@@ -2802,6 +2842,10 @@ replay:
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, false);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -3045,6 +3089,10 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, true);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
 		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);

From 583133b35ecaea84590e92e2732d1a472747ca7d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 7 Feb 2018 10:17:29 -0600
Subject: [PATCH 57/82] atm: he: use 64-bit arithmetic instead of 32-bit

Add suffix ULL to constants 272, 204, 136 and 68 in order to give the
compiler complete information about the proper arithmetic to use.
Notice that these constants are used in contexts that expect
expressions of type unsigned long long (64 bits, unsigned).

The following expressions are currently being evaluated using 32-bit
arithmetic:

272 * mult
204 * mult
136 * mult
68 * mult

Addresses-Coverity-ID: 201058
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/he.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index e58538c29377..29f102dcfec4 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -738,13 +738,13 @@ static int he_init_cs_block_rcm(struct he_dev *he_dev)
 #else
 		/* this is pretty, but avoids _divdu3 and is mostly correct */
 		mult = he_dev->atm_dev->link_rate / ATM_OC3_PCR;
-		if (rate_cps > (272 * mult))
+		if (rate_cps > (272ULL * mult))
 			buf = 4;
-		else if (rate_cps > (204 * mult))
+		else if (rate_cps > (204ULL * mult))
 			buf = 3;
-		else if (rate_cps > (136 * mult))
+		else if (rate_cps > (136ULL * mult))
 			buf = 2;
-		else if (rate_cps > (68 * mult))
+		else if (rate_cps > (68ULL * mult))
 			buf = 1;
 		else
 			buf = 0;

From ec95dffa408f0c24c0b358f3723c6ba262190965 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 7 Feb 2018 13:00:24 -0600
Subject: [PATCH 58/82] ibmvnic: queue reset when CRQ gets closed during reset

While handling a driver reset we get a H_CLOSED return trying
to send a CRQ event. When this occurs we need to queue up another
reset attempt. Without doing this we see instances where the driver
is left in a closed state because the reset failed and there is no
further attempts to reset the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 8dabc9d9dfa6..dd4a2946e1da 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2914,8 +2914,12 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter,
 				cpu_to_be64(u64_crq[1]));
 
 	if (rc) {
-		if (rc == H_CLOSED)
+		if (rc == H_CLOSED) {
 			dev_warn(dev, "CRQ Queue closed\n");
+			if (adapter->resetting)
+				ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+		}
+
 		dev_warn(dev, "Send error (rc=%d)\n", rc);
 	}
 

From e728789c52afccc1275cba1dd812f03abe16ea3c Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 7 Feb 2018 20:35:00 +0100
Subject: [PATCH 59/82] net: Extra '_get' in declaration of
 arch_get_platform_mac_address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit c7f5d105495a ("net: Add eth_platform_get_mac_address() helper."),
two declarations were added:

  int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
  unsigned char *arch_get_platform_get_mac_address(void);

An extra '_get' was introduced in arch_get_platform_get_mac_address, remove
it. Fix compile warning using W=1:

  CC      net/ethernet/eth.o
net/ethernet/eth.c:523:24: warning: no previous prototype for ‘arch_get_platform_mac_address’ [-Wmissing-prototypes]
 unsigned char * __weak arch_get_platform_mac_address(void)
                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  AR      net/ethernet/built-in.o

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 263dbcad22fc..79563840c295 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -31,7 +31,7 @@
 #ifdef __KERNEL__
 struct device;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
-unsigned char *arch_get_platform_get_mac_address(void);
+unsigned char *arch_get_platform_mac_address(void);
 u32 eth_get_headlen(void *data, unsigned int max_len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;

From 79a8a642bf05cd0dced20621f6fef9d884124abd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Feb 2018 17:44:38 -0800
Subject: [PATCH 60/82] net: Whitelist the skbuff_head_cache "cb" field

Most callers of put_cmsg() use a "sizeof(foo)" for the length argument.
Within put_cmsg(), a copy_to_user() call is made with a dynamic size, as a
result of the cmsg header calculations. This means that hardened usercopy
will examine the copy, even though it was technically a fixed size and
should be implicitly whitelisted. All the put_cmsg() calls being built
from values in skbuff_head_cache are coming out of the protocol-defined
"cb" field, so whitelist this field entirely instead of creating per-use
bounce buffers, for which there are concerns about performance.

Original report was:

Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'skbuff_head_cache' (offset 64, size 16)!
WARNING: CPU: 0 PID: 3663 at mm/usercopy.c:81 usercopy_warn+0xdb/0x100 mm/usercopy.c:76
...
 __check_heap_object+0x89/0xc0 mm/slab.c:4426
 check_heap_object mm/usercopy.c:236 [inline]
 __check_object_size+0x272/0x530 mm/usercopy.c:259
 check_object_size include/linux/thread_info.h:112 [inline]
 check_copy_size include/linux/thread_info.h:143 [inline]
 copy_to_user include/linux/uaccess.h:154 [inline]
 put_cmsg+0x233/0x3f0 net/core/scm.c:242
 sock_recv_errqueue+0x200/0x3e0 net/core/sock.c:2913
 packet_recvmsg+0xb2e/0x17a0 net/packet/af_packet.c:3296
 sock_recvmsg_nosec net/socket.c:803 [inline]
 sock_recvmsg+0xc9/0x110 net/socket.c:810
 ___sys_recvmsg+0x2a4/0x640 net/socket.c:2179
 __sys_recvmmsg+0x2a9/0xaf0 net/socket.c:2287
 SYSC_recvmmsg net/socket.c:2368 [inline]
 SyS_recvmmsg+0xc4/0x160 net/socket.c:2352
 entry_SYSCALL_64_fastpath+0x29/0xa0

Reported-by: syzbot+e2d6cfb305e9f3911dea@syzkaller.appspotmail.com
Fixes: 6d07d1cd300f ("usercopy: Restrict non-usercopy caches to size 0")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c61c27c1b28..09bd89c90a71 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3894,10 +3894,12 @@ EXPORT_SYMBOL_GPL(skb_gro_receive);
 
 void __init skb_init(void)
 {
-	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					      offsetof(struct sk_buff, cb),
+					      sizeof_field(struct sk_buff, cb),
 					      NULL);
 	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
 						sizeof(struct sk_buff_fclones),

From ebeeb1ad9b8adcc37c2ec21a96f39e9d35199b46 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Sat, 3 Feb 2018 04:26:51 -0800
Subject: [PATCH 61/82] rds: tcp: use rds_destroy_pending() to synchronize
 netns/module teardown and rds connection/workq management

An rds_connection can get added during netns deletion between lines 528
and 529 of

  506 static void rds_tcp_kill_sock(struct net *net)
  :
  /* code to pull out all the rds_connections that should be destroyed */
  :
  528         spin_unlock_irq(&rds_tcp_conn_lock);
  529         list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
  530                 rds_conn_destroy(tc->t_cpath->cp_conn);

Such an rds_connection would miss out the rds_conn_destroy()
loop (that cancels all pending work) and (if it was scheduled
after netns deletion) could trigger the use-after-free.

A similar race-window exists for the module unload path
in rds_tcp_exit -> rds_tcp_destroy_conns

Concurrency with netns deletion (rds_tcp_kill_sock()) must be handled
by checking check_net() before enqueuing new work or adding new
connections.

Concurrency with module-unload is handled by maintaining a module
specific flag that is set at the start of the module exit function,
and must be checked before enqueuing new work or adding new connections.

This commit refactors existing RDS_DESTROY_PENDING checks added by
commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with
connection teardown") and consolidates all the concurrency checks
listed above into the function rds_destroy_pending().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/cong.c        |  2 +-
 net/rds/connection.c  | 15 +++++++++------
 net/rds/ib.c          | 17 +++++++++++++++++
 net/rds/ib_cm.c       |  1 +
 net/rds/rds.h         |  7 +++++++
 net/rds/send.c        | 10 +++++-----
 net/rds/tcp.c         | 42 ++++++++++++++++++++++++++++++------------
 net/rds/tcp_connect.c |  2 +-
 net/rds/tcp_recv.c    |  2 +-
 net/rds/tcp_send.c    |  2 +-
 net/rds/threads.c     |  6 +++---
 11 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/net/rds/cong.c b/net/rds/cong.c
index 8d19fd25dce3..63da9d2f142d 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -223,7 +223,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 
 		rcu_read_lock();
 		if (!test_and_set_bit(0, &conn->c_map_queued) &&
-		    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		    !rds_destroy_pending(cp->cp_conn)) {
 			rds_stats_inc(s_cong_update_queued);
 			/* We cannot inline the call to rds_send_xmit() here
 			 * for two reasons (both pertaining to a TCP transport):
diff --git a/net/rds/connection.c b/net/rds/connection.c
index b10c0ef36d8d..94e190febfdd 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -220,8 +220,13 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 				     is_outgoing);
 		conn->c_path[i].cp_index = i;
 	}
-	ret = trans->conn_alloc(conn, gfp);
+	rcu_read_lock();
+	if (rds_destroy_pending(conn))
+		ret = -ENETDOWN;
+	else
+		ret = trans->conn_alloc(conn, gfp);
 	if (ret) {
+		rcu_read_unlock();
 		kfree(conn->c_path);
 		kmem_cache_free(rds_conn_slab, conn);
 		conn = ERR_PTR(ret);
@@ -283,6 +288,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		}
 	}
 	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 
 out:
 	return conn;
@@ -382,13 +388,10 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
 {
 	struct rds_message *rm, *rtmp;
 
-	set_bit(RDS_DESTROY_PENDING, &cp->cp_flags);
-
 	if (!cp->cp_transport_data)
 		return;
 
 	/* make sure lingering queued work won't try to ref the conn */
-	synchronize_rcu();
 	cancel_delayed_work_sync(&cp->cp_send_w);
 	cancel_delayed_work_sync(&cp->cp_recv_w);
 
@@ -691,7 +694,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
 	atomic_set(&cp->cp_state, RDS_CONN_ERROR);
 
 	rcu_read_lock();
-	if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (!destroy && rds_destroy_pending(cp->cp_conn)) {
 		rcu_read_unlock();
 		return;
 	}
@@ -714,7 +717,7 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
 void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
 {
 	rcu_read_lock();
-	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (rds_destroy_pending(cp->cp_conn)) {
 		rcu_read_unlock();
 		return;
 	}
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ff0c98096af1..50a88f3e7e39 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -48,6 +48,7 @@
 static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
 static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+static atomic_t rds_ib_unloading;
 
 module_param(rds_ib_mr_1m_pool_size, int, 0444);
 MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
@@ -378,8 +379,23 @@ static void rds_ib_unregister_client(void)
 	flush_workqueue(rds_wq);
 }
 
+static void rds_ib_set_unloading(void)
+{
+	atomic_set(&rds_ib_unloading, 1);
+}
+
+static bool rds_ib_is_unloading(struct rds_connection *conn)
+{
+	struct rds_conn_path *cp = &conn->c_path[0];
+
+	return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) ||
+		atomic_read(&rds_ib_unloading) != 0);
+}
+
 void rds_ib_exit(void)
 {
+	rds_ib_set_unloading();
+	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 	rds_ib_unregister_client();
 	rds_ib_destroy_nodev_conns();
@@ -413,6 +429,7 @@ struct rds_transport rds_ib_transport = {
 	.flush_mrs		= rds_ib_flush_mrs,
 	.t_owner		= THIS_MODULE,
 	.t_name			= "infiniband",
+	.t_unloading		= rds_ib_is_unloading,
 	.t_type			= RDS_TRANS_IB
 };
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 80fb6f63e768..eea1d8611b20 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -117,6 +117,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version));
+		set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
 		rds_conn_destroy(conn);
 		return;
 	} else {
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 374ae83b60d4..7301b9b01890 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -518,6 +518,7 @@ struct rds_transport {
 	void (*sync_mr)(void *trans_private, int direction);
 	void (*free_mr)(void *trans_private, int invalidate);
 	void (*flush_mrs)(void);
+	bool (*t_unloading)(struct rds_connection *conn);
 };
 
 struct rds_sock {
@@ -862,6 +863,12 @@ static inline void rds_mr_put(struct rds_mr *mr)
 		__rds_put_mr_final(mr);
 }
 
+static inline bool rds_destroy_pending(struct rds_connection *conn)
+{
+	return !check_net(rds_conn_net(conn)) ||
+	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
+}
+
 /* stats.c */
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {		\
diff --git a/net/rds/send.c b/net/rds/send.c
index d3e32d1f3c7d..b1b0022b8370 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -162,7 +162,7 @@ restart:
 		goto out;
 	}
 
-	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (rds_destroy_pending(cp->cp_conn)) {
 		release_in_xmit(cp);
 		ret = -ENETUNREACH; /* dont requeue send work */
 		goto out;
@@ -444,7 +444,7 @@ over_batch:
 			if (batch_count < send_batch_count)
 				goto restart;
 			rcu_read_lock();
-			if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+			if (rds_destroy_pending(cp->cp_conn))
 				ret = -ENETUNREACH;
 			else
 				queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
@@ -1162,7 +1162,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	else
 		cpath = &conn->c_path[0];
 
-	if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) {
+	if (rds_destroy_pending(conn)) {
 		ret = -EAGAIN;
 		goto out;
 	}
@@ -1209,7 +1209,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	if (ret == -ENOMEM || ret == -EAGAIN) {
 		ret = 0;
 		rcu_read_lock();
-		if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags))
+		if (rds_destroy_pending(cpath->cp_conn))
 			ret = -ENETUNREACH;
 		else
 			queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
@@ -1295,7 +1295,7 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 
 	/* schedule the send work on rds_wq */
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	if (!rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
 	rcu_read_unlock();
 
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9920d2f84eff..44c4652721af 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -49,6 +49,7 @@ static unsigned int rds_tcp_tc_count;
 /* Track rds_tcp_connection structs so they can be cleaned up */
 static DEFINE_SPINLOCK(rds_tcp_conn_lock);
 static LIST_HEAD(rds_tcp_conn_list);
+static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
 
 static struct kmem_cache *rds_tcp_conn_slab;
 
@@ -274,14 +275,13 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
 static void rds_tcp_conn_free(void *arg)
 {
 	struct rds_tcp_connection *tc = arg;
-	unsigned long flags;
 
 	rdsdebug("freeing tc %p\n", tc);
 
-	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	if (!tc->t_tcp_node_detached)
 		list_del(&tc->t_tcp_node);
-	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 
 	kmem_cache_free(rds_tcp_conn_slab, tc);
 }
@@ -296,7 +296,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
 		if (!tc) {
 			ret = -ENOMEM;
-			break;
+			goto fail;
 		}
 		mutex_init(&tc->t_conn_path_lock);
 		tc->t_sock = NULL;
@@ -306,14 +306,19 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 		conn->c_path[i].cp_transport_data = tc;
 		tc->t_cpath = &conn->c_path[i];
+		tc->t_tcp_node_detached = true;
 
-		spin_lock_irq(&rds_tcp_conn_lock);
-		tc->t_tcp_node_detached = false;
-		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
-		spin_unlock_irq(&rds_tcp_conn_lock);
 		rdsdebug("rds_conn_path [%d] tc %p\n", i,
 			 conn->c_path[i].cp_transport_data);
 	}
+	spin_lock_bh(&rds_tcp_conn_lock);
+	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+		tc = conn->c_path[i].cp_transport_data;
+		tc->t_tcp_node_detached = false;
+		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+	}
+	spin_unlock_bh(&rds_tcp_conn_lock);
+fail:
 	if (ret) {
 		for (j = 0; j < i; j++)
 			rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
@@ -332,6 +337,16 @@ static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
 	return false;
 }
 
+static void rds_tcp_set_unloading(void)
+{
+	atomic_set(&rds_tcp_unloading, 1);
+}
+
+static bool rds_tcp_is_unloading(struct rds_connection *conn)
+{
+	return atomic_read(&rds_tcp_unloading) != 0;
+}
+
 static void rds_tcp_destroy_conns(void)
 {
 	struct rds_tcp_connection *tc, *_tc;
@@ -370,6 +385,7 @@ struct rds_transport rds_tcp_transport = {
 	.t_type			= RDS_TRANS_TCP,
 	.t_prefer_loopback	= 1,
 	.t_mp_capable		= 1,
+	.t_unloading		= rds_tcp_is_unloading,
 };
 
 static unsigned int rds_tcp_netid;
@@ -513,7 +529,7 @@ static void rds_tcp_kill_sock(struct net *net)
 
 	rtn->rds_tcp_listen_sock = NULL;
 	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
-	spin_lock_irq(&rds_tcp_conn_lock);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -526,7 +542,7 @@ static void rds_tcp_kill_sock(struct net *net)
 			tc->t_tcp_node_detached = true;
 		}
 	}
-	spin_unlock_irq(&rds_tcp_conn_lock);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
 }
@@ -574,7 +590,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
 
-	spin_lock_irq(&rds_tcp_conn_lock);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -584,7 +600,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 		/* reconnect with new parameters */
 		rds_conn_path_drop(tc->t_cpath, false);
 	}
-	spin_unlock_irq(&rds_tcp_conn_lock);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 }
 
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
@@ -607,6 +623,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
 
 static void rds_tcp_exit(void)
 {
+	rds_tcp_set_unloading();
+	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 	unregister_pernet_subsys(&rds_tcp_net_ops);
 	if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 534c67aeb20f..d999e7075645 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
 		 cp->cp_conn, tc, sock);
 
 	if (sock) {
-		if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (rds_destroy_pending(cp->cp_conn))
 			rds_tcp_set_linger(sock);
 		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
 		lock_sock(sock->sk);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index dd707b9e73e5..b9fbd2ee74ef 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -323,7 +323,7 @@ void rds_tcp_data_ready(struct sock *sk)
 
 	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
 		rcu_read_lock();
-		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (!rds_destroy_pending(cp->cp_conn))
 			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
 		rcu_read_unlock();
 	}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 16f65744d984..7df869d37afd 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -204,7 +204,7 @@ void rds_tcp_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
-	    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	    !rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
 	rcu_read_unlock();
 
diff --git a/net/rds/threads.c b/net/rds/threads.c
index eb76db1360b0..c52861d77a59 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -88,7 +88,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
 	cp->cp_reconnect_jiffies = 0;
 	set_bit(0, &cp->cp_conn->c_map_queued);
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (!rds_destroy_pending(cp->cp_conn)) {
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
 		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
 	}
@@ -138,7 +138,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	if (cp->cp_reconnect_jiffies == 0) {
 		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
 		rcu_read_lock();
-		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (!rds_destroy_pending(cp->cp_conn))
 			queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
 		rcu_read_unlock();
 		return;
@@ -149,7 +149,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
 		 conn, &conn->c_laddr, &conn->c_faddr);
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	if (!rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_conn_w,
 				   rand % cp->cp_reconnect_jiffies);
 	rcu_read_unlock();

From 3968523f855050b8195134da951b87c20bd66130 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 7 Feb 2018 22:34:24 -0800
Subject: [PATCH 62/82] mpls, nospec: Sanitize array index in mpls_label_ok()

mpls_label_ok() validates that the 'platform_label' array index from a
userspace netlink message payload is valid. Under speculation the
mpls_label_ok() result may not resolve in the CPU pipeline until after
the index is used to access an array element. Sanitize the index to zero
to prevent userspace-controlled arbitrary out-of-bounds speculation, a
precursor for a speculative execution side channel vulnerability.

Cc: <stable@vger.kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5dce8336d33f..e545a3c9365f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -8,6 +8,7 @@
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
 #include <linux/netconf.h>
+#include <linux/nospec.h>
 #include <linux/vmalloc.h>
 #include <linux/percpu.h>
 #include <net/ip.h>
@@ -935,24 +936,27 @@ errout:
 	return err;
 }
 
-static bool mpls_label_ok(struct net *net, unsigned int index,
+static bool mpls_label_ok(struct net *net, unsigned int *index,
 			  struct netlink_ext_ack *extack)
 {
+	bool is_ok = true;
+
 	/* Reserved labels may not be set */
-	if (index < MPLS_LABEL_FIRST_UNRESERVED) {
+	if (*index < MPLS_LABEL_FIRST_UNRESERVED) {
 		NL_SET_ERR_MSG(extack,
 			       "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher");
-		return false;
+		is_ok = false;
 	}
 
 	/* The full 20 bit range may not be supported. */
-	if (index >= net->mpls.platform_labels) {
+	if (is_ok && *index >= net->mpls.platform_labels) {
 		NL_SET_ERR_MSG(extack,
 			       "Label >= configured maximum in platform_labels");
-		return false;
+		is_ok = false;
 	}
 
-	return true;
+	*index = array_index_nospec(*index, net->mpls.platform_labels);
+	return is_ok;
 }
 
 static int mpls_route_add(struct mpls_route_config *cfg,
@@ -975,7 +979,7 @@ static int mpls_route_add(struct mpls_route_config *cfg,
 		index = find_free_label(net);
 	}
 
-	if (!mpls_label_ok(net, index, extack))
+	if (!mpls_label_ok(net, &index, extack))
 		goto errout;
 
 	/* Append makes no sense with mpls */
@@ -1052,7 +1056,7 @@ static int mpls_route_del(struct mpls_route_config *cfg,
 
 	index = cfg->rc_label;
 
-	if (!mpls_label_ok(net, index, extack))
+	if (!mpls_label_ok(net, &index, extack))
 		goto errout;
 
 	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
@@ -1810,7 +1814,7 @@ static int rtm_to_route_config(struct sk_buff *skb,
 				goto errout;
 
 			if (!mpls_label_ok(cfg->rc_nlinfo.nl_net,
-					   cfg->rc_label, extack))
+					   &cfg->rc_label, extack))
 				goto errout;
 			break;
 		}
@@ -2137,7 +2141,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 			goto errout;
 		}
 
-		if (!mpls_label_ok(net, in_label, extack)) {
+		if (!mpls_label_ok(net, &in_label, extack)) {
 			err = -EINVAL;
 			goto errout;
 		}

From eb53f7af6f15285e2f6ada97285395343ce9f433 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Thu, 8 Feb 2018 16:10:39 +0100
Subject: [PATCH 63/82] net/sched: cls_u32: fix cls_u32 on filter replace

The following sequence is currently broken:

 # tc qdisc add dev foo ingress
 # tc filter replace dev foo protocol all ingress \
   u32 match u8 0 0 action mirred egress mirror dev bar1
 # tc filter replace dev foo protocol all ingress \
   handle 800::800 pref 49152 \
   u32 match u8 0 0 action mirred egress mirror dev bar2
 Error: cls_u32: Key node flags do not match passed flags.
 We have an error talking to the kernel, -1

The error comes from u32_change() when comparing new and
existing flags. The existing ones always contains one of
TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state.
These flags cannot be passed from userspace so the condition
(n->flags != flags) in u32_change() always fails.

Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and
TCA_CLS_FLAGS_IN_HW are not taken into account.

Fixes: 24d3dc6d27ea ("net/sched: cls_u32: Reflect HW offload status")
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6311a548046b..c75e68e839c7 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -955,7 +955,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		}
 
-		if (n->flags != flags) {
+		if ((n->flags ^ flags) &
+		    ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
 			NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
 			return -EINVAL;
 		}

From 55b3280d1e471795c08dbbe17325720a843e104c Mon Sep 17 00:00:00 2001
From: Hoang Le <hoang.h.le@dektek.com.au>
Date: Thu, 8 Feb 2018 17:16:25 +0100
Subject: [PATCH 64/82] tipc: fix skb truesize/datasize ratio control

In commit d618d09a68e4 ("tipc: enforce valid ratio between skb truesize
and contents") we introduced a test for ensuring that the condition
truesize/datasize <= 4 is true for a received buffer. Unfortunately this
test has two problems.

- Because of the integer arithmetics the test
  if (skb->truesize / buf_roundup_len(skb) > 4) will miss all
  ratios [4 < ratio < 5], which was not the intention.
- The buffer returned by skb_copy() inherits skb->truesize of the
  original buffer, which doesn't help the situation at all.

In this commit, we change the ratio condition and replace skb_copy()
with a call to skb_copy_expand() to finally get this right.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/msg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 55d8ba92291d..4e1c6f6450bb 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -208,8 +208,8 @@ bool tipc_msg_validate(struct sk_buff **_skb)
 	int msz, hsz;
 
 	/* Ensure that flow control ratio condition is satisfied */
-	if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) {
-		skb = skb_copy(skb, GFP_ATOMIC);
+	if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) {
+		skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC);
 		if (!skb)
 			return false;
 		kfree_skb(*_skb);

From 88c991a91729b402bbfbf247fdba16ac21c369ab Mon Sep 17 00:00:00 2001
From: Dean Nelson <dnelson@redhat.com>
Date: Thu, 8 Feb 2018 14:21:05 -0500
Subject: [PATCH 65/82] net: thunder: change q_len's type to handle max ring
 size

The Cavium thunder nicvf driver supports rx/tx rings of up to 65536 entries per.
The number of entires are stored in the q_len member of struct q_desc_mem. The
problem is that q_len being a u16, results in 65536 becoming 0.

In getting pointers to descriptors in the rings, the driver uses q_len minus 1
as a mask after incrementing the pointer, in order to go back to the beginning
and not go past the end of the ring.

With the q_len set to 0 the mask is no longer correct and the driver does go
beyond the end of the ring, causing various ills. Usually the first thing that
shows up is a "NETDEV WATCHDOG: enP2p1s0f1 (nicvf): transmit queue 7 timed out"
warning.

This patch remedies the problem by changing q_len to a u32.

Signed-off-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 7d1e4e2aaad0..ce1eed7a6d63 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -213,7 +213,7 @@ struct rx_tx_queue_stats {
 struct q_desc_mem {
 	dma_addr_t	dma;
 	u64		size;
-	u16		q_len;
+	u32		q_len;
 	dma_addr_t	phys_base;
 	void		*base;
 	void		*unalign_base;

From 08f5138512180a479ce6b9d23b825c9f4cd3be77 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 8 Feb 2018 21:01:48 +0100
Subject: [PATCH 66/82] net: phy: fix phy_start to consider
 PHY_IGNORE_INTERRUPT

This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added
long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates
also PHY state changes and we should do what the symbol says.

Fixes: 84a527a41f38 ("net: phylib: fix interrupts re-enablement in phy_start")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index f3313a129531..e3e29c2b028b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -822,7 +822,7 @@ void phy_start(struct phy_device *phydev)
 		phy_resume(phydev);
 
 		/* make sure interrupts are re-enabled for the PHY */
-		if (phydev->irq != PHY_POLL) {
+		if (phy_interrupt_is_valid(phydev)) {
 			err = phy_enable_interrupts(phydev);
 			if (err < 0)
 				break;

From 8c88181ed452707f3815827225517b1964463b95 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:12 +0100
Subject: [PATCH 67/82] bpf: Sync kernel ABI header with tooling header for
 bpf_common.h

I recently fixed up a lot of commits that forgot to keep the tooling
headers in sync.  And then I forgot to do the same thing in commit
cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes"). Let correct
that before people notice ;-).

Lawrence did partly fix/sync this for bpf.h in commit d6d4f60c3a09
("bpf: add selftest for tcpbpf").

Fixes: cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf_common.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/bpf_common.h b/tools/include/uapi/linux/bpf_common.h
index 18be90725ab0..ee97668bdadb 100644
--- a/tools/include/uapi/linux/bpf_common.h
+++ b/tools/include/uapi/linux/bpf_common.h
@@ -15,9 +15,10 @@
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20

From 077c066a6c5445423147496e6c9b9a2c2d2ee762 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:17 +0100
Subject: [PATCH 68/82] tools/libbpf: improve the pr_debug statements to
 contain section numbers

While debugging a bpf ELF loading issue, I needed to correlate the
ELF section number with the failed relocation section reference.
Thus, add section numbers/index to the pr_debug.

In debug mode, also print section that were skipped.  This helped
me identify that a section (.eh_frame) was skipped, and this was
the reason the relocation section (.rel.eh_frame) could not find
that section number.

The section numbers corresponds to the readelf tools Section Headers [Nr].

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c64840365433..0c794f7fc050 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -319,8 +319,8 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx,
 
 	prog->section_name = strdup(section_name);
 	if (!prog->section_name) {
-		pr_warning("failed to alloc name for prog under section %s\n",
-			   section_name);
+		pr_warning("failed to alloc name for prog under section(%d) %s\n",
+			   idx, section_name);
 		goto errout;
 	}
 
@@ -763,29 +763,29 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 
 		idx++;
 		if (gelf_getshdr(scn, &sh) != &sh) {
-			pr_warning("failed to get section header from %s\n",
-				   obj->path);
+			pr_warning("failed to get section(%d) header from %s\n",
+				   idx, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
 
 		name = elf_strptr(elf, ep->e_shstrndx, sh.sh_name);
 		if (!name) {
-			pr_warning("failed to get section name from %s\n",
-				   obj->path);
+			pr_warning("failed to get section(%d) name from %s\n",
+				   idx, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
 
 		data = elf_getdata(scn, 0);
 		if (!data) {
-			pr_warning("failed to get section data from %s(%s)\n",
-				   name, obj->path);
+			pr_warning("failed to get section(%d) data from %s(%s)\n",
+				   idx, name, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
-		pr_debug("section %s, size %ld, link %d, flags %lx, type=%d\n",
-			 name, (unsigned long)data->d_size,
+		pr_debug("section(%d) %s, size %ld, link %d, flags %lx, type=%d\n",
+			 idx, name, (unsigned long)data->d_size,
 			 (int)sh.sh_link, (unsigned long)sh.sh_flags,
 			 (int)sh.sh_type);
 
@@ -840,6 +840,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 				obj->efile.reloc[n].shdr = sh;
 				obj->efile.reloc[n].data = data;
 			}
+		} else {
+			pr_debug("skip section(%d) %s\n", idx, name);
 		}
 		if (err)
 			goto out;
@@ -1119,8 +1121,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 
 		prog = bpf_object__find_prog_by_idx(obj, idx);
 		if (!prog) {
-			pr_warning("relocation failed: no %d section\n",
-				   idx);
+			pr_warning("relocation failed: no section(%d)\n", idx);
 			return -LIBBPF_ERRNO__RELOC;
 		}
 

From 864db336c677c288d81524c30a656804785902f9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:22 +0100
Subject: [PATCH 69/82] selftests/bpf: add test program for loading BPF ELF
 files

V2: Moved program into selftests/bpf from tools/libbpf

This program can be used on its own for testing/debugging if a
BPF ELF-object file can be loaded with libbpf (from tools/lib/bpf).

If something is wrong with the ELF object, the program have
a --debug mode that will display the ELF sections and especially
the skipped sections.  This allows for quickly identifying the
problematic ELF section number, which can be corrolated with the
readelf tool.

The program signal error via return codes, and also have
a --quiet mode, which is practical for use in scripts like
selftests/bpf.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 .../testing/selftests/bpf/test_libbpf_open.c  | 150 ++++++++++++++++++
 2 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_libbpf_open.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 566d6adc172a..3f48da0866ba 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -20,7 +20,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o test_tcpbpf_kern.o
+	sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c
new file mode 100644
index 000000000000..8fcd1c076add
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_libbpf_open.c
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ =
+	"Libbpf test program for loading BPF ELF object files";
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <bpf/libbpf.h>
+#include <getopt.h>
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"debug",	no_argument,		NULL, 'D' },
+	{"quiet",	no_argument,		NULL, 'q' },
+	{0, 0, NULL,  0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n\n", __doc__);
+	printf(" Usage: %s (options-see-below) BPF_FILE\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		printf(" short-option: -%c",
+		       long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+#define DEFINE_PRINT_FN(name, enabled) \
+static int libbpf_##name(const char *fmt, ...)  	\
+{							\
+        va_list args;					\
+        int ret;					\
+							\
+        va_start(args, fmt);				\
+	if (enabled) {					\
+		fprintf(stderr, "[" #name "] ");	\
+		ret = vfprintf(stderr, fmt, args);	\
+	}						\
+        va_end(args);					\
+        return ret;					\
+}
+DEFINE_PRINT_FN(warning, 1)
+DEFINE_PRINT_FN(info, 1)
+DEFINE_PRINT_FN(debug, 1)
+
+#define EXIT_FAIL_LIBBPF EXIT_FAILURE
+#define EXIT_FAIL_OPTION 2
+
+int test_walk_progs(struct bpf_object *obj, bool verbose)
+{
+	struct bpf_program *prog;
+	int cnt = 0;
+
+	bpf_object__for_each_program(prog, obj) {
+		cnt++;
+		if (verbose)
+			printf("Prog (count:%d) section_name: %s\n", cnt,
+			       bpf_program__title(prog, false));
+	}
+	return 0;
+}
+
+int test_walk_maps(struct bpf_object *obj, bool verbose)
+{
+	struct bpf_map *map;
+	int cnt = 0;
+
+	bpf_map__for_each(map, obj) {
+		cnt++;
+		if (verbose)
+			printf("Map (count:%d) name: %s\n", cnt,
+			       bpf_map__name(map));
+	}
+	return 0;
+}
+
+int test_open_file(char *filename, bool verbose)
+{
+	struct bpf_object *bpfobj = NULL;
+	long err;
+
+	if (verbose)
+		printf("Open BPF ELF-file with libbpf: %s\n", filename);
+
+	/* Load BPF ELF object file and check for errors */
+	bpfobj = bpf_object__open(filename);
+	err = libbpf_get_error(bpfobj);
+	if (err) {
+		char err_buf[128];
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
+		if (verbose)
+			printf("Unable to load eBPF objects in file '%s': %s\n",
+			       filename, err_buf);
+		return EXIT_FAIL_LIBBPF;
+	}
+	test_walk_progs(bpfobj, verbose);
+	test_walk_maps(bpfobj, verbose);
+
+	if (verbose)
+		printf("Close BPF ELF-file with libbpf: %s\n",
+		       bpf_object__name(bpfobj));
+	bpf_object__close(bpfobj);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	char filename[1024] = { 0 };
+	bool verbose = 1;
+	int longindex = 0;
+	int opt;
+
+	libbpf_set_print(libbpf_warning, libbpf_info, NULL);
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hDq",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'D':
+			libbpf_set_print(libbpf_warning, libbpf_info,
+					 libbpf_debug);
+			break;
+		case 'q': /* Use in scripting mode */
+			verbose = 0;
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	if (optind >= argc) {
+		usage(argv);
+		printf("ERROR: Expected BPF_FILE argument after options\n");
+		return EXIT_FAIL_OPTION;
+	}
+	snprintf(filename, sizeof(filename), "%s", argv[optind]);
+
+	return test_open_file(filename, verbose);
+}

From f09b2e382e9a7053e3ae6f2fb6535efd5760cf5d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:27 +0100
Subject: [PATCH 70/82] selftests/bpf: add selftest that use test_libbpf_open

This script test_libbpf.sh will be part of the 'make run_tests'
invocation, but can also be invoked manually in this directory,
and a verbose mode can be enabled via setting the environment
variable $VERBOSE like:

 $ VERBOSE=yes ./test_libbpf.sh

The script contains some tests that are commented out, as they
currently fail.  They are reminders about what we need to improve
for the libbpf loader library.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile       | 14 ++++++-
 tools/testing/selftests/bpf/test_libbpf.sh | 49 ++++++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_libbpf.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3f48da0866ba..5c43c187f27c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,6 +13,7 @@ endif
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
+# Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
@@ -20,17 +21,26 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open
+	sample_map_ret0.o test_tcpbpf_kern.o
 
-TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
+# Order correspond to 'make run_tests' order
+TEST_PROGS := test_kmod.sh \
+	test_libbpf.sh \
+	test_xdp_redirect.sh \
+	test_xdp_meta.sh \
 	test_offload.py
 
+# Compile but not part of 'make run_tests'
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open
+
 include ../lib.mk
 
 BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
+$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
+
 .PHONY: force
 
 # force a rebuild of BPFOBJ when its dependencies are updated
diff --git a/tools/testing/selftests/bpf/test_libbpf.sh b/tools/testing/selftests/bpf/test_libbpf.sh
new file mode 100755
index 000000000000..d97dc914cd49
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_libbpf.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+export TESTNAME=test_libbpf
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+	if (( $? == 0 )); then
+		echo "selftests: $TESTNAME [PASS]";
+	else
+		echo "$TESTNAME: failed at file $LAST_LOADED" 1>&2
+		echo "selftests: $TESTNAME [FAILED]";
+	fi
+}
+
+libbpf_open_file()
+{
+	LAST_LOADED=$1
+	if [ -n "$VERBOSE" ]; then
+	    ./test_libbpf_open $1
+	else
+	    ./test_libbpf_open --quiet $1
+	fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+libbpf_open_file test_l4lb.o
+
+# TODO: fix libbpf to load noinline functions
+# [warning] libbpf: incorrect bpf_call opcode
+#libbpf_open_file test_l4lb_noinline.o
+
+# TODO: fix test_xdp_meta.c to load with libbpf
+# [warning] libbpf: test_xdp_meta.o doesn't provide kernel version
+#libbpf_open_file test_xdp_meta.o
+
+# TODO: fix libbpf to handle .eh_frame
+# [warning] libbpf: relocation failed: no section(10)
+#libbpf_open_file ../../../../samples/bpf/tracex3_kern.o
+
+# Success
+exit 0

From e3d91b0ca523d53158f435a3e13df7f0cb360ea2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:32 +0100
Subject: [PATCH 71/82] tools/libbpf: handle issues with bpf ELF objects
 containing .eh_frames

V3: More generic skipping of relo-section (suggested by Daniel)

If clang >= 4.0.1 is missing the option '-target bpf', it will cause
llc/llvm to create two ELF sections for "Exception Frames", with
section names '.eh_frame' and '.rel.eh_frame'.

The BPF ELF loader library libbpf fails when loading files with these
sections.  The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c,
handle this gracefully. And iproute2 loader also seems to work with these
"eh" sections.

The issue in libbpf is caused by bpf_object__elf_collect() skipping
some sections, and later when performing relocation it will be
pointing to a skipped section, as these sections cannot be found by
bpf_object__find_prog_by_idx() in bpf_object__collect_reloc().

This is a general issue that also occurs for other sections, like
debug sections which are also skipped and can have relo section.

As suggested by Daniel.  To avoid keeping state about all skipped
sections, instead perform a direct qlookup in the ELF object.  Lookup
the section that the relo-section points to and check if it contains
executable machine instructions (denoted by the sh_flags
SHF_EXECINSTR).  Use this check to also skip irrelevant relo-sections.

Note, for samples/bpf/ the '-target bpf' parameter to clang cannot be used
due to incompatibility with asm embedded headers, that some of the samples
include. This is explained in more details by Yonghong Song in bpf_devel_QA.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0c794f7fc050..97073d649c1a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -742,6 +742,24 @@ bpf_object__init_maps(struct bpf_object *obj)
 	return 0;
 }
 
+static bool section_have_execinstr(struct bpf_object *obj, int idx)
+{
+	Elf_Scn *scn;
+	GElf_Shdr sh;
+
+	scn = elf_getscn(obj->efile.elf, idx);
+	if (!scn)
+		return false;
+
+	if (gelf_getshdr(scn, &sh) != &sh)
+		return false;
+
+	if (sh.sh_flags & SHF_EXECINSTR)
+		return true;
+
+	return false;
+}
+
 static int bpf_object__elf_collect(struct bpf_object *obj)
 {
 	Elf *elf = obj->efile.elf;
@@ -825,6 +843,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		} else if (sh.sh_type == SHT_REL) {
 			void *reloc = obj->efile.reloc;
 			int nr_reloc = obj->efile.nr_reloc + 1;
+			int sec = sh.sh_info; /* points to other section */
+
+			/* Only do relo for section with exec instructions */
+			if (!section_have_execinstr(obj, sec)) {
+				pr_debug("skip relo %s(%d) for section(%d)\n",
+					 name, idx, sec);
+				continue;
+			}
 
 			reloc = realloc(reloc,
 					sizeof(*obj->efile.reloc) * nr_reloc);

From faefaa97215a0c05105d7ae180fe1a3b5979ad1f Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:41:09 -0600
Subject: [PATCH 72/82] ibmvnic: Reset long term map ID counter

When allocating RX or TX buffer pools, the driver needs to provide a
unique mapping ID to firmware for each pool. This value is assigned
using a counter which is incremented after a new pool is created. The
ID can be an integer ranging from 1-255. When migrating to a device
that requests a different number of queues, this value was not being
reset properly. As a result, after enough migrations, the counter
exceeded the upper bound and pool creation failed. This is fixed by
resetting the counter to one in this case.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index dd4a2946e1da..ce127a72cda2 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1644,6 +1644,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 				return rc;
 		} else if (adapter->req_rx_queues != old_num_rx_queues ||
 			   adapter->req_tx_queues != old_num_tx_queues) {
+			adapter->map_id = 1;
 			release_rx_pools(adapter);
 			release_tx_pools(adapter);
 			init_rx_pools(netdev);

From 1b84ca187510f60f00f4e15255043ce19bb30410 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:45 +0100
Subject: [PATCH 73/82] net: stmmac: discard disabled flags in interrupt status
 register

The interrupt status register in both dwmac1000 and dwmac4 ignores
interrupt enable (for dwmac4) / interrupt mask (for dwmac1000).
Therefore, if we want to check only the bits that can actually trigger
an irq, we have to filter the interrupt status register manually.

Commit 0a764db10337 ("stmmac: Discard masked flags in interrupt status
register") fixed this for dwmac1000. Fix the same issue for dwmac4.

Just like commit 0a764db10337 ("stmmac: Discard masked flags in
interrupt status register"), this makes sure that we do not get
spurious link up/link down prints.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index ed222b20fcf1..1e0a7668b752 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -572,10 +572,12 @@ static int dwmac4_irq_status(struct mac_device_info *hw,
 			     struct stmmac_extra_stats *x)
 {
 	void __iomem *ioaddr = hw->pcsr;
-	u32 intr_status;
+	u32 intr_status = readl(ioaddr + GMAC_INT_STATUS);
+	u32 intr_enable = readl(ioaddr + GMAC_INT_EN);
 	int ret = 0;
 
-	intr_status = readl(ioaddr + GMAC_INT_STATUS);
+	/* Discard disabled bits */
+	intr_status &= intr_enable;
 
 	/* Not used events (e.g. MMC interrupts) are not handled. */
 	if ((intr_status & mmc_tx_irq))

From e879b7ab3739d8f9990961fc7df2f63861bd780b Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:46 +0100
Subject: [PATCH 74/82] net: stmmac: rename GMAC_INT_DEFAULT_MASK for dwmac4

GMAC_INT_DEFAULT_MASK is written to the interrupt enable register.
In previous versions of the IP (e.g. dwmac1000), this register was
instead an interrupt mask register.
To improve clarity and reflect reality, rename GMAC_INT_DEFAULT_MASK
to GMAC_INT_DEFAULT_ENABLE.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 789dad8a07b5..7761a26ec9c5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -98,7 +98,7 @@
 #define	GMAC_PCS_IRQ_DEFAULT	(GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK |	\
 				 GMAC_INT_PCS_ANE)
 
-#define	GMAC_INT_DEFAULT_MASK	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN)
+#define	GMAC_INT_DEFAULT_ENABLE	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN)
 
 enum dwmac4_irq_status {
 	time_stamp_irq = 0x00001000,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 1e0a7668b752..6badc63d8e6d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -61,8 +61,8 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	writel(value, ioaddr + GMAC_CONFIG);
 
-	/* Mask GMAC interrupts */
-	value = GMAC_INT_DEFAULT_MASK;
+	/* Enable GMAC interrupts */
+	value = GMAC_INT_DEFAULT_ENABLE;
 	if (hw->pmt)
 		value |= GMAC_INT_PMT_EN;
 	if (hw->pcs)

From 1029117127540fef4edcf4f0887dc3e1f7d5adb2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:47 +0100
Subject: [PATCH 75/82] net: stmmac: remove redundant enable of PMT irq

For dwmac4, GMAC_INT_DEFAULT_ENABLE already includes
GMAC_INT_PMT_EN, so it is redundant to check if hw->pmt
is set, and if so, setting the bit again.

For dwmac1000, GMAC_INT_DEFAULT_MASK does not include
GMAC_INT_DISABLE_PMT, so it is redundant to check if
hw->pmt is set, and if so, clearing an already cleared bit.

Improve code readability by removing this redundant code.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 --
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c    | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 540d21786a43..ef10baf14186 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -74,8 +74,6 @@ static void dwmac1000_core_init(struct mac_device_info *hw,
 	/* Mask GMAC interrupts */
 	value = GMAC_INT_DEFAULT_MASK;
 
-	if (hw->pmt)
-		value &= ~GMAC_INT_DISABLE_PMT;
 	if (hw->pcs)
 		value &= ~GMAC_INT_DISABLE_PCS;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 6badc63d8e6d..63795ecafc8d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -63,8 +63,7 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	/* Enable GMAC interrupts */
 	value = GMAC_INT_DEFAULT_ENABLE;
-	if (hw->pmt)
-		value |= GMAC_INT_PMT_EN;
+
 	if (hw->pcs)
 		value |= GMAC_PCS_IRQ_DEFAULT;
 

From 6e6e41c3112276288ccaf80c70916779b84bb276 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Feb 2018 17:45:49 +0800
Subject: [PATCH 76/82] ptr_ring: fail early if queue occupies more than
 KMALLOC_MAX_SIZE

To avoid slab to warn about exceeded size, fail early if queue
occupies more than KMALLOC_MAX_SIZE.

Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com
Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 1883d6137e9b..6051a5fe6913 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -466,6 +466,8 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 
 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
+	if (size * sizeof(void *) > KMALLOC_MAX_SIZE)
+		return NULL;
 	return kcalloc(size, sizeof(void *), gfp);
 }
 

From 0bf7800f1799b5b1fd7d4f024e9ece53ac489011 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Feb 2018 17:45:50 +0800
Subject: [PATCH 77/82] ptr_ring: try vmalloc() when kmalloc() fails

This patch switch to use kvmalloc_array() for using a vmalloc()
fallback to help in case kmalloc() fails.

Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com
Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6051a5fe6913..b884b7794187 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -464,11 +464,14 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 	__PTR_RING_PEEK_CALL_v; \
 })
 
+/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
+ * documentation for vmalloc for which of them are legal.
+ */
 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
 	if (size * sizeof(void *) > KMALLOC_MAX_SIZE)
 		return NULL;
-	return kcalloc(size, sizeof(void *), gfp);
+	return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -603,7 +606,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 	spin_unlock(&(r)->producer_lock);
 	spin_unlock_irqrestore(&(r)->consumer_lock, flags);
 
-	kfree(old);
+	kvfree(old);
 
 	return 0;
 }
@@ -643,7 +646,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
 	}
 
 	for (i = 0; i < nrings; ++i)
-		kfree(queues[i]);
+		kvfree(queues[i]);
 
 	kfree(queues);
 
@@ -651,7 +654,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
 
 nomem:
 	while (--i >= 0)
-		kfree(queues[i]);
+		kvfree(queues[i]);
 
 	kfree(queues);
 
@@ -666,7 +669,7 @@ static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
 	if (destroy)
 		while ((ptr = ptr_ring_consume(r)))
 			destroy(ptr);
-	kfree(r->queue);
+	kvfree(r->queue);
 }
 
 #endif /* _LINUX_PTR_RING_H  */

From 89271c65edd599207dd982007900506283c90ae3 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:03:49 +0100
Subject: [PATCH 78/82] s390/qeth: fix underestimated count of buffer elements

For a memory range/skb where the last byte falls onto a page boundary
(ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the
calculation currently doesn't round up to the next PFN due to an
off-by-one error.
Thus qeth believes that the skb occupies one page less than it
actually does, and may select a IO buffer that doesn't have enough spare
buffer elements to fit all of the skb's data.
HW detects this as a malformed buffer descriptor, and raises an
exception which then triggers device recovery.

Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count")
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index db42107bf2f5..c33fbc4c2e91 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -846,7 +846,7 @@ struct qeth_trap_id {
  */
 static inline int qeth_get_elements_for_range(addr_t start, addr_t end)
 {
-	return PFN_UP(end - 1) - PFN_DOWN(start);
+	return PFN_UP(end) - PFN_DOWN(start);
 }
 
 static inline int qeth_get_micros(void)

From 1c5b2216fbb973a9410e0b06389740b5c1289171 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:03:50 +0100
Subject: [PATCH 79/82] s390/qeth: fix SETIP command handling

send_control_data() applies some special handling to SETIP v4 IPA
commands. But current code parses *all* command types for the SETIP
command code. Limit the command code check to IPA commands.

Fixes: 5b54e16f1a54 ("qeth: do not spin for SETIP ip assist command")
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  5 +++++
 drivers/s390/net/qeth_core_main.c | 14 ++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index c33fbc4c2e91..959c65cf75d9 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -591,6 +591,11 @@ struct qeth_cmd_buffer {
 	void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *);
 };
 
+static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob)
+{
+	return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE);
+}
+
 /**
  * definition of a qeth channel, used for read and write
  */
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 6abd3bc285e4..ca72f3311004 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -2120,7 +2120,7 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 	unsigned long flags;
 	struct qeth_reply *reply = NULL;
 	unsigned long timeout, event_timeout;
-	struct qeth_ipa_cmd *cmd;
+	struct qeth_ipa_cmd *cmd = NULL;
 
 	QETH_CARD_TEXT(card, 2, "sendctl");
 
@@ -2146,10 +2146,13 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 	while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ;
 	qeth_prepare_control_data(card, len, iob);
 
-	if (IS_IPA(iob->data))
+	if (IS_IPA(iob->data)) {
+		cmd = __ipa_cmd(iob);
 		event_timeout = QETH_IPA_TIMEOUT;
-	else
+	} else {
 		event_timeout = QETH_TIMEOUT;
+	}
+
 	timeout = jiffies + event_timeout;
 
 	QETH_CARD_TEXT(card, 6, "noirqpnd");
@@ -2174,9 +2177,8 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 
 	/* we have only one long running ipassist, since we can ensure
 	   process context of this command we can sleep */
-	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
-	if ((cmd->hdr.command == IPA_CMD_SETIP) &&
-	    (cmd->hdr.prot_version == QETH_PROT_IPV4)) {
+	if (cmd && cmd->hdr.command == IPA_CMD_SETIP &&
+	    cmd->hdr.prot_version == QETH_PROT_IPV4) {
 		if (!wait_event_timeout(reply->wait_q,
 		    atomic_read(&reply->received), event_timeout))
 			goto time_err;

From 07f2c7ab6f8d0a7e7c5764c4e6cc9c52951b9d9c Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 9 Feb 2018 17:35:23 +0300
Subject: [PATCH 80/82] sctp: verify size of a new chunk in _sctp_make_chunk()

When SCTP makes INIT or INIT_ACK packet the total chunk length
can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when
transmitting these packets, e.g. the crash on sending INIT_ACK:

[  597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168
               put:120156 head:000000007aa47635 data:00000000d991c2de
               tail:0x1d640 end:0xfec0 dev:<NULL>
...
[  597.976970] ------------[ cut here ]------------
[  598.033408] kernel BUG at net/core/skbuff.c:104!
[  600.314841] Call Trace:
[  600.345829]  <IRQ>
[  600.371639]  ? sctp_packet_transmit+0x2095/0x26d0 [sctp]
[  600.436934]  skb_put+0x16c/0x200
[  600.477295]  sctp_packet_transmit+0x2095/0x26d0 [sctp]
[  600.540630]  ? sctp_packet_config+0x890/0x890 [sctp]
[  600.601781]  ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp]
[  600.671356]  ? sctp_cmp_addr_exact+0x3f/0x90 [sctp]
[  600.731482]  sctp_outq_flush+0x663/0x30d0 [sctp]
[  600.788565]  ? sctp_make_init+0xbf0/0xbf0 [sctp]
[  600.845555]  ? sctp_check_transmitted+0x18f0/0x18f0 [sctp]
[  600.912945]  ? sctp_outq_tail+0x631/0x9d0 [sctp]
[  600.969936]  sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp]
[  601.041593]  ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp]
[  601.104837]  ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp]
[  601.175436]  ? sctp_eat_data+0x1710/0x1710 [sctp]
[  601.233575]  sctp_do_sm+0x182/0x560 [sctp]
[  601.284328]  ? sctp_has_association+0x70/0x70 [sctp]
[  601.345586]  ? sctp_rcv+0xef4/0x32f0 [sctp]
[  601.397478]  ? sctp6_rcv+0xa/0x20 [sctp]
...

Here the chunk size for INIT_ACK packet becomes too big, mostly
because of the state cookie (INIT packet has large size with
many address parameters), plus additional server parameters.

Later this chunk causes the panic in skb_put_data():

  skb_packet_transmit()
      sctp_packet_pack()
          skb_put_data(nskb, chunk->skb->data, chunk->skb->len);

'nskb' (head skb) was previously allocated with packet->size
from u16 'chunk->chunk_hdr->length'.

As suggested by Marcelo we should check the chunk's length in
_sctp_make_chunk() before trying to allocate skb for it and
discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN.

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leinter@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 793b05ec692b..d01475f5f710 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1380,9 +1380,14 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sctp_chunk *retval;
 	struct sk_buff *skb;
 	struct sock *sk;
+	int chunklen;
+
+	chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen);
+	if (chunklen > SCTP_MAX_CHUNK_LEN)
+		goto nodata;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(SCTP_PAD4(sizeof(*chunk_hdr) + paylen), gfp);
+	skb = alloc_skb(chunklen, gfp);
 	if (!skb)
 		goto nodata;
 

From 941ff6f11c020913f5cddf543a9ec63475d7c082 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Feb 2018 14:49:44 +0100
Subject: [PATCH 81/82] bpf: fix rlimit in reuseport net selftest

Fix two issues in the reuseport_bpf selftests that were
reported by Linaro CI:

  [...]
  + ./reuseport_bpf
  ---- IPv4 UDP ----
  Testing EBPF mod 10...
  Reprograming, testing mod 5...
  ./reuseport_bpf: ebpf error. log:
  0: (bf) r6 = r1
  1: (20) r0 = *(u32 *)skb[0]
  2: (97) r0 %= 10
  3: (95) exit
  processed 4 insns
  : Operation not permitted
  + echo FAIL
  [...]
  ---- IPv4 TCP ----
  Testing EBPF mod 10...
  ./reuseport_bpf: failed to bind send socket: Address already in use
  + echo FAIL
  [...]

For the former adjust rlimit since this was the cause of
failure for loading the BPF prog, and for the latter add
SO_REUSEADDR.

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Link: https://bugs.linaro.org/show_bug.cgi?id=3502
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/reuseport_bpf.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c
index 4a8217448f20..cad14cd0ea92 100644
--- a/tools/testing/selftests/net/reuseport_bpf.c
+++ b/tools/testing/selftests/net/reuseport_bpf.c
@@ -21,6 +21,7 @@
 #include <sys/epoll.h>
 #include <sys/types.h>
 #include <sys/socket.h>
+#include <sys/resource.h>
 #include <unistd.h>
 
 #ifndef ARRAY_SIZE
@@ -190,11 +191,14 @@ static void send_from(struct test_params p, uint16_t sport, char *buf,
 	struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport);
 	struct sockaddr * const daddr =
 		new_loopback_sockaddr(p.send_family, p.recv_port);
-	const int fd = socket(p.send_family, p.protocol, 0);
+	const int fd = socket(p.send_family, p.protocol, 0), one = 1;
 
 	if (fd < 0)
 		error(1, errno, "failed to create send socket");
 
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)))
+		error(1, errno, "failed to set reuseaddr");
+
 	if (bind(fd, saddr, sockaddr_size()))
 		error(1, errno, "failed to bind send socket");
 
@@ -433,6 +437,21 @@ void enable_fastopen(void)
 	}
 }
 
+static struct rlimit rlim_old, rlim_new;
+
+static  __attribute__((constructor)) void main_ctor(void)
+{
+	getrlimit(RLIMIT_MEMLOCK, &rlim_old);
+	rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20);
+	rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20);
+	setrlimit(RLIMIT_MEMLOCK, &rlim_new);
+}
+
+static __attribute__((destructor)) void main_dtor(void)
+{
+	setrlimit(RLIMIT_MEMLOCK, &rlim_old);
+}
+
 int main(void)
 {
 	fprintf(stderr, "---- IPv4 UDP ----\n");

From 2fa56a494484f19e06bf4f3464b2155a92beafac Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 13:19:46 -0600
Subject: [PATCH 82/82] ibmvnic: Remove skb->protocol checks in ibmvnic_xmit

Having these checks in ibmvnic_xmit causes problems with VLAN
tagging and balance-alb/tlb bonding modes. The restriction they
imposed can be removed.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index ce127a72cda2..27447260215d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1419,10 +1419,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		hdrs += 2;
 	}
 	/* determine if l2/3/4 headers are sent to firmware */
-	if ((*hdrs >> 7) & 1 &&
-	    (skb->protocol == htons(ETH_P_IP) ||
-	     skb->protocol == htons(ETH_P_IPV6) ||
-	     skb->protocol == htons(ETH_P_ARP))) {
+	if ((*hdrs >> 7) & 1) {
 		build_hdr_descs_arr(tx_buff, &num_entries, *hdrs);
 		tx_crq.v1.n_crq_elem = num_entries;
 		tx_buff->indir_arr[0] = tx_crq;