From 671796dd96b6cd85b75fba9d3007bcf7e5f7c309 Mon Sep 17 00:00:00 2001
From: Ronald Wahl <ronald.wahl@raritan.com>
Date: Thu, 7 Aug 2014 14:15:50 +0200
Subject: [PATCH 01/42] carl9170: fix sending URBs with wrong type when using
 full-speed

The driver assumes that endpoint 4 is always an interrupt endpoint.
Unfortunately the type differs between high-speed and full-speed
configurations while in the former case it is indeed an interrupt
endpoint this is not true for the latter case - here it is a bulk
endpoint. When sending URBs with the wrong type the kernel will
generate a warning message including backtrace. In this specific
case there will be a huge amount of warnings which can bring the system
to freeze.

To fix this we are now sending URBs to endpoint 4 using the type
found in the endpoint descriptor.

A side note: The carl9170 firmware currently specifies endpoint 4 as
interrupt endpoint even in the full-speed configuration but this has
no relevance because before this firmware is loaded the endpoint type
is as described above and after the firmware is running the stick is not
reenumerated and so the old descriptor is used.

Signed-off-by: Ronald Wahl <ronald.wahl@raritan.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath/carl9170/carl9170.h |  1 +
 drivers/net/wireless/ath/carl9170/usb.c      | 31 +++++++++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/carl9170/carl9170.h b/drivers/net/wireless/ath/carl9170/carl9170.h
index 8596aba34f96..237d0cda1bcb 100644
--- a/drivers/net/wireless/ath/carl9170/carl9170.h
+++ b/drivers/net/wireless/ath/carl9170/carl9170.h
@@ -256,6 +256,7 @@ struct ar9170 {
 	atomic_t rx_work_urbs;
 	atomic_t rx_pool_urbs;
 	kernel_ulong_t features;
+	bool usb_ep_cmd_is_bulk;
 
 	/* firmware settings */
 	struct completion fw_load_wait;
diff --git a/drivers/net/wireless/ath/carl9170/usb.c b/drivers/net/wireless/ath/carl9170/usb.c
index f35c7f30f9a6..c9f93310c0d6 100644
--- a/drivers/net/wireless/ath/carl9170/usb.c
+++ b/drivers/net/wireless/ath/carl9170/usb.c
@@ -621,9 +621,16 @@ int __carl9170_exec_cmd(struct ar9170 *ar, struct carl9170_cmd *cmd,
 		goto err_free;
 	}
 
-	usb_fill_int_urb(urb, ar->udev, usb_sndintpipe(ar->udev,
-		AR9170_USB_EP_CMD), cmd, cmd->hdr.len + 4,
-		carl9170_usb_cmd_complete, ar, 1);
+	if (ar->usb_ep_cmd_is_bulk)
+		usb_fill_bulk_urb(urb, ar->udev,
+				  usb_sndbulkpipe(ar->udev, AR9170_USB_EP_CMD),
+				  cmd, cmd->hdr.len + 4,
+				  carl9170_usb_cmd_complete, ar);
+	else
+		usb_fill_int_urb(urb, ar->udev,
+				 usb_sndintpipe(ar->udev, AR9170_USB_EP_CMD),
+				 cmd, cmd->hdr.len + 4,
+				 carl9170_usb_cmd_complete, ar, 1);
 
 	if (free_buf)
 		urb->transfer_flags |= URB_FREE_BUFFER;
@@ -1032,9 +1039,10 @@ static void carl9170_usb_firmware_step2(const struct firmware *fw,
 static int carl9170_usb_probe(struct usb_interface *intf,
 			      const struct usb_device_id *id)
 {
+	struct usb_endpoint_descriptor *ep;
 	struct ar9170 *ar;
 	struct usb_device *udev;
-	int err;
+	int i, err;
 
 	err = usb_reset_device(interface_to_usbdev(intf));
 	if (err)
@@ -1050,6 +1058,21 @@ static int carl9170_usb_probe(struct usb_interface *intf,
 	ar->intf = intf;
 	ar->features = id->driver_info;
 
+	/* We need to remember the type of endpoint 4 because it differs
+	 * between high- and full-speed configuration. The high-speed
+	 * configuration specifies it as interrupt and the full-speed
+	 * configuration as bulk endpoint. This information is required
+	 * later when sending urbs to that endpoint.
+	 */
+	for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; ++i) {
+		ep = &intf->cur_altsetting->endpoint[i].desc;
+
+		if (usb_endpoint_num(ep) == AR9170_USB_EP_CMD &&
+		    usb_endpoint_dir_out(ep) &&
+		    usb_endpoint_type(ep) == USB_ENDPOINT_XFER_BULK)
+			ar->usb_ep_cmd_is_bulk = true;
+	}
+
 	usb_set_intfdata(intf, ar);
 	SET_IEEE80211_DEV(ar->hw, &intf->dev);
 

From ebcc2f517d2a667d4f034171ed8f373f85b035df Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Thu, 7 Aug 2014 14:45:08 +0200
Subject: [PATCH 02/42] brcmfmac: fix curly brace mistake in
 brcmf_pcie_handle_mb_data()

Running coccicheck on brcm80211 drivers resulted in following report:

$ make coccicheck MODE=report M=drivers/net/wireless/brcm80211

  drivers/net/wireless/brcm80211/brcmfmac/pcie.c:595:2-43:
    code aligned with following code on line 596

It revealed that due to a merge failure a block statement lost its
curly braces where it should not.

Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/brcm80211/brcmfmac/pcie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/brcm80211/brcmfmac/pcie.c
index bc972c0ba5f8..e5101b287e4e 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/pcie.c
@@ -591,12 +591,13 @@ static void brcmf_pcie_handle_mb_data(struct brcmf_pciedev_info *devinfo)
 	}
 	if (dtoh_mb_data & BRCMF_D2H_DEV_DS_EXIT_NOTE)
 		brcmf_dbg(PCIE, "D2H_MB_DATA: DEEP SLEEP EXIT\n");
-	if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK)
+	if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK) {
 		brcmf_dbg(PCIE, "D2H_MB_DATA: D3 ACK\n");
 		if (waitqueue_active(&devinfo->mbdata_resp_wait)) {
 			devinfo->mbdata_completed = true;
 			wake_up(&devinfo->mbdata_resp_wait);
 		}
+	}
 }
 
 

From 2d116b8849474bfce8f8dac5ef671d0d7053c6fc Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Thu, 7 Aug 2014 14:45:09 +0200
Subject: [PATCH 03/42] brcmfmac: fix memory leakage in msgbuf

The kbuild robot came up with the following warning:

tree:   .../kernel/git/linville/wireless-next.git master
head:   dc6be9f54a4ecb0a09765d1f515ed947d86b7528
commit: 9a1bb60250d2b6b546a62e5b73f55c4f1d22016b
	 [5/13] brcmfmac: Adding msgbuf protocol.

coccinelle warnings:
 drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c:1309:1-28:
   alloc with no test, possible model on line 1318

Looking into the issue, it turned out that the referred allocation
buffer was not being released in failure path nor upon module
unload.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Reviewed-by: Daniel (Deognyoun) Kim <dekim@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c b/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c
index 535c7eb01b3a..8f8b9373de95 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c
@@ -1318,6 +1318,8 @@ int brcmf_proto_msgbuf_attach(struct brcmf_pub *drvr)
 	msgbuf->nrof_flowrings = if_msgbuf->nrof_flowrings;
 	msgbuf->flowring_dma_handle = kzalloc(msgbuf->nrof_flowrings *
 		sizeof(*msgbuf->flowring_dma_handle), GFP_ATOMIC);
+	if (!msgbuf->flowring_dma_handle)
+		goto fail;
 
 	msgbuf->rx_dataoffset = if_msgbuf->rx_dataoffset;
 	msgbuf->max_rxbufpost = if_msgbuf->max_rxbufpost;
@@ -1362,6 +1364,7 @@ fail:
 		kfree(msgbuf->flow_map);
 		kfree(msgbuf->txstatus_done_map);
 		brcmf_msgbuf_release_pktids(msgbuf);
+		kfree(msgbuf->flowring_dma_handle);
 		if (msgbuf->ioctbuf)
 			dma_free_coherent(drvr->bus_if->dev,
 					  BRCMF_TX_IOCTL_MAX_MSG_SIZE,
@@ -1391,6 +1394,7 @@ void brcmf_proto_msgbuf_detach(struct brcmf_pub *drvr)
 				  BRCMF_TX_IOCTL_MAX_MSG_SIZE,
 				  msgbuf->ioctbuf, msgbuf->ioctbuf_handle);
 		brcmf_msgbuf_release_pktids(msgbuf);
+		kfree(msgbuf->flowring_dma_handle);
 		kfree(msgbuf);
 		drvr->proto->pd = NULL;
 	}

From 2da2c5854ed759a64f8a9231a7f544b472c3aa6d Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Sun, 10 Aug 2014 01:39:28 +0200
Subject: [PATCH 04/42] net: wireless: ipw2x00: ipw2200.c: Cleaning up missing
 null-terminate after strncpy call

Added a guaranteed null-terminate after call to strncpy.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ipw2x00/ipw2200.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index c5aa404069f3..389656bd1a74 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -9853,6 +9853,7 @@ static int ipw_wx_get_wireless_mode(struct net_device *dev,
 		strncpy(extra, "unknown", MAX_WX_STRING);
 		break;
 	}
+	extra[MAX_WX_STRING - 1] = '\0';
 
 	IPW_DEBUG_WX("PRIV GET MODE: %s\n", extra);
 

From e222ade63f038378eec6266c0587db0b98cfc779 Mon Sep 17 00:00:00 2001
From: Lucas Tanure <tanure@linux.com>
Date: Tue, 12 Aug 2014 06:30:23 +0000
Subject: [PATCH 05/42] i40e: Fix missing uapi/linux/dcbnl.h include in
 i40e_fcoe.c

Fix missing include in Intel i40e driver. Without this include linux next
tree won't compile.

Signed-off-by: Lucas Tanure <tanure@linux.com>
Tested-by:  Jim Young <jamesx.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
index 6938fc1ad877..5d01db1d789b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
@@ -33,6 +33,7 @@
 #include <scsi/fc/fc_fcoe.h>
 #include <scsi/libfc.h>
 #include <scsi/libfcoe.h>
+#include <uapi/linux/dcbnl.h>
 
 #include "i40e.h"
 #include "i40e_fcoe.h"

From 952d9639dba84e2a979cf4dd8edab3542b477ba3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 30 Jul 2014 09:02:53 +0000
Subject: [PATCH 06/42] i40e: fix sparse non static symbol warning

Fixes the following sparse warnings:

drivers/net/ethernet/intel/i40e/i40e_nvm.c:254:13: warning:
 symbol 'i40e_write_nvm_aq' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Tested-By: Jim Young <jamesx.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_nvm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 97bda3dffd49..25c4f9a3011f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -251,9 +251,9 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset,
  *
  * Writes a 16 bit words buffer to the Shadow RAM using the admin command.
  **/
-i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
-			      u32 offset, u16 words, void *data,
-			      bool last_command)
+static i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer,
+				     u32 offset, u16 words, void *data,
+				     bool last_command)
 {
 	i40e_status ret_code = I40E_ERR_NVM;
 

From f57e4fbdcc8f65b94d59ce5566c781c46a298c28 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 30 Jul 2014 03:11:09 +0000
Subject: [PATCH 07/42] i40e: use correct structure type name in sizeof

Correct typo in the name of the type given to sizeof.  Because it is the
size of a pointer that is wanted, the typo has no impact on compilation or
execution.

This problem was found using Coccinelle (http://coccinelle.lip6.fr/).  The
semantic patch used can be found in message 0 of this patch series.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Tested-By: Jim Young <jamesx.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 51bc03072ed3..2ac8f45d7c19 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6293,7 +6293,7 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors)
 
 	if (alloc_qvectors) {
 		/* allocate memory for q_vector pointers */
-		size = sizeof(struct i40e_q_vectors *) * vsi->num_q_vectors;
+		size = sizeof(struct i40e_q_vector *) * vsi->num_q_vectors;
 		vsi->q_vectors = kzalloc(size, GFP_KERNEL);
 		if (!vsi->q_vectors) {
 			ret = -ENOMEM;

From 35a7d8045931d384c3d02f28287e0eb42d6038f4 Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Tue, 29 Jul 2014 09:26:25 +0000
Subject: [PATCH 08/42] i40e: Cleaning up missing null-terminate in conjunction
 with strncpy

Replacing strncpy with strlcpy to avoid strings that lacks null terminate.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Tested-By: Jim Young <jamesx.m.young@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2ac8f45d7c19..871474f6fe62 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -4415,13 +4415,13 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 
 	switch (vsi->back->hw.phy.link_info.link_speed) {
 	case I40E_LINK_SPEED_40GB:
-		strncpy(speed, "40 Gbps", SPEED_SIZE);
+		strlcpy(speed, "40 Gbps", SPEED_SIZE);
 		break;
 	case I40E_LINK_SPEED_10GB:
-		strncpy(speed, "10 Gbps", SPEED_SIZE);
+		strlcpy(speed, "10 Gbps", SPEED_SIZE);
 		break;
 	case I40E_LINK_SPEED_1GB:
-		strncpy(speed, "1000 Mbps", SPEED_SIZE);
+		strlcpy(speed, "1000 Mbps", SPEED_SIZE);
 		break;
 	default:
 		break;
@@ -4429,16 +4429,16 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 
 	switch (vsi->back->hw.fc.current_mode) {
 	case I40E_FC_FULL:
-		strncpy(fc, "RX/TX", FC_SIZE);
+		strlcpy(fc, "RX/TX", FC_SIZE);
 		break;
 	case I40E_FC_TX_PAUSE:
-		strncpy(fc, "TX", FC_SIZE);
+		strlcpy(fc, "TX", FC_SIZE);
 		break;
 	case I40E_FC_RX_PAUSE:
-		strncpy(fc, "RX", FC_SIZE);
+		strlcpy(fc, "RX", FC_SIZE);
 		break;
 	default:
-		strncpy(fc, "None", FC_SIZE);
+		strlcpy(fc, "None", FC_SIZE);
 		break;
 	}
 
@@ -5839,7 +5839,7 @@ static void i40e_send_version(struct i40e_pf *pf)
 	dv.minor_version = DRV_VERSION_MINOR;
 	dv.build_version = DRV_VERSION_BUILD;
 	dv.subbuild_version = 0;
-	strncpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string));
+	strlcpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string));
 	i40e_aq_send_driver_version(&pf->hw, &dv, NULL);
 }
 

From da8ed50b8b1f2f5a146ebd068ccbb1e721ee28db Mon Sep 17 00:00:00 2001
From: Jean Sacren <sakiwit@gmail.com>
Date: Tue, 5 Aug 2014 08:01:03 +0000
Subject: [PATCH 09/42] e1000e: fix trivial kernel doc typos

The macro E1000_success is meant to be E1000_SUCCESS. As the return
statement in the function is good as is, let's simply correct the
comment for this trivial matter.

Additionally E1000_ERR_HOST_INTERFACE_COMMAND is supposed to be
-E1000_ERR_HOST_INTERFACE_COMMAND.

Signed-off-by: Jean Sacren <sakiwit@gmail.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/manage.c b/drivers/net/ethernet/intel/e1000e/manage.c
index 58856032298d..e156664d5ba1 100644
--- a/drivers/net/ethernet/intel/e1000e/manage.c
+++ b/drivers/net/ethernet/intel/e1000e/manage.c
@@ -47,7 +47,7 @@ static u8 e1000_calculate_checksum(u8 *buffer, u32 length)
  *  e1000_mng_enable_host_if - Checks host interface is enabled
  *  @hw: pointer to the HW structure
  *
- *  Returns E1000_success upon success, else E1000_ERR_HOST_INTERFACE_COMMAND
+ *  Returns 0 upon success, else -E1000_ERR_HOST_INTERFACE_COMMAND
  *
  *  This function checks whether the HOST IF is enabled for command operation
  *  and also checks whether the previous command is completed.  It busy waits

From 5e815d84180edcc6bf8e3d9c7d3b2ee0cf79b6ba Mon Sep 17 00:00:00 2001
From: Jean Sacren <sakiwit@gmail.com>
Date: Tue, 5 Aug 2014 08:01:31 +0000
Subject: [PATCH 10/42] e1000e: delete excessive space character in debug
 message

There is an excessive space character between the word and the
period in the debug message. So delete it.

Signed-off-by: Jean Sacren <sakiwit@gmail.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/manage.c b/drivers/net/ethernet/intel/e1000e/manage.c
index e156664d5ba1..06edfca1a35e 100644
--- a/drivers/net/ethernet/intel/e1000e/manage.c
+++ b/drivers/net/ethernet/intel/e1000e/manage.c
@@ -78,7 +78,7 @@ static s32 e1000_mng_enable_host_if(struct e1000_hw *hw)
 	}
 
 	if (i == E1000_MNG_DHCP_COMMAND_TIMEOUT) {
-		e_dbg("Previous command timeout failed .\n");
+		e_dbg("Previous command timeout failed.\n");
 		return -E1000_ERR_HOST_INTERFACE_COMMAND;
 	}
 

From 3791b3f6fb74d265c93d493d9bbf29c1e769ceae Mon Sep 17 00:00:00 2001
From: Christoph Jaeger <cj@linux.com>
Date: Tue, 12 Aug 2014 09:27:57 +0200
Subject: [PATCH 11/42] openvswitch: Fix memory leak in ovs_vport_alloc() error
 path

ovs_vport_alloc() bails out without freeing the memory 'vport' points to.

Picked up by Coverity - CID 1230503.

Fixes: 5cd667b0a4 ("openvswitch: Allow each vport to have an array of 'port_id's.")
Signed-off-by: Christoph Jaeger <cj@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 702fb21bfe15..6d8f2ec481d9 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -137,8 +137,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
 	vport->ops = ops;
 	INIT_HLIST_NODE(&vport->dp_hash_node);
 
-	if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids))
+	if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) {
+		kfree(vport);
 		return ERR_PTR(-EINVAL);
+	}
 
 	vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!vport->percpu_stats) {

From 10545937e866ccdbb7ab583031dbdcc6b14e4eb4 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 12 Aug 2014 10:35:19 +0200
Subject: [PATCH 12/42] myri10ge: check for DMA mapping errors

On IOMMU systems DMA mapping can fail, we need to check for
that possibility.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/myricom/myri10ge/myri10ge.c  | 88 ++++++++++++-------
 1 file changed, 58 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 69c26f04d8ce..679db026f4be 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -873,6 +873,10 @@ static int myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
 		return -ENOMEM;
 	dmatest_bus = pci_map_page(mgp->pdev, dmatest_page, 0, PAGE_SIZE,
 				   DMA_BIDIRECTIONAL);
+	if (unlikely(pci_dma_mapping_error(mgp->pdev, dmatest_bus))) {
+		__free_page(dmatest_page);
+		return -ENOMEM;
+	}
 
 	/* Run a small DMA test.
 	 * The magic multipliers to the length tell the firmware
@@ -1294,6 +1298,7 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
 			int bytes, int watchdog)
 {
 	struct page *page;
+	dma_addr_t bus;
 	int idx;
 #if MYRI10GE_ALLOC_SIZE > 4096
 	int end_offset;
@@ -1318,11 +1323,21 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
 					rx->watchdog_needed = 1;
 				return;
 			}
+
+			bus = pci_map_page(mgp->pdev, page, 0,
+					   MYRI10GE_ALLOC_SIZE,
+					   PCI_DMA_FROMDEVICE);
+			if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) {
+				__free_pages(page, MYRI10GE_ALLOC_ORDER);
+				if (rx->fill_cnt - rx->cnt < 16)
+					rx->watchdog_needed = 1;
+				return;
+			}
+
 			rx->page = page;
 			rx->page_offset = 0;
-			rx->bus = pci_map_page(mgp->pdev, page, 0,
-					       MYRI10GE_ALLOC_SIZE,
-					       PCI_DMA_FROMDEVICE);
+			rx->bus = bus;
+
 		}
 		rx->info[idx].page = rx->page;
 		rx->info[idx].page_offset = rx->page_offset;
@@ -2764,6 +2779,35 @@ myri10ge_submit_req(struct myri10ge_tx_buf *tx, struct mcp_kreq_ether_send *src,
 	mb();
 }
 
+static void myri10ge_unmap_tx_dma(struct myri10ge_priv *mgp,
+				  struct myri10ge_tx_buf *tx, int idx)
+{
+	unsigned int len;
+	int last_idx;
+
+	/* Free any DMA resources we've alloced and clear out the skb slot */
+	last_idx = (idx + 1) & tx->mask;
+	idx = tx->req & tx->mask;
+	do {
+		len = dma_unmap_len(&tx->info[idx], len);
+		if (len) {
+			if (tx->info[idx].skb != NULL)
+				pci_unmap_single(mgp->pdev,
+						 dma_unmap_addr(&tx->info[idx],
+								bus), len,
+						 PCI_DMA_TODEVICE);
+			else
+				pci_unmap_page(mgp->pdev,
+					       dma_unmap_addr(&tx->info[idx],
+							      bus), len,
+					       PCI_DMA_TODEVICE);
+			dma_unmap_len_set(&tx->info[idx], len, 0);
+			tx->info[idx].skb = NULL;
+		}
+		idx = (idx + 1) & tx->mask;
+	} while (idx != last_idx);
+}
+
 /*
  * Transmit a packet.  We need to split the packet so that a single
  * segment does not cross myri10ge->tx_boundary, so this makes segment
@@ -2787,7 +2831,7 @@ static netdev_tx_t myri10ge_xmit(struct sk_buff *skb,
 	u32 low;
 	__be32 high_swapped;
 	unsigned int len;
-	int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
+	int idx, avail, frag_cnt, frag_idx, count, mss, max_segments;
 	u16 pseudo_hdr_offset, cksum_offset, queue;
 	int cum_len, seglen, boundary, rdma_count;
 	u8 flags, odd_flag;
@@ -2884,9 +2928,12 @@ again:
 
 	/* map the skb for DMA */
 	len = skb_headlen(skb);
+	bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE);
+	if (unlikely(pci_dma_mapping_error(mgp->pdev, bus)))
+		goto drop;
+
 	idx = tx->req & tx->mask;
 	tx->info[idx].skb = skb;
-	bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE);
 	dma_unmap_addr_set(&tx->info[idx], bus, bus);
 	dma_unmap_len_set(&tx->info[idx], len, len);
 
@@ -2985,12 +3032,16 @@ again:
 			break;
 
 		/* map next fragment for DMA */
-		idx = (count + tx->req) & tx->mask;
 		frag = &skb_shinfo(skb)->frags[frag_idx];
 		frag_idx++;
 		len = skb_frag_size(frag);
 		bus = skb_frag_dma_map(&mgp->pdev->dev, frag, 0, len,
 				       DMA_TO_DEVICE);
+		if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) {
+			myri10ge_unmap_tx_dma(mgp, tx, idx);
+			goto drop;
+		}
+		idx = (count + tx->req) & tx->mask;
 		dma_unmap_addr_set(&tx->info[idx], bus, bus);
 		dma_unmap_len_set(&tx->info[idx], len, len);
 	}
@@ -3021,31 +3072,8 @@ again:
 	return NETDEV_TX_OK;
 
 abort_linearize:
-	/* Free any DMA resources we've alloced and clear out the skb
-	 * slot so as to not trip up assertions, and to avoid a
-	 * double-free if linearizing fails */
+	myri10ge_unmap_tx_dma(mgp, tx, idx);
 
-	last_idx = (idx + 1) & tx->mask;
-	idx = tx->req & tx->mask;
-	tx->info[idx].skb = NULL;
-	do {
-		len = dma_unmap_len(&tx->info[idx], len);
-		if (len) {
-			if (tx->info[idx].skb != NULL)
-				pci_unmap_single(mgp->pdev,
-						 dma_unmap_addr(&tx->info[idx],
-								bus), len,
-						 PCI_DMA_TODEVICE);
-			else
-				pci_unmap_page(mgp->pdev,
-					       dma_unmap_addr(&tx->info[idx],
-							      bus), len,
-					       PCI_DMA_TODEVICE);
-			dma_unmap_len_set(&tx->info[idx], len, 0);
-			tx->info[idx].skb = NULL;
-		}
-		idx = (idx + 1) & tx->mask;
-	} while (idx != last_idx);
 	if (skb_is_gso(skb)) {
 		netdev_err(mgp->dev, "TSO but wanted to linearize?!?!?\n");
 		goto drop;

From de713b57947a7a1b9cc605c0296ee14065cd86b6 Mon Sep 17 00:00:00 2001
From: chas williams - CONTRACTOR <chas@cmf.nrl.navy.mil>
Date: Tue, 12 Aug 2014 08:12:26 -0400
Subject: [PATCH 13/42] atm/svc: Fix blocking in wait loop

One should not call blocking primitives inside a wait loop, since both
require task_struct::state to sleep, so the inner will destroy the
outer state.

sigd_enq() will possibly sleep for alloc_skb().  Move sigd_enq() before
prepare_to_wait() to avoid sleeping while waiting interruptibly.  You do
not actually need to call sigd_enq() after the initial prepare_to_wait()
because we test the termination condition before calling schedule().

Based on suggestions from Peter Zijlstra.

Signed-off-by: Chas Williams <chas@cmf.n4rl.navy.mil>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/svc.c | 60 +++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/net/atm/svc.c b/net/atm/svc.c
index d8e5d0c2ebbc..1ba23f5018e7 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -50,12 +50,12 @@ static void svc_disconnect(struct atm_vcc *vcc)
 
 	pr_debug("%p\n", vcc);
 	if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 		sigd_enq(vcc, as_close, NULL, NULL, NULL);
-		while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+		for (;;) {
+			prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+			if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd)
+				break;
 			schedule();
-			prepare_to_wait(sk_sleep(sk), &wait,
-					TASK_UNINTERRUPTIBLE);
 		}
 		finish_wait(sk_sleep(sk), &wait);
 	}
@@ -126,11 +126,12 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
 	}
 	vcc->local = *addr;
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
-	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-		schedule();
+	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+			break;
+		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
 	clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
@@ -202,15 +203,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
 		}
 		vcc->remote = *addr;
 		set_bit(ATM_VF_WAITING, &vcc->flags);
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
 		if (flags & O_NONBLOCK) {
-			finish_wait(sk_sleep(sk), &wait);
 			sock->state = SS_CONNECTING;
 			error = -EINPROGRESS;
 			goto out;
 		}
 		error = 0;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 		while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
 			schedule();
 			if (!signal_pending(current)) {
@@ -297,11 +297,12 @@ static int svc_listen(struct socket *sock, int backlog)
 		goto out;
 	}
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
-	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-		schedule();
+	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+			break;
+		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd) {
@@ -387,15 +388,15 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
 		}
 		/* wait should be short, so we ignore the non-blocking flag */
 		set_bit(ATM_VF_WAITING, &new_vcc->flags);
-		prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
-				TASK_UNINTERRUPTIBLE);
 		sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
-		while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
+		for (;;) {
+			prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd)
+				break;
 			release_sock(sk);
 			schedule();
 			lock_sock(sk);
-			prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
-					TASK_UNINTERRUPTIBLE);
 		}
 		finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
 		if (!sigd) {
@@ -433,12 +434,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
 	DEFINE_WAIT(wait);
 
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
 	sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
-	while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
-	       !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
-		schedule();
+	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(ATM_VF_WAITING, &vcc->flags) ||
+		    test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) {
+			break;
+		}
+		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd)
@@ -529,18 +532,18 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
 
 	lock_sock(sk);
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	sigd_enq(vcc, as_addparty, NULL, NULL,
 		 (struct sockaddr_atmsvc *) sockaddr);
 	if (flags & O_NONBLOCK) {
-		finish_wait(sk_sleep(sk), &wait);
 		error = -EINPROGRESS;
 		goto out;
 	}
 	pr_debug("added wait queue\n");
-	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-		schedule();
+	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+			break;
+		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
 	error = xchg(&sk->sk_err_soft, 0);
@@ -558,11 +561,12 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
 
 	lock_sock(sk);
 	set_bit(ATM_VF_WAITING, &vcc->flags);
-	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
-	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
-		schedule();
+	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+			break;
+		schedule();
 	}
 	finish_wait(sk_sleep(sk), &wait);
 	if (!sigd) {

From 8356f9d564ed3da4f98e8c44cbd9f397ff9c49a5 Mon Sep 17 00:00:00 2001
From: chas williams - CONTRACTOR <chas@cmf.nrl.navy.mil>
Date: Tue, 12 Aug 2014 09:00:36 -0400
Subject: [PATCH 14/42] lec: Fix bug introduced by
 b67bfe0d42cac56c512dd5da4b1b347a23f4b70a

b67bfe0d42cac56c512dd5da4b1b347a23f4b70a (hlist: drop the node
parameter from iterators) dropped the node parameter from
iterators which lec_tbl_walk() was using to iterate the list.

Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/lec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/atm/lec.c b/net/atm/lec.c
index 4c5b8ba0f84f..e4853b50cf40 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -833,7 +833,6 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
 			  loff_t *l)
 {
 	struct hlist_node *e = state->node;
-	struct lec_arp_table *tmp;
 
 	if (!e)
 		e = tbl->first;
@@ -842,9 +841,7 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
 		--*l;
 	}
 
-	tmp = container_of(e, struct lec_arp_table, next);
-
-	hlist_for_each_entry_from(tmp, next) {
+	for (; e; e = e->next) {
 		if (--*l < 0)
 			break;
 	}

From 1f6394e382f137f07136182c591ca8a7eeba6a06 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 12 Aug 2014 10:27:24 -0400
Subject: [PATCH 15/42] sunvnet: Do not ask for an ACK for every dring transmit

No need to ask for an ack with every vnet_start_xmit()- the single
ACK with DRING_STOPPED is sufficient for the protocol, and we free
the sk_buff in vnet_start_xmit itself, so we dont need an ACK back.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Raghuram Kothakota <raghuram.kothakota@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/sunvnet.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index d813bfb1a847..238434dcfe02 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -691,7 +691,15 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
 	}
 
-	d->hdr.ack = VIO_ACK_ENABLE;
+	/* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
+	 * thus it is safe to not set VIO_ACK_ENABLE for each transmission:
+	 * the protocol itself does not require it as long as the peer
+	 * sends a VIO_SUBTYPE_ACK for VIO_DRING_STOPPED.
+	 *
+	 * An ACK for every packet in the ring is expensive as the
+	 * sending of LDC messages is slow and affects performance.
+	 */
+	d->hdr.ack = VIO_ACK_DISABLE;
 	d->size = len;
 	d->ncookies = port->tx_bufs[dr->prod].ncookies;
 	for (i = 0; i < d->ncookies; i++)

From adddc32d6fde82156c779997f928865100542e55 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 12 Aug 2014 10:33:10 -0400
Subject: [PATCH 16/42] sunvnet: Do not spin in an infinite loop when
 vio_ldc_send() returns EAGAIN

ldc_rx -> vnet_rx -> .. -> vnet_walk_rx->vnet_send_ack should not
spin into an infinite loop waiting  EAGAIN to lift.

The sender could have sent us a burst, and gone to lunch without
doing any more ldc_read()'s. That should not cause the receiver to
loop infinitely till soft-lockup kicks in.

Similarly __vnet_tx_trigger should only loop on EAGAIN a finite
number of times. The caller (vnet_start_xmit()) already has code
to reset the dring state and bail on errors from __vnet_tx_trigger

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Raghuram Kothakota <raghuram.kothakota@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/sunvnet.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 238434dcfe02..a5871791e452 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -32,6 +32,11 @@ MODULE_DESCRIPTION("Sun LDOM virtual network driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
+/* Heuristic for the number of times to exponentially backoff and
+ * retry sending an LDC trigger when EAGAIN is encountered
+ */
+#define	VNET_MAX_RETRIES	10
+
 /* Ordered from largest major to lowest */
 static struct vio_version vnet_versions[] = {
 	{ .major = 1, .minor = 0 },
@@ -260,6 +265,7 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
 		.state			= vio_dring_state,
 	};
 	int err, delay;
+	int retries = 0;
 
 	hdr.seq = dr->snd_nxt;
 	delay = 1;
@@ -272,6 +278,13 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
 		udelay(delay);
 		if ((delay <<= 1) > 128)
 			delay = 128;
+		if (retries++ > VNET_MAX_RETRIES) {
+			pr_info("ECONNRESET %x:%x:%x:%x:%x:%x\n",
+				port->raddr[0], port->raddr[1],
+				port->raddr[2], port->raddr[3],
+				port->raddr[4], port->raddr[5]);
+			err = -ECONNRESET;
+		}
 	} while (err == -EAGAIN);
 
 	return err;
@@ -593,6 +606,7 @@ static int __vnet_tx_trigger(struct vnet_port *port)
 		.end_idx		= (u32) -1,
 	};
 	int err, delay;
+	int retries = 0;
 
 	hdr.seq = dr->snd_nxt;
 	delay = 1;
@@ -605,6 +619,8 @@ static int __vnet_tx_trigger(struct vnet_port *port)
 		udelay(delay);
 		if ((delay <<= 1) > 128)
 			delay = 128;
+		if (retries++ > VNET_MAX_RETRIES)
+			break;
 	} while (err == -EAGAIN);
 
 	return err;

From 1d311ad2f9fe5d75cf228372174170d0f435e3ef Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Wed, 13 Aug 2014 10:29:41 -0400
Subject: [PATCH 17/42] sunvnet: Schedule maybe_tx_wakeup() as a tasklet from
 ldc_rx path

At the tail of vnet_event(), if we hit the maybe_tx_wakeup()
condition, we try to take the netif_tx_lock() in the
recv-interrupt-context and can deadlock with dev_watchdog().
vnet_event() should schedule maybe_tx_wakeup() as a tasklet
to avoid this deadlock

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/sunvnet.c | 12 ++++++++++--
 drivers/net/ethernet/sun/sunvnet.h |  4 ++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index a5871791e452..23c89ab5a6ad 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -488,8 +488,9 @@ static int handle_mcast(struct vnet_port *port, void *msgbuf)
 	return 0;
 }
 
-static void maybe_tx_wakeup(struct vnet *vp)
+static void maybe_tx_wakeup(unsigned long param)
 {
+	struct vnet *vp = (struct vnet *)param;
 	struct net_device *dev = vp->dev;
 
 	netif_tx_lock(dev);
@@ -586,8 +587,13 @@ static void vnet_event(void *arg, int event)
 			break;
 	}
 	spin_unlock(&vio->lock);
+	/* Kick off a tasklet to wake the queue.  We cannot call
+	 * maybe_tx_wakeup directly here because we could deadlock on
+	 * netif_tx_lock() with dev_watchdog()
+	 */
 	if (unlikely(tx_wakeup && err != -ECONNRESET))
-		maybe_tx_wakeup(port->vp);
+		tasklet_schedule(&port->vp->vnet_tx_wakeup);
+
 	local_irq_restore(flags);
 }
 
@@ -1070,6 +1076,7 @@ static struct vnet *vnet_new(const u64 *local_mac)
 	vp = netdev_priv(dev);
 
 	spin_lock_init(&vp->lock);
+	tasklet_init(&vp->vnet_tx_wakeup, maybe_tx_wakeup, (unsigned long)vp);
 	vp->dev = dev;
 
 	INIT_LIST_HEAD(&vp->port_list);
@@ -1129,6 +1136,7 @@ static void vnet_cleanup(void)
 		vp = list_first_entry(&vnet_list, struct vnet, list);
 		list_del(&vp->list);
 		dev = vp->dev;
+		tasklet_kill(&vp->vnet_tx_wakeup);
 		/* vio_unregister_driver() should have cleaned up port_list */
 		BUG_ON(!list_empty(&vp->port_list));
 		unregister_netdev(dev);
diff --git a/drivers/net/ethernet/sun/sunvnet.h b/drivers/net/ethernet/sun/sunvnet.h
index d347a5bf24b0..de5c2c64996f 100644
--- a/drivers/net/ethernet/sun/sunvnet.h
+++ b/drivers/net/ethernet/sun/sunvnet.h
@@ -1,6 +1,8 @@
 #ifndef _SUNVNET_H
 #define _SUNVNET_H
 
+#include <linux/interrupt.h>
+
 #define DESC_NCOOKIES(entry_size)	\
 	((entry_size) - sizeof(struct vio_net_desc))
 
@@ -78,6 +80,8 @@ struct vnet {
 
 	struct list_head	list;
 	u64			local_mac;
+
+	struct tasklet_struct	vnet_tx_wakeup;
 };
 
 #endif /* _SUNVNET_H */

From 7b31b4deda7679120379c005df8d3d647161eff7 Mon Sep 17 00:00:00 2001
From: Govindarajulu Varadarajan <_govind@gmx.com>
Date: Wed, 13 Aug 2014 13:04:56 +0530
Subject: [PATCH 18/42] tg3: fix return value in tg3_get_stats64

When tp->hw_stats is 0, tg3_get_stats64 should display previously
recorded stats. So it returns &tp->net_stats_prev. But the caller,
dev_get_stats, ignores the return value.

Fix this by assigning tp->net_stats_prev to stats and returning stats.

Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
Acked-by: Prashant Sreedharan <prashant@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index a3dd5dc64f4c..4296b3d26f02 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14093,8 +14093,9 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
 
 	spin_lock_bh(&tp->lock);
 	if (!tp->hw_stats) {
+		*stats = tp->net_stats_prev;
 		spin_unlock_bh(&tp->lock);
-		return &tp->net_stats_prev;
+		return stats;
 	}
 
 	tg3_get_nstats(tp, stats);

From 8d21797df9ce6537a46ff58e20f359ef05c8faf2 Mon Sep 17 00:00:00 2001
From: Anish Bhatt <anish@chelsio.com>
Date: Tue, 12 Aug 2014 22:38:26 -0700
Subject: [PATCH 19/42] libcxgbi/cxgb4i : Fix ipv6 build failure caught with
 randconfig

Previous guard of IS_ENABLED(CONFIG_IPV6) is not sufficient when cxgbi drivers
are built into kernel but ipv6 is not.

v2: Use Kconfig to disable compiling cxgbi built into kernel when ipv6 is
compiled as a module

Fixes: e81fbf6cd652 ("libcxgbi:cxgb4i Guard ipv6 code with a config check")
Fixes: fc8d0590d914 ("libcxgbi: Add ipv6 api to driver")
Signed-off-by: Anish Bhatt <anish@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/cxgbi/cxgb3i/Kconfig | 2 +-
 drivers/scsi/cxgbi/cxgb4i/Kconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/cxgbi/cxgb3i/Kconfig b/drivers/scsi/cxgbi/cxgb3i/Kconfig
index 6bbc36fbd6ec..e4603985dce3 100644
--- a/drivers/scsi/cxgbi/cxgb3i/Kconfig
+++ b/drivers/scsi/cxgbi/cxgb3i/Kconfig
@@ -1,6 +1,6 @@
 config SCSI_CXGB3_ISCSI
 	tristate "Chelsio T3 iSCSI support"
-	depends on PCI && INET
+	depends on PCI && INET && (IPV6 || IPV6=n)
 	select NETDEVICES
 	select ETHERNET
 	select NET_VENDOR_CHELSIO
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig
index 16b2c7d26617..8c4e423037b6 100644
--- a/drivers/scsi/cxgbi/cxgb4i/Kconfig
+++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig
@@ -1,6 +1,6 @@
 config SCSI_CXGB4_ISCSI
 	tristate "Chelsio T4 iSCSI support"
-	depends on PCI && INET
+	depends on PCI && INET && (IPV6 || IPV6=n)
 	select NETDEVICES
 	select ETHERNET
 	select NET_VENDOR_CHELSIO

From efd5029010c5b13c20bd6364fce9b3eae454419a Mon Sep 17 00:00:00 2001
From: Maks Naumov <maksqwe1@ukr.net>
Date: Tue, 12 Aug 2014 11:28:41 -0700
Subject: [PATCH 20/42] irda: Fix rd_frame control field initialization in
 irlap_send_rd_frame()

Signed-off-by: Maks Naumov <maksqwe1@ukr.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/irlap_frame.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 9ea0c933b9ff..a37998c6273d 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -622,7 +622,7 @@ void irlap_send_rd_frame(struct irlap_cb *self)
 	frame = (struct rd_frame *)skb_put(tx_skb, 2);
 
 	frame->caddr = self->caddr;
-	frame->caddr = RD_RSP | PF_BIT;
+	frame->control = RD_RSP | PF_BIT;
 
 	irlap_queue_xmit(self, tx_skb);
 }

From cd094927c68718c8bda432c18e85b6ec3a9ee3d1 Mon Sep 17 00:00:00 2001
From: Libo Chen <clbchenlibo.chen@huawei.com>
Date: Tue, 12 Aug 2014 13:31:54 -0700
Subject: [PATCH 21/42] drivers/net/irda/donauboe.c: convert to
 module_pci_driver

Signed-off-by: Libo Chen <libo.chen@huawei.com>
Cc: Samuel Ortiz <samuel@sortiz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/irda/donauboe.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c
index 768dfe9a9315..6d3e2093bf7f 100644
--- a/drivers/net/irda/donauboe.c
+++ b/drivers/net/irda/donauboe.c
@@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = {
 	.resume		= toshoboe_wakeup 
 };
 
-static int __init
-donauboe_init (void)
-{
-  return pci_register_driver(&donauboe_pci_driver);
-}
-
-static void __exit
-donauboe_cleanup (void)
-{
-  pci_unregister_driver(&donauboe_pci_driver);
-}
-
-module_init(donauboe_init);
-module_exit(donauboe_cleanup);
+module_pci_driver(donauboe_pci_driver);

From 712a72213fad36cc9e6ec706b5e020d7eb6e03bc Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 Aug 2014 14:53:16 -0400
Subject: [PATCH 22/42] net-timestamp: fix missing ACK timestamp

ACK timestamps are generated in tcp_clean_rtx_queue. The TSO datapath
can break out early, causing the timestamp code to be skipped. Move
the code up before the break.

Reported-by: David S. Miller <davem@davemloft.net>

Also fix a boundary condition: tp->snd_una is the next unacknowledged
byte and between tests inclusive (a <= b <= c), so generate a an ACK
timestamp if (prior_snd_una <= tskey <= tp->snd_una - 1).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a3d47af01906..1a8e89fdd331 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3050,10 +3050,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	first_ackt.v64 = 0;
 
 	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
 
+		if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+		    between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
 		/* Determine how many packets and what bytes were acked, tso and else */
 		if (after(scb->end_seq, tp->snd_una)) {
 			if (tcp_skb_pcount(skb) == 1 ||
@@ -3107,11 +3112,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			tp->retrans_stamp = 0;
 		}
 
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
-		    between(skb_shinfo(skb)->tskey, prior_snd_una,
-			    tp->snd_una + 1))
-			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
-
 		if (!fully_acked)
 			break;
 

From 490cc7d03c21871eef5949ba77a18b0c7ef70ef5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 12 Aug 2014 15:08:12 -0400
Subject: [PATCH 23/42] net-timestamp: fix missing tcp fragmentation cases

Bytestream timestamps are correlated with a single byte in the skbuff,
recorded in skb_shinfo(skb)->tskey. When fragmenting skbuffs, ensure
that the tskey is set for the fragment in which the tskey falls
(seqno <= tskey < end_seqno).

The original implementation did not address fragmentation in
tcp_fragment or tso_fragment. Add code to inspect the sequence numbers
and move both tskey and the relevant tx_flags if necessary.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8fcfc91964ec..ef4a051de018 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1069,6 +1069,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	tcp_verify_left_out(tp);
 }
 
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+		shinfo->tx_flags &= ~tsflags;
+		shinfo2->tx_flags |= tsflags;
+		swap(shinfo->tskey, shinfo2->tskey);
+	}
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -1136,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 	buff->tstamp = skb->tstamp;
+	tcp_fragment_tstamp(skb, buff);
 
 	old_factor = tcp_skb_pcount(skb);
 
@@ -1652,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
+	tcp_fragment_tstamp(skb, buff);
 
 	/* Fix up tso_factor for both original and new SKB.  */
 	tcp_set_skb_tso_segs(sk, skb, mss_now);

From 5c807005fa60deef2db6616d9b7b24fc4c436be9 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 12 Aug 2014 11:59:29 +0100
Subject: [PATCH 24/42] xen-netback: fix debugfs write length check

Enlarge buffer size and check input length properly, so that we don't
misuse -ENOSPC.

Note that command like "kickXXXX" is still allowed, that's one patch for
another day if we really want to be very strict on this.

Reported-by: SeeChen Ng <seechen81@gmail.com>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Zoltan Kiss <zoltan.kiss@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/xenbus.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 580517d857bf..4c9041e60e7d 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -116,6 +116,7 @@ static int xenvif_read_io_ring(struct seq_file *m, void *v)
 }
 
 #define XENVIF_KICK_STR "kick"
+#define BUFFER_SIZE     32
 
 static ssize_t
 xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count,
@@ -124,22 +125,24 @@ xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count,
 	struct xenvif_queue *queue =
 		((struct seq_file *)filp->private_data)->private;
 	int len;
-	char write[sizeof(XENVIF_KICK_STR)];
+	char write[BUFFER_SIZE];
 
 	/* don't allow partial writes and check the length */
 	if (*ppos != 0)
 		return 0;
-	if (count < sizeof(XENVIF_KICK_STR) - 1)
+	if (count >= sizeof(write))
 		return -ENOSPC;
 
 	len = simple_write_to_buffer(write,
-				     sizeof(write),
+				     sizeof(write) - 1,
 				     ppos,
 				     buf,
 				     count);
 	if (len < 0)
 		return len;
 
+	write[len] = '\0';
+
 	if (!strncmp(write, XENVIF_KICK_STR, sizeof(XENVIF_KICK_STR) - 1))
 		xenvif_interrupt(0, (void *)queue);
 	else {

From 628fa76b09d7b0923c142631fc25b6affbfb868d Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 12 Aug 2014 11:59:30 +0100
Subject: [PATCH 25/42] xen-netback: fix debugfs entry creation

The original code is bogus. The function gets called in a loop which
leaks entries created in previous rounds.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Zoltan Kiss <zoltan.kiss@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/xenbus.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 4c9041e60e7d..9c47b897b6d2 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -174,10 +174,9 @@ static const struct file_operations xenvif_dbg_io_ring_ops_fops = {
 	.write = xenvif_write_io_ring,
 };
 
-static void xenvif_debugfs_addif(struct xenvif_queue *queue)
+static void xenvif_debugfs_addif(struct xenvif *vif)
 {
 	struct dentry *pfile;
-	struct xenvif *vif = queue->vif;
 	int i;
 
 	if (IS_ERR_OR_NULL(xen_netback_dbg_root))
@@ -736,11 +735,12 @@ static void connect(struct backend_info *be)
 			be->vif->num_queues = queue_index;
 			goto err;
 		}
-#ifdef CONFIG_DEBUG_FS
-		xenvif_debugfs_addif(queue);
-#endif /* CONFIG_DEBUG_FS */
 	}
 
+#ifdef CONFIG_DEBUG_FS
+	xenvif_debugfs_addif(be->vif);
+#endif /* CONFIG_DEBUG_FS */
+
 	/* Initialisation completed, tell core driver the number of
 	 * active queues.
 	 */

From ea2c5e134237eadc9924ce821ded678750024549 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 12 Aug 2014 11:48:06 +0100
Subject: [PATCH 26/42] xen-netback: move NAPI add/remove calls

Originally netif_napi_add was in xenvif_init_queue and netif_napi_del
was in xenvif_deinit_queue, while kthreads were handled in
xenvif_connect and xenvif_disconnect. Move netif_napi_add and
netif_napi_del to xenvif_connect and xenvif_disconnect so that they
reside together with kthread operations.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/interface.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index bfd10cb9c8de..5f3d6c06fcf7 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -524,9 +524,6 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 
 	init_timer(&queue->rx_stalled);
 
-	netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
-			XENVIF_NAPI_WEIGHT);
-
 	return 0;
 }
 
@@ -614,6 +611,9 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
 	wake_up_process(queue->task);
 	wake_up_process(queue->dealloc_task);
 
+	netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
+			XENVIF_NAPI_WEIGHT);
+
 	return 0;
 
 err_rx_unbind:
@@ -672,6 +672,8 @@ void xenvif_disconnect(struct xenvif *vif)
 	for (queue_index = 0; queue_index < num_queues; ++queue_index) {
 		queue = &vif->queues[queue_index];
 
+		netif_napi_del(&queue->napi);
+
 		if (queue->task) {
 			del_timer_sync(&queue->rx_stalled);
 			kthread_stop(queue->task);
@@ -704,7 +706,6 @@ void xenvif_disconnect(struct xenvif *vif)
 void xenvif_deinit_queue(struct xenvif_queue *queue)
 {
 	free_xenballooned_pages(MAX_PENDING_REQS, queue->mmap_pages);
-	netif_napi_del(&queue->napi);
 }
 
 void xenvif_free(struct xenvif *vif)

From a64bd934528e26e8956112e43a279fba2ee0634e Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 12 Aug 2014 11:48:07 +0100
Subject: [PATCH 27/42] xen-netback: don't stop dealloc kthread too early

Reference count the number of packets in host stack, so that we don't
stop the deallocation thread too early. If not, we can end up with
xenvif_free permanently waiting for deallocation thread to unmap grefs.

Reported-by: Thomas Leonard <talex5@gmail.com>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/common.h    |  5 +++++
 drivers/net/xen-netback/interface.c | 18 ++++++++++++++++++
 drivers/net/xen-netback/netback.c   | 26 +++++++++++++++++++-------
 3 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index ef3026f46a37..d4eb8d2e9cb7 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -165,6 +165,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */
 	u16 dealloc_ring[MAX_PENDING_REQS];
 	struct task_struct *dealloc_task;
 	wait_queue_head_t dealloc_wq;
+	atomic_t inflight_packets;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
@@ -329,4 +330,8 @@ extern unsigned int xenvif_max_queues;
 extern struct dentry *xen_netback_dbg_root;
 #endif
 
+void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue,
+				 struct sk_buff *skb);
+void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue);
+
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 5f3d6c06fcf7..0aaca902699a 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -43,6 +43,23 @@
 #define XENVIF_QUEUE_LENGTH 32
 #define XENVIF_NAPI_WEIGHT  64
 
+/* This function is used to set SKBTX_DEV_ZEROCOPY as well as
+ * increasing the inflight counter. We need to increase the inflight
+ * counter because core driver calls into xenvif_zerocopy_callback
+ * which calls xenvif_skb_zerocopy_complete.
+ */
+void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue,
+				 struct sk_buff *skb)
+{
+	skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	atomic_inc(&queue->inflight_packets);
+}
+
+void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue)
+{
+	atomic_dec(&queue->inflight_packets);
+}
+
 static inline void xenvif_stop_queue(struct xenvif_queue *queue)
 {
 	struct net_device *dev = queue->vif->dev;
@@ -557,6 +574,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
 
 	init_waitqueue_head(&queue->wq);
 	init_waitqueue_head(&queue->dealloc_wq);
+	atomic_set(&queue->inflight_packets, 0);
 
 	if (tx_evtchn == rx_evtchn) {
 		/* feature-split-event-channels == 0 */
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 4734472aa620..08f65996534c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1525,10 +1525,12 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s
 	/* remove traces of mapped pages and frag_list */
 	skb_frag_list_init(skb);
 	uarg = skb_shinfo(skb)->destructor_arg;
+	/* increase inflight counter to offset decrement in callback */
+	atomic_inc(&queue->inflight_packets);
 	uarg->callback(uarg, true);
 	skb_shinfo(skb)->destructor_arg = NULL;
 
-	skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	xenvif_skb_zerocopy_prepare(queue, nskb);
 	kfree_skb(nskb);
 
 	return 0;
@@ -1589,7 +1591,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 				if (net_ratelimit())
 					netdev_err(queue->vif->dev,
 						   "Not enough memory to consolidate frag_list!\n");
-				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+				xenvif_skb_zerocopy_prepare(queue, skb);
 				kfree_skb(skb);
 				continue;
 			}
@@ -1609,7 +1611,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 				   "Can't setup checksum in net_tx_action\n");
 			/* We have to set this flag to trigger the callback */
 			if (skb_shinfo(skb)->destructor_arg)
-				skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+				xenvif_skb_zerocopy_prepare(queue, skb);
 			kfree_skb(skb);
 			continue;
 		}
@@ -1641,7 +1643,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 		 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
 		 */
 		if (skb_shinfo(skb)->destructor_arg) {
-			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			xenvif_skb_zerocopy_prepare(queue, skb);
 			queue->stats.tx_zerocopy_sent++;
 		}
 
@@ -1681,6 +1683,7 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 		queue->stats.tx_zerocopy_success++;
 	else
 		queue->stats.tx_zerocopy_fail++;
+	xenvif_skb_zerocopy_complete(queue);
 }
 
 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
@@ -2058,15 +2061,24 @@ int xenvif_kthread_guest_rx(void *data)
 	return 0;
 }
 
+static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue)
+{
+	/* Dealloc thread must remain running until all inflight
+	 * packets complete.
+	 */
+	return kthread_should_stop() &&
+		!atomic_read(&queue->inflight_packets);
+}
+
 int xenvif_dealloc_kthread(void *data)
 {
 	struct xenvif_queue *queue = data;
 
-	while (!kthread_should_stop()) {
+	for (;;) {
 		wait_event_interruptible(queue->dealloc_wq,
 					 tx_dealloc_work_todo(queue) ||
-					 kthread_should_stop());
-		if (kthread_should_stop())
+					 xenvif_dealloc_kthread_should_stop(queue));
+		if (xenvif_dealloc_kthread_should_stop(queue))
 			break;
 
 		xenvif_tx_dealloc_action(queue);

From b1252858213f39700dac1bc3295b6e88f6cce24b Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 12 Aug 2014 11:48:08 +0100
Subject: [PATCH 28/42] xen-netback: remove loop waiting function

The original implementation relies on a loop to check if all inflight
packets are freed. Now we have proper reference counting, there's no
need to use loop anymore.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: Zoltan Kiss <zoltan.kiss@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netback/interface.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 0aaca902699a..e29e15dca86e 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -660,25 +660,6 @@ void xenvif_carrier_off(struct xenvif *vif)
 	rtnl_unlock();
 }
 
-static void xenvif_wait_unmap_timeout(struct xenvif_queue *queue,
-				      unsigned int worst_case_skb_lifetime)
-{
-	int i, unmap_timeout = 0;
-
-	for (i = 0; i < MAX_PENDING_REQS; ++i) {
-		if (queue->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
-			unmap_timeout++;
-			schedule_timeout(msecs_to_jiffies(1000));
-			if (unmap_timeout > worst_case_skb_lifetime &&
-			    net_ratelimit())
-				netdev_err(queue->vif->dev,
-					   "Page still granted! Index: %x\n",
-					   i);
-			i = -1;
-		}
-	}
-}
-
 void xenvif_disconnect(struct xenvif *vif)
 {
 	struct xenvif_queue *queue = NULL;
@@ -731,21 +712,11 @@ void xenvif_free(struct xenvif *vif)
 	struct xenvif_queue *queue = NULL;
 	unsigned int num_queues = vif->num_queues;
 	unsigned int queue_index;
-	/* Here we want to avoid timeout messages if an skb can be legitimately
-	 * stuck somewhere else. Realistically this could be an another vif's
-	 * internal or QDisc queue. That another vif also has this
-	 * rx_drain_timeout_msecs timeout, so give it time to drain out.
-	 * Although if that other guest wakes up just before its timeout happens
-	 * and takes only one skb from QDisc, it can hold onto other skbs for a
-	 * longer period.
-	 */
-	unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000);
 
 	unregister_netdev(vif->dev);
 
 	for (queue_index = 0; queue_index < num_queues; ++queue_index) {
 		queue = &vif->queues[queue_index];
-		xenvif_wait_unmap_timeout(queue, worst_case_skb_lifetime);
 		xenvif_deinit_queue(queue);
 	}
 

From 77b2f2865956b7573f9b040db7a9f808b434acd1 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Thu, 14 Aug 2014 19:32:16 +0300
Subject: [PATCH 29/42] iwlwifi: mvm: disable scheduled scan to prevent
 firmware crash

There are firmwares which don't support scheduled scan.
Disable it for now.
Linus's system encoutered this issue.
Thanks to David Spinadel for his help.

Tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
---
 drivers/net/wireless/iwlwifi/mvm/mac80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c
index 0d6a8b768a68..7c8796584c25 100644
--- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c
@@ -396,7 +396,8 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm)
 	else
 		hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT;
 
-	hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN;
+	/* TODO: enable that only for firmwares that don't crash */
+	/* hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; */
 	hw->wiphy->max_sched_scan_ssids = PROBE_OPTION_MAX;
 	hw->wiphy->max_match_sets = IWL_SCAN_MAX_PROFILES;
 	/* we create the 802.11 header and zero length SSID IE. */

From 5e3c516b512c0f8f18359413b04918f6347f67e7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 14 Aug 2014 14:32:49 -0700
Subject: [PATCH 30/42] Revert "macvlan: simplify the structure port"

This reverts commit a188a54d11629bef2169052297e61f3767ca8ce5.

It causes crashes

====================
[   80.643286] BUG: unable to handle kernel NULL pointer dereference at 0000000000000878
[   80.670103] IP: [<ffffffff810832e4>] try_to_grab_pending+0x64/0x1f0
[   80.691289] PGD 22c102067 PUD 235bf0067 PMD 0
[   80.706611] Oops: 0002 [#1] SMP
[   80.717836] Modules linked in: macvlan nfsd lockd nfs_acl exportfs auth_rpcgss sunrpc oid_registry ioatdma ixgbe(-) mdio igb dca
[   80.757935] CPU: 37 PID: 6724 Comm: rmmod Not tainted 3.16.0-net-next-08-12-2014-FCoE+ #1
[   80.785688] Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.03.0003.041920141333 04/19/2014
[   80.820310] task: ffff880235a9eae0 ti: ffff88022e844000 task.ti: ffff88022e844000
[   80.845770] RIP: 0010:[<ffffffff810832e4>]  [<ffffffff810832e4>] try_to_grab_pending+0x64/0x1f0
[   80.875326] RSP: 0018:ffff88022e847b28  EFLAGS: 00010046
[   80.893251] RAX: 0000000000037a6a RBX: 0000000000000878 RCX: 0000000000000000
[   80.917187] RDX: ffff880235a9eae0 RSI: 0000000000000001 RDI: ffffffff810832db
[   80.941125] RBP: ffff88022e847b58 R08: 0000000000000000 R09: 0000000000000000
[   80.965056] R10: 0000000000000001 R11: 0000000000000001 R12: ffff88022e847b70
[   80.988994] R13: 0000000000000000 R14: ffff88022e847be8 R15: ffffffff81ebe440
[   81.012929] FS:  00007fab90b07700(0000) GS:ffff88043f7a0000(0000) knlGS:0000000000000000
[   81.040400] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   81.059757] CR2: 0000000000000878 CR3: 0000000235a42000 CR4: 00000000001407e0
[   81.083689] Stack:
[   81.090739]  ffff880235a9eae0 0000000000000878 ffff88022e847b70 0000000000000000
[   81.116253]  ffff88022e847be8 ffffffff81ebe440 ffff88022e847b98 ffffffff810847f1
[   81.141766]  ffff88022e847b78 0000000000000286 ffff880234200000 0000000000000000
[   81.167282] Call Trace:
[   81.175768]  [<ffffffff810847f1>] __cancel_work_timer+0x31/0x170
[   81.195985]  [<ffffffff8108494b>] cancel_work_sync+0xb/0x10
[   81.214769]  [<ffffffffa015ae68>] macvlan_port_destroy+0x28/0x60 [macvlan]
[   81.237844]  [<ffffffffa015b930>] macvlan_uninit+0x40/0x50 [macvlan]
[   81.259209]  [<ffffffff816bf6e2>] rollback_registered_many+0x1a2/0x2c0
[   81.281140]  [<ffffffff816bf81a>] unregister_netdevice_many+0x1a/0xb0
[   81.302786]  [<ffffffffa015a4ff>] macvlan_device_event+0x1ef/0x240 [macvlan]
[   81.326439]  [<ffffffff8108a13d>] notifier_call_chain+0x4d/0x70
[   81.346366]  [<ffffffff8108a201>] raw_notifier_call_chain+0x11/0x20
[   81.367439]  [<ffffffff816bf25b>] call_netdevice_notifiers_info+0x3b/0x70
[   81.390228]  [<ffffffff816bf2a1>] call_netdevice_notifiers+0x11/0x20
[   81.411587]  [<ffffffff816bf6bd>] rollback_registered_many+0x17d/0x2c0
[   81.433518]  [<ffffffff816bf925>] unregister_netdevice_queue+0x75/0x110
[   81.455735]  [<ffffffff816bfb2b>] unregister_netdev+0x1b/0x30
[   81.475094]  [<ffffffffa0039b50>] ixgbe_remove+0x170/0x1d0 [ixgbe]
[   81.495886]  [<ffffffff813512a2>] pci_device_remove+0x32/0x60
[   81.515246]  [<ffffffff814c75c4>] __device_release_driver+0x64/0xd0
[   81.536321]  [<ffffffff814c76f8>] driver_detach+0xc8/0xd0
[   81.554530]  [<ffffffff814c656e>] bus_remove_driver+0x4e/0xa0
[   81.573888]  [<ffffffff814c828b>] driver_unregister+0x2b/0x60
[   81.593246]  [<ffffffff8135143e>] pci_unregister_driver+0x1e/0xa0
[   81.613749]  [<ffffffffa005db18>] ixgbe_exit_module+0x1c/0x2e [ixgbe]
[   81.635401]  [<ffffffff810e738b>] SyS_delete_module+0x15b/0x1e0
[   81.655334]  [<ffffffff8187a395>] ? sysret_check+0x22/0x5d
[   81.673833]  [<ffffffff810abd2d>] ? trace_hardirqs_on_caller+0x11d/0x1e0
[   81.696339]  [<ffffffff8132bfde>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[   81.717985]  [<ffffffff8187a369>] system_call_fastpath+0x16/0x1b
[   81.738199] Code: 00 48 83 3d 6e bb da 00 00 48 89 c2 0f 84 67 01 00 00 fa 66 0f 1f 44 00 00 49 89 14 24 e8 b5 4b 02 00 45 84 ed 0f 85 ac 00 00 00 <f0> 0f ba 2b 00 72 1d 31 c0 48 8b 5d d8 4c 8b 65 e0 4c 8b 6d e8
[   81.807026] RIP  [<ffffffff810832e4>] try_to_grab_pending+0x64/0x1f0
[   81.828468]  RSP <ffff88022e847b28>
[   81.840384] CR2: 0000000000000878
[   81.851731] ---[ end trace 9f6c7232e3464e11 ]---
====================

This bug could be triggered by these steps:

modprobe ixgbe ; modprobe macvlan
ip link add link p96p1 address 00:1B:21:6E:06:00 macvlan0 type macvlan
ip link add link p96p1 address 00:1B:21:6E:06:01 macvlan1 type macvlan
ip link add link p96p1 address 00:1B:21:6E:06:02 macvlan2 type macvlan
ip link add link p96p1 address 00:1B:21:6E:06:03 macvlan3 type macvlan
rmmod ixgbe

Reported-by: "Keller, Jacob E" <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index ef8a5c20236a..60e4ca01ccbb 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -45,10 +45,9 @@ struct macvlan_port {
 	struct sk_buff_head	bc_queue;
 	struct work_struct	bc_work;
 	bool 			passthru;
+	int			count;
 };
 
-#define MACVLAN_PORT_IS_EMPTY(port)    list_empty(&port->vlans)
-
 struct macvlan_skb_cb {
 	const struct macvlan_dev *src;
 };
@@ -667,7 +666,8 @@ static void macvlan_uninit(struct net_device *dev)
 
 	free_percpu(vlan->pcpu_stats);
 
-	if (MACVLAN_PORT_IS_EMPTY(port))
+	port->count -= 1;
+	if (!port->count)
 		macvlan_port_destroy(port->dev);
 }
 
@@ -1020,12 +1020,13 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
 
 	if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
-		if (!MACVLAN_PORT_IS_EMPTY(port))
+		if (port->count)
 			return -EINVAL;
 		port->passthru = true;
 		eth_hw_addr_inherit(dev, lowerdev);
 	}
 
+	port->count += 1;
 	err = register_netdevice(dev);
 	if (err < 0)
 		goto destroy_port;
@@ -1043,7 +1044,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 unregister_netdev:
 	unregister_netdevice(dev);
 destroy_port:
-	if (MACVLAN_PORT_IS_EMPTY(port))
+	port->count -= 1;
+	if (!port->count)
 		macvlan_port_destroy(lowerdev);
 
 	return err;

From fdd42e4400fcb5fe837a20a8d837ef094d5a26fb Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 13 Aug 2014 13:54:22 +0200
Subject: [PATCH 31/42] net: xilinx: Remove .owner field for driver

There is no need to init .owner field.

Based on the patch from Peter Griffin <peter.griffin@linaro.org>
"mmc: remove .owner field for drivers using module_platform_driver"

This patch removes the superflous .owner field for drivers which
use the module_platform_driver API, as this is overriden in
platform_driver_register anyway."

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac_main.c       | 1 -
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 1 -
 drivers/net/ethernet/xilinx/xilinx_emaclite.c     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 36f4459520c3..fda5891835d4 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1170,7 +1170,6 @@ static struct platform_driver temac_of_driver = {
 	.probe = temac_of_probe,
 	.remove = temac_of_remove,
 	.driver = {
-		.owner = THIS_MODULE,
 		.name = "xilinx_temac",
 		.of_match_table = temac_of_match,
 	},
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 30e8608ff050..c8fd94133ecd 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1645,7 +1645,6 @@ static struct platform_driver axienet_of_driver = {
 	.probe = axienet_of_probe,
 	.remove = axienet_of_remove,
 	.driver = {
-		 .owner = THIS_MODULE,
 		 .name = "xilinx_axienet",
 		 .of_match_table = axienet_of_match,
 	},
diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 782bb9373cd8..28dbbdc393eb 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -1245,7 +1245,6 @@ MODULE_DEVICE_TABLE(of, xemaclite_of_match);
 static struct platform_driver xemaclite_of_driver = {
 	.driver = {
 		.name = DRIVER_NAME,
-		.owner = THIS_MODULE,
 		.of_match_table = xemaclite_of_match,
 	},
 	.probe		= xemaclite_of_probe,

From 9d186cac7ffb1831e9f34cb4a3a8b22abb9dd9d4 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 13 Aug 2014 16:03:10 +0400
Subject: [PATCH 32/42] tcp: don't use timestamp from repaired skb-s to
 calculate RTT (v2)

We don't know right timestamp for repaired skb-s. Wrong RTT estimations
isn't good, because some congestion modules heavily depends on it.

This patch adds the TCPCB_REPAIRED flag, which is included in
TCPCB_RETRANS.

Thanks to Eric for the advice how to fix this issue.

This patch fixes the warning:
[  879.562947] WARNING: CPU: 0 PID: 2825 at net/ipv4/tcp_input.c:3078 tcp_ack+0x11f5/0x1380()
[  879.567253] CPU: 0 PID: 2825 Comm: socket-tcpbuf-l Not tainted 3.16.0-next-20140811 #1
[  879.567829] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  879.568177]  0000000000000000 00000000c532680c ffff880039643d00 ffffffff817aa2d2
[  879.568776]  0000000000000000 ffff880039643d38 ffffffff8109afbd ffff880039d6ba80
[  879.569386]  ffff88003a449800 000000002983d6bd 0000000000000000 000000002983d6bc
[  879.569982] Call Trace:
[  879.570264]  [<ffffffff817aa2d2>] dump_stack+0x4d/0x66
[  879.570599]  [<ffffffff8109afbd>] warn_slowpath_common+0x7d/0xa0
[  879.570935]  [<ffffffff8109b0ea>] warn_slowpath_null+0x1a/0x20
[  879.571292]  [<ffffffff816d0a05>] tcp_ack+0x11f5/0x1380
[  879.571614]  [<ffffffff816d10bd>] tcp_rcv_established+0x1ed/0x710
[  879.571958]  [<ffffffff816dc9da>] tcp_v4_do_rcv+0x10a/0x370
[  879.572315]  [<ffffffff81657459>] release_sock+0x89/0x1d0
[  879.572642]  [<ffffffff816c81a0>] do_tcp_setsockopt.isra.36+0x120/0x860
[  879.573000]  [<ffffffff8110a52e>] ? rcu_read_lock_held+0x6e/0x80
[  879.573352]  [<ffffffff816c8912>] tcp_setsockopt+0x32/0x40
[  879.573678]  [<ffffffff81654ac4>] sock_common_setsockopt+0x14/0x20
[  879.574031]  [<ffffffff816537b0>] SyS_setsockopt+0x80/0xf0
[  879.574393]  [<ffffffff817b40a9>] system_call_fastpath+0x16/0x1b
[  879.574730] ---[ end trace a17cbc38eb8c5c00 ]---

v2: moving setting of skb->when for repaired skb-s in tcp_write_xmit,
    where it's set for other skb-s.

Fixes: 431a91242d8d ("tcp: timestamp SYN+DATA messages")
Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution")
Cc: Eric Dumazet <edumazet@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  4 +++-
 net/ipv4/tcp.c        | 14 +++++++-------
 net/ipv4/tcp_output.c |  5 ++++-
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index dafa1cbc149b..36f55254573f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -705,8 +705,10 @@ struct tcp_skb_cb {
 #define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
 #define TCPCB_LOST		0x04	/* SKB is lost			*/
 #define TCPCB_TAGBITS		0x07	/* All tag bits			*/
+#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp)	*/
 #define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
-#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
+#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
+				TCPCB_REPAIRED)
 
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	/* 1 byte hole */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 181b70ebd964..541f26a67ba2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1187,13 +1187,6 @@ new_segment:
 				if (!skb)
 					goto wait_for_memory;
 
-				/*
-				 * All packets are restored as if they have
-				 * already been sent.
-				 */
-				if (tp->repair)
-					TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
 				/*
 				 * Check whether we can use HW checksum.
 				 */
@@ -1203,6 +1196,13 @@ new_segment:
 				skb_entail(sk, skb);
 				copy = size_goal;
 				max = size_goal;
+
+				/* All packets are restored as if they have
+				 * already been sent. skb_mstamp isn't set to
+				 * avoid wrong rtt estimation.
+				 */
+				if (tp->repair)
+					TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
 			}
 
 			/* Try to append data to the end of skb. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ef4a051de018..ff3f0f75cc6c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1934,8 +1934,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 
-		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+			/* "when" is used as a start point for the retransmit timer */
+			TCP_SKB_CB(skb)->when = tcp_time_stamp;
 			goto repair; /* Skip network transmission */
+		}
 
 		cwnd_quota = tcp_cwnd_test(tp, skb);
 		if (!cwnd_quota) {

From 9b9ba821b6c47b6d1a711b8f6fff931277217bab Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 14 Aug 2014 13:59:42 +0200
Subject: [PATCH 33/42] net: xgene: Check negative return value of
 xgene_enet_get_ring_size()

xgene_enet_get_ring_size() returns a negative value in case of an error,
but its only caller in xgene_enet_create_desc_ring() currently uses the
return value directly as u32. Instead, check for a negative value first and
error out in case. Also move the call to xgene_enet_get_ring_size() before
devm_kzalloc() so we don't need to free anything in the error path.

This fixes the following issue reported by the Coverity Scanner:

** CID 1231336:  Improper use of negative value  (NEGATIVE_RETURNS)
/drivers/net/ethernet/apm/xgene/xgene_enet_main.c: 596 in xgene_enet_create_desc_ring()

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index af7c40ac1455..e1a8f4e19983 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -581,7 +581,11 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
 	struct xgene_enet_desc_ring *ring;
 	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
 	struct device *dev = ndev_to_dev(ndev);
-	u32 size;
+	int size;
+
+	size = xgene_enet_get_ring_size(dev, cfgsize);
+	if (size < 0)
+		return NULL;
 
 	ring = devm_kzalloc(dev, sizeof(struct xgene_enet_desc_ring),
 			    GFP_KERNEL);
@@ -593,7 +597,6 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring(
 	ring->cfgsize = cfgsize;
 	ring->id = ring_id;
 
-	size = xgene_enet_get_ring_size(dev, cfgsize);
 	ring->desc_addr = dma_zalloc_coherent(dev, size, &ring->dma,
 					      GFP_KERNEL);
 	if (!ring->desc_addr) {

From 3b3e0ea8bc0f46e8291295003f70225e5a66acbc Mon Sep 17 00:00:00 2001
From: Andreas Ruprecht <rupran@einserver.de>
Date: Thu, 14 Aug 2014 14:26:23 +0200
Subject: [PATCH 34/42] net: ethernet: ibm: ehea: Remove duplicate object from
 Makefile

In the Makefile, ehea_phyp.o is included twice in the list of
object files compile into ehea.o.

This change removes one instance.

Signed-off-by: Andreas Ruprecht <rupran@einserver.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ehea/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ehea/Makefile b/drivers/net/ethernet/ibm/ehea/Makefile
index 775d9969b5c2..cd473e295242 100644
--- a/drivers/net/ethernet/ibm/ehea/Makefile
+++ b/drivers/net/ethernet/ibm/ehea/Makefile
@@ -1,6 +1,6 @@
 #
 # Makefile for the eHEA ethernet device driver for IBM eServer System p
 #
-ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o ehea_phyp.o
+ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o
 obj-$(CONFIG_EHEA) += ehea.o
 

From bc8fc7b8f825ef17a0fb9e68c18ce94fa66ab337 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Thu, 14 Aug 2014 15:27:20 +0300
Subject: [PATCH 35/42] sit: Fix ipip6_tunnel_lookup device matching criteria

As of 4fddbf5d78 ("sit: strictly restrict incoming traffic to tunnel link device"),
when looking up a tunnel, tunnel's underlying interface (t->parms.link)
is verified to match incoming traffic's ingress device.

However the comparison was incorrectly based on skb->dev->iflink.

Instead, dev->ifindex should be used, which correctly represents the
interface from which the IP stack hands the ipip6 packets.

This allows setting up sit tunnels bound to vlan interfaces (otherwise
incoming ipip6 traffic on the vlan interface was dropped due to
ipip6_tunnel_lookup match failure).

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2e9ba035fb5f..6163f851dc01 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
 	for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
-		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
 		    (t->dev->flags & IFF_UP))
 			return t;
 	}
 	for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
 		if (remote == t->parms.iph.daddr &&
-		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
 		    (t->dev->flags & IFF_UP))
 			return t;
 	}
 	for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
 		if (local == t->parms.iph.saddr &&
-		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
 		    (t->dev->flags & IFF_UP))
 			return t;
 	}

From 4fab9071950c2021d846e18351e0f46a1cffd67b Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 14 Aug 2014 12:40:05 -0400
Subject: [PATCH 36/42] tcp: fix tcp_release_cb() to dispatch via address
 family for mtu_reduced()

Make sure we use the correct address-family-specific function for
handling MTU reductions from within tcp_release_cb().

Previously AF_INET6 sockets were incorrectly always using the IPv6
code path when sometimes they were handling IPv4 traffic and thus had
an IPv4 dst.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Diagnosed-by: Willem de Bruijn <willemb@google.com>
Fixes: 563d34d057862 ("tcp: dont drop MTU reduction indications")
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 1 +
 include/net/sock.h                 | 1 -
 include/net/tcp.h                  | 1 +
 net/ipv4/tcp_ipv4.c                | 5 +++--
 net/ipv4/tcp_output.c              | 2 +-
 net/ipv6/tcp_ipv6.c                | 3 ++-
 6 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7a4313887568..5fbe6568c3cf 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -62,6 +62,7 @@ struct inet_connection_sock_af_ops {
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
 	int	    (*bind_conflict)(const struct sock *sk,
 				     const struct inet_bind_bucket *tb, bool relax);
+	void	    (*mtu_reduced)(struct sock *sk);
 };
 
 /** inet_connection_sock - INET connection oriented sock
diff --git a/include/net/sock.h b/include/net/sock.h
index 38805fa02e48..7f2ab72f321a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -987,7 +987,6 @@ struct proto {
 						struct sk_buff *skb);
 
 	void		(*release_cb)(struct sock *sk);
-	void		(*mtu_reduced)(struct sock *sk);
 
 	/* Keeping track of sk's, looking them up, and port selection methods. */
 	void			(*hash)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 36f55254573f..e337e05035be 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -448,6 +448,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
  */
 
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_mtu_reduced(struct sock *sk);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(struct sock *sk,
 				      struct request_sock *req,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dceff5fe8e66..cd17f009aede 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
  * It can be called through tcp_release_cb() if socket was owned by user
  * at the time tcp_v4_err() was called to handle ICMP message.
  */
-static void tcp_v4_mtu_reduced(struct sock *sk)
+void tcp_v4_mtu_reduced(struct sock *sk)
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
@@ -302,6 +302,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
 		tcp_simple_retransmit(sk);
 	} /* else let the usual retransmit timer handle it */
 }
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 
 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 {
@@ -1787,6 +1788,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.compat_setsockopt = compat_ip_setsockopt,
 	.compat_getsockopt = compat_ip_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 EXPORT_SYMBOL(ipv4_specific);
 
@@ -2406,7 +2408,6 @@ struct proto tcp_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.mtu_reduced		= tcp_v4_mtu_reduced,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ff3f0f75cc6c..5a7c41fbc6d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
 		__sock_put(sk);
 	}
 	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
-		sk->sk_prot->mtu_reduced(sk);
+		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f2ce95502392..29964c3d363c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v6_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
 	.compat_setsockopt = compat_ipv6_setsockopt,
 	.compat_getsockopt = compat_ipv6_getsockopt,
 #endif
+	.mtu_reduced	   = tcp_v4_mtu_reduced,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = {
 	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.release_cb		= tcp_release_cb,
-	.mtu_reduced		= tcp_v6_mtu_reduced,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,

From a26552afe89438eefbe097512b3187f2c7e929fd Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 14 Aug 2014 22:06:12 +0200
Subject: [PATCH 37/42] tcp: don't allow syn packets without timestamps to pass
 tcp_tw_recycle logic

tcp_tw_recycle heavily relies on tcp timestamps to build a per-host
ordering of incoming connections and teardowns without the need to
hold state on a specific quadruple for TCP_TIMEWAIT_LEN, but only for
the last measured RTO. To do so, we keep the last seen timestamp in a
per-host indexed data structure and verify if the incoming timestamp
in a connection request is strictly greater than the saved one during
last connection teardown. Thus we can verify later on that no old data
packets will be accepted by the new connection.

During moving a socket to time-wait state we already verify if timestamps
where seen on a connection. Only if that was the case we let the
time-wait socket expire after the RTO, otherwise normal TCP_TIMEWAIT_LEN
will be used. But we don't verify this on incoming SYN packets. If a
connection teardown was less than TCP_PAWS_MSL seconds in the past we
cannot guarantee to not accept data packets from an old connection if
no timestamps are present. We should drop this SYN packet. This patch
closes this loophole.

Please note, this patch does not make tcp_tw_recycle in any way more
usable but only adds another safety check:
Sporadic drops of SYN packets because of reordering in the network or
in the socket backlog queues can happen. Users behing NAT trying to
connect to a tcp_tw_recycle enabled server can get caught in blackholes
and their connection requests may regullary get dropped because hosts
behind an address translator don't have synchronized tcp timestamp clocks.
tcp_tw_recycle cannot work if peers don't have tcp timestamps enabled.

In general, use of tcp_tw_recycle is disadvised.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      | 2 +-
 net/ipv4/tcp_input.c   | 9 ++++++---
 net/ipv4/tcp_metrics.c | 6 ++++--
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e337e05035be..590e01a476ac 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -417,7 +417,7 @@ void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
 void tcp_metrics_init(void);
 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
-			bool paws_check);
+			bool paws_check, bool timestamps);
 bool tcp_remember_stamp(struct sock *sk);
 bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1a8e89fdd331..4f6cfbc57775 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5979,12 +5979,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+		if (tcp_death_row.sysctl_tw_recycle) {
 			bool strict;
 
 			dst = af_ops->route_req(sk, &fl, req, &strict);
+
 			if (dst && strict &&
-			    !tcp_peer_is_proven(req, dst, true)) {
+			    !tcp_peer_is_proven(req, dst, true,
+						tmp_opt.saw_tstamp)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -5993,7 +5995,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		else if (!sysctl_tcp_syncookies &&
 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
-			 !tcp_peer_is_proven(req, dst, false)) {
+			 !tcp_peer_is_proven(req, dst, false,
+					     tmp_opt.saw_tstamp)) {
 			/* Without syncookies last quarter of
 			 * backlog is filled with destinations,
 			 * proven to be alive.
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0d54e59b9ea8..ed9c9a91851c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -576,7 +576,8 @@ reset:
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+			bool paws_check, bool timestamps)
 {
 	struct tcp_metrics_block *tm;
 	bool ret;
@@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa
 	if (paws_check) {
 		if (tm &&
 		    (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
-		    (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+		    ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+		     !timestamps))
 			ret = false;
 		else
 			ret = true;

From 0c9ab09223fe9922baeb22546c9a90d774a4bde6 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 14 Aug 2014 16:13:07 -0400
Subject: [PATCH 38/42] tcp: fix ssthresh and undo for consecutive short FRTO
 episodes

Fix TCP FRTO logic so that it always notices when snd_una advances,
indicating that any RTO after that point will be a new and distinct
loss episode.

Previously there was a very specific sequence that could cause FRTO to
fail to notice a new loss episode had started:

(1) RTO timer fires, enter FRTO and retransmit packet 1 in write queue
(2) receiver ACKs packet 1
(3) FRTO sends 2 more packets
(4) RTO timer fires again (should start a new loss episode)

The problem was in step (3) above, where tcp_process_loss() returned
early (in the spot marked "Step 2.b"), so that it never got to the
logic to clear icsk_retransmits. Thus icsk_retransmits stayed
non-zero. Thus in step (4) tcp_enter_loss() would see the non-zero
icsk_retransmits, decide that this RTO is not a new episode, and
decide not to cut ssthresh and remember the current cwnd and ssthresh
for undo.

There were two main consequences to the bug that we have
observed. First, ssthresh was not decreased in step (4). Second, when
there was a series of such FRTO (1-4) sequences that happened to be
followed by an FRTO undo, we would restore the cwnd and ssthresh from
before the entire series started (instead of the cwnd and ssthresh
from before the most recent RTO). This could result in cwnd and
ssthresh being restored to values much bigger than the proper values.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Fixes: e33099f96d99c ("tcp: implement RFC5682 F-RTO")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4f6cfbc57775..a906e0200ff2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2687,7 +2687,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
  */
 static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool recovered = !before(tp->snd_una, tp->high_seq);
 
@@ -2713,12 +2712,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 
 	if (recovered) {
 		/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
-		icsk->icsk_retransmits = 0;
 		tcp_try_undo_recovery(sk);
 		return;
 	}
-	if (flag & FLAG_DATA_ACKED)
-		icsk->icsk_retransmits = 0;
 	if (tcp_is_reno(tp)) {
 		/* A Reno DUPACK means new data in F-RTO step 2.b above are
 		 * delivered. Lower inflight to clock out (re)tranmissions.
@@ -3405,8 +3401,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
-	if (after(ack, prior_snd_una))
+	if (after(ack, prior_snd_una)) {
 		flag |= FLAG_SND_UNA_ADVANCED;
+		icsk->icsk_retransmits = 0;
+	}
 
 	prior_fackets = tp->fackets_out;
 

From 5300fdcb7b7e97d83033bc7196582705524d35ea Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 13 Aug 2014 16:38:29 +0200
Subject: [PATCH 39/42] rhashtable: RCU annotations for next pointers

Properly annotate next pointers as access is RCU protected in
the lookup path.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 ++--
 lib/rhashtable.c           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 9cda293c867d..8c6048e77f29 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -21,7 +21,7 @@
 #include <linux/rculist.h>
 
 struct rhash_head {
-	struct rhash_head		*next;
+	struct rhash_head __rcu		*next;
 };
 
 #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
@@ -97,7 +97,7 @@ u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node, gfp_t);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node, gfp_t);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head **pprev, gfp_t flags);
+			     struct rhash_head __rcu **pprev, gfp_t flags);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6940cf16628..338dd7aa5e13 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * deletion when combined with walking or lookup.
  */
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head **pprev, gfp_t flags)
+			     struct rhash_head __rcu **pprev, gfp_t flags)
 {
 	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
 

From c91eee56dc4f8c3d9ae834bacb835596d47a709e Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 13 Aug 2014 16:38:30 +0200
Subject: [PATCH 40/42] rhashtable: unexport and make rht_obj() static

No need to export rht_obj(), all inner to outer object translations
occur internally. It was intended to be used with rht_for_each() which
now primarily serves as the iterator for rhashtable_remove_pprev() to
effectively flush and free the full table.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 1 -
 lib/rhashtable.c           | 8 +-------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8c6048e77f29..af967c4c7591 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -117,7 +117,6 @@ void rhashtable_destroy(const struct rhashtable *ht);
 #define rht_dereference_rcu(p, ht) \
 	rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
 
-/* Internal, use rht_obj() instead */
 #define rht_entry(ptr, type, member) container_of(ptr, type, member)
 #define rht_entry_safe(ptr, type, member) \
 ({ \
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 338dd7aa5e13..a2c78810ebc1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -38,16 +38,10 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 #endif
 
-/**
- * rht_obj - cast hash head to outer object
- * @ht:		hash table
- * @he:		hashed node
- */
-void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
+static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 {
 	return (void *) he - ht->p.head_offset;
 }
-EXPORT_SYMBOL_GPL(rht_obj);
 
 static u32 __hashfn(const struct rhashtable *ht, const void *key,
 		      u32 len, u32 hsize)

From 93f560811e80216e98f3fcec220aa0f8836b09af Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 13 Aug 2014 16:38:31 +0200
Subject: [PATCH 41/42] rhashtable: fix annotations for
 rht_for_each_entry_rcu()

Call rcu_deference_raw() directly from within rht_for_each_entry_rcu()
as list_for_each_entry_rcu() does.

Fixes the following sparse warnings:
net/netlink/af_netlink.c:2906:25:    expected struct rhash_head const *__mptr
net/netlink/af_netlink.c:2906:25:    got struct rhash_head [noderef] <asn:4>*<noident>

Fixes: e341694e3eb57fc ("netlink: Convert netlink_lookup() to use RCU protected hash table")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index af967c4c7591..36826c0166c5 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -123,11 +123,6 @@ void rhashtable_destroy(const struct rhashtable *ht);
 	typeof(ptr) __ptr = (ptr); \
 	   __ptr ? rht_entry(__ptr, type, member) : NULL; \
 })
-#define rht_entry_safe_rcu(ptr, type, member) \
-({ \
-	typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \
-	__ptr ? container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member) : NULL; \
-})
 
 #define rht_next_entry_safe(pos, ht, member) \
 ({ \
@@ -204,9 +199,10 @@ void rhashtable_destroy(const struct rhashtable *ht);
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(pos, head, member) \
-	for (pos = rht_entry_safe_rcu(head, typeof(*(pos)), member); \
+	for (pos = rht_entry_safe(rcu_dereference_raw(head), \
+				  typeof(*(pos)), member); \
 	     pos; \
-	     pos = rht_entry_safe_rcu((pos)->member.next, \
-				      typeof(*(pos)), member))
+	     pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \
+				  typeof(*(pos)), member))
 
 #endif /* _LINUX_RHASHTABLE_H */

From 9ce12eb16ffb143f3a509da86283ddd0b10bcdb3 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 13 Aug 2014 16:38:32 +0200
Subject: [PATCH 42/42] netlink: Annotate RCU locking for seq_file walker

Silences the following sparse warnings:
net/netlink/af_netlink.c:2926:21: warning: context imbalance in 'netlink_seq_start' - wrong count at exit
net/netlink/af_netlink.c:2972:13: warning: context imbalance in 'netlink_seq_stop' - unexpected unlock

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2e152e5f2186..c416725d28c4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2921,6 +2921,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
 {
 	rcu_read_lock();
 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -2970,6 +2971,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
 {
 	rcu_read_unlock();
 }