From c197facc8ea08062f8f949aade6a33649ee06771 Mon Sep 17 00:00:00 2001
From: "hummerbliss@gmail.com" <hummerbliss@gmail.com>
Date: Mon, 20 Apr 2009 17:12:35 +0200
Subject: [PATCH 01/32] netfilter: bridge: allow fragmentation of VLAN packets
 traversing a bridge

br_nf_dev_queue_xmit only checks for ETH_P_IP packets for fragmenting but not
VLAN packets. This results in dropping of large VLAN packets. This can be
observed when connection tracking is enabled. Connection tracking re-assembles
fragmented packets, and these have to re-fragmented when transmitting out. Also,
make sure only refragmented packets are defragmented as per suggestion from
Patrick McHardy.

Signed-off-by: Saikiran Madugula <hummerbliss@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/bridge/br_netfilter.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3953ac4214c8..e4a418fcb35b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -788,15 +788,23 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
+#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	if (skb->protocol == htons(ETH_P_IP) &&
+	if (skb->nfct != NULL &&
+	    (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) &&
 	    skb->len > skb->dev->mtu &&
 	    !skb_is_gso(skb))
 		return ip_fragment(skb, br_dev_queue_push_xmit);
 	else
 		return br_dev_queue_push_xmit(skb);
 }
+#else
+static int br_nf_dev_queue_xmit(struct sk_buff *skb)
+{
+        return br_dev_queue_push_xmit(skb);
+}
+#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,

From 5ff482940f5aa2cdc3424c4a8ea94b9833b2af5f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 24 Apr 2009 15:37:44 +0200
Subject: [PATCH 02/32] netfilter: nf_ct_dccp/udplite: fix protocol
 registration error

Commit d0dba725 (netfilter: ctnetlink: add callbacks to the per-proto
nlattrs) changed the protocol registration function to abort if the
to-be registered protocol doesn't provide a new callback function.

The DCCP and UDP-Lite IPv6 protocols were missed in this conversion,
add the required callback pointer.

Reported-and-tested-by: Steven Jan Springl <steven@springl.ukfsn.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto_dccp.c    | 1 +
 net/netfilter/nf_conntrack_proto_udplite.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 50dac8dbe7d8..5411d63f31a5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -777,6 +777,7 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
 	.print_conntrack	= dccp_print_conntrack,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 	.to_nlattr		= dccp_to_nlattr,
+	.nlattr_size		= dccp_nlattr_size,
 	.from_nlattr		= nlattr_to_dccp,
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
 	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 4614696c1b88..0badedc542d3 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -204,6 +204,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
 	.error			= udplite_error,
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
 	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
 	.nla_policy		= nf_ct_port_nla_policy,
 #endif

From 4b0706624930dc75c3b0d0df463d89759ef7de29 Mon Sep 17 00:00:00 2001
From: Laszlo Attila Toth <panther@balabit.hu>
Date: Fri, 24 Apr 2009 16:55:25 +0200
Subject: [PATCH 03/32] netfilter: Kconfig: TProxy doesn't depend on
 NF_CONNTRACK

Signed-off-by: Laszlo Attila Toth <panther@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2329c5f50551..881203c4a142 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -275,6 +275,8 @@ config NF_CT_NETLINK
 	help
 	  This option enables support for a netlink-based userspace interface
 
+endif # NF_CONNTRACK
+
 # transparent proxy support
 config NETFILTER_TPROXY
 	tristate "Transparent proxying support (EXPERIMENTAL)"
@@ -290,8 +292,6 @@ config NETFILTER_TPROXY
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-endif # NF_CONNTRACK
-
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n

From 71951b64a5a87c09eb6fde59ce51aaab2fdaeab2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 24 Apr 2009 16:58:41 +0200
Subject: [PATCH 04/32] netfilter: nf_ct_dccp: add missing role attributes for
 DCCP

This patch adds missing role attribute to the DCCP type, otherwise
the creation of entries is not of any use.

The attribute added is CTA_PROTOINFO_DCCP_ROLE which contains the
role of the conntrack original tuple.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h |  1 +
 net/netfilter/nf_conntrack_proto_dccp.c       | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 29fe9ea1d346..1a865e48b8eb 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -100,6 +100,7 @@ enum ctattr_protoinfo_tcp {
 enum ctattr_protoinfo_dccp {
 	CTA_PROTOINFO_DCCP_UNSPEC,
 	CTA_PROTOINFO_DCCP_STATE,
+	CTA_PROTOINFO_DCCP_ROLE,
 	__CTA_PROTOINFO_DCCP_MAX,
 };
 #define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5411d63f31a5..8e757dd53396 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -633,6 +633,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	if (!nest_parms)
 		goto nla_put_failure;
 	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
+	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
+		   ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
 	nla_nest_end(skb, nest_parms);
 	read_unlock_bh(&dccp_lock);
 	return 0;
@@ -644,6 +646,7 @@ nla_put_failure:
 
 static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
 	[CTA_PROTOINFO_DCCP_STATE]	= { .type = NLA_U8 },
+	[CTA_PROTOINFO_DCCP_ROLE]	= { .type = NLA_U8 },
 };
 
 static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
@@ -661,11 +664,21 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
 		return err;
 
 	if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
-	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE)
+	    !tb[CTA_PROTOINFO_DCCP_ROLE] ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
 		return -EINVAL;
+	}
 
 	write_lock_bh(&dccp_lock);
 	ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
+	if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+	} else {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
+	}
 	write_unlock_bh(&dccp_lock);
 	return 0;
 }

From 37e55cf0ceb8803256bf69a3e45bd668bf90b76f Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Fri, 24 Apr 2009 17:05:21 +0200
Subject: [PATCH 05/32] netfilter: xt_recent: fix stack overread in compat code

Related-to: commit 325fb5b4d26038cba665dd0d8ee09555321061f0

The compat path suffers from a similar problem. It only uses a __be32
when all of the recent code uses, and expects, an nf_inet_addr
everywhere. As a result, addresses stored by xt_recents were
filled with whatever other stuff was on the stack following the be32.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>

With a minor compile fix from Roman.

Reported-and-tested-by: Roman Hoog Antink <rha@open.ch>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_recent.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 791e030ea903..eb0ceb846527 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -474,7 +474,7 @@ static ssize_t recent_old_proc_write(struct file *file,
 	struct recent_table *t = pde->data;
 	struct recent_entry *e;
 	char buf[sizeof("+255.255.255.255")], *c = buf;
-	__be32 addr;
+	union nf_inet_addr addr = {};
 	int add;
 
 	if (size > sizeof(buf))
@@ -506,14 +506,13 @@ static ssize_t recent_old_proc_write(struct file *file,
 		add = 1;
 		break;
 	}
-	addr = in_aton(c);
+	addr.ip = in_aton(c);
 
 	spin_lock_bh(&recent_lock);
-	e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0);
+	e = recent_entry_lookup(t, &addr, NFPROTO_IPV4, 0);
 	if (e == NULL) {
 		if (add)
-			recent_entry_init(t, (const void *)&addr,
-					  NFPROTO_IPV4, 0);
+			recent_entry_init(t, &addr, NFPROTO_IPV4, 0);
 	} else {
 		if (add)
 			recent_entry_update(t, e);

From adc667e84f086aa110d810f3476c494e48eaabaa Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Sat, 25 Apr 2009 18:03:35 -0700
Subject: [PATCH 06/32] vlan: update vlan carrier state for admin up/down

	Currently, the VLAN event handler does not adjust the VLAN
device's carrier state when the real device or the VLAN device is set
administratively up or down.

	The following patch adds a transfer of operating state from the
real device to the VLAN device when the real device is administratively
set up or down, and sets the carrier state up or down during init, open
and close of the VLAN device.

	This permits observers above the VLAN device that care about the
carrier state (bonding's link monitor, for example) to receive updates
for administrative changes by more closely mimicing the behavior of real
devices.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/8021q/vlan.c     | 2 ++
 net/8021q/vlan_dev.c | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2b7390e377b3..d1e10546eb85 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -492,6 +492,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 				continue;
 
 			dev_change_flags(vlandev, flgs & ~IFF_UP);
+			vlan_transfer_operstate(dev, vlandev);
 		}
 		break;
 
@@ -507,6 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 				continue;
 
 			dev_change_flags(vlandev, flgs | IFF_UP);
+			vlan_transfer_operstate(dev, vlandev);
 		}
 		break;
 
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 6b0921364014..b4b9068e55a7 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -462,6 +462,7 @@ static int vlan_dev_open(struct net_device *dev)
 	if (vlan->flags & VLAN_FLAG_GVRP)
 		vlan_gvrp_request_join(dev);
 
+	netif_carrier_on(dev);
 	return 0;
 
 clear_allmulti:
@@ -471,6 +472,7 @@ del_unicast:
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
 		dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN);
 out:
+	netif_carrier_off(dev);
 	return err;
 }
 
@@ -492,6 +494,7 @@ static int vlan_dev_stop(struct net_device *dev)
 	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
 		dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len);
 
+	netif_carrier_off(dev);
 	return 0;
 }
 
@@ -612,6 +615,8 @@ static int vlan_dev_init(struct net_device *dev)
 	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
 	int subclass = 0;
 
+	netif_carrier_off(dev);
+
 	/* IFF_BROADCAST|IFF_MULTICAST; ??? */
 	dev->flags  = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI);
 	dev->iflink = real_dev->ifindex;

From a4233304bb43f87f97fc2ac9143b513814dcf094 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Sun, 26 Apr 2009 20:41:34 +0000
Subject: [PATCH 07/32] mlx4_en: Fix cleanup flow on cq activation

In case of mlx4_en_activate_cq() failure, the cleanup
code would go to rx_err and try to disable unactivated rings.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mlx4/en_netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 438678ab2a10..7bcc49de1637 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -583,7 +583,7 @@ int mlx4_en_start_port(struct net_device *dev)
 		err = mlx4_en_activate_cq(priv, cq);
 		if (err) {
 			mlx4_err(mdev, "Failed activating Rx CQ\n");
-			goto rx_err;
+			goto cq_err;
 		}
 		for (j = 0; j < cq->size; j++)
 			cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;

From 785a0982eaaeae2fbe3372d1c9c769e8156a7a5a Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Sun, 26 Apr 2009 20:42:57 +0000
Subject: [PATCH 08/32] mlx4_en: Handle page allocation failure during receive

If we failed to allocate new fragments for receive buffer,
the packet should be dropped and packets should be reused.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mlx4/en_rx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index 0cbb78ca7b29..7942c4d3cd88 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -610,6 +610,10 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags,
 						      skb_shinfo(skb)->frags,
 						      page_alloc, length);
+		if (unlikely(!used_frags)) {
+			kfree_skb(skb);
+			return NULL;
+		}
 		skb_shinfo(skb)->nr_frags = used_frags;
 
 		/* Copy headers into the skb linear buffer */

From c759a6b4e1cae6aff71f58c9c85404ebcd81b6e0 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 27 Apr 2009 02:36:20 -0700
Subject: [PATCH 09/32] net: Fix LL_MAX_HEADER for CONFIG_TR_MODULE

Unless I miss anything this should fix a bug.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f4a755..453be9a674c0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -104,7 +104,7 @@ struct wireless_dev;
 # else
 #  define LL_MAX_HEADER 96
 # endif
-#elif defined(CONFIG_TR)
+#elif defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
 # define LL_MAX_HEADER 48
 #else
 # define LL_MAX_HEADER 32

From 8f955d7f042e4ac44891a400d5000928f8db9f58 Mon Sep 17 00:00:00 2001
From: Ayaz Abdulla <aabdulla@nvidia.com>
Date: Sat, 25 Apr 2009 09:17:56 +0000
Subject: [PATCH 10/32] forcedeth: tx timeout fix

This patch fixes the tx_timeout() to properly handle the clean up of the
tx ring. It also sets the tx put pointer back to the correct position to
be in sync with HW.

Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/forcedeth.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 11d5db16ed9c..f9a846b1b92f 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1880,6 +1880,7 @@ static void nv_init_tx(struct net_device *dev)
 	np->tx_pkts_in_progress = 0;
 	np->tx_change_owner = NULL;
 	np->tx_end_flip = NULL;
+	np->tx_stop = 0;
 
 	for (i = 0; i < np->tx_ring_size; i++) {
 		if (!nv_optimized(np)) {
@@ -2530,6 +2531,8 @@ static void nv_tx_timeout(struct net_device *dev)
 	struct fe_priv *np = netdev_priv(dev);
 	u8 __iomem *base = get_hwbase(dev);
 	u32 status;
+	union ring_type put_tx;
+	int saved_tx_limit;
 
 	if (np->msi_flags & NV_MSI_X_ENABLED)
 		status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
@@ -2589,24 +2592,32 @@ static void nv_tx_timeout(struct net_device *dev)
 	/* 1) stop tx engine */
 	nv_stop_tx(dev);
 
-	/* 2) check that the packets were not sent already: */
+	/* 2) complete any outstanding tx and do not give HW any limited tx pkts */
+	saved_tx_limit = np->tx_limit;
+	np->tx_limit = 0; /* prevent giving HW any limited pkts */
+	np->tx_stop = 0;  /* prevent waking tx queue */
 	if (!nv_optimized(np))
 		nv_tx_done(dev, np->tx_ring_size);
 	else
 		nv_tx_done_optimized(dev, np->tx_ring_size);
 
-	/* 3) if there are dead entries: clear everything */
-	if (np->get_tx_ctx != np->put_tx_ctx) {
-		printk(KERN_DEBUG "%s: tx_timeout: dead entries!\n", dev->name);
-		nv_drain_tx(dev);
-		nv_init_tx(dev);
-		setup_hw_rings(dev, NV_SETUP_TX_RING);
-	}
+	/* save current HW postion */
+	if (np->tx_change_owner)
+		put_tx.ex = np->tx_change_owner->first_tx_desc;
+	else
+		put_tx = np->put_tx;
 
-	netif_wake_queue(dev);
+	/* 3) clear all tx state */
+	nv_drain_tx(dev);
+	nv_init_tx(dev);
 
-	/* 4) restart tx engine */
+	/* 4) restore state to current HW position */
+	np->get_tx = np->put_tx = put_tx;
+	np->tx_limit = saved_tx_limit;
+
+	/* 5) restart tx engine */
 	nv_start_tx(dev);
+	netif_wake_queue(dev);
 	spin_unlock_irq(&np->lock);
 }
 

From 6a783c9067e3f71aac61a9262fe42c1f68efd4fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Mon, 27 Apr 2009 02:58:59 -0700
Subject: [PATCH 11/32] xfrm: wrong hash value for temporary SA

When kernel inserts a temporary SA for IKE, it uses the wrong hash
value for dst list. Two hash values were calcultated before: one with
source address and one with a wildcard source address.

Bug hinted by Junwei Zhang <junwei.zhang@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 82271720d970..5f1f86565f16 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -794,7 +794,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 {
 	static xfrm_address_t saddr_wildcard = { };
 	struct net *net = xp_net(pol);
-	unsigned int h;
+	unsigned int h, h_wildcard;
 	struct hlist_node *entry;
 	struct xfrm_state *x, *x0, *to_put;
 	int acquire_in_progress = 0;
@@ -819,8 +819,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	if (best)
 		goto found;
 
-	h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
-	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
 		if (x->props.family == family &&
 		    x->props.reqid == tmpl->reqid &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&

From ae0e8e82205c903978a79ebf5e31c670b61fa5b4 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 27 Apr 2009 03:04:58 -0700
Subject: [PATCH 12/32] veth: prevent oops caused by netdev destructor

From: Stephen Hemminger <shemminger@vyatta.com>

The veth driver will oops if sysfs hooks are open while module is removed.

The net device destructor can not point to code in a module; basically
there are only two possible safe values: NULL - no destructor, or
free_netdev - free on last use

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/veth.c | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 015db1cece72..8e56fcf0a0e3 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -210,14 +210,11 @@ rx_drop:
 
 static struct net_device_stats *veth_get_stats(struct net_device *dev)
 {
-	struct veth_priv *priv;
-	struct net_device_stats *dev_stats;
-	int cpu;
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device_stats *dev_stats = &dev->stats;
+	unsigned int cpu;
 	struct veth_net_stats *stats;
 
-	priv = netdev_priv(dev);
-	dev_stats = &dev->stats;
-
 	dev_stats->rx_packets = 0;
 	dev_stats->tx_packets = 0;
 	dev_stats->rx_bytes = 0;
@@ -225,16 +222,17 @@ static struct net_device_stats *veth_get_stats(struct net_device *dev)
 	dev_stats->tx_dropped = 0;
 	dev_stats->rx_dropped = 0;
 
-	for_each_online_cpu(cpu) {
-		stats = per_cpu_ptr(priv->stats, cpu);
+	if (priv->stats)
+		for_each_online_cpu(cpu) {
+			stats = per_cpu_ptr(priv->stats, cpu);
 
-		dev_stats->rx_packets += stats->rx_packets;
-		dev_stats->tx_packets += stats->tx_packets;
-		dev_stats->rx_bytes += stats->rx_bytes;
-		dev_stats->tx_bytes += stats->tx_bytes;
-		dev_stats->tx_dropped += stats->tx_dropped;
-		dev_stats->rx_dropped += stats->rx_dropped;
-	}
+			dev_stats->rx_packets += stats->rx_packets;
+			dev_stats->tx_packets += stats->tx_packets;
+			dev_stats->rx_bytes += stats->rx_bytes;
+			dev_stats->tx_bytes += stats->tx_bytes;
+			dev_stats->tx_dropped += stats->tx_dropped;
+			dev_stats->rx_dropped += stats->rx_dropped;
+		}
 
 	return dev_stats;
 }
@@ -261,6 +259,8 @@ static int veth_close(struct net_device *dev)
 	netif_carrier_off(dev);
 	netif_carrier_off(priv->peer);
 
+	free_percpu(priv->stats);
+	priv->stats = NULL;
 	return 0;
 }
 
@@ -291,15 +291,6 @@ static int veth_dev_init(struct net_device *dev)
 	return 0;
 }
 
-static void veth_dev_free(struct net_device *dev)
-{
-	struct veth_priv *priv;
-
-	priv = netdev_priv(dev);
-	free_percpu(priv->stats);
-	free_netdev(dev);
-}
-
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -317,7 +308,7 @@ static void veth_setup(struct net_device *dev)
 	dev->netdev_ops = &veth_netdev_ops;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
-	dev->destructor = veth_dev_free;
+	dev->destructor = free_netdev;
 }
 
 /*

From 495dce123ceabbae035552437fcaa0f69247ff08 Mon Sep 17 00:00:00 2001
From: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com>
Date: Thu, 23 Apr 2009 11:15:18 +0000
Subject: [PATCH 13/32] ixgbe: Fix WoL functionality for 82599 KX4 devices

The current code writes the PME enabled bit in PCI config space which is
wrong.  This was needed for pre-release hardware, and was not removed from
the driver.  Also, we need to clear the WUS (wake up status) after we
resume.  Otherwise we can't wake for the same event again since it's still
asserted in the hardware.  Plus, the multicast lists were being written
improperly, causing multicast WoL to fail.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe_common.c | 51 ++------------------------------
 drivers/net/ixgbe/ixgbe_main.c   | 10 +++----
 2 files changed, 6 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c
index 5567519676d5..186a65069b33 100644
--- a/drivers/net/ixgbe/ixgbe_common.c
+++ b/drivers/net/ixgbe/ixgbe_common.c
@@ -50,7 +50,6 @@ static u16 ixgbe_calc_eeprom_checksum(struct ixgbe_hw *hw);
 static void ixgbe_enable_rar(struct ixgbe_hw *hw, u32 index);
 static void ixgbe_disable_rar(struct ixgbe_hw *hw, u32 index);
 static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr);
-static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr);
 static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
 
 /**
@@ -1377,8 +1376,7 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
 	 * Clear accounting of old secondary address list,
 	 * don't count RAR[0]
 	 */
-	uc_addr_in_use = hw->addr_ctrl.rar_used_count -
-	                 hw->addr_ctrl.mc_addr_in_rar_count - 1;
+	uc_addr_in_use = hw->addr_ctrl.rar_used_count - 1;
 	hw->addr_ctrl.rar_used_count -= uc_addr_in_use;
 	hw->addr_ctrl.overflow_promisc = 0;
 
@@ -1492,40 +1490,6 @@ static void ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr)
 	IXGBE_WRITE_REG(hw, IXGBE_MTA(vector_reg), mta_reg);
 }
 
-/**
- *  ixgbe_add_mc_addr - Adds a multicast address.
- *  @hw: pointer to hardware structure
- *  @mc_addr: new multicast address
- *
- *  Adds it to unused receive address register or to the multicast table.
- **/
-static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr)
-{
-	u32 rar_entries = hw->mac.num_rar_entries;
-	u32 rar;
-
-	hw_dbg(hw, " MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n",
-	       mc_addr[0], mc_addr[1], mc_addr[2],
-	       mc_addr[3], mc_addr[4], mc_addr[5]);
-
-	/*
-	 * Place this multicast address in the RAR if there is room,
-	 * else put it in the MTA
-	 */
-	if (hw->addr_ctrl.rar_used_count < rar_entries) {
-		/* use RAR from the end up for multicast */
-		rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1;
-		hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV);
-		hw_dbg(hw, "Added a multicast address to RAR[%d]\n", rar);
-		hw->addr_ctrl.rar_used_count++;
-		hw->addr_ctrl.mc_addr_in_rar_count++;
-	} else {
-		ixgbe_set_mta(hw, mc_addr);
-	}
-
-	hw_dbg(hw, "ixgbe_add_mc_addr Complete\n");
-}
-
 /**
  *  ixgbe_update_mc_addr_list_generic - Updates MAC list of multicast addresses
  *  @hw: pointer to hardware structure
@@ -1542,7 +1506,6 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
                                       u32 mc_addr_count, ixgbe_mc_addr_itr next)
 {
 	u32 i;
-	u32 rar_entries = hw->mac.num_rar_entries;
 	u32 vmdq;
 
 	/*
@@ -1550,18 +1513,8 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
 	 * use.
 	 */
 	hw->addr_ctrl.num_mc_addrs = mc_addr_count;
-	hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count;
-	hw->addr_ctrl.mc_addr_in_rar_count = 0;
 	hw->addr_ctrl.mta_in_use = 0;
 
-	/* Zero out the other receive addresses. */
-	hw_dbg(hw, "Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count,
-	          rar_entries - 1);
-	for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) {
-		IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0);
-		IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0);
-	}
-
 	/* Clear the MTA */
 	hw_dbg(hw, " Clearing MTA\n");
 	for (i = 0; i < hw->mac.mcft_size; i++)
@@ -1570,7 +1523,7 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
 	/* Add the new addresses */
 	for (i = 0; i < mc_addr_count; i++) {
 		hw_dbg(hw, " Adding the multicast addresses:\n");
-		ixgbe_add_mc_addr(hw, next(hw, &mc_addr_list, &vmdq));
+		ixgbe_set_mta(hw, next(hw, &mc_addr_list, &vmdq));
 	}
 
 	/* Enable mta */
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 01884256f4c9..07e778d3e5d2 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3646,6 +3646,8 @@ static int ixgbe_resume(struct pci_dev *pdev)
 
 	ixgbe_reset(adapter);
 
+	IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0);
+
 	if (netif_running(netdev)) {
 		err = ixgbe_open(adapter->netdev);
 		if (err)
@@ -4575,7 +4577,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data];
 	static int cards_found;
 	int i, err, pci_using_dac;
-	u16 pm_value = 0;
 	u32 part_num, eec;
 
 	err = pci_enable_device(pdev);
@@ -4763,11 +4764,8 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 
 	switch (pdev->device) {
 	case IXGBE_DEV_ID_82599_KX4:
-#define IXGBE_PCIE_PMCSR 0x44
-		adapter->wol = IXGBE_WUFC_MAG;
-		pci_read_config_word(pdev, IXGBE_PCIE_PMCSR, &pm_value);
-		pci_write_config_word(pdev, IXGBE_PCIE_PMCSR,
-		                      (pm_value | (1 << 8)));
+		adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX |
+		                IXGBE_WUFC_MC | IXGBE_WUFC_BC);
 		break;
 	default:
 		adapter->wol = 0;

From 7ced70c47f68ad672f50781de5adc6d41e6d2866 Mon Sep 17 00:00:00 2001
From: Tilman Schmidt <tilman@imap.cc>
Date: Thu, 23 Apr 2009 02:24:21 +0000
Subject: [PATCH 14/32] update Documentation/isdn/00-INDEX

After the merging of mISDN, state which files refer only to the
old isdn4linux subsystem.  Also add a few missing files.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/isdn/00-INDEX | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index 9fee5f2e5c62..d9660ee5ab27 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -2,8 +2,12 @@
 	- this file (info on ISDN implementation for Linux)
 CREDITS
 	- list of the kind folks that brought you this stuff.
+HiSax.cert
+	- information about the ITU approval certification of the HiSax driver.
 INTERFACE
-	- description of Linklevel and Hardwarelevel ISDN interface.
+	- description of isdn4linux Link Level and Hardware Level interfaces.
+INTERFACE.fax
+	- description of the fax subinterface of isdn4linux.
 README
 	- general info on what you need and what to do for Linux ISDN.
 README.FAQ
@@ -12,6 +16,8 @@ README.audio
 	- info for running audio over ISDN.
 README.fax
 	- info for using Fax over ISDN.
+README.gigaset
+	- info on the drivers for Siemens Gigaset ISDN adapters.
 README.icn
 	- info on the ICN-ISDN-card and its driver.
 README.HiSax
@@ -37,7 +43,8 @@ README.diversion
 README.sc
 	- info on driver for Spellcaster cards.
 README.x25
-    _ info for running X.25 over ISDN.
+	- info for running X.25 over ISDN.
 README.hysdn
-        - info on driver for Hypercope active HYSDN cards
- 
+	- info on driver for Hypercope active HYSDN cards
+README.mISDN
+	- info on the Modular ISDN subsystem (mISDN).

From 554f200e22a13e19bd407d0037e41be0ec8a0a2e Mon Sep 17 00:00:00 2001
From: Tilman Schmidt <tilman@imap.cc>
Date: Thu, 23 Apr 2009 02:24:21 +0000
Subject: [PATCH 15/32] Documentation/isdn/INTERFACE.CAPI

isdn: document Kernel CAPI driver interface

Create a file Documentation/isdn/INTERFACE.CAPI describing the
interface between the kernel CAPI subsystem and ISDN device drivers,
analogous to the existing Documentation/isdn/INTERFACE for the old
isdn4linux subsystem. Also add kerneldoc comments to the exported
functions in drivers/isdn/capi/kcapi.c.

Impact: Documentation
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/isdn/00-INDEX       |   2 +
 Documentation/isdn/INTERFACE.CAPI | 207 ++++++++++++++++++++++++++++++
 drivers/isdn/capi/kcapi.c         | 171 ++++++++++++++++++++++++
 3 files changed, 380 insertions(+)
 create mode 100644 Documentation/isdn/INTERFACE.CAPI

diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index d9660ee5ab27..5a2d69989a8c 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -8,6 +8,8 @@ INTERFACE
 	- description of isdn4linux Link Level and Hardware Level interfaces.
 INTERFACE.fax
 	- description of the fax subinterface of isdn4linux.
+INTERFACE.CAPI
+	- description of kernel CAPI Link Level to Hardware Level interface.
 README
 	- general info on what you need and what to do for Linux ISDN.
 README.FAQ
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
new file mode 100644
index 000000000000..8947ffcda16e
--- /dev/null
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -0,0 +1,207 @@
+Kernel CAPI Interface to Hardware Drivers
+-----------------------------------------
+
+1. Overview
+
+Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI
+hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI
+lingo) with Kernel CAPI to indicate their readiness to provide their service
+to CAPI applications. CAPI applications also register with Kernel CAPI,
+requesting association with a CAPI device. Kernel CAPI then dispatches the
+application registration to an available device, forwarding it to the
+corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both
+directions between the application and the hardware driver.
+
+
+2. Driver and Device Registration
+
+CAPI drivers optionally register themselves with Kernel CAPI by calling the
+Kernel CAPI function register_capi_driver() with a pointer to a struct
+capi_driver. This structure must be filled with the name and revision of the
+driver, and optionally a pointer to a callback function, add_card(). The
+registration can be revoked by calling the function unregister_capi_driver()
+with a pointer to the same struct capi_driver.
+
+CAPI drivers must register each of the ISDN devices they control with Kernel
+CAPI by calling the Kernel CAPI function attach_capi_ctr() with a pointer to a
+struct capi_ctr before they can be used. This structure must be filled with
+the names of the driver and controller, and a number of callback function
+pointers which are subsequently used by Kernel CAPI for communicating with the
+driver. The registration can be revoked by calling the function
+detach_capi_ctr() with a pointer to the same struct capi_ctr.
+
+Before the device can be actually used, the driver must fill in the device
+information fields 'manu', 'version', 'profile' and 'serial' in the capi_ctr
+structure of the device, and signal its readiness by calling capi_ctr_ready().
+From then on, Kernel CAPI may call the registered callback functions for the
+device.
+
+If the device becomes unusable for any reason (shutdown, disconnect ...), the
+driver has to call capi_ctr_reseted(). This will prevent further calls to the
+callback functions by Kernel CAPI.
+
+
+3. Application Registration and Communication
+
+Kernel CAPI forwards registration requests from applications (calls to CAPI
+operation CAPI_REGISTER) to an appropriate hardware driver by calling its
+register_appl() callback function. A unique Application ID (ApplID, u16) is
+allocated by Kernel CAPI and passed to register_appl() along with the
+parameter structure provided by the application. This is analogous to the
+open() operation on regular files or character devices.
+
+After a successful return from register_appl(), CAPI messages from the
+application may be passed to the driver for the device via calls to the
+send_message() callback function. The CAPI message to send is stored in the
+data portion of a skb. Conversely, the driver may call Kernel CAPI's
+capi_ctr_handle_message() function to pass a received CAPI message to Kernel
+CAPI for forwarding to an application, specifying its ApplID.
+
+Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
+
+Deregistration requests (CAPI operation CAPI_RELEASE) from applications are
+forwarded as calls to the release_appl() callback function, passing the same
+ApplID as with register_appl(). After return from release_appl(), no CAPI
+messages for that application may be passed to or from the device anymore.
+
+
+4. Data Structures
+
+4.1 struct capi_driver
+
+This structure describes a Kernel CAPI driver itself. It is used in the
+register_capi_driver() and unregister_capi_driver() functions, and contains
+the following non-private fields, all to be set by the driver before calling
+register_capi_driver():
+
+char name[32]
+	the name of the driver, as a zero terminated ASCII string
+char revision[32]
+	the revision number of the driver, as a zero terminated ASCII string
+int (*add_card)(struct capi_driver *driver, capicardparams *data)
+	a callback function pointer (may be NULL)
+
+
+4.2 struct capi_ctr
+
+This structure describes an ISDN device (controller) handled by a Kernel CAPI
+driver. After registration via the attach_capi_ctr() function it is passed to
+all controller specific lower layer interface and callback functions to
+identify the controller to operate on.
+
+It contains the following non-private fields:
+
+- to be set by the driver before calling attach_capi_ctr():
+
+struct module *owner
+	pointer to the driver module owning the device
+
+void *driverdata
+	an opaque pointer to driver specific data, not touched by Kernel CAPI
+
+char name[32]
+	the name of the controller, as a zero terminated ASCII string
+
+char *driver_name
+	the name of the driver, as a zero terminated ASCII string
+
+int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
+	(optional) pointer to a callback function for sending firmware and
+	configuration data to the device
+
+void (*reset_ctr)(struct capi_ctr *ctrlr)
+	pointer to a callback function for performing a reset on the device,
+	releasing all registered applications
+
+void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
+			capi_register_params *rparam)
+void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
+	pointers to callback functions for registration and deregistration of
+	applications with the device
+
+u16  (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
+	pointer to a callback function for sending a CAPI message to the
+	device
+
+char *(*procinfo)(struct capi_ctr *ctrlr)
+	pointer to a callback function returning the entry for the device in
+	the CAPI controller info table, /proc/capi/controller
+
+read_proc_t *ctr_read_proc
+	pointer to the read_proc callback function for the device's proc file
+	system entry, /proc/capi/controllers/<n>; will be called with a
+	pointer to the device's capi_ctr structure as the last (data) argument
+
+- to be filled in before calling capi_ctr_ready():
+
+u8 manu[CAPI_MANUFACTURER_LEN]
+	value to return for CAPI_GET_MANUFACTURER
+
+capi_version version
+	value to return for CAPI_GET_VERSION
+
+capi_profile profile
+	value to return for CAPI_GET_PROFILE
+
+u8 serial[CAPI_SERIAL_LEN]
+	value to return for CAPI_GET_SERIAL
+
+
+5. Lower Layer Interface Functions
+
+(declared in <linux/isdn/capilli.h>)
+
+void register_capi_driver(struct capi_driver *drvr)
+void unregister_capi_driver(struct capi_driver *drvr)
+	register/unregister a driver with Kernel CAPI
+
+int attach_capi_ctr(struct capi_ctr *ctrlr)
+int detach_capi_ctr(struct capi_ctr *ctrlr)
+	register/unregister a device (controller) with Kernel CAPI
+
+void capi_ctr_ready(struct capi_ctr *ctrlr)
+void capi_ctr_reseted(struct capi_ctr *ctrlr)
+	signal controller ready/not ready
+
+void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
+void capi_ctr_resume_output(struct capi_ctr *ctrlr)
+	signal suspend/resume
+
+void capi_ctr_handle_message(struct capi_ctr * ctrlr, u16 applid,
+				struct sk_buff *skb)
+	pass a received CAPI message to Kernel CAPI
+	for forwarding to the specified application
+
+
+6. Helper Functions and Macros
+
+Library functions (from <linux/isdn/capilli.h>):
+
+void capilib_new_ncci(struct list_head *head, u16 applid,
+			u32 ncci, u32 winsize)
+void capilib_free_ncci(struct list_head *head, u16 applid, u32 ncci)
+void capilib_release_appl(struct list_head *head, u16 applid)
+void capilib_release(struct list_head *head)
+void capilib_data_b3_conf(struct list_head *head, u16 applid,
+			u32 ncci, u16 msgid)
+u16  capilib_data_b3_req(struct list_head *head, u16 applid,
+			u32 ncci, u16 msgid)
+
+
+Macros to extract/set element values from/in a CAPI message header
+(from <linux/isdn/capiutil.h>):
+
+Get Macro		Set Macro			Element (Type)
+
+CAPIMSG_LEN(m)		CAPIMSG_SETLEN(m, len)		Total Length (u16)
+CAPIMSG_APPID(m)	CAPIMSG_SETAPPID(m, applid)	ApplID (u16)
+CAPIMSG_COMMAND(m)	CAPIMSG_SETCOMMAND(m,cmd)	Command (u8)
+CAPIMSG_SUBCOMMAND(m)	CAPIMSG_SETSUBCOMMAND(m, cmd)	Subcommand (u8)
+CAPIMSG_CMD(m)		-				Command*256
+							+ Subcommand (u16)
+CAPIMSG_MSGID(m)	CAPIMSG_SETMSGID(m, msgid)	Message Number (u16)
+
+CAPIMSG_CONTROL(m)	CAPIMSG_SETCONTROL(m, contr)	Controller/PLCI/NCCI
+							(u32)
+CAPIMSG_DATALEN(m)	CAPIMSG_SETDATALEN(m, len)	Data Length (u16)
+
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 5360c4fd4739..f33170368cd1 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -270,6 +270,15 @@ static void recv_handler(struct work_struct *work)
 	mutex_unlock(&ap->recv_mtx);
 }
 
+/**
+ * capi_ctr_handle_message() - handle incoming CAPI message
+ * @card:	controller descriptor structure.
+ * @appl:	application ID.
+ * @skb:	message.
+ *
+ * Called by hardware driver to pass a CAPI message to the application.
+ */
+
 void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb)
 {
 	struct capi20_appl *ap;
@@ -348,6 +357,13 @@ error:
 
 EXPORT_SYMBOL(capi_ctr_handle_message);
 
+/**
+ * capi_ctr_ready() - signal CAPI controller ready
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to signal that the controller is up and running.
+ */
+
 void capi_ctr_ready(struct capi_ctr * card)
 {
 	card->cardstate = CARD_RUNNING;
@@ -360,6 +376,14 @@ void capi_ctr_ready(struct capi_ctr * card)
 
 EXPORT_SYMBOL(capi_ctr_ready);
 
+/**
+ * capi_ctr_reseted() - signal CAPI controller reset
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to signal that the controller is down and
+ * unavailable for use.
+ */
+
 void capi_ctr_reseted(struct capi_ctr * card)
 {
 	u16 appl;
@@ -391,6 +415,13 @@ void capi_ctr_reseted(struct capi_ctr * card)
 
 EXPORT_SYMBOL(capi_ctr_reseted);
 
+/**
+ * capi_ctr_suspend_output() - suspend controller
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to stop data flow.
+ */
+
 void capi_ctr_suspend_output(struct capi_ctr *card)
 {
 	if (!card->blocked) {
@@ -401,6 +432,13 @@ void capi_ctr_suspend_output(struct capi_ctr *card)
 
 EXPORT_SYMBOL(capi_ctr_suspend_output);
 
+/**
+ * capi_ctr_resume_output() - resume controller
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to resume data flow.
+ */
+
 void capi_ctr_resume_output(struct capi_ctr *card)
 {
 	if (card->blocked) {
@@ -413,6 +451,14 @@ EXPORT_SYMBOL(capi_ctr_resume_output);
 
 /* ------------------------------------------------------------- */
 
+/**
+ * attach_capi_ctr() - register CAPI controller
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to register a controller with the CAPI subsystem.
+ * Return value: 0 on success, error code < 0 on error
+ */
+
 int
 attach_capi_ctr(struct capi_ctr *card)
 {
@@ -459,6 +505,15 @@ attach_capi_ctr(struct capi_ctr *card)
 
 EXPORT_SYMBOL(attach_capi_ctr);
 
+/**
+ * detach_capi_ctr() - unregister CAPI controller
+ * @card:	controller descriptor structure.
+ *
+ * Called by hardware driver to remove the registration of a controller
+ * with the CAPI subsystem.
+ * Return value: 0 on success, error code < 0 on error
+ */
+
 int detach_capi_ctr(struct capi_ctr *card)
 {
         if (card->cardstate != CARD_DETECTED)
@@ -479,6 +534,13 @@ int detach_capi_ctr(struct capi_ctr *card)
 
 EXPORT_SYMBOL(detach_capi_ctr);
 
+/**
+ * register_capi_driver() - register CAPI driver
+ * @driver:	driver descriptor structure.
+ *
+ * Called by hardware driver to register itself with the CAPI subsystem.
+ */
+
 void register_capi_driver(struct capi_driver *driver)
 {
 	unsigned long flags;
@@ -490,6 +552,13 @@ void register_capi_driver(struct capi_driver *driver)
 
 EXPORT_SYMBOL(register_capi_driver);
 
+/**
+ * unregister_capi_driver() - unregister CAPI driver
+ * @driver:	driver descriptor structure.
+ *
+ * Called by hardware driver to unregister itself from the CAPI subsystem.
+ */
+
 void unregister_capi_driver(struct capi_driver *driver)
 {
 	unsigned long flags;
@@ -505,6 +574,13 @@ EXPORT_SYMBOL(unregister_capi_driver);
 /* -------- CAPI2.0 Interface ---------------------------------- */
 /* ------------------------------------------------------------- */
 
+/**
+ * capi20_isinstalled() - CAPI 2.0 operation CAPI_INSTALLED
+ *
+ * Return value: CAPI result code (CAPI_NOERROR if at least one ISDN controller
+ *	is ready for use, CAPI_REGNOTINSTALLED otherwise)
+ */
+
 u16 capi20_isinstalled(void)
 {
 	int i;
@@ -517,6 +593,18 @@ u16 capi20_isinstalled(void)
 
 EXPORT_SYMBOL(capi20_isinstalled);
 
+/**
+ * capi20_register() - CAPI 2.0 operation CAPI_REGISTER
+ * @ap:		CAPI application descriptor structure.
+ *
+ * Register an application's presence with CAPI.
+ * A unique application ID is assigned and stored in @ap->applid.
+ * After this function returns successfully, the message receive
+ * callback function @ap->recv_message() may be called at any time
+ * until capi20_release() has been called for the same @ap.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_register(struct capi20_appl *ap)
 {
 	int i;
@@ -571,6 +659,16 @@ u16 capi20_register(struct capi20_appl *ap)
 
 EXPORT_SYMBOL(capi20_register);
 
+/**
+ * capi20_release() - CAPI 2.0 operation CAPI_RELEASE
+ * @ap:		CAPI application descriptor structure.
+ *
+ * Terminate an application's registration with CAPI.
+ * After this function returns successfully, the message receive
+ * callback function @ap->recv_message() will no longer be called.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_release(struct capi20_appl *ap)
 {
 	int i;
@@ -603,6 +701,15 @@ u16 capi20_release(struct capi20_appl *ap)
 
 EXPORT_SYMBOL(capi20_release);
 
+/**
+ * capi20_put_message() - CAPI 2.0 operation CAPI_PUT_MESSAGE
+ * @ap:		CAPI application descriptor structure.
+ * @skb:	CAPI message.
+ *
+ * Transfer a single message to CAPI.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
 {
 	struct capi_ctr *card;
@@ -668,6 +775,16 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
 
 EXPORT_SYMBOL(capi20_put_message);
 
+/**
+ * capi20_get_manufacturer() - CAPI 2.0 operation CAPI_GET_MANUFACTURER
+ * @contr:	controller number.
+ * @buf:	result buffer (64 bytes).
+ *
+ * Retrieve information about the manufacturer of the specified ISDN controller
+ * or (for @contr == 0) the driver itself.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 {
 	struct capi_ctr *card;
@@ -685,6 +802,16 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 
 EXPORT_SYMBOL(capi20_get_manufacturer);
 
+/**
+ * capi20_get_version() - CAPI 2.0 operation CAPI_GET_VERSION
+ * @contr:	controller number.
+ * @verp:	result structure.
+ *
+ * Retrieve version information for the specified ISDN controller
+ * or (for @contr == 0) the driver itself.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_get_version(u32 contr, struct capi_version *verp)
 {
 	struct capi_ctr *card;
@@ -703,6 +830,16 @@ u16 capi20_get_version(u32 contr, struct capi_version *verp)
 
 EXPORT_SYMBOL(capi20_get_version);
 
+/**
+ * capi20_get_serial() - CAPI 2.0 operation CAPI_GET_SERIAL_NUMBER
+ * @contr:	controller number.
+ * @serial:	result buffer (8 bytes).
+ *
+ * Retrieve the serial number of the specified ISDN controller
+ * or (for @contr == 0) the driver itself.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_get_serial(u32 contr, u8 *serial)
 {
 	struct capi_ctr *card;
@@ -721,6 +858,16 @@ u16 capi20_get_serial(u32 contr, u8 *serial)
 
 EXPORT_SYMBOL(capi20_get_serial);
 
+/**
+ * capi20_get_profile() - CAPI 2.0 operation CAPI_GET_PROFILE
+ * @contr:	controller number.
+ * @profp:	result structure.
+ *
+ * Retrieve capability information for the specified ISDN controller
+ * or (for @contr == 0) the number of installed controllers.
+ * Return value: CAPI result code
+ */
+
 u16 capi20_get_profile(u32 contr, struct capi_profile *profp)
 {
 	struct capi_ctr *card;
@@ -903,6 +1050,15 @@ static int old_capi_manufacturer(unsigned int cmd, void __user *data)
 }
 #endif
 
+/**
+ * capi20_manufacturer() - CAPI 2.0 operation CAPI_MANUFACTURER
+ * @cmd:	command.
+ * @data:	parameter.
+ *
+ * Perform manufacturer specific command.
+ * Return value: CAPI result code
+ */
+
 int capi20_manufacturer(unsigned int cmd, void __user *data)
 {
         struct capi_ctr *card;
@@ -981,6 +1137,21 @@ int capi20_manufacturer(unsigned int cmd, void __user *data)
 EXPORT_SYMBOL(capi20_manufacturer);
 
 /* temporary hack */
+
+/**
+ * capi20_set_callback() - set CAPI application notification callback function
+ * @ap:		CAPI application descriptor structure.
+ * @callback:	callback function (NULL to remove).
+ *
+ * If not NULL, the callback function will be called to notify the
+ * application of the addition or removal of a controller.
+ * The first argument (cmd) will tell whether the controller was added
+ * (KCI_CONTRUP) or removed (KCI_CONTRDOWN).
+ * The second argument (contr) will be the controller number.
+ * For cmd==KCI_CONTRUP the third argument (data) will be a pointer to the
+ * new controller's capability profile structure.
+ */
+
 void capi20_set_callback(struct capi20_appl *ap,
 			 void (*callback) (unsigned int cmd, __u32 contr, void *data))
 {

From 2296e5a0136f7ba64c99f3a48a55a687aa9abcc8 Mon Sep 17 00:00:00 2001
From: Karsten Keil <keil@b1-systems.de>
Date: Thu, 23 Apr 2009 02:24:21 +0000
Subject: [PATCH 16/32] Add reference to CAPI 2.0 standard

Move the entry about CAPI 2.0 to the beginning and add a URL.
Incorporate changes suggested by Randy Dunlap, thanks for proofreading.

Signed-off-by: Karsten Keil <keil@b1-systems.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/isdn/INTERFACE.CAPI | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
index 8947ffcda16e..786d619b36e5 100644
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -3,6 +3,11 @@ Kernel CAPI Interface to Hardware Drivers
 
 1. Overview
 
+From the CAPI 2.0 specification:
+COMMON-ISDN-API (CAPI) is an application programming interface standard used
+to access ISDN equipment connected to basic rate interfaces (BRI) and primary
+rate interfaces (PRI).
+
 Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI
 hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI
 lingo) with Kernel CAPI to indicate their readiness to provide their service
@@ -12,6 +17,9 @@ application registration to an available device, forwarding it to the
 corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both
 directions between the application and the hardware driver.
 
+Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
+This standard is freely available from http://www.capi.org.
+
 
 2. Driver and Device Registration
 
@@ -53,12 +61,10 @@ open() operation on regular files or character devices.
 After a successful return from register_appl(), CAPI messages from the
 application may be passed to the driver for the device via calls to the
 send_message() callback function. The CAPI message to send is stored in the
-data portion of a skb. Conversely, the driver may call Kernel CAPI's
+data portion of an skb. Conversely, the driver may call Kernel CAPI's
 capi_ctr_handle_message() function to pass a received CAPI message to Kernel
 CAPI for forwarding to an application, specifying its ApplID.
 
-Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
-
 Deregistration requests (CAPI operation CAPI_RELEASE) from applications are
 forwarded as calls to the release_appl() callback function, passing the same
 ApplID as with register_appl(). After return from release_appl(), no CAPI
@@ -75,9 +81,9 @@ the following non-private fields, all to be set by the driver before calling
 register_capi_driver():
 
 char name[32]
-	the name of the driver, as a zero terminated ASCII string
+	the name of the driver, as a zero-terminated ASCII string
 char revision[32]
-	the revision number of the driver, as a zero terminated ASCII string
+	the revision number of the driver, as a zero-terminated ASCII string
 int (*add_card)(struct capi_driver *driver, capicardparams *data)
 	a callback function pointer (may be NULL)
 
@@ -100,10 +106,10 @@ void *driverdata
 	an opaque pointer to driver specific data, not touched by Kernel CAPI
 
 char name[32]
-	the name of the controller, as a zero terminated ASCII string
+	the name of the controller, as a zero-terminated ASCII string
 
 char *driver_name
-	the name of the driver, as a zero terminated ASCII string
+	the name of the driver, as a zero-terminated ASCII string
 
 int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
 	(optional) pointer to a callback function for sending firmware and

From c9503e0fe052020e0294cd07d0ecd982eb7c9177 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 27 Apr 2009 05:42:24 -0700
Subject: [PATCH 17/32] ipv4: Limit size of route cache hash table

Right now we have no upper limit on the size of the route cache hash table.
On a 128GB POWER6 box it ends up as 32MB:

    IP route cache hash table entries: 4194304 (order: 9, 33554432 bytes)

It would be nice to cap this for memory consumption reasons, but a massive
hashtable also causes a significant spike when measuring OS jitter.

With a 32MB hashtable and 4 million entries, rt_worker_func is taking
5 ms to complete. On another system with more memory it's taking 14 ms.
Even though rt_worker_func does call cond_sched() to limit its impact,
in an HPC environment we want to keep all sources of OS jitter to a minimum.

With the patch applied we limit the number of entries to 512k which
can still be overriden by using the rt_entries boot option:

    IP route cache hash table entries: 524288 (order: 6, 4194304 bytes)

With this patch rt_worker_func now takes 0.460 ms on the same system.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c40debe51b38..c4c60e9f068a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3397,7 +3397,7 @@ int __init ip_rt_init(void)
 					0,
 					&rt_hash_log,
 					&rt_hash_mask,
-					0);
+					rhash_entries ? 0 : 512 * 1024);
 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
 	rt_hash_lock_init();
 

From 37b607c5ac3b7c92a6a3624bb29f1cdcdcf7044a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Mon, 27 Apr 2009 05:45:54 -0700
Subject: [PATCH 18/32] net: Fix typo in net_device_ops description.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 453be9a674c0..5a96a1a406e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -500,7 +500,7 @@ struct netdev_queue {
  *
  * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
  *	This function  is called when the Media Access Control address
- *	needs to be changed. If not this interface is not defined, the
+ *	needs to be changed. If this interface is not defined, the
  *	mac address can not be changed.
  *
  * int (*ndo_validate_addr)(struct net_device *dev);

From bf368e4e70cd4e0f880923c44e95a4273d725ab4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 28 Apr 2009 02:24:21 -0700
Subject: [PATCH 19/32] net: Avoid extra wakeups of threads blocked in
 wait_for_packet()

In 2.6.25 we added UDP mem accounting.

This unfortunatly added a penalty when a frame is transmitted, since
we have at TX completion time to call sock_wfree() to perform necessary
memory accounting. This calls sock_def_write_space() and utimately
scheduler if any thread is waiting on the socket.
Thread(s) waiting for an incoming frame was scheduled, then had to sleep
again as event was meaningless.

(All threads waiting on a socket are using same sk_sleep anchor)

This adds lot of extra wakeups and increases latencies, as noted
by Christoph Lameter, and slows down softirq handler.

Reference : http://marc.info/?l=linux-netdev&m=124060437012283&w=2

Fortunatly, Davide Libenzi recently added concept of keyed wakeups
into kernel, and particularly for sockets (see commit
37e5540b3c9d838eb20f2ca8ea2eb8072271e403
epoll keyed wakeups: make sockets use keyed wakeups)

Davide goal was to optimize epoll, but this new wakeup infrastructure
can help non epoll users as well, if they care to setup an appropriate
handler.

This patch introduces new DEFINE_WAIT_FUNC() helper and uses it
in wait_for_packet(), so that only relevant event can wakeup a thread
blocked in this function.

Trace of function calls from bnx2 TX completion bnx2_poll_work() is :
__kfree_skb()
 skb_release_head_state()
  sock_wfree()
   sock_def_write_space()
    __wake_up_sync_key()
     __wake_up_common()
      receiver_wake_function() : Stops here since thread is waiting for an INPUT


Reported-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/wait.h |  6 ++++--
 net/core/datagram.c  | 14 +++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c17eaee..bc024632f365 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -440,13 +440,15 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
-#define DEFINE_WAIT(name)						\
+#define DEFINE_WAIT_FUNC(name, function)				\
 	wait_queue_t name = {						\
 		.private	= current,				\
-		.func		= autoremove_wake_function,		\
+		.func		= function,				\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
 
+#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
+
 #define DEFINE_WAIT_BIT(name, word, bit)				\
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d0de644b378d..b01a76abe1d2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk)
 	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
 }
 
+static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
+				  void *key)
+{
+	unsigned long bits = (unsigned long)key;
+
+	/*
+	 * Avoid a wakeup if event not interesting for us
+	 */
+	if (bits && !(bits & (POLLIN | POLLERR)))
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
 /*
  * Wait for a packet..
  */
 static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
 {
 	int error;
-	DEFINE_WAIT(wait);
+	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
 
 	prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 

From f3784d834c71689336fa272df420b45345cb6b84 Mon Sep 17 00:00:00 2001
From: Roger Quadros <ext-roger.quadros@nokia.com>
Date: Thu, 23 Apr 2009 14:50:54 +0300
Subject: [PATCH 20/32] Bluetooth: Ensure that HCI sysfs add/del is preempt
 safe

Use a different work_struct variables for add_conn() and del_conn() and
use single work queue instead of two for adding and deleting connections.

It eliminates the following error on a preemptible kernel:

[  204.358032] Unable to handle kernel NULL pointer dereference at virtual address 0000000c
[  204.370697] pgd = c0004000
[  204.373443] [0000000c] *pgd=00000000
[  204.378601] Internal error: Oops: 17 [#1] PREEMPT
[  204.383361] Modules linked in: vfat fat rfcomm sco l2cap sd_mod scsi_mod iphb pvr2d drm omaplfb ps
[  204.438537] CPU: 0    Not tainted  (2.6.28-maemo2 #1)
[  204.443664] PC is at klist_put+0x2c/0xb4
[  204.447601] LR is at klist_put+0x18/0xb4
[  204.451568] pc : [<c0270f08>]    lr : [<c0270ef4>]    psr: a0000113
[  204.451568] sp : cf1b3f10  ip : cf1b3f10  fp : cf1b3f2c
[  204.463104] r10: 00000000  r9 : 00000000  r8 : bf08029c
[  204.468353] r7 : c7869200  r6 : cfbe2690  r5 : c78692c8  r4 : 00000001
[  204.474945] r3 : 00000001  r2 : cf1b2000  r1 : 00000001  r0 : 00000000
[  204.481506] Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM Segment kernel
[  204.488861] Control: 10c5387d  Table: 887fc018  DAC: 00000017
[  204.494628] Process btdelconn (pid: 515, stack limit = 0xcf1b22e0)

Signed-off-by: Roger Quadros <ext-roger.quadros@nokia.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  3 ++-
 net/bluetooth/hci_sysfs.c        | 37 ++++++++++++++------------------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 01f9316b4c23..1224bba24bdd 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -180,7 +180,8 @@ struct hci_conn {
 	struct timer_list disc_timer;
 	struct timer_list idle_timer;
 
-	struct work_struct work;
+	struct work_struct work_add;
+	struct work_struct work_del;
 
 	struct device	dev;
 
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index ed82796d4a0f..b7c51082ddeb 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -9,8 +9,7 @@
 struct class *bt_class = NULL;
 EXPORT_SYMBOL_GPL(bt_class);
 
-static struct workqueue_struct *btaddconn;
-static struct workqueue_struct *btdelconn;
+static struct workqueue_struct *bluetooth;
 
 static inline char *link_typetostr(int type)
 {
@@ -88,9 +87,10 @@ static struct device_type bt_link = {
 
 static void add_conn(struct work_struct *work)
 {
-	struct hci_conn *conn = container_of(work, struct hci_conn, work);
+	struct hci_conn *conn = container_of(work, struct hci_conn, work_add);
 
-	flush_workqueue(btdelconn);
+	/* ensure previous add/del is complete */
+	flush_workqueue(bluetooth);
 
 	if (device_add(&conn->dev) < 0) {
 		BT_ERR("Failed to register connection device");
@@ -114,9 +114,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn)
 
 	device_initialize(&conn->dev);
 
-	INIT_WORK(&conn->work, add_conn);
+	INIT_WORK(&conn->work_add, add_conn);
 
-	queue_work(btaddconn, &conn->work);
+	queue_work(bluetooth, &conn->work_add);
 }
 
 /*
@@ -131,9 +131,12 @@ static int __match_tty(struct device *dev, void *data)
 
 static void del_conn(struct work_struct *work)
 {
-	struct hci_conn *conn = container_of(work, struct hci_conn, work);
+	struct hci_conn *conn = container_of(work, struct hci_conn, work_del);
 	struct hci_dev *hdev = conn->hdev;
 
+	/* ensure previous add/del is complete */
+	flush_workqueue(bluetooth);
+
 	while (1) {
 		struct device *dev;
 
@@ -156,9 +159,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn)
 	if (!device_is_registered(&conn->dev))
 		return;
 
-	INIT_WORK(&conn->work, del_conn);
+	INIT_WORK(&conn->work_del, del_conn);
 
-	queue_work(btdelconn, &conn->work);
+	queue_work(bluetooth, &conn->work_del);
 }
 
 static inline char *host_typetostr(int type)
@@ -435,20 +438,13 @@ void hci_unregister_sysfs(struct hci_dev *hdev)
 
 int __init bt_sysfs_init(void)
 {
-	btaddconn = create_singlethread_workqueue("btaddconn");
-	if (!btaddconn)
+	bluetooth = create_singlethread_workqueue("bluetooth");
+	if (!bluetooth)
 		return -ENOMEM;
 
-	btdelconn = create_singlethread_workqueue("btdelconn");
-	if (!btdelconn) {
-		destroy_workqueue(btaddconn);
-		return -ENOMEM;
-	}
-
 	bt_class = class_create(THIS_MODULE, "bluetooth");
 	if (IS_ERR(bt_class)) {
-		destroy_workqueue(btdelconn);
-		destroy_workqueue(btaddconn);
+		destroy_workqueue(bluetooth);
 		return PTR_ERR(bt_class);
 	}
 
@@ -457,8 +453,7 @@ int __init bt_sysfs_init(void)
 
 void bt_sysfs_cleanup(void)
 {
-	destroy_workqueue(btaddconn);
-	destroy_workqueue(btdelconn);
+	destroy_workqueue(bluetooth);
 
 	class_destroy(bt_class);
 }

From 052b30b0a8eec8db5b18ad49effdf2a9ba4c1e1a Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Sun, 26 Apr 2009 20:01:22 +0200
Subject: [PATCH 21/32] Bluetooth: Add different pairing timeout for Legacy
 Pairing

The Bluetooth stack uses a reference counting for all established ACL
links and if no user (L2CAP connection) is present, the link will be
terminated to save power. The problem part is the dedicated pairing
when using Legacy Pairing (Bluetooth 2.0 and before). At that point
no user is present and pairing attempts will be disconnected within
10 seconds or less. In previous kernel version this was not a problem
since the disconnect timeout wasn't triggered on incoming connections
for the first time. However this caused issues with broken host stacks
that kept the connections around after dedicated pairing. When the
support for Simple Pairing got added, the link establishment procedure
needed to be changed and now causes issues when using Legacy Pairing

When using Simple Pairing it is possible to do a proper reference
counting of ACL link users. With Legacy Pairing this is not possible
since the specification is unclear in some areas and too many broken
Bluetooth devices have already been deployed. So instead of trying to
deal with all the broken devices, a special pairing timeout will be
introduced that increases the timeout to 60 seconds when pairing is
triggered.

If a broken devices now puts the stack into an unforeseen state, the
worst that happens is the disconnect timeout triggers after 120 seconds
instead of 4 seconds. This allows successful pairings with legacy and
broken devices now.

Based on a report by Johan Hedberg <johan.hedberg@nokia.com>

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_core.h |  5 +++--
 net/bluetooth/hci_conn.c         |  1 +
 net/bluetooth/hci_event.c        | 36 +++++++++++++++++++++++++++++++-
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index f69f015bbcc0..ed3aea1605e8 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -101,6 +101,7 @@ enum {
 /* HCI timeouts */
 #define HCI_CONNECT_TIMEOUT	(40000)	/* 40 seconds */
 #define HCI_DISCONN_TIMEOUT	(2000)	/* 2 seconds */
+#define HCI_PAIRING_TIMEOUT	(60000)	/* 60 seconds */
 #define HCI_IDLE_TIMEOUT	(6000)	/* 6 seconds */
 #define HCI_INIT_TIMEOUT	(10000)	/* 10 seconds */
 
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 1224bba24bdd..be5bd713d2c9 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -171,6 +171,7 @@ struct hci_conn {
 	__u8             auth_type;
 	__u8             sec_level;
 	__u8             power_save;
+	__u16            disc_timeout;
 	unsigned long	 pend;
 
 	unsigned int	 sent;
@@ -349,9 +350,9 @@ static inline void hci_conn_put(struct hci_conn *conn)
 		if (conn->type == ACL_LINK) {
 			del_timer(&conn->idle_timer);
 			if (conn->state == BT_CONNECTED) {
-				timeo = msecs_to_jiffies(HCI_DISCONN_TIMEOUT);
+				timeo = msecs_to_jiffies(conn->disc_timeout);
 				if (!conn->out)
-					timeo *= 5;
+					timeo *= 2;
 			} else
 				timeo = msecs_to_jiffies(10);
 		} else
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 1181db08d9de..75ebbe2221a3 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -215,6 +215,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
 	conn->state = BT_OPEN;
 
 	conn->power_save = 1;
+	conn->disc_timeout = HCI_DISCONN_TIMEOUT;
 
 	switch (type) {
 	case ACL_LINK:
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 15f40ea8d544..4e7cb88e5da9 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -883,6 +883,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 		if (conn->type == ACL_LINK) {
 			conn->state = BT_CONFIG;
 			hci_conn_hold(conn);
+			conn->disc_timeout = HCI_DISCONN_TIMEOUT;
 		} else
 			conn->state = BT_CONNECTED;
 
@@ -1063,9 +1064,14 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 				hci_proto_connect_cfm(conn, ev->status);
 				hci_conn_put(conn);
 			}
-		} else
+		} else {
 			hci_auth_cfm(conn, ev->status);
 
+			hci_conn_hold(conn);
+			conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+			hci_conn_put(conn);
+		}
+
 		if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) {
 			if (!ev->status) {
 				struct hci_cp_set_conn_encrypt cp;
@@ -1479,7 +1485,21 @@ static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb
 
 static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
+	struct hci_ev_pin_code_req *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
 	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn) {
+		hci_conn_hold(conn);
+		conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+		hci_conn_put(conn);
+	}
+
+	hci_dev_unlock(hdev);
 }
 
 static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
@@ -1489,7 +1509,21 @@ static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff
 
 static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
+	struct hci_ev_link_key_notify *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
 	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn) {
+		hci_conn_hold(conn);
+		conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+		hci_conn_put(conn);
+	}
+
+	hci_dev_unlock(hdev);
 }
 
 static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)

From 3fdca1e1370ffe89980927cdef0583bebcd8caaf Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Tue, 28 Apr 2009 09:04:55 -0700
Subject: [PATCH 22/32] Bluetooth: Fix connection establishment with low
 security requirement

The Bluetooth 2.1 specification introduced four different security modes
that can be mapped using Legacy Pairing and Simple Pairing. With the
usage of Simple Pairing it is required that all connections (except
the ones for SDP) are encrypted. So even the low security requirement
mandates an encrypted connection when using Simple Pairing. When using
Legacy Pairing (for Bluetooth 2.0 devices and older) this is not required
since it causes interoperability issues.

To support this properly the low security requirement translates into
different host controller transactions depending if Simple Pairing is
supported or not. However in case of Simple Pairing the command to
switch on encryption after a successful authentication is not triggered
for the low security mode. This patch fixes this and actually makes
the logic to differentiate between Simple Pairing and Legacy Pairing
a lot simpler.

Based on a report by Ville Tervo <ville.tervo@nokia.com>

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_conn.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 75ebbe2221a3..375f4b4f7f79 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -425,12 +425,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
 	if (sec_level == BT_SECURITY_SDP)
 		return 1;
 
-	if (sec_level == BT_SECURITY_LOW) {
-		if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0)
-			return hci_conn_auth(conn, sec_level, auth_type);
-		else
-			return 1;
-	}
+	if (sec_level == BT_SECURITY_LOW &&
+				(!conn->ssp_mode || !conn->hdev->ssp_mode))
+		return 1;
 
 	if (conn->link_mode & HCI_LM_ENCRYPT)
 		return hci_conn_auth(conn, sec_level, auth_type);

From 6269b731560d69c5eaa929909891edec39496d71 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 22 Apr 2009 15:11:05 +1000
Subject: [PATCH 23/32] wireless: remove unneeded EXPORT_SYMBOL the tickles a
 powerpc compiler bug

drivers/net/wireless/iwlwifi/iwl3945-base.c:1415: error: __ksymtab_iwl3945_rx_queue_reset causes a section type conflict

I am pretty sure that this is a compiler bug, so not to worry.  However,
as far as I can see, iwl-3945.o (the only user) and iwl3945-base.o are
always linked into the same module, so the EXPORT_SYMBOL (which causes
the problem) should not be needed.  Correct?

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl3945-base.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 617c4235d971..70a00c8ee42e 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -1694,7 +1694,6 @@ void iwl3945_rx_queue_reset(struct iwl_priv *priv, struct iwl_rx_queue *rxq)
 	rxq->free_count = 0;
 	spin_unlock_irqrestore(&rxq->lock, flags);
 }
-EXPORT_SYMBOL(iwl3945_rx_queue_reset);
 
 /*
  * this should be called while priv->lock is locked

From e805e4d0b53506dff4255a2792483f094e7fcd2c Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 22 Apr 2009 10:59:37 +0300
Subject: [PATCH 24/32] rndis_wlan: fix initialization order for
 workqueue&workers

rndis_wext_link_change() might be called from rndis_command() at
initialization stage and priv->workqueue/priv->work have not been
initialized yet. This causes invalid opcode at rndis_wext_bind on
some brands of bcm4320.

Fix by initializing workqueue/workers in rndis_wext_bind() before
rndis_command is used.

This bug has existed since 2.6.25, reported at:
	http://bugzilla.kernel.org/show_bug.cgi?id=12794

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/rndis_wlan.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index db91db776508..bebf735cd4bd 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -2558,6 +2558,11 @@ static int rndis_wext_bind(struct usbnet *usbdev, struct usb_interface *intf)
 	mutex_init(&priv->command_lock);
 	spin_lock_init(&priv->stats_lock);
 
+	/* because rndis_command() sleeps we need to use workqueue */
+	priv->workqueue = create_singlethread_workqueue("rndis_wlan");
+	INIT_WORK(&priv->work, rndis_wext_worker);
+	INIT_DELAYED_WORK(&priv->stats_work, rndis_update_wireless_stats);
+
 	/* try bind rndis_host */
 	retval = generic_rndis_bind(usbdev, intf, FLAG_RNDIS_PHYM_WIRELESS);
 	if (retval < 0)
@@ -2603,16 +2608,17 @@ static int rndis_wext_bind(struct usbnet *usbdev, struct usb_interface *intf)
 	disassociate(usbdev, 1);
 	netif_carrier_off(usbdev->net);
 
-	/* because rndis_command() sleeps we need to use workqueue */
-	priv->workqueue = create_singlethread_workqueue("rndis_wlan");
-	INIT_DELAYED_WORK(&priv->stats_work, rndis_update_wireless_stats);
 	queue_delayed_work(priv->workqueue, &priv->stats_work,
 		round_jiffies_relative(STATS_UPDATE_JIFFIES));
-	INIT_WORK(&priv->work, rndis_wext_worker);
 
 	return 0;
 
 fail:
+	cancel_delayed_work_sync(&priv->stats_work);
+	cancel_work_sync(&priv->work);
+	flush_workqueue(priv->workqueue);
+	destroy_workqueue(priv->workqueue);
+
 	kfree(priv);
 	return retval;
 }

From 74aa9be0ea0ffeb233f45c39f3cf594b68bbbb89 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 23 Apr 2009 10:45:04 +0200
Subject: [PATCH 25/32] iwlwifi: notify on scan completion even when shutting
 down

Under certain circumstances iwlwifi can get stuck and will no
longer accept scan requests, because the core code (cfg80211)
thinks that it's still processing one. This fixes one of the
points where it can happen, but I've still seen it (although
only with my radio-off-when-idle patch).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/iwlwifi/iwl-scan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c
index 23644cf884f1..e7c65c4f741b 100644
--- a/drivers/net/wireless/iwlwifi/iwl-scan.c
+++ b/drivers/net/wireless/iwlwifi/iwl-scan.c
@@ -925,11 +925,11 @@ void iwl_bg_scan_completed(struct work_struct *work)
 
 	IWL_DEBUG_SCAN(priv, "SCAN complete scan\n");
 
+	ieee80211_scan_completed(priv->hw, false);
+
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
 
-	ieee80211_scan_completed(priv->hw, false);
-
 	/* Since setting the TXPOWER may have been deferred while
 	 * performing the scan, fire one off */
 	mutex_lock(&priv->mutex);

From b7fcb5c4a4c27da2f6d86cb03d18687e537442cf Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Mon, 27 Apr 2009 22:12:43 -0400
Subject: [PATCH 26/32] ath5k: fix buffer overrun in rate debug code

char bname[5] is too small for the string "X GHz" when the null
terminator is taken into account.  Thus, turning on rate debugging
can crash unless we have lucky stack alignment.

Cc: stable@kernel.org
Reported-by: Paride Legovini <legovini@spiro.fisica.unipd.it>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath5k/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath5k/debug.c b/drivers/net/wireless/ath5k/debug.c
index 9770bb3d40f9..4904a07e4b59 100644
--- a/drivers/net/wireless/ath5k/debug.c
+++ b/drivers/net/wireless/ath5k/debug.c
@@ -424,7 +424,7 @@ ath5k_debug_dump_bands(struct ath5k_softc *sc)
 
 	for (b = 0; b < IEEE80211_NUM_BANDS; b++) {
 		struct ieee80211_supported_band *band = &sc->sbands[b];
-		char bname[5];
+		char bname[6];
 		switch (band->band) {
 		case IEEE80211_BAND_2GHZ:
 			strcpy(bname, "2 GHz");

From 942e4a2bd680c606af0211e64eb216be2e19bf61 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 28 Apr 2009 22:36:33 -0700
Subject: [PATCH 27/32] netfilter: revised locking for x_tables

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h |  73 +++++++++++++++--
 net/ipv4/netfilter/arp_tables.c    | 125 +++++++++-------------------
 net/ipv4/netfilter/ip_tables.c     | 126 ++++++++---------------------
 net/ipv6/netfilter/ip6_tables.c    | 123 +++++++++-------------------
 net/netfilter/x_tables.c           |  53 ++++++------
 5 files changed, 204 insertions(+), 296 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652066c0..1b2e43502ef7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU spinlock associated with per-cpu table entries, and
+ * with a counter for the "reading" side that allows a recursive
+ * reader to avoid taking the lock and deadlocking.
+ *
+ * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while the packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because of the count.
+ *
+ * "writing" is used when reading counters.
+ *  During replace any readers that are using the old tables have to complete
+ *  before freeing the old table. This is handled by the write locking
+ *  necessary for reading the counters.
+ */
+struct xt_info_lock {
+	spinlock_t lock;
+	unsigned char readers;
+};
+DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+/*
+ * Note: we need to ensure that preemption is disabled before acquiring
+ * the per-cpu-variable, so we do it as a two step process rather than
+ * using "spin_lock_bh()".
+ *
+ * We _also_ need to disable bottom half processing before updating our
+ * nesting count, to make sure that the only kind of re-entrancy is this
+ * code being called by itself: since the count+lock is not an atomic
+ * operation, we can allow no races.
+ *
+ * _Only_ that special combination of being per-cpu and never getting
+ * re-entered asynchronously means that the count is safe.
+ */
+static inline void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	local_bh_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (!lock->readers++)
+		spin_lock(&lock->lock);
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	if (!--lock->readers)
+		spin_unlock(&lock->lock);
+	local_bh_enable();
+}
+
+/*
+ * The "writer" side needs to get exclusive access to the lock,
+ * regardless of readers.  This must be called with bottom half
+ * processing (and thus also preemption) disabled.
+ */
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+}
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae8542471..219e165aea10 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a95621f9f..150e5cf62f85 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = {
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock_init(&lock->lock);
+		lock->readers = 0;
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

From ac7c992cac0c8f276aa8e4a8273204a6db707bb3 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Tue, 28 Apr 2009 22:42:39 -0700
Subject: [PATCH 28/32] e100: do not go D3 in shutdown unless system is
 powering off

After experimenting with kexec with the last merges after 2.6.29, I've
had some problems when probing e100.  It would not read the eeprom.  After
some bisects, I realized this has been like that since forever (at least
2.6.18).  The problem is that shutdown is doing the same thing that
suspend does and puts the device in D3 state.  I couldn't find a way to
get the device back to a sane state in the probe function.  So, based on
some similar patches from Rafael J. Wysocki for e1000, e1000e, and ixgbe,
I wrote this one for e100.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/e100.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 5c0b457c7868..0f9ee1348552 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2728,7 +2728,7 @@ static void __devexit e100_remove(struct pci_dev *pdev)
 #define E100_82552_SMARTSPEED   0x14   /* SmartSpeed Ctrl register */
 #define E100_82552_REV_ANEG     0x0200 /* Reverse auto-negotiation */
 #define E100_82552_ANEG_NOW     0x0400 /* Auto-negotiate now */
-static int e100_suspend(struct pci_dev *pdev, pm_message_t state)
+static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct nic *nic = netdev_priv(netdev);
@@ -2749,19 +2749,32 @@ static int e100_suspend(struct pci_dev *pdev, pm_message_t state)
 			           E100_82552_SMARTSPEED, smartspeed |
 			           E100_82552_REV_ANEG | E100_82552_ANEG_NOW);
 		}
-		if (pci_enable_wake(pdev, PCI_D3cold, true))
-			pci_enable_wake(pdev, PCI_D3hot, true);
+		*enable_wake = true;
 	} else {
-		pci_enable_wake(pdev, PCI_D3hot, false);
+		*enable_wake = false;
 	}
 
 	pci_disable_device(pdev);
-	pci_set_power_state(pdev, PCI_D3hot);
+}
 
-	return 0;
+static int __e100_power_off(struct pci_dev *pdev, bool wake)
+{
+	if (wake) {
+		return pci_prepare_to_sleep(pdev);
+	} else {
+		pci_wake_from_d3(pdev, false);
+		return pci_set_power_state(pdev, PCI_D3hot);
+	}
 }
 
 #ifdef CONFIG_PM
+static int e100_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	bool wake;
+	__e100_shutdown(pdev, &wake);
+	return __e100_power_off(pdev, wake);
+}
+
 static int e100_resume(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
@@ -2792,7 +2805,10 @@ static int e100_resume(struct pci_dev *pdev)
 
 static void e100_shutdown(struct pci_dev *pdev)
 {
-	e100_suspend(pdev, PMSG_SUSPEND);
+	bool wake;
+	__e100_shutdown(pdev, &wake);
+	if (system_state == SYSTEM_POWER_OFF)
+		__e100_power_off(pdev, wake);
 }
 
 /* ------------------ PCI Error Recovery infrastructure  -------------- */

From d4c4a9a1bce1912ed5681251f0037fd4f2364a3e Mon Sep 17 00:00:00 2001
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Date: Wed, 29 Apr 2009 11:41:24 +0100
Subject: [PATCH 29/32] mac80211: fix modprobe deadlock by not calling wep_init
 under rtnl_lock

- ieee80211_wep_init(), which is called with rtnl_lock held, blocks in
   request_module() [waiting for modprobe to load a crypto module].

 - modprobe blocks in a call to flush_workqueue(), when it closes a TTY
   [presumably when it exits].

 - The workqueue item linkwatch_event() blocks on rtnl_lock.

There's no reason for wep_init() to be called with rtnl_lock held, so
just move it outside the critical section.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index fbcbed6cad01..00968c2d22bb 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -909,6 +909,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (result < 0)
 		goto fail_sta_info;
 
+	result = ieee80211_wep_init(local);
+	if (result < 0) {
+		printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n",
+		       wiphy_name(local->hw.wiphy), result);
+		goto fail_wep;
+	}
+
 	rtnl_lock();
 	result = dev_alloc_name(local->mdev, local->mdev->name);
 	if (result < 0)
@@ -930,14 +937,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		goto fail_rate;
 	}
 
-	result = ieee80211_wep_init(local);
-
-	if (result < 0) {
-		printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n",
-		       wiphy_name(local->hw.wiphy), result);
-		goto fail_wep;
-	}
-
 	/* add one default STA interface if supported */
 	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) {
 		result = ieee80211_if_add(local, "wlan%d", NULL,
@@ -967,13 +966,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	return 0;
 
-fail_wep:
-	rate_control_deinitialize(local);
 fail_rate:
 	unregister_netdevice(local->mdev);
 	local->mdev = NULL;
 fail_dev:
 	rtnl_unlock();
+	ieee80211_wep_free(local);
+fail_wep:
 	sta_info_stop(local);
 fail_sta_info:
 	debugfs_hw_del(local);

From c428c89201a57a0ce24c37ed79e540d1f4101cf3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 29 Apr 2009 00:28:18 +0200
Subject: [PATCH 30/32] mac80211: default to automatic power control

In "mac80211: correct wext transmit power handler"
I fixed the wext handler, but forgot to make the default of the
user_power_level -1 (aka "auto"), so that now the transmit power
is always set to 0, causing associations to time out and similar
problems since we're transmitting with very little power. Correct
this by correcting the default user_power_level to -1.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Bisected-by: Niel Lambrechts <niel.lambrechts@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 00968c2d22bb..14134193cd17 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -757,6 +757,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	local->hw.conf.long_frame_max_tx_count = 4;
 	local->hw.conf.short_frame_max_tx_count = 7;
 	local->hw.conf.radio_enabled = true;
+	local->user_power_level = -1;
 
 	INIT_LIST_HEAD(&local->interfaces);
 	mutex_init(&local->iflist_mtx);

From 1319ebadf185933e6b7ff95211d3cef9004e9754 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Wed, 29 Apr 2009 11:57:34 +0000
Subject: [PATCH 31/32] mv643xx_eth: OOM handling fixes

Currently, when OOM occurs during rx ring refill, mv643xx_eth will get
into an infinite loop, due to the refill function setting the OOM bit
but not clearing the 'rx refill needed' bit for this queue, while the
calling function (the NAPI poll handler) will call the refill function
in a loop until the 'rx refill needed' bit goes off, without checking
the OOM bit.

This patch fixes this by checking the OOM bit in the NAPI poll handler
before attempting to do rx refill.  This means that once OOM occurs,
we won't try to do any memory allocations again until the next invocation
of the poll handler.

While we're at it, change the OOM flag to be a single bit instead of
one bit per receive queue since OOM is a system state rather than a
per-queue state, and cancel the OOM timer on entry to the NAPI poll
handler if it's running to prevent it from firing when we've already
come out of OOM.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Cc: stable@kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mv643xx_eth.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index b3185bf2c158..038beff7da80 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -393,12 +393,12 @@ struct mv643xx_eth_private {
 	struct work_struct tx_timeout_task;
 
 	struct napi_struct napi;
+	u8 oom;
 	u8 work_link;
 	u8 work_tx;
 	u8 work_tx_end;
 	u8 work_rx;
 	u8 work_rx_refill;
-	u8 work_rx_oom;
 
 	int skb_size;
 	struct sk_buff_head rx_recycle;
@@ -661,7 +661,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget)
 					    dma_get_cache_alignment() - 1);
 
 		if (skb == NULL) {
-			mp->work_rx_oom |= 1 << rxq->index;
+			mp->oom = 1;
 			goto oom;
 		}
 
@@ -2167,8 +2167,10 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 
 	mp = container_of(napi, struct mv643xx_eth_private, napi);
 
-	mp->work_rx_refill |= mp->work_rx_oom;
-	mp->work_rx_oom = 0;
+	if (unlikely(mp->oom)) {
+		mp->oom = 0;
+		del_timer(&mp->rx_oom);
+	}
 
 	work_done = 0;
 	while (work_done < budget) {
@@ -2182,8 +2184,10 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 			continue;
 		}
 
-		queue_mask = mp->work_tx | mp->work_tx_end |
-				mp->work_rx | mp->work_rx_refill;
+		queue_mask = mp->work_tx | mp->work_tx_end | mp->work_rx;
+		if (likely(!mp->oom))
+			queue_mask |= mp->work_rx_refill;
+
 		if (!queue_mask) {
 			if (mv643xx_eth_collect_events(mp))
 				continue;
@@ -2204,7 +2208,7 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 			txq_maybe_wake(mp->txq + queue);
 		} else if (mp->work_rx & queue_mask) {
 			work_done += rxq_process(mp->rxq + queue, work_tbd);
-		} else if (mp->work_rx_refill & queue_mask) {
+		} else if (!mp->oom && (mp->work_rx_refill & queue_mask)) {
 			work_done += rxq_refill(mp->rxq + queue, work_tbd);
 		} else {
 			BUG();
@@ -2212,7 +2216,7 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (work_done < budget) {
-		if (mp->work_rx_oom)
+		if (mp->oom)
 			mod_timer(&mp->rx_oom, jiffies + (HZ / 10));
 		napi_complete(napi);
 		wrlp(mp, INT_MASK, INT_TX_END | INT_RX | INT_EXT);
@@ -2372,7 +2376,7 @@ static int mv643xx_eth_open(struct net_device *dev)
 		rxq_refill(mp->rxq + i, INT_MAX);
 	}
 
-	if (mp->work_rx_oom) {
+	if (mp->oom) {
 		mp->rx_oom.expires = jiffies + (HZ / 10);
 		add_timer(&mp->rx_oom);
 	}

From 93af7aca44f0e82e67bda10a0fb73d383edcc8bd Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Wed, 29 Apr 2009 11:58:18 +0000
Subject: [PATCH 32/32] mv643xx_eth: 64bit mib counter read fix

On several mv643xx_eth hardware versions, the two 64bit mib counters
for 'good octets received' and 'good octets sent' are actually 32bit
counters, and reading from the upper half of the register has the same
effect as reading from the lower half of the register: an atomic
read-and-clear of the entire 32bit counter value.  This can under heavy
traffic occasionally lead to small numbers being added to the upper
half of the 64bit mib counter even though no 32bit wrap has occured.

Since we poll the mib counters at least every 30 seconds anyway, we
might as well just skip the reads of the upper halves of the hardware
counters without breaking the stats, which this patch does.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Cc: stable@kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mv643xx_eth.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index 038beff7da80..a400d7115f78 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1255,7 +1255,6 @@ static void mib_counters_update(struct mv643xx_eth_private *mp)
 
 	spin_lock_bh(&mp->mib_counters_lock);
 	p->good_octets_received += mib_read(mp, 0x00);
-	p->good_octets_received += (u64)mib_read(mp, 0x04) << 32;
 	p->bad_octets_received += mib_read(mp, 0x08);
 	p->internal_mac_transmit_err += mib_read(mp, 0x0c);
 	p->good_frames_received += mib_read(mp, 0x10);
@@ -1269,7 +1268,6 @@ static void mib_counters_update(struct mv643xx_eth_private *mp)
 	p->frames_512_to_1023_octets += mib_read(mp, 0x30);
 	p->frames_1024_to_max_octets += mib_read(mp, 0x34);
 	p->good_octets_sent += mib_read(mp, 0x38);
-	p->good_octets_sent += (u64)mib_read(mp, 0x3c) << 32;
 	p->good_frames_sent += mib_read(mp, 0x40);
 	p->excessive_collision += mib_read(mp, 0x44);
 	p->multicast_frames_sent += mib_read(mp, 0x48);