From c197facc8ea08062f8f949aade6a33649ee06771 Mon Sep 17 00:00:00 2001 From: "hummerbliss@gmail.com" Date: Mon, 20 Apr 2009 17:12:35 +0200 Subject: [PATCH 01/32] netfilter: bridge: allow fragmentation of VLAN packets traversing a bridge br_nf_dev_queue_xmit only checks for ETH_P_IP packets for fragmenting but not VLAN packets. This results in dropping of large VLAN packets. This can be observed when connection tracking is enabled. Connection tracking re-assembles fragmented packets, and these have to re-fragmented when transmitting out. Also, make sure only refragmented packets are defragmented as per suggestion from Patrick McHardy. Signed-off-by: Saikiran Madugula Signed-off-by: Patrick McHardy --- net/bridge/br_netfilter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3953ac4214c8..e4a418fcb35b 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -788,15 +788,23 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff *skb, return NF_STOLEN; } +#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) static int br_nf_dev_queue_xmit(struct sk_buff *skb) { - if (skb->protocol == htons(ETH_P_IP) && + if (skb->nfct != NULL && + (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) && skb->len > skb->dev->mtu && !skb_is_gso(skb)) return ip_fragment(skb, br_dev_queue_push_xmit); else return br_dev_queue_push_xmit(skb); } +#else +static int br_nf_dev_queue_xmit(struct sk_buff *skb) +{ + return br_dev_queue_push_xmit(skb); +} +#endif /* PF_BRIDGE/POST_ROUTING ********************************************/ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, From 5ff482940f5aa2cdc3424c4a8ea94b9833b2af5f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 24 Apr 2009 15:37:44 +0200 Subject: [PATCH 02/32] netfilter: nf_ct_dccp/udplite: fix protocol registration error Commit d0dba725 (netfilter: ctnetlink: add callbacks to the per-proto nlattrs) changed the protocol registration function to abort if the to-be registered protocol doesn't provide a new callback function. The DCCP and UDP-Lite IPv6 protocols were missed in this conversion, add the required callback pointer. Reported-and-tested-by: Steven Jan Springl Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_dccp.c | 1 + net/netfilter/nf_conntrack_proto_udplite.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 50dac8dbe7d8..5411d63f31a5 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -777,6 +777,7 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { .print_conntrack = dccp_print_conntrack, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 4614696c1b88..0badedc542d3 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -204,6 +204,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .error = udplite_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif From 4b0706624930dc75c3b0d0df463d89759ef7de29 Mon Sep 17 00:00:00 2001 From: Laszlo Attila Toth Date: Fri, 24 Apr 2009 16:55:25 +0200 Subject: [PATCH 03/32] netfilter: Kconfig: TProxy doesn't depend on NF_CONNTRACK Signed-off-by: Laszlo Attila Toth Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 2329c5f50551..881203c4a142 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -275,6 +275,8 @@ config NF_CT_NETLINK help This option enables support for a netlink-based userspace interface +endif # NF_CONNTRACK + # transparent proxy support config NETFILTER_TPROXY tristate "Transparent proxying support (EXPERIMENTAL)" @@ -290,8 +292,6 @@ config NETFILTER_TPROXY To compile it as a module, choose M here. If unsure, say N. -endif # NF_CONNTRACK - config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n From 71951b64a5a87c09eb6fde59ce51aaab2fdaeab2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Apr 2009 16:58:41 +0200 Subject: [PATCH 04/32] netfilter: nf_ct_dccp: add missing role attributes for DCCP This patch adds missing role attribute to the DCCP type, otherwise the creation of entries is not of any use. The attribute added is CTA_PROTOINFO_DCCP_ROLE which contains the role of the conntrack original tuple. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- include/linux/netfilter/nfnetlink_conntrack.h | 1 + net/netfilter/nf_conntrack_proto_dccp.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 29fe9ea1d346..1a865e48b8eb 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -100,6 +100,7 @@ enum ctattr_protoinfo_tcp { enum ctattr_protoinfo_dccp { CTA_PROTOINFO_DCCP_UNSPEC, CTA_PROTOINFO_DCCP_STATE, + CTA_PROTOINFO_DCCP_ROLE, __CTA_PROTOINFO_DCCP_MAX, }; #define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 5411d63f31a5..8e757dd53396 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -633,6 +633,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, if (!nest_parms) goto nla_put_failure; NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); nla_nest_end(skb, nest_parms); read_unlock_bh(&dccp_lock); return 0; @@ -644,6 +646,7 @@ nla_put_failure: static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, }; static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) @@ -661,11 +664,21 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) return err; if (!tb[CTA_PROTOINFO_DCCP_STATE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) + !tb[CTA_PROTOINFO_DCCP_ROLE] || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { return -EINVAL; + } write_lock_bh(&dccp_lock); ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); + if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + } else { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; + } write_unlock_bh(&dccp_lock); return 0; } From 37e55cf0ceb8803256bf69a3e45bd668bf90b76f Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 24 Apr 2009 17:05:21 +0200 Subject: [PATCH 05/32] netfilter: xt_recent: fix stack overread in compat code Related-to: commit 325fb5b4d26038cba665dd0d8ee09555321061f0 The compat path suffers from a similar problem. It only uses a __be32 when all of the recent code uses, and expects, an nf_inet_addr everywhere. As a result, addresses stored by xt_recents were filled with whatever other stuff was on the stack following the be32. Signed-off-by: Jan Engelhardt With a minor compile fix from Roman. Reported-and-tested-by: Roman Hoog Antink Signed-off-by: Patrick McHardy --- net/netfilter/xt_recent.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 791e030ea903..eb0ceb846527 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -474,7 +474,7 @@ static ssize_t recent_old_proc_write(struct file *file, struct recent_table *t = pde->data; struct recent_entry *e; char buf[sizeof("+255.255.255.255")], *c = buf; - __be32 addr; + union nf_inet_addr addr = {}; int add; if (size > sizeof(buf)) @@ -506,14 +506,13 @@ static ssize_t recent_old_proc_write(struct file *file, add = 1; break; } - addr = in_aton(c); + addr.ip = in_aton(c); spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0); + e = recent_entry_lookup(t, &addr, NFPROTO_IPV4, 0); if (e == NULL) { if (add) - recent_entry_init(t, (const void *)&addr, - NFPROTO_IPV4, 0); + recent_entry_init(t, &addr, NFPROTO_IPV4, 0); } else { if (add) recent_entry_update(t, e); From adc667e84f086aa110d810f3476c494e48eaabaa Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Sat, 25 Apr 2009 18:03:35 -0700 Subject: [PATCH 06/32] vlan: update vlan carrier state for admin up/down Currently, the VLAN event handler does not adjust the VLAN device's carrier state when the real device or the VLAN device is set administratively up or down. The following patch adds a transfer of operating state from the real device to the VLAN device when the real device is administratively set up or down, and sets the carrier state up or down during init, open and close of the VLAN device. This permits observers above the VLAN device that care about the carrier state (bonding's link monitor, for example) to receive updates for administrative changes by more closely mimicing the behavior of real devices. Signed-off-by: Jay Vosburgh Signed-off-by: Patrick McHardy --- net/8021q/vlan.c | 2 ++ net/8021q/vlan_dev.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 2b7390e377b3..d1e10546eb85 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -492,6 +492,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, continue; dev_change_flags(vlandev, flgs & ~IFF_UP); + vlan_transfer_operstate(dev, vlandev); } break; @@ -507,6 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, continue; dev_change_flags(vlandev, flgs | IFF_UP); + vlan_transfer_operstate(dev, vlandev); } break; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 6b0921364014..b4b9068e55a7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -462,6 +462,7 @@ static int vlan_dev_open(struct net_device *dev) if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); + netif_carrier_on(dev); return 0; clear_allmulti: @@ -471,6 +472,7 @@ del_unicast: if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN); out: + netif_carrier_off(dev); return err; } @@ -492,6 +494,7 @@ static int vlan_dev_stop(struct net_device *dev) if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len); + netif_carrier_off(dev); return 0; } @@ -612,6 +615,8 @@ static int vlan_dev_init(struct net_device *dev) struct net_device *real_dev = vlan_dev_info(dev)->real_dev; int subclass = 0; + netif_carrier_off(dev); + /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI); dev->iflink = real_dev->ifindex; From a4233304bb43f87f97fc2ac9143b513814dcf094 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Sun, 26 Apr 2009 20:41:34 +0000 Subject: [PATCH 07/32] mlx4_en: Fix cleanup flow on cq activation In case of mlx4_en_activate_cq() failure, the cleanup code would go to rx_err and try to disable unactivated rings. Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index 438678ab2a10..7bcc49de1637 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -583,7 +583,7 @@ int mlx4_en_start_port(struct net_device *dev) err = mlx4_en_activate_cq(priv, cq); if (err) { mlx4_err(mdev, "Failed activating Rx CQ\n"); - goto rx_err; + goto cq_err; } for (j = 0; j < cq->size; j++) cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; From 785a0982eaaeae2fbe3372d1c9c769e8156a7a5a Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Sun, 26 Apr 2009 20:42:57 +0000 Subject: [PATCH 08/32] mlx4_en: Handle page allocation failure during receive If we failed to allocate new fragments for receive buffer, the packet should be dropped and packets should be reused. Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_rx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c index 0cbb78ca7b29..7942c4d3cd88 100644 --- a/drivers/net/mlx4/en_rx.c +++ b/drivers/net/mlx4/en_rx.c @@ -610,6 +610,10 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags, skb_shinfo(skb)->frags, page_alloc, length); + if (unlikely(!used_frags)) { + kfree_skb(skb); + return NULL; + } skb_shinfo(skb)->nr_frags = used_frags; /* Copy headers into the skb linear buffer */ From c759a6b4e1cae6aff71f58c9c85404ebcd81b6e0 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 27 Apr 2009 02:36:20 -0700 Subject: [PATCH 09/32] net: Fix LL_MAX_HEADER for CONFIG_TR_MODULE Unless I miss anything this should fix a bug. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2e7783f4a755..453be9a674c0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -104,7 +104,7 @@ struct wireless_dev; # else # define LL_MAX_HEADER 96 # endif -#elif defined(CONFIG_TR) +#elif defined(CONFIG_TR) || defined(CONFIG_TR_MODULE) # define LL_MAX_HEADER 48 #else # define LL_MAX_HEADER 32 From 8f955d7f042e4ac44891a400d5000928f8db9f58 Mon Sep 17 00:00:00 2001 From: Ayaz Abdulla Date: Sat, 25 Apr 2009 09:17:56 +0000 Subject: [PATCH 10/32] forcedeth: tx timeout fix This patch fixes the tx_timeout() to properly handle the clean up of the tx ring. It also sets the tx put pointer back to the correct position to be in sync with HW. Signed-off-by: Ayaz Abdulla Signed-off-by: David S. Miller --- drivers/net/forcedeth.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 11d5db16ed9c..f9a846b1b92f 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -1880,6 +1880,7 @@ static void nv_init_tx(struct net_device *dev) np->tx_pkts_in_progress = 0; np->tx_change_owner = NULL; np->tx_end_flip = NULL; + np->tx_stop = 0; for (i = 0; i < np->tx_ring_size; i++) { if (!nv_optimized(np)) { @@ -2530,6 +2531,8 @@ static void nv_tx_timeout(struct net_device *dev) struct fe_priv *np = netdev_priv(dev); u8 __iomem *base = get_hwbase(dev); u32 status; + union ring_type put_tx; + int saved_tx_limit; if (np->msi_flags & NV_MSI_X_ENABLED) status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK; @@ -2589,24 +2592,32 @@ static void nv_tx_timeout(struct net_device *dev) /* 1) stop tx engine */ nv_stop_tx(dev); - /* 2) check that the packets were not sent already: */ + /* 2) complete any outstanding tx and do not give HW any limited tx pkts */ + saved_tx_limit = np->tx_limit; + np->tx_limit = 0; /* prevent giving HW any limited pkts */ + np->tx_stop = 0; /* prevent waking tx queue */ if (!nv_optimized(np)) nv_tx_done(dev, np->tx_ring_size); else nv_tx_done_optimized(dev, np->tx_ring_size); - /* 3) if there are dead entries: clear everything */ - if (np->get_tx_ctx != np->put_tx_ctx) { - printk(KERN_DEBUG "%s: tx_timeout: dead entries!\n", dev->name); - nv_drain_tx(dev); - nv_init_tx(dev); - setup_hw_rings(dev, NV_SETUP_TX_RING); - } + /* save current HW postion */ + if (np->tx_change_owner) + put_tx.ex = np->tx_change_owner->first_tx_desc; + else + put_tx = np->put_tx; - netif_wake_queue(dev); + /* 3) clear all tx state */ + nv_drain_tx(dev); + nv_init_tx(dev); - /* 4) restart tx engine */ + /* 4) restore state to current HW position */ + np->get_tx = np->put_tx = put_tx; + np->tx_limit = saved_tx_limit; + + /* 5) restart tx engine */ nv_start_tx(dev); + netif_wake_queue(dev); spin_unlock_irq(&np->lock); } From 6a783c9067e3f71aac61a9262fe42c1f68efd4fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 27 Apr 2009 02:58:59 -0700 Subject: [PATCH 11/32] xfrm: wrong hash value for temporary SA When kernel inserts a temporary SA for IKE, it uses the wrong hash value for dst list. Two hash values were calcultated before: one with source address and one with a wildcard source address. Bug hinted by Junwei Zhang Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 82271720d970..5f1f86565f16 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -794,7 +794,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); - unsigned int h; + unsigned int h, h_wildcard; struct hlist_node *entry; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; @@ -819,8 +819,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, if (best) goto found; - h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); - hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { + h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); + hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) { if (x->props.family == family && x->props.reqid == tmpl->reqid && !(x->props.flags & XFRM_STATE_WILDRECV) && From ae0e8e82205c903978a79ebf5e31c670b61fa5b4 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 27 Apr 2009 03:04:58 -0700 Subject: [PATCH 12/32] veth: prevent oops caused by netdev destructor From: Stephen Hemminger The veth driver will oops if sysfs hooks are open while module is removed. The net device destructor can not point to code in a module; basically there are only two possible safe values: NULL - no destructor, or free_netdev - free on last use Signed-off-by: David S. Miller --- drivers/net/veth.c | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 015db1cece72..8e56fcf0a0e3 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -210,14 +210,11 @@ rx_drop: static struct net_device_stats *veth_get_stats(struct net_device *dev) { - struct veth_priv *priv; - struct net_device_stats *dev_stats; - int cpu; + struct veth_priv *priv = netdev_priv(dev); + struct net_device_stats *dev_stats = &dev->stats; + unsigned int cpu; struct veth_net_stats *stats; - priv = netdev_priv(dev); - dev_stats = &dev->stats; - dev_stats->rx_packets = 0; dev_stats->tx_packets = 0; dev_stats->rx_bytes = 0; @@ -225,16 +222,17 @@ static struct net_device_stats *veth_get_stats(struct net_device *dev) dev_stats->tx_dropped = 0; dev_stats->rx_dropped = 0; - for_each_online_cpu(cpu) { - stats = per_cpu_ptr(priv->stats, cpu); + if (priv->stats) + for_each_online_cpu(cpu) { + stats = per_cpu_ptr(priv->stats, cpu); - dev_stats->rx_packets += stats->rx_packets; - dev_stats->tx_packets += stats->tx_packets; - dev_stats->rx_bytes += stats->rx_bytes; - dev_stats->tx_bytes += stats->tx_bytes; - dev_stats->tx_dropped += stats->tx_dropped; - dev_stats->rx_dropped += stats->rx_dropped; - } + dev_stats->rx_packets += stats->rx_packets; + dev_stats->tx_packets += stats->tx_packets; + dev_stats->rx_bytes += stats->rx_bytes; + dev_stats->tx_bytes += stats->tx_bytes; + dev_stats->tx_dropped += stats->tx_dropped; + dev_stats->rx_dropped += stats->rx_dropped; + } return dev_stats; } @@ -261,6 +259,8 @@ static int veth_close(struct net_device *dev) netif_carrier_off(dev); netif_carrier_off(priv->peer); + free_percpu(priv->stats); + priv->stats = NULL; return 0; } @@ -291,15 +291,6 @@ static int veth_dev_init(struct net_device *dev) return 0; } -static void veth_dev_free(struct net_device *dev) -{ - struct veth_priv *priv; - - priv = netdev_priv(dev); - free_percpu(priv->stats); - free_netdev(dev); -} - static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -317,7 +308,7 @@ static void veth_setup(struct net_device *dev) dev->netdev_ops = &veth_netdev_ops; dev->ethtool_ops = &veth_ethtool_ops; dev->features |= NETIF_F_LLTX; - dev->destructor = veth_dev_free; + dev->destructor = free_netdev; } /* From 495dce123ceabbae035552437fcaa0f69247ff08 Mon Sep 17 00:00:00 2001 From: "Waskiewicz Jr, Peter P" Date: Thu, 23 Apr 2009 11:15:18 +0000 Subject: [PATCH 13/32] ixgbe: Fix WoL functionality for 82599 KX4 devices The current code writes the PME enabled bit in PCI config space which is wrong. This was needed for pre-release hardware, and was not removed from the driver. Also, we need to clear the WUS (wake up status) after we resume. Otherwise we can't wake for the same event again since it's still asserted in the hardware. Plus, the multicast lists were being written improperly, causing multicast WoL to fail. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ixgbe/ixgbe_common.c | 51 ++------------------------------ drivers/net/ixgbe/ixgbe_main.c | 10 +++---- 2 files changed, 6 insertions(+), 55 deletions(-) diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c index 5567519676d5..186a65069b33 100644 --- a/drivers/net/ixgbe/ixgbe_common.c +++ b/drivers/net/ixgbe/ixgbe_common.c @@ -50,7 +50,6 @@ static u16 ixgbe_calc_eeprom_checksum(struct ixgbe_hw *hw); static void ixgbe_enable_rar(struct ixgbe_hw *hw, u32 index); static void ixgbe_disable_rar(struct ixgbe_hw *hw, u32 index); static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr); -static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr); static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq); /** @@ -1377,8 +1376,7 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list, * Clear accounting of old secondary address list, * don't count RAR[0] */ - uc_addr_in_use = hw->addr_ctrl.rar_used_count - - hw->addr_ctrl.mc_addr_in_rar_count - 1; + uc_addr_in_use = hw->addr_ctrl.rar_used_count - 1; hw->addr_ctrl.rar_used_count -= uc_addr_in_use; hw->addr_ctrl.overflow_promisc = 0; @@ -1492,40 +1490,6 @@ static void ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr) IXGBE_WRITE_REG(hw, IXGBE_MTA(vector_reg), mta_reg); } -/** - * ixgbe_add_mc_addr - Adds a multicast address. - * @hw: pointer to hardware structure - * @mc_addr: new multicast address - * - * Adds it to unused receive address register or to the multicast table. - **/ -static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr) -{ - u32 rar_entries = hw->mac.num_rar_entries; - u32 rar; - - hw_dbg(hw, " MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n", - mc_addr[0], mc_addr[1], mc_addr[2], - mc_addr[3], mc_addr[4], mc_addr[5]); - - /* - * Place this multicast address in the RAR if there is room, - * else put it in the MTA - */ - if (hw->addr_ctrl.rar_used_count < rar_entries) { - /* use RAR from the end up for multicast */ - rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1; - hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV); - hw_dbg(hw, "Added a multicast address to RAR[%d]\n", rar); - hw->addr_ctrl.rar_used_count++; - hw->addr_ctrl.mc_addr_in_rar_count++; - } else { - ixgbe_set_mta(hw, mc_addr); - } - - hw_dbg(hw, "ixgbe_add_mc_addr Complete\n"); -} - /** * ixgbe_update_mc_addr_list_generic - Updates MAC list of multicast addresses * @hw: pointer to hardware structure @@ -1542,7 +1506,6 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr next) { u32 i; - u32 rar_entries = hw->mac.num_rar_entries; u32 vmdq; /* @@ -1550,18 +1513,8 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, * use. */ hw->addr_ctrl.num_mc_addrs = mc_addr_count; - hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count; - hw->addr_ctrl.mc_addr_in_rar_count = 0; hw->addr_ctrl.mta_in_use = 0; - /* Zero out the other receive addresses. */ - hw_dbg(hw, "Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count, - rar_entries - 1); - for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) { - IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0); - IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0); - } - /* Clear the MTA */ hw_dbg(hw, " Clearing MTA\n"); for (i = 0; i < hw->mac.mcft_size; i++) @@ -1570,7 +1523,7 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, /* Add the new addresses */ for (i = 0; i < mc_addr_count; i++) { hw_dbg(hw, " Adding the multicast addresses:\n"); - ixgbe_add_mc_addr(hw, next(hw, &mc_addr_list, &vmdq)); + ixgbe_set_mta(hw, next(hw, &mc_addr_list, &vmdq)); } /* Enable mta */ diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 01884256f4c9..07e778d3e5d2 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -3646,6 +3646,8 @@ static int ixgbe_resume(struct pci_dev *pdev) ixgbe_reset(adapter); + IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0); + if (netif_running(netdev)) { err = ixgbe_open(adapter->netdev); if (err) @@ -4575,7 +4577,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev, const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data]; static int cards_found; int i, err, pci_using_dac; - u16 pm_value = 0; u32 part_num, eec; err = pci_enable_device(pdev); @@ -4763,11 +4764,8 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev, switch (pdev->device) { case IXGBE_DEV_ID_82599_KX4: -#define IXGBE_PCIE_PMCSR 0x44 - adapter->wol = IXGBE_WUFC_MAG; - pci_read_config_word(pdev, IXGBE_PCIE_PMCSR, &pm_value); - pci_write_config_word(pdev, IXGBE_PCIE_PMCSR, - (pm_value | (1 << 8))); + adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX | + IXGBE_WUFC_MC | IXGBE_WUFC_BC); break; default: adapter->wol = 0; From 7ced70c47f68ad672f50781de5adc6d41e6d2866 Mon Sep 17 00:00:00 2001 From: Tilman Schmidt Date: Thu, 23 Apr 2009 02:24:21 +0000 Subject: [PATCH 14/32] update Documentation/isdn/00-INDEX After the merging of mISDN, state which files refer only to the old isdn4linux subsystem. Also add a few missing files. Signed-off-by: Tilman Schmidt Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- Documentation/isdn/00-INDEX | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX index 9fee5f2e5c62..d9660ee5ab27 100644 --- a/Documentation/isdn/00-INDEX +++ b/Documentation/isdn/00-INDEX @@ -2,8 +2,12 @@ - this file (info on ISDN implementation for Linux) CREDITS - list of the kind folks that brought you this stuff. +HiSax.cert + - information about the ITU approval certification of the HiSax driver. INTERFACE - - description of Linklevel and Hardwarelevel ISDN interface. + - description of isdn4linux Link Level and Hardware Level interfaces. +INTERFACE.fax + - description of the fax subinterface of isdn4linux. README - general info on what you need and what to do for Linux ISDN. README.FAQ @@ -12,6 +16,8 @@ README.audio - info for running audio over ISDN. README.fax - info for using Fax over ISDN. +README.gigaset + - info on the drivers for Siemens Gigaset ISDN adapters. README.icn - info on the ICN-ISDN-card and its driver. README.HiSax @@ -37,7 +43,8 @@ README.diversion README.sc - info on driver for Spellcaster cards. README.x25 - _ info for running X.25 over ISDN. + - info for running X.25 over ISDN. README.hysdn - - info on driver for Hypercope active HYSDN cards - + - info on driver for Hypercope active HYSDN cards +README.mISDN + - info on the Modular ISDN subsystem (mISDN). From 554f200e22a13e19bd407d0037e41be0ec8a0a2e Mon Sep 17 00:00:00 2001 From: Tilman Schmidt Date: Thu, 23 Apr 2009 02:24:21 +0000 Subject: [PATCH 15/32] Documentation/isdn/INTERFACE.CAPI isdn: document Kernel CAPI driver interface Create a file Documentation/isdn/INTERFACE.CAPI describing the interface between the kernel CAPI subsystem and ISDN device drivers, analogous to the existing Documentation/isdn/INTERFACE for the old isdn4linux subsystem. Also add kerneldoc comments to the exported functions in drivers/isdn/capi/kcapi.c. Impact: Documentation Signed-off-by: Tilman Schmidt Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- Documentation/isdn/00-INDEX | 2 + Documentation/isdn/INTERFACE.CAPI | 207 ++++++++++++++++++++++++++++++ drivers/isdn/capi/kcapi.c | 171 ++++++++++++++++++++++++ 3 files changed, 380 insertions(+) create mode 100644 Documentation/isdn/INTERFACE.CAPI diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX index d9660ee5ab27..5a2d69989a8c 100644 --- a/Documentation/isdn/00-INDEX +++ b/Documentation/isdn/00-INDEX @@ -8,6 +8,8 @@ INTERFACE - description of isdn4linux Link Level and Hardware Level interfaces. INTERFACE.fax - description of the fax subinterface of isdn4linux. +INTERFACE.CAPI + - description of kernel CAPI Link Level to Hardware Level interface. README - general info on what you need and what to do for Linux ISDN. README.FAQ diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI new file mode 100644 index 000000000000..8947ffcda16e --- /dev/null +++ b/Documentation/isdn/INTERFACE.CAPI @@ -0,0 +1,207 @@ +Kernel CAPI Interface to Hardware Drivers +----------------------------------------- + +1. Overview + +Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI +hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI +lingo) with Kernel CAPI to indicate their readiness to provide their service +to CAPI applications. CAPI applications also register with Kernel CAPI, +requesting association with a CAPI device. Kernel CAPI then dispatches the +application registration to an available device, forwarding it to the +corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both +directions between the application and the hardware driver. + + +2. Driver and Device Registration + +CAPI drivers optionally register themselves with Kernel CAPI by calling the +Kernel CAPI function register_capi_driver() with a pointer to a struct +capi_driver. This structure must be filled with the name and revision of the +driver, and optionally a pointer to a callback function, add_card(). The +registration can be revoked by calling the function unregister_capi_driver() +with a pointer to the same struct capi_driver. + +CAPI drivers must register each of the ISDN devices they control with Kernel +CAPI by calling the Kernel CAPI function attach_capi_ctr() with a pointer to a +struct capi_ctr before they can be used. This structure must be filled with +the names of the driver and controller, and a number of callback function +pointers which are subsequently used by Kernel CAPI for communicating with the +driver. The registration can be revoked by calling the function +detach_capi_ctr() with a pointer to the same struct capi_ctr. + +Before the device can be actually used, the driver must fill in the device +information fields 'manu', 'version', 'profile' and 'serial' in the capi_ctr +structure of the device, and signal its readiness by calling capi_ctr_ready(). +From then on, Kernel CAPI may call the registered callback functions for the +device. + +If the device becomes unusable for any reason (shutdown, disconnect ...), the +driver has to call capi_ctr_reseted(). This will prevent further calls to the +callback functions by Kernel CAPI. + + +3. Application Registration and Communication + +Kernel CAPI forwards registration requests from applications (calls to CAPI +operation CAPI_REGISTER) to an appropriate hardware driver by calling its +register_appl() callback function. A unique Application ID (ApplID, u16) is +allocated by Kernel CAPI and passed to register_appl() along with the +parameter structure provided by the application. This is analogous to the +open() operation on regular files or character devices. + +After a successful return from register_appl(), CAPI messages from the +application may be passed to the driver for the device via calls to the +send_message() callback function. The CAPI message to send is stored in the +data portion of a skb. Conversely, the driver may call Kernel CAPI's +capi_ctr_handle_message() function to pass a received CAPI message to Kernel +CAPI for forwarding to an application, specifying its ApplID. + +Format and semantics of CAPI messages are specified in the CAPI 2.0 standard. + +Deregistration requests (CAPI operation CAPI_RELEASE) from applications are +forwarded as calls to the release_appl() callback function, passing the same +ApplID as with register_appl(). After return from release_appl(), no CAPI +messages for that application may be passed to or from the device anymore. + + +4. Data Structures + +4.1 struct capi_driver + +This structure describes a Kernel CAPI driver itself. It is used in the +register_capi_driver() and unregister_capi_driver() functions, and contains +the following non-private fields, all to be set by the driver before calling +register_capi_driver(): + +char name[32] + the name of the driver, as a zero terminated ASCII string +char revision[32] + the revision number of the driver, as a zero terminated ASCII string +int (*add_card)(struct capi_driver *driver, capicardparams *data) + a callback function pointer (may be NULL) + + +4.2 struct capi_ctr + +This structure describes an ISDN device (controller) handled by a Kernel CAPI +driver. After registration via the attach_capi_ctr() function it is passed to +all controller specific lower layer interface and callback functions to +identify the controller to operate on. + +It contains the following non-private fields: + +- to be set by the driver before calling attach_capi_ctr(): + +struct module *owner + pointer to the driver module owning the device + +void *driverdata + an opaque pointer to driver specific data, not touched by Kernel CAPI + +char name[32] + the name of the controller, as a zero terminated ASCII string + +char *driver_name + the name of the driver, as a zero terminated ASCII string + +int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata) + (optional) pointer to a callback function for sending firmware and + configuration data to the device + +void (*reset_ctr)(struct capi_ctr *ctrlr) + pointer to a callback function for performing a reset on the device, + releasing all registered applications + +void (*register_appl)(struct capi_ctr *ctrlr, u16 applid, + capi_register_params *rparam) +void (*release_appl)(struct capi_ctr *ctrlr, u16 applid) + pointers to callback functions for registration and deregistration of + applications with the device + +u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb) + pointer to a callback function for sending a CAPI message to the + device + +char *(*procinfo)(struct capi_ctr *ctrlr) + pointer to a callback function returning the entry for the device in + the CAPI controller info table, /proc/capi/controller + +read_proc_t *ctr_read_proc + pointer to the read_proc callback function for the device's proc file + system entry, /proc/capi/controllers/; will be called with a + pointer to the device's capi_ctr structure as the last (data) argument + +- to be filled in before calling capi_ctr_ready(): + +u8 manu[CAPI_MANUFACTURER_LEN] + value to return for CAPI_GET_MANUFACTURER + +capi_version version + value to return for CAPI_GET_VERSION + +capi_profile profile + value to return for CAPI_GET_PROFILE + +u8 serial[CAPI_SERIAL_LEN] + value to return for CAPI_GET_SERIAL + + +5. Lower Layer Interface Functions + +(declared in ) + +void register_capi_driver(struct capi_driver *drvr) +void unregister_capi_driver(struct capi_driver *drvr) + register/unregister a driver with Kernel CAPI + +int attach_capi_ctr(struct capi_ctr *ctrlr) +int detach_capi_ctr(struct capi_ctr *ctrlr) + register/unregister a device (controller) with Kernel CAPI + +void capi_ctr_ready(struct capi_ctr *ctrlr) +void capi_ctr_reseted(struct capi_ctr *ctrlr) + signal controller ready/not ready + +void capi_ctr_suspend_output(struct capi_ctr *ctrlr) +void capi_ctr_resume_output(struct capi_ctr *ctrlr) + signal suspend/resume + +void capi_ctr_handle_message(struct capi_ctr * ctrlr, u16 applid, + struct sk_buff *skb) + pass a received CAPI message to Kernel CAPI + for forwarding to the specified application + + +6. Helper Functions and Macros + +Library functions (from ): + +void capilib_new_ncci(struct list_head *head, u16 applid, + u32 ncci, u32 winsize) +void capilib_free_ncci(struct list_head *head, u16 applid, u32 ncci) +void capilib_release_appl(struct list_head *head, u16 applid) +void capilib_release(struct list_head *head) +void capilib_data_b3_conf(struct list_head *head, u16 applid, + u32 ncci, u16 msgid) +u16 capilib_data_b3_req(struct list_head *head, u16 applid, + u32 ncci, u16 msgid) + + +Macros to extract/set element values from/in a CAPI message header +(from ): + +Get Macro Set Macro Element (Type) + +CAPIMSG_LEN(m) CAPIMSG_SETLEN(m, len) Total Length (u16) +CAPIMSG_APPID(m) CAPIMSG_SETAPPID(m, applid) ApplID (u16) +CAPIMSG_COMMAND(m) CAPIMSG_SETCOMMAND(m,cmd) Command (u8) +CAPIMSG_SUBCOMMAND(m) CAPIMSG_SETSUBCOMMAND(m, cmd) Subcommand (u8) +CAPIMSG_CMD(m) - Command*256 + + Subcommand (u16) +CAPIMSG_MSGID(m) CAPIMSG_SETMSGID(m, msgid) Message Number (u16) + +CAPIMSG_CONTROL(m) CAPIMSG_SETCONTROL(m, contr) Controller/PLCI/NCCI + (u32) +CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16) + diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 5360c4fd4739..f33170368cd1 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -270,6 +270,15 @@ static void recv_handler(struct work_struct *work) mutex_unlock(&ap->recv_mtx); } +/** + * capi_ctr_handle_message() - handle incoming CAPI message + * @card: controller descriptor structure. + * @appl: application ID. + * @skb: message. + * + * Called by hardware driver to pass a CAPI message to the application. + */ + void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb) { struct capi20_appl *ap; @@ -348,6 +357,13 @@ error: EXPORT_SYMBOL(capi_ctr_handle_message); +/** + * capi_ctr_ready() - signal CAPI controller ready + * @card: controller descriptor structure. + * + * Called by hardware driver to signal that the controller is up and running. + */ + void capi_ctr_ready(struct capi_ctr * card) { card->cardstate = CARD_RUNNING; @@ -360,6 +376,14 @@ void capi_ctr_ready(struct capi_ctr * card) EXPORT_SYMBOL(capi_ctr_ready); +/** + * capi_ctr_reseted() - signal CAPI controller reset + * @card: controller descriptor structure. + * + * Called by hardware driver to signal that the controller is down and + * unavailable for use. + */ + void capi_ctr_reseted(struct capi_ctr * card) { u16 appl; @@ -391,6 +415,13 @@ void capi_ctr_reseted(struct capi_ctr * card) EXPORT_SYMBOL(capi_ctr_reseted); +/** + * capi_ctr_suspend_output() - suspend controller + * @card: controller descriptor structure. + * + * Called by hardware driver to stop data flow. + */ + void capi_ctr_suspend_output(struct capi_ctr *card) { if (!card->blocked) { @@ -401,6 +432,13 @@ void capi_ctr_suspend_output(struct capi_ctr *card) EXPORT_SYMBOL(capi_ctr_suspend_output); +/** + * capi_ctr_resume_output() - resume controller + * @card: controller descriptor structure. + * + * Called by hardware driver to resume data flow. + */ + void capi_ctr_resume_output(struct capi_ctr *card) { if (card->blocked) { @@ -413,6 +451,14 @@ EXPORT_SYMBOL(capi_ctr_resume_output); /* ------------------------------------------------------------- */ +/** + * attach_capi_ctr() - register CAPI controller + * @card: controller descriptor structure. + * + * Called by hardware driver to register a controller with the CAPI subsystem. + * Return value: 0 on success, error code < 0 on error + */ + int attach_capi_ctr(struct capi_ctr *card) { @@ -459,6 +505,15 @@ attach_capi_ctr(struct capi_ctr *card) EXPORT_SYMBOL(attach_capi_ctr); +/** + * detach_capi_ctr() - unregister CAPI controller + * @card: controller descriptor structure. + * + * Called by hardware driver to remove the registration of a controller + * with the CAPI subsystem. + * Return value: 0 on success, error code < 0 on error + */ + int detach_capi_ctr(struct capi_ctr *card) { if (card->cardstate != CARD_DETECTED) @@ -479,6 +534,13 @@ int detach_capi_ctr(struct capi_ctr *card) EXPORT_SYMBOL(detach_capi_ctr); +/** + * register_capi_driver() - register CAPI driver + * @driver: driver descriptor structure. + * + * Called by hardware driver to register itself with the CAPI subsystem. + */ + void register_capi_driver(struct capi_driver *driver) { unsigned long flags; @@ -490,6 +552,13 @@ void register_capi_driver(struct capi_driver *driver) EXPORT_SYMBOL(register_capi_driver); +/** + * unregister_capi_driver() - unregister CAPI driver + * @driver: driver descriptor structure. + * + * Called by hardware driver to unregister itself from the CAPI subsystem. + */ + void unregister_capi_driver(struct capi_driver *driver) { unsigned long flags; @@ -505,6 +574,13 @@ EXPORT_SYMBOL(unregister_capi_driver); /* -------- CAPI2.0 Interface ---------------------------------- */ /* ------------------------------------------------------------- */ +/** + * capi20_isinstalled() - CAPI 2.0 operation CAPI_INSTALLED + * + * Return value: CAPI result code (CAPI_NOERROR if at least one ISDN controller + * is ready for use, CAPI_REGNOTINSTALLED otherwise) + */ + u16 capi20_isinstalled(void) { int i; @@ -517,6 +593,18 @@ u16 capi20_isinstalled(void) EXPORT_SYMBOL(capi20_isinstalled); +/** + * capi20_register() - CAPI 2.0 operation CAPI_REGISTER + * @ap: CAPI application descriptor structure. + * + * Register an application's presence with CAPI. + * A unique application ID is assigned and stored in @ap->applid. + * After this function returns successfully, the message receive + * callback function @ap->recv_message() may be called at any time + * until capi20_release() has been called for the same @ap. + * Return value: CAPI result code + */ + u16 capi20_register(struct capi20_appl *ap) { int i; @@ -571,6 +659,16 @@ u16 capi20_register(struct capi20_appl *ap) EXPORT_SYMBOL(capi20_register); +/** + * capi20_release() - CAPI 2.0 operation CAPI_RELEASE + * @ap: CAPI application descriptor structure. + * + * Terminate an application's registration with CAPI. + * After this function returns successfully, the message receive + * callback function @ap->recv_message() will no longer be called. + * Return value: CAPI result code + */ + u16 capi20_release(struct capi20_appl *ap) { int i; @@ -603,6 +701,15 @@ u16 capi20_release(struct capi20_appl *ap) EXPORT_SYMBOL(capi20_release); +/** + * capi20_put_message() - CAPI 2.0 operation CAPI_PUT_MESSAGE + * @ap: CAPI application descriptor structure. + * @skb: CAPI message. + * + * Transfer a single message to CAPI. + * Return value: CAPI result code + */ + u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) { struct capi_ctr *card; @@ -668,6 +775,16 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) EXPORT_SYMBOL(capi20_put_message); +/** + * capi20_get_manufacturer() - CAPI 2.0 operation CAPI_GET_MANUFACTURER + * @contr: controller number. + * @buf: result buffer (64 bytes). + * + * Retrieve information about the manufacturer of the specified ISDN controller + * or (for @contr == 0) the driver itself. + * Return value: CAPI result code + */ + u16 capi20_get_manufacturer(u32 contr, u8 *buf) { struct capi_ctr *card; @@ -685,6 +802,16 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf) EXPORT_SYMBOL(capi20_get_manufacturer); +/** + * capi20_get_version() - CAPI 2.0 operation CAPI_GET_VERSION + * @contr: controller number. + * @verp: result structure. + * + * Retrieve version information for the specified ISDN controller + * or (for @contr == 0) the driver itself. + * Return value: CAPI result code + */ + u16 capi20_get_version(u32 contr, struct capi_version *verp) { struct capi_ctr *card; @@ -703,6 +830,16 @@ u16 capi20_get_version(u32 contr, struct capi_version *verp) EXPORT_SYMBOL(capi20_get_version); +/** + * capi20_get_serial() - CAPI 2.0 operation CAPI_GET_SERIAL_NUMBER + * @contr: controller number. + * @serial: result buffer (8 bytes). + * + * Retrieve the serial number of the specified ISDN controller + * or (for @contr == 0) the driver itself. + * Return value: CAPI result code + */ + u16 capi20_get_serial(u32 contr, u8 *serial) { struct capi_ctr *card; @@ -721,6 +858,16 @@ u16 capi20_get_serial(u32 contr, u8 *serial) EXPORT_SYMBOL(capi20_get_serial); +/** + * capi20_get_profile() - CAPI 2.0 operation CAPI_GET_PROFILE + * @contr: controller number. + * @profp: result structure. + * + * Retrieve capability information for the specified ISDN controller + * or (for @contr == 0) the number of installed controllers. + * Return value: CAPI result code + */ + u16 capi20_get_profile(u32 contr, struct capi_profile *profp) { struct capi_ctr *card; @@ -903,6 +1050,15 @@ static int old_capi_manufacturer(unsigned int cmd, void __user *data) } #endif +/** + * capi20_manufacturer() - CAPI 2.0 operation CAPI_MANUFACTURER + * @cmd: command. + * @data: parameter. + * + * Perform manufacturer specific command. + * Return value: CAPI result code + */ + int capi20_manufacturer(unsigned int cmd, void __user *data) { struct capi_ctr *card; @@ -981,6 +1137,21 @@ int capi20_manufacturer(unsigned int cmd, void __user *data) EXPORT_SYMBOL(capi20_manufacturer); /* temporary hack */ + +/** + * capi20_set_callback() - set CAPI application notification callback function + * @ap: CAPI application descriptor structure. + * @callback: callback function (NULL to remove). + * + * If not NULL, the callback function will be called to notify the + * application of the addition or removal of a controller. + * The first argument (cmd) will tell whether the controller was added + * (KCI_CONTRUP) or removed (KCI_CONTRDOWN). + * The second argument (contr) will be the controller number. + * For cmd==KCI_CONTRUP the third argument (data) will be a pointer to the + * new controller's capability profile structure. + */ + void capi20_set_callback(struct capi20_appl *ap, void (*callback) (unsigned int cmd, __u32 contr, void *data)) { From 2296e5a0136f7ba64c99f3a48a55a687aa9abcc8 Mon Sep 17 00:00:00 2001 From: Karsten Keil Date: Thu, 23 Apr 2009 02:24:21 +0000 Subject: [PATCH 16/32] Add reference to CAPI 2.0 standard Move the entry about CAPI 2.0 to the beginning and add a URL. Incorporate changes suggested by Randy Dunlap, thanks for proofreading. Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- Documentation/isdn/INTERFACE.CAPI | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI index 8947ffcda16e..786d619b36e5 100644 --- a/Documentation/isdn/INTERFACE.CAPI +++ b/Documentation/isdn/INTERFACE.CAPI @@ -3,6 +3,11 @@ Kernel CAPI Interface to Hardware Drivers 1. Overview +From the CAPI 2.0 specification: +COMMON-ISDN-API (CAPI) is an application programming interface standard used +to access ISDN equipment connected to basic rate interfaces (BRI) and primary +rate interfaces (PRI). + Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI lingo) with Kernel CAPI to indicate their readiness to provide their service @@ -12,6 +17,9 @@ application registration to an available device, forwarding it to the corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both directions between the application and the hardware driver. +Format and semantics of CAPI messages are specified in the CAPI 2.0 standard. +This standard is freely available from http://www.capi.org. + 2. Driver and Device Registration @@ -53,12 +61,10 @@ open() operation on regular files or character devices. After a successful return from register_appl(), CAPI messages from the application may be passed to the driver for the device via calls to the send_message() callback function. The CAPI message to send is stored in the -data portion of a skb. Conversely, the driver may call Kernel CAPI's +data portion of an skb. Conversely, the driver may call Kernel CAPI's capi_ctr_handle_message() function to pass a received CAPI message to Kernel CAPI for forwarding to an application, specifying its ApplID. -Format and semantics of CAPI messages are specified in the CAPI 2.0 standard. - Deregistration requests (CAPI operation CAPI_RELEASE) from applications are forwarded as calls to the release_appl() callback function, passing the same ApplID as with register_appl(). After return from release_appl(), no CAPI @@ -75,9 +81,9 @@ the following non-private fields, all to be set by the driver before calling register_capi_driver(): char name[32] - the name of the driver, as a zero terminated ASCII string + the name of the driver, as a zero-terminated ASCII string char revision[32] - the revision number of the driver, as a zero terminated ASCII string + the revision number of the driver, as a zero-terminated ASCII string int (*add_card)(struct capi_driver *driver, capicardparams *data) a callback function pointer (may be NULL) @@ -100,10 +106,10 @@ void *driverdata an opaque pointer to driver specific data, not touched by Kernel CAPI char name[32] - the name of the controller, as a zero terminated ASCII string + the name of the controller, as a zero-terminated ASCII string char *driver_name - the name of the driver, as a zero terminated ASCII string + the name of the driver, as a zero-terminated ASCII string int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata) (optional) pointer to a callback function for sending firmware and From c9503e0fe052020e0294cd07d0ecd982eb7c9177 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 27 Apr 2009 05:42:24 -0700 Subject: [PATCH 17/32] ipv4: Limit size of route cache hash table Right now we have no upper limit on the size of the route cache hash table. On a 128GB POWER6 box it ends up as 32MB: IP route cache hash table entries: 4194304 (order: 9, 33554432 bytes) It would be nice to cap this for memory consumption reasons, but a massive hashtable also causes a significant spike when measuring OS jitter. With a 32MB hashtable and 4 million entries, rt_worker_func is taking 5 ms to complete. On another system with more memory it's taking 14 ms. Even though rt_worker_func does call cond_sched() to limit its impact, in an HPC environment we want to keep all sources of OS jitter to a minimum. With the patch applied we limit the number of entries to 512k which can still be overriden by using the rt_entries boot option: IP route cache hash table entries: 524288 (order: 6, 4194304 bytes) With this patch rt_worker_func now takes 0.460 ms on the same system. Signed-off-by: Anton Blanchard Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c40debe51b38..c4c60e9f068a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3397,7 +3397,7 @@ int __init ip_rt_init(void) 0, &rt_hash_log, &rt_hash_mask, - 0); + rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); From 37b607c5ac3b7c92a6a3624bb29f1cdcdcf7044a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Apr 2009 05:45:54 -0700 Subject: [PATCH 18/32] net: Fix typo in net_device_ops description. Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 453be9a674c0..5a96a1a406e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -500,7 +500,7 @@ struct netdev_queue { * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address - * needs to be changed. If not this interface is not defined, the + * needs to be changed. If this interface is not defined, the * mac address can not be changed. * * int (*ndo_validate_addr)(struct net_device *dev); From bf368e4e70cd4e0f880923c44e95a4273d725ab4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2009 02:24:21 -0700 Subject: [PATCH 19/32] net: Avoid extra wakeups of threads blocked in wait_for_packet() In 2.6.25 we added UDP mem accounting. This unfortunatly added a penalty when a frame is transmitted, since we have at TX completion time to call sock_wfree() to perform necessary memory accounting. This calls sock_def_write_space() and utimately scheduler if any thread is waiting on the socket. Thread(s) waiting for an incoming frame was scheduled, then had to sleep again as event was meaningless. (All threads waiting on a socket are using same sk_sleep anchor) This adds lot of extra wakeups and increases latencies, as noted by Christoph Lameter, and slows down softirq handler. Reference : http://marc.info/?l=linux-netdev&m=124060437012283&w=2 Fortunatly, Davide Libenzi recently added concept of keyed wakeups into kernel, and particularly for sockets (see commit 37e5540b3c9d838eb20f2ca8ea2eb8072271e403 epoll keyed wakeups: make sockets use keyed wakeups) Davide goal was to optimize epoll, but this new wakeup infrastructure can help non epoll users as well, if they care to setup an appropriate handler. This patch introduces new DEFINE_WAIT_FUNC() helper and uses it in wait_for_packet(), so that only relevant event can wakeup a thread blocked in this function. Trace of function calls from bnx2 TX completion bnx2_poll_work() is : __kfree_skb() skb_release_head_state() sock_wfree() sock_def_write_space() __wake_up_sync_key() __wake_up_common() receiver_wake_function() : Stops here since thread is waiting for an INPUT Reported-by: Christoph Lameter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/wait.h | 6 ++++-- net/core/datagram.c | 14 +++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5d631c17eaee..bc024632f365 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -440,13 +440,15 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -#define DEFINE_WAIT(name) \ +#define DEFINE_WAIT_FUNC(name, function) \ wait_queue_t name = { \ .private = current, \ - .func = autoremove_wake_function, \ + .func = function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ } +#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) + #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ diff --git a/net/core/datagram.c b/net/core/datagram.c index d0de644b378d..b01a76abe1d2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} /* * Wait for a packet.. */ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); From f3784d834c71689336fa272df420b45345cb6b84 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Thu, 23 Apr 2009 14:50:54 +0300 Subject: [PATCH 20/32] Bluetooth: Ensure that HCI sysfs add/del is preempt safe Use a different work_struct variables for add_conn() and del_conn() and use single work queue instead of two for adding and deleting connections. It eliminates the following error on a preemptible kernel: [ 204.358032] Unable to handle kernel NULL pointer dereference at virtual address 0000000c [ 204.370697] pgd = c0004000 [ 204.373443] [0000000c] *pgd=00000000 [ 204.378601] Internal error: Oops: 17 [#1] PREEMPT [ 204.383361] Modules linked in: vfat fat rfcomm sco l2cap sd_mod scsi_mod iphb pvr2d drm omaplfb ps [ 204.438537] CPU: 0 Not tainted (2.6.28-maemo2 #1) [ 204.443664] PC is at klist_put+0x2c/0xb4 [ 204.447601] LR is at klist_put+0x18/0xb4 [ 204.451568] pc : [] lr : [] psr: a0000113 [ 204.451568] sp : cf1b3f10 ip : cf1b3f10 fp : cf1b3f2c [ 204.463104] r10: 00000000 r9 : 00000000 r8 : bf08029c [ 204.468353] r7 : c7869200 r6 : cfbe2690 r5 : c78692c8 r4 : 00000001 [ 204.474945] r3 : 00000001 r2 : cf1b2000 r1 : 00000001 r0 : 00000000 [ 204.481506] Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment kernel [ 204.488861] Control: 10c5387d Table: 887fc018 DAC: 00000017 [ 204.494628] Process btdelconn (pid: 515, stack limit = 0xcf1b22e0) Signed-off-by: Roger Quadros Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 ++- net/bluetooth/hci_sysfs.c | 37 ++++++++++++++------------------ 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 01f9316b4c23..1224bba24bdd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -180,7 +180,8 @@ struct hci_conn { struct timer_list disc_timer; struct timer_list idle_timer; - struct work_struct work; + struct work_struct work_add; + struct work_struct work_del; struct device dev; diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index ed82796d4a0f..b7c51082ddeb 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -9,8 +9,7 @@ struct class *bt_class = NULL; EXPORT_SYMBOL_GPL(bt_class); -static struct workqueue_struct *btaddconn; -static struct workqueue_struct *btdelconn; +static struct workqueue_struct *bluetooth; static inline char *link_typetostr(int type) { @@ -88,9 +87,10 @@ static struct device_type bt_link = { static void add_conn(struct work_struct *work) { - struct hci_conn *conn = container_of(work, struct hci_conn, work); + struct hci_conn *conn = container_of(work, struct hci_conn, work_add); - flush_workqueue(btdelconn); + /* ensure previous add/del is complete */ + flush_workqueue(bluetooth); if (device_add(&conn->dev) < 0) { BT_ERR("Failed to register connection device"); @@ -114,9 +114,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn) device_initialize(&conn->dev); - INIT_WORK(&conn->work, add_conn); + INIT_WORK(&conn->work_add, add_conn); - queue_work(btaddconn, &conn->work); + queue_work(bluetooth, &conn->work_add); } /* @@ -131,9 +131,12 @@ static int __match_tty(struct device *dev, void *data) static void del_conn(struct work_struct *work) { - struct hci_conn *conn = container_of(work, struct hci_conn, work); + struct hci_conn *conn = container_of(work, struct hci_conn, work_del); struct hci_dev *hdev = conn->hdev; + /* ensure previous add/del is complete */ + flush_workqueue(bluetooth); + while (1) { struct device *dev; @@ -156,9 +159,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn) if (!device_is_registered(&conn->dev)) return; - INIT_WORK(&conn->work, del_conn); + INIT_WORK(&conn->work_del, del_conn); - queue_work(btdelconn, &conn->work); + queue_work(bluetooth, &conn->work_del); } static inline char *host_typetostr(int type) @@ -435,20 +438,13 @@ void hci_unregister_sysfs(struct hci_dev *hdev) int __init bt_sysfs_init(void) { - btaddconn = create_singlethread_workqueue("btaddconn"); - if (!btaddconn) + bluetooth = create_singlethread_workqueue("bluetooth"); + if (!bluetooth) return -ENOMEM; - btdelconn = create_singlethread_workqueue("btdelconn"); - if (!btdelconn) { - destroy_workqueue(btaddconn); - return -ENOMEM; - } - bt_class = class_create(THIS_MODULE, "bluetooth"); if (IS_ERR(bt_class)) { - destroy_workqueue(btdelconn); - destroy_workqueue(btaddconn); + destroy_workqueue(bluetooth); return PTR_ERR(bt_class); } @@ -457,8 +453,7 @@ int __init bt_sysfs_init(void) void bt_sysfs_cleanup(void) { - destroy_workqueue(btaddconn); - destroy_workqueue(btdelconn); + destroy_workqueue(bluetooth); class_destroy(bt_class); } From 052b30b0a8eec8db5b18ad49effdf2a9ba4c1e1a Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 26 Apr 2009 20:01:22 +0200 Subject: [PATCH 21/32] Bluetooth: Add different pairing timeout for Legacy Pairing The Bluetooth stack uses a reference counting for all established ACL links and if no user (L2CAP connection) is present, the link will be terminated to save power. The problem part is the dedicated pairing when using Legacy Pairing (Bluetooth 2.0 and before). At that point no user is present and pairing attempts will be disconnected within 10 seconds or less. In previous kernel version this was not a problem since the disconnect timeout wasn't triggered on incoming connections for the first time. However this caused issues with broken host stacks that kept the connections around after dedicated pairing. When the support for Simple Pairing got added, the link establishment procedure needed to be changed and now causes issues when using Legacy Pairing When using Simple Pairing it is possible to do a proper reference counting of ACL link users. With Legacy Pairing this is not possible since the specification is unclear in some areas and too many broken Bluetooth devices have already been deployed. So instead of trying to deal with all the broken devices, a special pairing timeout will be introduced that increases the timeout to 60 seconds when pairing is triggered. If a broken devices now puts the stack into an unforeseen state, the worst that happens is the disconnect timeout triggers after 120 seconds instead of 4 seconds. This allows successful pairings with legacy and broken devices now. Based on a report by Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 5 +++-- net/bluetooth/hci_conn.c | 1 + net/bluetooth/hci_event.c | 36 +++++++++++++++++++++++++++++++- 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index f69f015bbcc0..ed3aea1605e8 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -101,6 +101,7 @@ enum { /* HCI timeouts */ #define HCI_CONNECT_TIMEOUT (40000) /* 40 seconds */ #define HCI_DISCONN_TIMEOUT (2000) /* 2 seconds */ +#define HCI_PAIRING_TIMEOUT (60000) /* 60 seconds */ #define HCI_IDLE_TIMEOUT (6000) /* 6 seconds */ #define HCI_INIT_TIMEOUT (10000) /* 10 seconds */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1224bba24bdd..be5bd713d2c9 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -171,6 +171,7 @@ struct hci_conn { __u8 auth_type; __u8 sec_level; __u8 power_save; + __u16 disc_timeout; unsigned long pend; unsigned int sent; @@ -349,9 +350,9 @@ static inline void hci_conn_put(struct hci_conn *conn) if (conn->type == ACL_LINK) { del_timer(&conn->idle_timer); if (conn->state == BT_CONNECTED) { - timeo = msecs_to_jiffies(HCI_DISCONN_TIMEOUT); + timeo = msecs_to_jiffies(conn->disc_timeout); if (!conn->out) - timeo *= 5; + timeo *= 2; } else timeo = msecs_to_jiffies(10); } else diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 1181db08d9de..75ebbe2221a3 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -215,6 +215,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) conn->state = BT_OPEN; conn->power_save = 1; + conn->disc_timeout = HCI_DISCONN_TIMEOUT; switch (type) { case ACL_LINK: diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 15f40ea8d544..4e7cb88e5da9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -883,6 +883,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; @@ -1063,9 +1064,14 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s hci_proto_connect_cfm(conn, ev->status); hci_conn_put(conn); } - } else + } else { hci_auth_cfm(conn, ev->status); + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + hci_conn_put(conn); + } + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { if (!ev->status) { struct hci_cp_set_conn_encrypt cp; @@ -1479,7 +1485,21 @@ static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct hci_ev_pin_code_req *ev = (void *) skb->data; + struct hci_conn *conn; + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + hci_conn_put(conn); + } + + hci_dev_unlock(hdev); } static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) @@ -1489,7 +1509,21 @@ static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct hci_ev_link_key_notify *ev = (void *) skb->data; + struct hci_conn *conn; + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + hci_conn_hold(conn); + conn->disc_timeout = HCI_DISCONN_TIMEOUT; + hci_conn_put(conn); + } + + hci_dev_unlock(hdev); } static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb) From 3fdca1e1370ffe89980927cdef0583bebcd8caaf Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 28 Apr 2009 09:04:55 -0700 Subject: [PATCH 22/32] Bluetooth: Fix connection establishment with low security requirement The Bluetooth 2.1 specification introduced four different security modes that can be mapped using Legacy Pairing and Simple Pairing. With the usage of Simple Pairing it is required that all connections (except the ones for SDP) are encrypted. So even the low security requirement mandates an encrypted connection when using Simple Pairing. When using Legacy Pairing (for Bluetooth 2.0 devices and older) this is not required since it causes interoperability issues. To support this properly the low security requirement translates into different host controller transactions depending if Simple Pairing is supported or not. However in case of Simple Pairing the command to switch on encryption after a successful authentication is not triggered for the low security mode. This patch fixes this and actually makes the logic to differentiate between Simple Pairing and Legacy Pairing a lot simpler. Based on a report by Ville Tervo Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 75ebbe2221a3..375f4b4f7f79 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -425,12 +425,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (sec_level == BT_SECURITY_SDP) return 1; - if (sec_level == BT_SECURITY_LOW) { - if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) - return hci_conn_auth(conn, sec_level, auth_type); - else - return 1; - } + if (sec_level == BT_SECURITY_LOW && + (!conn->ssp_mode || !conn->hdev->ssp_mode)) + return 1; if (conn->link_mode & HCI_LM_ENCRYPT) return hci_conn_auth(conn, sec_level, auth_type); From 6269b731560d69c5eaa929909891edec39496d71 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 22 Apr 2009 15:11:05 +1000 Subject: [PATCH 23/32] wireless: remove unneeded EXPORT_SYMBOL the tickles a powerpc compiler bug drivers/net/wireless/iwlwifi/iwl3945-base.c:1415: error: __ksymtab_iwl3945_rx_queue_reset causes a section type conflict I am pretty sure that this is a compiler bug, so not to worry. However, as far as I can see, iwl-3945.o (the only user) and iwl3945-base.o are always linked into the same module, so the EXPORT_SYMBOL (which causes the problem) should not be needed. Correct? Signed-off-by: Stephen Rothwell Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl3945-base.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 617c4235d971..70a00c8ee42e 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -1694,7 +1694,6 @@ void iwl3945_rx_queue_reset(struct iwl_priv *priv, struct iwl_rx_queue *rxq) rxq->free_count = 0; spin_unlock_irqrestore(&rxq->lock, flags); } -EXPORT_SYMBOL(iwl3945_rx_queue_reset); /* * this should be called while priv->lock is locked From e805e4d0b53506dff4255a2792483f094e7fcd2c Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 22 Apr 2009 10:59:37 +0300 Subject: [PATCH 24/32] rndis_wlan: fix initialization order for workqueue&workers rndis_wext_link_change() might be called from rndis_command() at initialization stage and priv->workqueue/priv->work have not been initialized yet. This causes invalid opcode at rndis_wext_bind on some brands of bcm4320. Fix by initializing workqueue/workers in rndis_wext_bind() before rndis_command is used. This bug has existed since 2.6.25, reported at: http://bugzilla.kernel.org/show_bug.cgi?id=12794 Signed-off-by: Jussi Kivilinna Signed-off-by: John W. Linville --- drivers/net/wireless/rndis_wlan.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c index db91db776508..bebf735cd4bd 100644 --- a/drivers/net/wireless/rndis_wlan.c +++ b/drivers/net/wireless/rndis_wlan.c @@ -2558,6 +2558,11 @@ static int rndis_wext_bind(struct usbnet *usbdev, struct usb_interface *intf) mutex_init(&priv->command_lock); spin_lock_init(&priv->stats_lock); + /* because rndis_command() sleeps we need to use workqueue */ + priv->workqueue = create_singlethread_workqueue("rndis_wlan"); + INIT_WORK(&priv->work, rndis_wext_worker); + INIT_DELAYED_WORK(&priv->stats_work, rndis_update_wireless_stats); + /* try bind rndis_host */ retval = generic_rndis_bind(usbdev, intf, FLAG_RNDIS_PHYM_WIRELESS); if (retval < 0) @@ -2603,16 +2608,17 @@ static int rndis_wext_bind(struct usbnet *usbdev, struct usb_interface *intf) disassociate(usbdev, 1); netif_carrier_off(usbdev->net); - /* because rndis_command() sleeps we need to use workqueue */ - priv->workqueue = create_singlethread_workqueue("rndis_wlan"); - INIT_DELAYED_WORK(&priv->stats_work, rndis_update_wireless_stats); queue_delayed_work(priv->workqueue, &priv->stats_work, round_jiffies_relative(STATS_UPDATE_JIFFIES)); - INIT_WORK(&priv->work, rndis_wext_worker); return 0; fail: + cancel_delayed_work_sync(&priv->stats_work); + cancel_work_sync(&priv->work); + flush_workqueue(priv->workqueue); + destroy_workqueue(priv->workqueue); + kfree(priv); return retval; } From 74aa9be0ea0ffeb233f45c39f3cf594b68bbbb89 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Apr 2009 10:45:04 +0200 Subject: [PATCH 25/32] iwlwifi: notify on scan completion even when shutting down Under certain circumstances iwlwifi can get stuck and will no longer accept scan requests, because the core code (cfg80211) thinks that it's still processing one. This fixes one of the points where it can happen, but I've still seen it (although only with my radio-off-when-idle patch). Signed-off-by: Johannes Berg Acked-by: Reinette Chatre Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c index 23644cf884f1..e7c65c4f741b 100644 --- a/drivers/net/wireless/iwlwifi/iwl-scan.c +++ b/drivers/net/wireless/iwlwifi/iwl-scan.c @@ -925,11 +925,11 @@ void iwl_bg_scan_completed(struct work_struct *work) IWL_DEBUG_SCAN(priv, "SCAN complete scan\n"); + ieee80211_scan_completed(priv->hw, false); + if (test_bit(STATUS_EXIT_PENDING, &priv->status)) return; - ieee80211_scan_completed(priv->hw, false); - /* Since setting the TXPOWER may have been deferred while * performing the scan, fire one off */ mutex_lock(&priv->mutex); From b7fcb5c4a4c27da2f6d86cb03d18687e537442cf Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Mon, 27 Apr 2009 22:12:43 -0400 Subject: [PATCH 26/32] ath5k: fix buffer overrun in rate debug code char bname[5] is too small for the string "X GHz" when the null terminator is taken into account. Thus, turning on rate debugging can crash unless we have lucky stack alignment. Cc: stable@kernel.org Reported-by: Paride Legovini Signed-off-by: Bob Copeland Signed-off-by: John W. Linville --- drivers/net/wireless/ath5k/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath5k/debug.c b/drivers/net/wireless/ath5k/debug.c index 9770bb3d40f9..4904a07e4b59 100644 --- a/drivers/net/wireless/ath5k/debug.c +++ b/drivers/net/wireless/ath5k/debug.c @@ -424,7 +424,7 @@ ath5k_debug_dump_bands(struct ath5k_softc *sc) for (b = 0; b < IEEE80211_NUM_BANDS; b++) { struct ieee80211_supported_band *band = &sc->sbands[b]; - char bname[5]; + char bname[6]; switch (band->band) { case IEEE80211_BAND_2GHZ: strcpy(bname, "2 GHz"); From 942e4a2bd680c606af0211e64eb216be2e19bf61 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 28 Apr 2009 22:36:33 -0700 Subject: [PATCH 27/32] netfilter: revised locking for x_tables The x_tables are organized with a table structure and a per-cpu copies of the counters and rules. On older kernels there was a reader/writer lock per table which was a performance bottleneck. In 2.6.30-rc, this was converted to use RCU and the counters/rules which solved the performance problems for do_table but made replacing rules much slower because of the necessary RCU grace period. This version uses a per-cpu set of spinlocks and counters to allow to table processing to proceed without the cache thrashing of a global reader lock and keeps the same performance for table updates. Signed-off-by: Stephen Hemminger Acked-by: Linus Torvalds Signed-off-by: David S. Miller --- include/linux/netfilter/x_tables.h | 73 +++++++++++++++-- net/ipv4/netfilter/arp_tables.c | 125 +++++++++------------------- net/ipv4/netfilter/ip_tables.c | 126 ++++++++--------------------- net/ipv6/netfilter/ip6_tables.c | 123 +++++++++------------------- net/netfilter/x_tables.c | 53 ++++++------ 5 files changed, 204 insertions(+), 296 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 7b1a652066c0..1b2e43502ef7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -354,9 +354,6 @@ struct xt_table /* What hooks you will enter on */ unsigned int valid_hooks; - /* Lock for the curtain */ - struct mutex lock; - /* Man behind the curtain... */ struct xt_table_info *private; @@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af); extern struct xt_table_info *xt_alloc_table_info(unsigned int size); extern void xt_free_table_info(struct xt_table_info *info); -extern void xt_table_entry_swap_rcu(struct xt_table_info *old, - struct xt_table_info *new); + +/* + * Per-CPU spinlock associated with per-cpu table entries, and + * with a counter for the "reading" side that allows a recursive + * reader to avoid taking the lock and deadlocking. + * + * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. + * It needs to ensure that the rules are not being changed while the packet + * is being processed. In some cases, the read lock will be acquired + * twice on the same CPU; this is okay because of the count. + * + * "writing" is used when reading counters. + * During replace any readers that are using the old tables have to complete + * before freeing the old table. This is handled by the write locking + * necessary for reading the counters. + */ +struct xt_info_lock { + spinlock_t lock; + unsigned char readers; +}; +DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); + +/* + * Note: we need to ensure that preemption is disabled before acquiring + * the per-cpu-variable, so we do it as a two step process rather than + * using "spin_lock_bh()". + * + * We _also_ need to disable bottom half processing before updating our + * nesting count, to make sure that the only kind of re-entrancy is this + * code being called by itself: since the count+lock is not an atomic + * operation, we can allow no races. + * + * _Only_ that special combination of being per-cpu and never getting + * re-entered asynchronously means that the count is safe. + */ +static inline void xt_info_rdlock_bh(void) +{ + struct xt_info_lock *lock; + + local_bh_disable(); + lock = &__get_cpu_var(xt_info_locks); + if (!lock->readers++) + spin_lock(&lock->lock); +} + +static inline void xt_info_rdunlock_bh(void) +{ + struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); + + if (!--lock->readers) + spin_unlock(&lock->lock); + local_bh_enable(); +} + +/* + * The "writer" side needs to get exclusive access to the lock, + * regardless of readers. This must be called with bottom half + * processing (and thus also preemption) disabled. + */ +static inline void xt_info_wrlock(unsigned int cpu) +{ + spin_lock(&per_cpu(xt_info_locks, cpu).lock); +} + +static inline void xt_info_wrunlock(unsigned int cpu) +{ + spin_unlock(&per_cpu(xt_info_locks, cpu).lock); +} /* * This helper is performance critical and must be inlined diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 5ba533d234db..831fe1879dc0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + (2 * skb->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); t = arpt_get_target(e); @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); if (hotdrop) return NF_DROP; @@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; ARPT_ENTRY_ITERATE(t->entries[curcpu], @@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct arpt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); - + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 810c0b62c7d4..2ec8d7290c40 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb, tgpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters - * with data used by 'current' CPU - * We dont care about preemption here. + * with data used by 'current' CPU. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; IPT_ENTRY_ITERATE(t->entries[curcpu], @@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } - -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } - -static inline int -zero_entry_counter(struct ipt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 800ae8542471..219e165aea10 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t, /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. + * + * Bottom half has to be disabled to prevent deadlock + * if new softirq were to run and call ipt_do_table */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; IP6T_ENTRY_ITERATE(t->entries[curcpu], @@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t, if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct ip6t_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); - /* Get the old counters. */ + /* Get the old counters, and synchronize with replace */ get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, @@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len) return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, goto free; } - mutex_lock(&t->lock); + + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + xt_info_wrlock(curcpu); + loc_cpu_entry = private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); + unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 509a95621f9f..150e5cf62f85 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, - struct xt_table_info *newinfo) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) { - void *p = oldinfo->entries[cpu]; - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); - newinfo->entries[cpu] = p; - } - -} -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); - /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif +DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); + + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { - struct xt_table_info *oldinfo, *private; + struct xt_table_info *private; /* Do the substitution. */ - mutex_lock(&table->lock); + local_bh_disable(); private = table->private; + /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - mutex_unlock(&table->lock); + local_bh_enable(); *error = -EAGAIN; return NULL; } - oldinfo = private; - rcu_assign_pointer(table->private, newinfo); - newinfo->initial_entries = oldinfo->initial_entries; - mutex_unlock(&table->lock); - synchronize_net(); - return oldinfo; + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries. This is okay, because + * resynchronization happens because of the locking done + * during the get_counters() routine. + */ + local_bh_enable(); + + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table, /* Simplifies replace_table code. */ table->private = bootstrap; - mutex_init(&table->lock); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; @@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = { static int __init xt_init(void) { - int i, rv; + unsigned int i; + int rv; + + for_each_possible_cpu(i) { + struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); + spin_lock_init(&lock->lock); + lock->readers = 0; + } xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) From ac7c992cac0c8f276aa8e4a8273204a6db707bb3 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 28 Apr 2009 22:42:39 -0700 Subject: [PATCH 28/32] e100: do not go D3 in shutdown unless system is powering off After experimenting with kexec with the last merges after 2.6.29, I've had some problems when probing e100. It would not read the eeprom. After some bisects, I realized this has been like that since forever (at least 2.6.18). The problem is that shutdown is doing the same thing that suspend does and puts the device in D3 state. I couldn't find a way to get the device back to a sane state in the probe function. So, based on some similar patches from Rafael J. Wysocki for e1000, e1000e, and ixgbe, I wrote this one for e100. Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: "Rafael J. Wysocki" Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/e100.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/net/e100.c b/drivers/net/e100.c index 5c0b457c7868..0f9ee1348552 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -2728,7 +2728,7 @@ static void __devexit e100_remove(struct pci_dev *pdev) #define E100_82552_SMARTSPEED 0x14 /* SmartSpeed Ctrl register */ #define E100_82552_REV_ANEG 0x0200 /* Reverse auto-negotiation */ #define E100_82552_ANEG_NOW 0x0400 /* Auto-negotiate now */ -static int e100_suspend(struct pci_dev *pdev, pm_message_t state) +static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake) { struct net_device *netdev = pci_get_drvdata(pdev); struct nic *nic = netdev_priv(netdev); @@ -2749,19 +2749,32 @@ static int e100_suspend(struct pci_dev *pdev, pm_message_t state) E100_82552_SMARTSPEED, smartspeed | E100_82552_REV_ANEG | E100_82552_ANEG_NOW); } - if (pci_enable_wake(pdev, PCI_D3cold, true)) - pci_enable_wake(pdev, PCI_D3hot, true); + *enable_wake = true; } else { - pci_enable_wake(pdev, PCI_D3hot, false); + *enable_wake = false; } pci_disable_device(pdev); - pci_set_power_state(pdev, PCI_D3hot); +} - return 0; +static int __e100_power_off(struct pci_dev *pdev, bool wake) +{ + if (wake) { + return pci_prepare_to_sleep(pdev); + } else { + pci_wake_from_d3(pdev, false); + return pci_set_power_state(pdev, PCI_D3hot); + } } #ifdef CONFIG_PM +static int e100_suspend(struct pci_dev *pdev, pm_message_t state) +{ + bool wake; + __e100_shutdown(pdev, &wake); + return __e100_power_off(pdev, wake); +} + static int e100_resume(struct pci_dev *pdev) { struct net_device *netdev = pci_get_drvdata(pdev); @@ -2792,7 +2805,10 @@ static int e100_resume(struct pci_dev *pdev) static void e100_shutdown(struct pci_dev *pdev) { - e100_suspend(pdev, PMSG_SUSPEND); + bool wake; + __e100_shutdown(pdev, &wake); + if (system_state == SYSTEM_POWER_OFF) + __e100_power_off(pdev, wake); } /* ------------------ PCI Error Recovery infrastructure -------------- */ From d4c4a9a1bce1912ed5681251f0037fd4f2364a3e Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Apr 2009 11:41:24 +0100 Subject: [PATCH 29/32] mac80211: fix modprobe deadlock by not calling wep_init under rtnl_lock - ieee80211_wep_init(), which is called with rtnl_lock held, blocks in request_module() [waiting for modprobe to load a crypto module]. - modprobe blocks in a call to flush_workqueue(), when it closes a TTY [presumably when it exits]. - The workqueue item linkwatch_event() blocks on rtnl_lock. There's no reason for wep_init() to be called with rtnl_lock held, so just move it outside the critical section. Signed-off-by: Alan Jenkins Signed-off-by: John W. Linville --- net/mac80211/main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fbcbed6cad01..00968c2d22bb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -909,6 +909,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (result < 0) goto fail_sta_info; + result = ieee80211_wep_init(local); + if (result < 0) { + printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", + wiphy_name(local->hw.wiphy), result); + goto fail_wep; + } + rtnl_lock(); result = dev_alloc_name(local->mdev, local->mdev->name); if (result < 0) @@ -930,14 +937,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_rate; } - result = ieee80211_wep_init(local); - - if (result < 0) { - printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", - wiphy_name(local->hw.wiphy), result); - goto fail_wep; - } - /* add one default STA interface if supported */ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) { result = ieee80211_if_add(local, "wlan%d", NULL, @@ -967,13 +966,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) return 0; -fail_wep: - rate_control_deinitialize(local); fail_rate: unregister_netdevice(local->mdev); local->mdev = NULL; fail_dev: rtnl_unlock(); + ieee80211_wep_free(local); +fail_wep: sta_info_stop(local); fail_sta_info: debugfs_hw_del(local); From c428c89201a57a0ce24c37ed79e540d1f4101cf3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 29 Apr 2009 00:28:18 +0200 Subject: [PATCH 30/32] mac80211: default to automatic power control In "mac80211: correct wext transmit power handler" I fixed the wext handler, but forgot to make the default of the user_power_level -1 (aka "auto"), so that now the transmit power is always set to 0, causing associations to time out and similar problems since we're transmitting with very little power. Correct this by correcting the default user_power_level to -1. Signed-off-by: Johannes Berg Bisected-by: Niel Lambrechts Signed-off-by: John W. Linville --- net/mac80211/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 00968c2d22bb..14134193cd17 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -757,6 +757,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, local->hw.conf.long_frame_max_tx_count = 4; local->hw.conf.short_frame_max_tx_count = 7; local->hw.conf.radio_enabled = true; + local->user_power_level = -1; INIT_LIST_HEAD(&local->interfaces); mutex_init(&local->iflist_mtx); From 1319ebadf185933e6b7ff95211d3cef9004e9754 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Wed, 29 Apr 2009 11:57:34 +0000 Subject: [PATCH 31/32] mv643xx_eth: OOM handling fixes Currently, when OOM occurs during rx ring refill, mv643xx_eth will get into an infinite loop, due to the refill function setting the OOM bit but not clearing the 'rx refill needed' bit for this queue, while the calling function (the NAPI poll handler) will call the refill function in a loop until the 'rx refill needed' bit goes off, without checking the OOM bit. This patch fixes this by checking the OOM bit in the NAPI poll handler before attempting to do rx refill. This means that once OOM occurs, we won't try to do any memory allocations again until the next invocation of the poll handler. While we're at it, change the OOM flag to be a single bit instead of one bit per receive queue since OOM is a system state rather than a per-queue state, and cancel the OOM timer on entry to the NAPI poll handler if it's running to prevent it from firing when we've already come out of OOM. Signed-off-by: Lennert Buytenhek Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/mv643xx_eth.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index b3185bf2c158..038beff7da80 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -393,12 +393,12 @@ struct mv643xx_eth_private { struct work_struct tx_timeout_task; struct napi_struct napi; + u8 oom; u8 work_link; u8 work_tx; u8 work_tx_end; u8 work_rx; u8 work_rx_refill; - u8 work_rx_oom; int skb_size; struct sk_buff_head rx_recycle; @@ -661,7 +661,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget) dma_get_cache_alignment() - 1); if (skb == NULL) { - mp->work_rx_oom |= 1 << rxq->index; + mp->oom = 1; goto oom; } @@ -2167,8 +2167,10 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget) mp = container_of(napi, struct mv643xx_eth_private, napi); - mp->work_rx_refill |= mp->work_rx_oom; - mp->work_rx_oom = 0; + if (unlikely(mp->oom)) { + mp->oom = 0; + del_timer(&mp->rx_oom); + } work_done = 0; while (work_done < budget) { @@ -2182,8 +2184,10 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget) continue; } - queue_mask = mp->work_tx | mp->work_tx_end | - mp->work_rx | mp->work_rx_refill; + queue_mask = mp->work_tx | mp->work_tx_end | mp->work_rx; + if (likely(!mp->oom)) + queue_mask |= mp->work_rx_refill; + if (!queue_mask) { if (mv643xx_eth_collect_events(mp)) continue; @@ -2204,7 +2208,7 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget) txq_maybe_wake(mp->txq + queue); } else if (mp->work_rx & queue_mask) { work_done += rxq_process(mp->rxq + queue, work_tbd); - } else if (mp->work_rx_refill & queue_mask) { + } else if (!mp->oom && (mp->work_rx_refill & queue_mask)) { work_done += rxq_refill(mp->rxq + queue, work_tbd); } else { BUG(); @@ -2212,7 +2216,7 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget) } if (work_done < budget) { - if (mp->work_rx_oom) + if (mp->oom) mod_timer(&mp->rx_oom, jiffies + (HZ / 10)); napi_complete(napi); wrlp(mp, INT_MASK, INT_TX_END | INT_RX | INT_EXT); @@ -2372,7 +2376,7 @@ static int mv643xx_eth_open(struct net_device *dev) rxq_refill(mp->rxq + i, INT_MAX); } - if (mp->work_rx_oom) { + if (mp->oom) { mp->rx_oom.expires = jiffies + (HZ / 10); add_timer(&mp->rx_oom); } From 93af7aca44f0e82e67bda10a0fb73d383edcc8bd Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Wed, 29 Apr 2009 11:58:18 +0000 Subject: [PATCH 32/32] mv643xx_eth: 64bit mib counter read fix On several mv643xx_eth hardware versions, the two 64bit mib counters for 'good octets received' and 'good octets sent' are actually 32bit counters, and reading from the upper half of the register has the same effect as reading from the lower half of the register: an atomic read-and-clear of the entire 32bit counter value. This can under heavy traffic occasionally lead to small numbers being added to the upper half of the 64bit mib counter even though no 32bit wrap has occured. Since we poll the mib counters at least every 30 seconds anyway, we might as well just skip the reads of the upper halves of the hardware counters without breaking the stats, which this patch does. Signed-off-by: Lennert Buytenhek Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/mv643xx_eth.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index 038beff7da80..a400d7115f78 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -1255,7 +1255,6 @@ static void mib_counters_update(struct mv643xx_eth_private *mp) spin_lock_bh(&mp->mib_counters_lock); p->good_octets_received += mib_read(mp, 0x00); - p->good_octets_received += (u64)mib_read(mp, 0x04) << 32; p->bad_octets_received += mib_read(mp, 0x08); p->internal_mac_transmit_err += mib_read(mp, 0x0c); p->good_frames_received += mib_read(mp, 0x10); @@ -1269,7 +1268,6 @@ static void mib_counters_update(struct mv643xx_eth_private *mp) p->frames_512_to_1023_octets += mib_read(mp, 0x30); p->frames_1024_to_max_octets += mib_read(mp, 0x34); p->good_octets_sent += mib_read(mp, 0x38); - p->good_octets_sent += (u64)mib_read(mp, 0x3c) << 32; p->good_frames_sent += mib_read(mp, 0x40); p->excessive_collision += mib_read(mp, 0x44); p->multicast_frames_sent += mib_read(mp, 0x48);