From 7ea49ed73c8d0d0bdf7c11fc18c61572d2d22176 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 14 Aug 2006 17:08:36 -0700 Subject: [PATCH 01/17] [VLAN]: Make sure bonding packet drop checks get done in hwaccel RX path. Since __vlan_hwaccel_rx() is essentially bypassing the netif_receive_skb() call that would have occurred if we did the VLAN decapsulation in software, we are missing the skb_bond() call and the assosciated checks it does. Export those checks via an inline function, skb_bond_should_drop(), and use this in __vlan_hwaccel_rx(). Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 5 +++++ include/linux/netdevice.h | 24 ++++++++++++++++++++++++ net/core/dev.c | 18 +----------------- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 383627ad328f..ab2740832742 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -155,6 +155,11 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb, { struct net_device_stats *stats; + if (skb_bond_should_drop(skb)) { + dev_kfree_skb_any(skb); + return NET_RX_DROP; + } + skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK]; if (skb->dev == NULL) { dev_kfree_skb_any(skb); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75f02d8c6ed3..c0c2b46face1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1012,6 +1012,30 @@ static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) unlikely(skb->ip_summed != CHECKSUM_HW)); } +/* On bonding slaves other than the currently active slave, suppress + * duplicates except for 802.3ad ETH_P_SLOW and alb non-mcast/bcast. + */ +static inline int skb_bond_should_drop(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct net_device *master = dev->master; + + if (master && + (dev->priv_flags & IFF_SLAVE_INACTIVE)) { + if (master->priv_flags & IFF_MASTER_ALB) { + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return 0; + } + if (master->priv_flags & IFF_MASTER_8023AD && + skb->protocol == __constant_htons(ETH_P_SLOW)) + return 0; + + return 1; + } + return 0; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/net/core/dev.c b/net/core/dev.c index d95e2626d944..9fe96cde3e19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1619,26 +1619,10 @@ static inline struct net_device *skb_bond(struct sk_buff *skb) struct net_device *dev = skb->dev; if (dev->master) { - /* - * On bonding slaves other than the currently active - * slave, suppress duplicates except for 802.3ad - * ETH_P_SLOW and alb non-mcast/bcast. - */ - if (dev->priv_flags & IFF_SLAVE_INACTIVE) { - if (dev->master->priv_flags & IFF_MASTER_ALB) { - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) - goto keep; - } - - if (dev->master->priv_flags & IFF_MASTER_8023AD && - skb->protocol == __constant_htons(ETH_P_SLOW)) - goto keep; - + if (skb_bond_should_drop(skb)) { kfree_skb(skb); return NULL; } -keep: skb->dev = dev->master; } From 855751125093f758871b70da2951d8b92b6368cc Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 15 Aug 2006 00:03:01 -0700 Subject: [PATCH 02/17] [NET]: Fix potential stack overflow in net/core/utils.c On High end systems (1024 or so cpus) this can potentially cause stack overflow. Fix the stack usage. Signed-off-by: Suresh Siddha Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/core/utils.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/utils.c b/net/core/utils.c index 4f96f389243d..e31c90e05594 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -130,12 +130,13 @@ void __init net_random_init(void) static int net_random_reseed(void) { int i; - unsigned long seed[NR_CPUS]; + unsigned long seed; - get_random_bytes(seed, sizeof(seed)); for_each_possible_cpu(i) { struct nrnd_state *state = &per_cpu(net_rand_state,i); - __net_srandom(state, seed[i]); + + get_random_bytes(&seed, sizeof(seed)); + __net_srandom(state, seed); } return 0; } From deb47c66e12a645f7eec9b1c153c05ed47989439 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 15 Aug 2006 00:04:56 -0700 Subject: [PATCH 03/17] [NETFILTER]: xt_physdev build fix It needs netfilter_bridge.h for brnf_deferred_hooks Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/netfilter/xt_physdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index a9f4f6f3c628..63a965467465 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include From 640c41c77a96dbbfb74d40ae86ab75b759afb911 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 15 Aug 2006 00:06:56 -0700 Subject: [PATCH 04/17] [IPV6] lockdep: annotate __icmpv6_socket Split off __icmpv6_socket's sk->sk_dst_lock class, because it gets used from softirqs, which is safe for __icmpv6_sockets (because they never get directly used via userspace syscalls), but unsafe for normal sockets. Has no effect on non-lockdep kernels. Signed-off-by: Ingo Molnar Acked-by: Herbert Xu Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1044b6fce0d5..3d6e9a351150 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -712,6 +712,11 @@ discard_it: return 0; } +/* + * Special lock-class for __icmpv6_socket: + */ +static struct lock_class_key icmpv6_socket_sk_dst_lock_key; + int __init icmpv6_init(struct net_proto_family *ops) { struct sock *sk; @@ -730,6 +735,14 @@ int __init icmpv6_init(struct net_proto_family *ops) sk = per_cpu(__icmpv6_socket, i)->sk; sk->sk_allocation = GFP_ATOMIC; + /* + * Split off their lock-class, because sk->sk_dst_lock + * gets used from softirqs, which is safe for + * __icmpv6_socket (because those never get directly used + * via userspace syscalls), but unsafe for normal sockets. + */ + lockdep_set_class(&sk->sk_dst_lock, + &icmpv6_socket_sk_dst_lock_key); /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. From bb699cbca0096aa3f5f750264ec0af080732375a Mon Sep 17 00:00:00 2001 From: Michal Ruzicka Date: Tue, 15 Aug 2006 00:20:17 -0700 Subject: [PATCH 05/17] [IPV4]: Possible leak of multicast source filter sctructure There is a leak of a socket's multicast source filter list structure on closing a socket with a multicast source filter set on an interface that does not exist any more. Signed-off-by: Michal Ruzicka Acked-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9f4b752f5a33..e981369ebe13 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2199,13 +2199,13 @@ void ip_mc_drop_socket(struct sock *sk) struct in_device *in_dev; inet->mc_list = iml->next; - if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) { - (void) ip_mc_leave_src(sk, iml, in_dev); + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + (void) ip_mc_leave_src(sk, iml, in_dev); + if (in_dev != NULL) { ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } sock_kfree_s(sk, iml, sizeof(*iml)); - } rtnl_unlock(); } From fb33f82568d32016b3b3c00819574f9526e52be3 Mon Sep 17 00:00:00 2001 From: "Jan \"Yenya\" Kasprzak" Date: Tue, 15 Aug 2006 01:33:50 -0700 Subject: [PATCH 06/17] [NET]: Terminology in ip-sysctl.txt this minor patch fixes the description of net.ipv4.tcp_mem sysctl in ip-sysctl.txt - the headline names the values "min, pressure, max", while the description uses the "low, pressure, high" values. Both tcp_rmem and tcp_wmem descriptions use the "min, pressure, max" values, so I have changed the tcp_mem to match this and not vice versa. Signed-off-by: Jan "Yenya" Kasprzak Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d46338af6002..3e0c017e7877 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -294,15 +294,15 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max Default: 87380*2 bytes. tcp_mem - vector of 3 INTEGERs: min, pressure, max - low: below this number of pages TCP is not bothered about its + min: below this number of pages TCP is not bothered about its memory appetite. pressure: when amount of memory allocated by TCP exceeds this number of pages, TCP moderates its memory consumption and enters memory pressure mode, which is exited when memory consumption falls - under "low". + under "min". - high: number of pages allowed for queueing by all TCP sockets. + max: number of pages allowed for queueing by all TCP sockets. Defaults are calculated at boot time from amount of available memory. From 2f8af120a159a843948749ea88bcacda9779b132 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 15 Aug 2006 01:39:10 -0700 Subject: [PATCH 07/17] [BNX2]: Fix tx race condition. Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int() similar to the one in tg3 discovered by Herbert Xu: CPU0 CPU1 bnx2_start_xmit() if (tx_ring_full) { tx_lock bnx2_tx() if (!netif_queue_stopped) netif_stop_queue() if (!tx_ring_full) update_tx_ring netif_wake_queue() tx_unlock } Even though tx_ring is updated before the if statement in bnx2_tx_int() in program order, it can be re-ordered by the CPU as shown above. This scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has just freed up the entire tx_ring. The possibility of this happening should be very rare though. The following changes are made, very much identical to the tg3 fix: 1. Add memory barrier to fix the above race condition. 2. Eliminate the private tx_lock altogether and rely solely on netif_tx_lock. This eliminates one spinlock in bnx2_start_xmit() when the ring is full. 3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling netif_wake_queue(). 4. Add memory barrier to bnx2_tx_avail(). 5. Add bp->tx_wake_thresh which is set to half the tx ring size. 6. Check for the full wake queue condition before getting netif_tx_lock in tg3_tx(). This reduces the number of unnecessary spinlocks when the tx ring is full in a steady-state condition. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 35 +++++++++++++++++++---------------- drivers/net/bnx2.h | 12 +++++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index db73de0d2511..2099edbbfdfd 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl); static inline u32 bnx2_tx_avail(struct bnx2 *bp) { - u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); + u32 diff; + smp_mb(); + diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); if (diff > MAX_TX_DESC_CNT) diff = (diff & MAX_TX_DESC_CNT) - 1; return (bp->tx_ring_size - diff); @@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp) } bp->tx_cons = sw_cons; + /* Need to make the tx_cons update visible to bnx2_start_xmit() + * before checking for netif_queue_stopped(). Without the + * memory barrier, there is a small possibility that bnx2_start_xmit() + * will miss it and cause the queue to be stopped forever. + */ + smp_mb(); - if (unlikely(netif_queue_stopped(bp->dev))) { - spin_lock(&bp->tx_lock); + if (unlikely(netif_queue_stopped(bp->dev)) && + (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) { + netif_tx_lock(bp->dev); if ((netif_queue_stopped(bp->dev)) && - (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { - + (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) netif_wake_queue(bp->dev); - } - spin_unlock(&bp->tx_lock); + netif_tx_unlock(bp->dev); } } @@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp) struct tx_bd *txbd; u32 val; + bp->tx_wake_thresh = bp->tx_ring_size / 2; + txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT]; txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32; @@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid) #endif /* Called with netif_tx_lock. - * hard_start_xmit is pseudo-lockless - a lock is only required when - * the tx queue is full. This way, we get the benefit of lockless - * operations most of the time without the complexities to handle - * netif_stop_queue/wake_queue race conditions. + * bnx2_tx_int() runs without netif_tx_lock unless it needs to call + * netif_wake_queue(). */ static int bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) dev->trans_start = jiffies; if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { - spin_lock(&bp->tx_lock); netif_stop_queue(dev); - - if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS) + if (bnx2_tx_avail(bp) > bp->tx_wake_thresh) netif_wake_queue(dev); - spin_unlock(&bp->tx_lock); } return NETDEV_TX_OK; @@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->pdev = pdev; spin_lock_init(&bp->phy_lock); - spin_lock_init(&bp->tx_lock); INIT_WORK(&bp->reset_task, bnx2_reset_task, bp); dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0); diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 658c5ee95c73..fe804763c607 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h @@ -3890,10 +3890,6 @@ struct bnx2 { u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES))); u16 tx_prod; - struct tx_bd *tx_desc_ring; - struct sw_bd *tx_buf_ring; - int tx_ring_size; - u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES))); u16 hw_tx_cons; @@ -3916,9 +3912,11 @@ struct bnx2 { struct sw_bd *rx_buf_ring; struct rx_bd *rx_desc_ring[MAX_RX_RINGS]; - /* Only used to synchronize netif_stop_queue/wake_queue when tx */ - /* ring is full */ - spinlock_t tx_lock; + /* TX constants */ + struct tx_bd *tx_desc_ring; + struct sw_bd *tx_buf_ring; + int tx_ring_size; + u32 tx_wake_thresh; /* End of fields used in the performance code paths. */ From 932f3772cf76cc1b1fd1538ceee3edba9bf2164f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 15 Aug 2006 01:39:36 -0700 Subject: [PATCH 08/17] [BNX2]: Convert to netdev_alloc_skb() Convert dev_alloc_skb() to netdev_alloc_skb() and increase default rx ring size to 255. The old ring size of 100 was too small. Update version to 1.4.44. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/bnx2.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 2099edbbfdfd..652eb05a6c2d 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -56,8 +56,8 @@ #define DRV_MODULE_NAME "bnx2" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "1.4.43" -#define DRV_MODULE_RELDATE "June 28, 2006" +#define DRV_MODULE_VERSION "1.4.44" +#define DRV_MODULE_RELDATE "August 10, 2006" #define RUN_AT(x) (jiffies + (x)) @@ -1571,7 +1571,7 @@ bnx2_alloc_rx_skb(struct bnx2 *bp, u16 index) struct rx_bd *rxbd = &bp->rx_desc_ring[RX_RING(index)][RX_IDX(index)]; unsigned long align; - skb = dev_alloc_skb(bp->rx_buf_size); + skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size); if (skb == NULL) { return -ENOMEM; } @@ -1580,7 +1580,6 @@ bnx2_alloc_rx_skb(struct bnx2 *bp, u16 index) skb_reserve(skb, 8 - align); } - skb->dev = bp->dev; mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_use_size, PCI_DMA_FROMDEVICE); @@ -1793,7 +1792,7 @@ bnx2_rx_int(struct bnx2 *bp, int budget) if ((bp->dev->mtu > 1500) && (len <= RX_COPY_THRESH)) { struct sk_buff *new_skb; - new_skb = dev_alloc_skb(len + 2); + new_skb = netdev_alloc_skb(bp->dev, len + 2); if (new_skb == NULL) goto reuse_rx; @@ -1804,7 +1803,6 @@ bnx2_rx_int(struct bnx2 *bp, int budget) skb_reserve(new_skb, 2); skb_put(new_skb, len); - new_skb->dev = bp->dev; bnx2_reuse_rx_skb(bp, skb, sw_ring_cons, sw_ring_prod); @@ -3961,7 +3959,7 @@ bnx2_run_loopback(struct bnx2 *bp, int loopback_mode) return -EINVAL; pkt_size = 1514; - skb = dev_alloc_skb(pkt_size); + skb = netdev_alloc_skb(bp->dev, pkt_size); if (!skb) return -ENOMEM; packet = skb_put(skb, pkt_size); @@ -5754,7 +5752,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->mac_addr[5] = (u8) reg; bp->tx_ring_size = MAX_TX_DESC_CNT; - bnx2_set_rx_ring_size(bp, 100); + bnx2_set_rx_ring_size(bp, 255); bp->rx_csum = 1; From b9c6e3e96669ade31afd3a39f17393e577b609c5 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Tue, 15 Aug 2006 02:02:33 -0700 Subject: [PATCH 09/17] [ATM]: Compile error on ARM atm_proc_exit() is declared as __exit, and thus in .exit.text. On some architectures (ARM) .exit.text is discarded at compile time, and since atm_proc_exit() is called by some other __init functions, it results in a link error. Signed-off-by: Kevin Hilman Signed-off-by: David S. Miller --- net/atm/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/atm/proc.c b/net/atm/proc.c index 3f95b0886a6a..91fe5f53ff11 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -507,7 +507,7 @@ err_out: goto out; } -void __exit atm_proc_exit(void) +void atm_proc_exit(void) { atm_proc_dirs_remove(); } From c0956bd25161bff45304d482cda51ca4b3b572f1 Mon Sep 17 00:00:00 2001 From: Ralf Hildebrandt Date: Tue, 15 Aug 2006 02:12:43 -0700 Subject: [PATCH 10/17] [PKT_SCHED] cls_u32: Fix typo. Signed-off-by: Ralf Hildebrandt Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index eea366966740..0a6cfa0005be 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -796,7 +796,7 @@ static int __init init_u32(void) { printk("u32 classifier\n"); #ifdef CONFIG_CLS_U32_PERF - printk(" Perfomance counters on\n"); + printk(" Performance counters on\n"); #endif #ifdef CONFIG_NET_CLS_POLICE printk(" OLD policer on \n"); From d4274b51a5c8147b5341e15927368e75b632d297 Mon Sep 17 00:00:00 2001 From: Panagiotis Issaris Date: Tue, 15 Aug 2006 16:01:07 -0700 Subject: [PATCH 11/17] [PPP]: handle kmalloc failures and convert to using kzalloc The PPP code contains two kmalloc()s followed by memset()s without handling a possible memory allocation failure. (Suggested by Joe Perches). And furthermore, conversions from kmalloc+memset to kzalloc. [akpm@osdl.org: fix error-path leak] [akpm@osdl.org: cleanups] [paulus@samba.org: don't add useless printk and cardmap_destroy calls] Signed-off-by: Panagiotis Issaris Signed-off-by: Andrew Morton Signed-off-by: Paul Mackerras Signed-off-by: David S. Miller --- drivers/net/ppp_generic.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 0ec6e9d57b94..c872f7c6cce3 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -192,7 +192,7 @@ struct cardmap { void *ptr[CARDMAP_WIDTH]; }; static void *cardmap_get(struct cardmap *map, unsigned int nr); -static void cardmap_set(struct cardmap **map, unsigned int nr, void *ptr); +static int cardmap_set(struct cardmap **map, unsigned int nr, void *ptr); static unsigned int cardmap_find_first_free(struct cardmap *map); static void cardmap_destroy(struct cardmap **map); @@ -1995,10 +1995,9 @@ ppp_register_channel(struct ppp_channel *chan) { struct channel *pch; - pch = kmalloc(sizeof(struct channel), GFP_KERNEL); + pch = kzalloc(sizeof(struct channel), GFP_KERNEL); if (pch == 0) return -ENOMEM; - memset(pch, 0, sizeof(struct channel)); pch->ppp = NULL; pch->chan = chan; chan->ppp = pch; @@ -2408,13 +2407,12 @@ ppp_create_interface(int unit, int *retp) int ret = -ENOMEM; int i; - ppp = kmalloc(sizeof(struct ppp), GFP_KERNEL); + ppp = kzalloc(sizeof(struct ppp), GFP_KERNEL); if (!ppp) goto out; dev = alloc_netdev(0, "", ppp_setup); if (!dev) goto out1; - memset(ppp, 0, sizeof(struct ppp)); ppp->mru = PPP_MRU; init_ppp_file(&ppp->file, INTERFACE); @@ -2454,11 +2452,16 @@ ppp_create_interface(int unit, int *retp) } atomic_inc(&ppp_unit_count); - cardmap_set(&all_ppp_units, unit, ppp); + ret = cardmap_set(&all_ppp_units, unit, ppp); + if (ret != 0) + goto out3; + mutex_unlock(&all_ppp_mutex); *retp = 0; return ppp; +out3: + atomic_dec(&ppp_unit_count); out2: mutex_unlock(&all_ppp_mutex); free_netdev(dev); @@ -2695,7 +2698,7 @@ static void *cardmap_get(struct cardmap *map, unsigned int nr) return NULL; } -static void cardmap_set(struct cardmap **pmap, unsigned int nr, void *ptr) +static int cardmap_set(struct cardmap **pmap, unsigned int nr, void *ptr) { struct cardmap *p; int i; @@ -2704,8 +2707,9 @@ static void cardmap_set(struct cardmap **pmap, unsigned int nr, void *ptr) if (p == NULL || (nr >> p->shift) >= CARDMAP_WIDTH) { do { /* need a new top level */ - struct cardmap *np = kmalloc(sizeof(*np), GFP_KERNEL); - memset(np, 0, sizeof(*np)); + struct cardmap *np = kzalloc(sizeof(*np), GFP_KERNEL); + if (!np) + goto enomem; np->ptr[0] = p; if (p != NULL) { np->shift = p->shift + CARDMAP_ORDER; @@ -2719,8 +2723,9 @@ static void cardmap_set(struct cardmap **pmap, unsigned int nr, void *ptr) while (p->shift > 0) { i = (nr >> p->shift) & CARDMAP_MASK; if (p->ptr[i] == NULL) { - struct cardmap *np = kmalloc(sizeof(*np), GFP_KERNEL); - memset(np, 0, sizeof(*np)); + struct cardmap *np = kzalloc(sizeof(*np), GFP_KERNEL); + if (!np) + goto enomem; np->shift = p->shift - CARDMAP_ORDER; np->parent = p; p->ptr[i] = np; @@ -2735,6 +2740,9 @@ static void cardmap_set(struct cardmap **pmap, unsigned int nr, void *ptr) set_bit(i, &p->inuse); else clear_bit(i, &p->inuse); + return 0; + enomem: + return -ENOMEM; } static unsigned int cardmap_find_first_free(struct cardmap *map) From c7fa9d189e93877a1fa08ab00f230e0689125e45 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Aug 2006 16:34:13 -0700 Subject: [PATCH 12/17] [NET]: Disallow whitespace in network device names. It causes way too much trouble and confusion in userspace. Signed-off-by: David S. Miller --- net/core/dev.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9fe96cde3e19..d4a1ec3bded5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -116,6 +116,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -632,14 +633,22 @@ struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mas * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work + * to allow sysfs to work. We also disallow any kind of + * whitespace. */ int dev_valid_name(const char *name) { - return !(*name == '\0' - || !strcmp(name, ".") - || !strcmp(name, "..") - || strchr(name, '/')); + if (*name == '\0') + return 0; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return 0; + + while (*name) { + if (*name == '/' || isspace(*name)) + return 0; + name++; + } + return 1; } /** From acd6e00b8e4db542cb6bc9ddfbb4e18bbe29ce4d Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Thu, 17 Aug 2006 16:27:39 -0700 Subject: [PATCH 13/17] [MCAST]: Fix filter leak on device removal. This fixes source filter leakage when a device is removed and a process leaves the group thereafter. This also includes corresponding fixes for IPv6 multicast source filters on device removal. Signed-off-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 32 +++++++++++++++++++------------- net/ipv6/mcast.c | 10 ++++++---- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e981369ebe13..8e8117c19e4d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1793,29 +1793,35 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) struct in_device *in_dev; u32 group = imr->imr_multiaddr.s_addr; u32 ifindex; + int ret = -EADDRNOTAVAIL; rtnl_lock(); in_dev = ip_mc_find_dev(imr); - if (!in_dev) { - rtnl_unlock(); - return -ENODEV; - } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { - if (iml->multi.imr_multiaddr.s_addr == group && - iml->multi.imr_ifindex == ifindex) { - (void) ip_mc_leave_src(sk, iml, in_dev); + if (iml->multi.imr_multiaddr.s_addr != group) + continue; + if (ifindex) { + if (iml->multi.imr_ifindex != ifindex) + continue; + } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != + iml->multi.imr_address.s_addr) + continue; - *imlp = iml->next; + (void) ip_mc_leave_src(sk, iml, in_dev); + *imlp = iml->next; + + if (in_dev) ip_mc_dec_group(in_dev, group); - rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); - return 0; - } + rtnl_unlock(); + sock_kfree_s(sk, iml, sizeof(*iml)); + return 0; } + if (!in_dev) + ret = -ENODEV; rtnl_unlock(); - return -EADDRNOTAVAIL; + return ret; } int ip_mc_source(int add, int omode, struct sock *sk, struct diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9d697d4dcffc..639eb20c9f1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -268,13 +268,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) if ((dev = dev_get_by_index(mc_lst->ifindex)) != NULL) { struct inet6_dev *idev = in6_dev_get(dev); + (void) ip6_mc_leave_src(sk, mc_lst, idev); if (idev) { - (void) ip6_mc_leave_src(sk,mc_lst,idev); __ipv6_dev_mc_dec(idev, &mc_lst->addr); in6_dev_put(idev); } dev_put(dev); - } + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return 0; } @@ -334,13 +335,14 @@ void ipv6_sock_mc_close(struct sock *sk) if (dev) { struct inet6_dev *idev = in6_dev_get(dev); + (void) ip6_mc_leave_src(sk, mc_lst, idev); if (idev) { - (void) ip6_mc_leave_src(sk, mc_lst, idev); __ipv6_dev_mc_dec(idev, &mc_lst->addr); in6_dev_put(idev); } dev_put(dev); - } + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); From 6e8fcbf64024f9056ba122abbb66554aa76bae5d Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Thu, 17 Aug 2006 16:44:46 -0700 Subject: [PATCH 14/17] [IPV4]: severe locking bug in fib_semantics.c Found in 2.4 by Yixin Pan . > When I read fib_semantics.c of Linux-2.4.32, write_lock(&fib_info_lock) = > is used in fib_release_info() instead of write_lock_bh(&fib_info_lock). = > Is the following case possible: a BH interrupts fib_release_info() while = > holding the write lock, and calls ip_check_fib_default() which calls = > read_lock(&fib_info_lock), and spin forever. Signed-off-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9be53a8e72c3..51738000f3dc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -159,7 +159,7 @@ void free_fib_info(struct fib_info *fi) void fib_release_info(struct fib_info *fi) { - write_lock(&fib_info_lock); + write_lock_bh(&fib_info_lock); if (fi && --fi->fib_treeref == 0) { hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) @@ -172,7 +172,7 @@ void fib_release_info(struct fib_info *fi) fi->fib_dead = 1; fib_info_put(fi); } - write_unlock(&fib_info_lock); + write_unlock_bh(&fib_info_lock); } static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) @@ -598,7 +598,7 @@ static void fib_hash_move(struct hlist_head *new_info_hash, unsigned int old_size = fib_hash_size; unsigned int i, bytes; - write_lock(&fib_info_lock); + write_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; fib_hash_size = new_size; @@ -639,7 +639,7 @@ static void fib_hash_move(struct hlist_head *new_info_hash, } fib_info_laddrhash = new_laddrhash; - write_unlock(&fib_info_lock); + write_unlock_bh(&fib_info_lock); bytes = old_size * sizeof(struct hlist_head *); fib_hash_free(old_info_hash, bytes); @@ -820,7 +820,7 @@ link_it: fi->fib_treeref++; atomic_inc(&fi->fib_clntref); - write_lock(&fib_info_lock); + write_lock_bh(&fib_info_lock); hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]); if (fi->fib_prefsrc) { @@ -839,7 +839,7 @@ link_it: head = &fib_info_devhash[hash]; hlist_add_head(&nh->nh_hash, head); } endfor_nexthops(fi) - write_unlock(&fib_info_lock); + write_unlock_bh(&fib_info_lock); return fi; err_inval: From d205dc40798d97d63ad348bfaf7394f445d152d4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 17 Aug 2006 18:12:38 -0700 Subject: [PATCH 15/17] [NETFILTER]: ctnetlink: fix deadlock in table dumping ip_conntrack_put must not be called while holding ip_conntrack_lock since destroy_conntrack takes it again. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 17 +++++++---------- net/netfilter/nf_conntrack_netlink.c | 17 +++++++---------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 33891bb1fde4..0d4cc92391fa 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -415,21 +415,18 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0], *id); read_lock_bh(&ip_conntrack_lock); + last = (struct ip_conntrack *)cb->args[1]; for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { restart: - last = (struct ip_conntrack *)cb->args[1]; list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); - if (last != NULL) { - if (ct == last) { - ip_conntrack_put(last); - cb->args[1] = 0; - last = NULL; - } else + if (cb->args[1]) { + if (ct != last) continue; + cb->args[1] = 0; } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -440,17 +437,17 @@ restart: goto out; } } - if (last != NULL) { - ip_conntrack_put(last); + if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: read_unlock_bh(&ip_conntrack_lock); + if (last) + ip_conntrack_put(last); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); - return skb->len; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index af4845971f70..6527d4e048d8 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -429,9 +429,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0], *id); read_lock_bh(&nf_conntrack_lock); + last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - last = (struct nf_conn *)cb->args[1]; list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { h = (struct nf_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -442,13 +442,10 @@ restart: * then dump everything. */ if (l3proto && L3PROTO(ct) != l3proto) continue; - if (last != NULL) { - if (ct == last) { - nf_ct_put(last); - cb->args[1] = 0; - last = NULL; - } else + if (cb->args[1]) { + if (ct != last) continue; + cb->args[1] = 0; } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -459,17 +456,17 @@ restart: goto out; } } - if (last != NULL) { - nf_ct_put(last); + if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: read_unlock_bh(&nf_conntrack_lock); + if (last) + nf_ct_put(last); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); - return skb->len; } From 8311731afc439f508ab4d759edadedae75afb73e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 17 Aug 2006 18:13:53 -0700 Subject: [PATCH 16/17] [NETFILTER]: ip_tables: fix table locking in ipt_do_table table->private might change because of ruleset changes, don't use it without holding the lock. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_tables.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f316ff5fd8a6..048514f15f2f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -230,7 +230,7 @@ ipt_do_table(struct sk_buff **pskb, const char *indev, *outdev; void *table_base; struct ipt_entry *e, *back; - struct xt_table_info *private = table->private; + struct xt_table_info *private; /* Initialization */ ip = (*pskb)->nh.iph; @@ -247,6 +247,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + private = table->private; table_base = (void *)private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); From 78eb887733ec8ff5d6e6c69e3c32a187a9303622 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 17 Aug 2006 18:22:32 -0700 Subject: [PATCH 17/17] [BRIDGE]: Disable SG/GSO if TX checksum is off When the bridge recomputes features, it does not maintain the constraint that SG/GSO must be off if TX checksum is off. This patch adds that constraint. On a completely unrelated note, I've also added TSO6 and TSO_ECN feature bits if GSO is enabled on the underlying device through the new NETIF_F_GSO_SOFTWARE macro. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/bridge/br_if.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0c2b46face1..50a4719512ed 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -320,6 +320,9 @@ struct net_device #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT) #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) + /* List of features with software fallbacks. */ +#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) + #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f55ef682ef84..b1211d5342f6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -386,12 +386,17 @@ void br_features_recompute(struct net_bridge *br) checksum = 0; if (feature & NETIF_F_GSO) - feature |= NETIF_F_TSO; + feature |= NETIF_F_GSO_SOFTWARE; feature |= NETIF_F_GSO; features &= feature; } + if (!(checksum & NETIF_F_ALL_CSUM)) + features &= ~NETIF_F_SG; + if (!(features & NETIF_F_SG)) + features &= ~NETIF_F_GSO_MASK; + br->dev->features = features | checksum | NETIF_F_LLTX | NETIF_F_GSO_ROBUST; }