1
0
Fork 0
remarkable-linux/net/core/neighbour.c

3292 lines
80 KiB
C
Raw Permalink Normal View History

/*
* Generic address resolution entity
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
* Harald Welte Add neighbour cache statistics like rtstat
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 02:04:11 -06:00
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/times.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/log2.h>
#include <linux/inetdevice.h>
#include <net/addrconf.h>
#define DEBUG
#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...) \
do { \
if (level <= NEIGH_DEBUG) \
pr_debug(fmt, ##__VA_ARGS__); \
} while (0)
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(unsigned long arg);
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid);
static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
struct net_device *dev);
#ifdef CONFIG_PROC_FS
static const struct file_operations neigh_stat_seq_fops;
#endif
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
- All the scans/updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
to protocol backends, no attempts to send something to network.
It will result in deadlocks, if backend/driver wants to use neighbour
cache.
- If the entry requires some non-trivial actions, increase
its reference count and release table lock.
Neighbour entries are protected:
- with reference count.
- with rwlock neigh->lock
Reference count prevents destruction.
neigh->lock mainly serializes ll address data and its validity state.
However, the same lock is used to protect another entry fields:
- timer
- resolution queue
Again, nothing clever shall be made under neigh->lock,
the most complicated procedure, which we allow is dev->hard_header.
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
*/
static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
{
kfree_skb(skb);
return -ENETDOWN;
}
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
if (neigh->parms->neigh_cleanup)
neigh->parms->neigh_cleanup(neigh);
__neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
/*
* It is random distribution in the interval (1/2)*base...(3/2)*base.
* It corresponds to default IPv6 settings and is not overridable,
* because it is really reasonable choice.
*/
unsigned long neigh_rand_reach_time(unsigned long base)
{
return base ? (prandom_u32() % base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
static bool neigh_del(struct neighbour *n, __u8 state,
struct neighbour __rcu **np, struct neigh_table *tbl)
{
bool retval = false;
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) {
struct neighbour *neigh;
neigh = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(*np, neigh);
n->dead = 1;
retval = true;
}
write_unlock(&n->lock);
if (retval)
neigh_cleanup_and_release(n);
return retval;
}
bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
{
struct neigh_hash_table *nht;
void *pkey = ndel->primary_key;
u32 hash_val;
struct neighbour *n;
struct neighbour __rcu **np;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
hash_val = hash_val >> (32 - nht->hash_shift);
np = &nht->hash_buckets[hash_val];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock)))) {
if (n == ndel)
return neigh_del(n, 0, np, tbl);
np = &n->next;
}
return false;
}
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
int i;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
/* Neighbour record may be discarded if:
* - nobody refers to it.
* - it is not permanent
*/
if (neigh_del(n, NUD_PERMANENT, np, tbl)) {
shrunk = 1;
continue;
}
np = &n->next;
}
}
tbl->last_flush = jiffies;
write_unlock_bh(&tbl->lock);
return shrunk;
}
static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
neigh_hold(n);
if (unlikely(mod_timer(&n->timer, when))) {
printk("NEIGH: BUG, double timer add, state is %x\n",
n->nud_state);
dump_stack();
}
}
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
del_timer(&n->timer)) {
neigh_release(n);
return 1;
}
return 0;
}
static void pneigh_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = skb_dequeue(list)) != NULL) {
dev_put(skb->dev);
kfree_skb(skb);
}
}
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
{
int i;
struct neigh_hash_table *nht;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
}
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
but someone still uses it.
The destroy will be delayed until
the last user releases us, but
we must kill timers etc. and move
it to safe state.
*/
__skb_queue_purge(&n->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
else
n->nud_state = NUD_NONE;
neigh_dbg(2, "neigh %p is stray\n", n);
}
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
}
}
}
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
write_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
pneigh_ifdown_and_unlock(tbl, dev);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
return 0;
}
EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
entries = atomic_inc_return(&tbl->entries) - 1;
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
entries >= tbl->gc_thresh3) {
net_info_ratelimited("%s: neighbor table overflow!\n",
tbl->id);
NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
}
}
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
__skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
refcount_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
out_entries:
atomic_dec(&tbl->entries);
goto out;
}
static void neigh_get_hash_rnd(u32 *x)
{
*x = get_random_u32() | 1;
}
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
size_t size = (1 << shift) * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
if (size <= PAGE_SIZE)
buckets = kzalloc(size, GFP_ATOMIC);
else
buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
if (!buckets) {
kfree(ret);
return NULL;
}
ret->hash_buckets = buckets;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
static void neigh_hash_free_rcu(struct rcu_head *head)
{
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
struct neighbour __rcu **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
kfree(buckets);
else
free_pages((unsigned long)buckets, get_order(size));
kfree(nht);
}
static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
unsigned long new_shift)
{
unsigned int i, hash;
struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
old_nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
new_nht = neigh_hash_alloc(new_shift);
if (!new_nht)
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
struct neighbour *n, *next;
for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
lockdep_is_held(&tbl->lock));
n != NULL;
n = next) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
next = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(n->next,
rcu_dereference_protected(
new_nht->hash_buckets[hash],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(new_nht->hash_buckets[hash], n);
}
}
rcu_assign_pointer(tbl->nht, new_nht);
call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
return new_nht;
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
struct neighbour *n;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
n = __neigh_lookup_noref(tbl, pkey, dev);
if (n) {
if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
}
rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
const void *pkey)
{
struct neighbour *n;
int key_len = tbl->key_len;
u32 hash_val;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference_bh(n->next)) {
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
if (!memcmp(n->primary_key, pkey, key_len) &&
net_eq(dev_net(n->dev), net)) {
if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup_nodev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
if (dev->netdev_ops->ndo_neigh_construct) {
error = dev->netdev_ops->ndo_neigh_construct(dev, n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0;
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, int key_len)
{
u32 hash_val = *(u32 *)(pkey + key_len - 4);
hash_val ^= (hash_val >> 16);
hash_val ^= hash_val >> 8;
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
return hash_val;
}
static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
struct net *net,
const void *pkey,
int key_len,
struct net_device *dev)
{
while (n) {
if (!memcmp(n->key, pkey, key_len) &&
net_eq(pneigh_net(n), net) &&
(n->dev == dev || !n->dev))
return n;
n = n->next;
}
return NULL;
}
struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
struct net *net, const void *pkey, struct net_device *dev)
{
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
net, pkey, key_len, dev);
}
EXPORT_SYMBOL_GPL(__pneigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
struct net *net, const void *pkey,
struct net_device *dev, int creat)
{
struct pneigh_entry *n;
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
read_lock_bh(&tbl->lock);
n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
net, pkey, key_len, dev);
read_unlock_bh(&tbl->lock);
if (n || !creat)
goto out;
ASSERT_RTNL();
n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (!n)
goto out;
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
if (dev)
dev_hold(dev);
if (tbl->pconstructor && tbl->pconstructor(n)) {
if (dev)
dev_put(dev);
kfree(n);
n = NULL;
goto out;
}
write_lock_bh(&tbl->lock);
n->next = tbl->phash_buckets[hash_val];
tbl->phash_buckets[hash_val] = n;
write_unlock_bh(&tbl->lock);
out:
return n;
}
EXPORT_SYMBOL(pneigh_lookup);
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
struct net_device *dev)
{
struct pneigh_entry *n, **np;
int key_len = tbl->key_len;
u32 hash_val = pneigh_hash(pkey, key_len);
write_lock_bh(&tbl->lock);
for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
np = &n->next) {
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
net_eq(pneigh_net(n), net)) {
*np = n->next;
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
kfree(n);
return 0;
}
}
write_unlock_bh(&tbl->lock);
return -ENOENT;
}
static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
struct net_device *dev)
{
struct pneigh_entry *n, **np, *freelist = NULL;
u32 h;
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
np = &tbl->phash_buckets[h];
while ((n = *np) != NULL) {
if (!dev || n->dev == dev) {
*np = n->next;
n->next = freelist;
freelist = n;
continue;
}
np = &n->next;
}
}
write_unlock_bh(&tbl->lock);
while ((n = freelist)) {
freelist = n->next;
n->next = NULL;
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
kfree(n);
}
return -ENOENT;
}
static void neigh_parms_destroy(struct neigh_parms *parms);
static inline void neigh_parms_put(struct neigh_parms *parms)
{
if (refcount_dec_and_test(&parms->refcnt))
neigh_parms_destroy(parms);
}
/*
* neighbour must already be out of the table;
*
*/
void neigh_destroy(struct neighbour *neigh)
{
struct net_device *dev = neigh->dev;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
pr_warn("Destroying alive neighbour %p\n", neigh);
dump_stack();
return;
}
if (neigh_del_timer(neigh))
pr_warn("Impossible event\n");
write_lock_bh(&neigh->lock);
__skb_queue_purge(&neigh->arp_queue);
write_unlock_bh(&neigh->lock);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
neigh->arp_queue_len_bytes = 0;
if (dev->netdev_ops->ndo_neigh_destroy)
dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
dev_put(dev);
neigh_parms_put(neigh->parms);
neigh_dbg(2, "neigh %p is destroyed\n", neigh);
atomic_dec(&neigh->tbl->entries);
kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
/* Neighbour state is suspicious;
disable fast path.
Called with write_locked neigh.
*/
static void neigh_suspect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->output = neigh->ops->output;
}
/* Neighbour state is OK;
enable fast path.
Called with write_locked neigh.
*/
static void neigh_connect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is connected\n", neigh);
neigh->output = neigh->ops->connected_output;
}
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neighbour *n;
struct neighbour __rcu **np;
unsigned int i;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
*/
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
tbl->last_rand = jiffies;
list_for_each_entry(p, &tbl->parms_list, list)
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
unsigned int state;
write_lock(&n->lock);
state = n->nud_state;
if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
write_unlock(&n->lock);
goto next_elt;
}
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
n->dead = 1;
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
next_elt:
np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
* grows while we are preempted.
*/
write_unlock_bh(&tbl->lock);
cond_resched();
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
}
out:
/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
* ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
* BASE_REACHABLE_TIME.
*/
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
write_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) +
(n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) :
NEIGH_VAR(p, MCAST_PROBES));
}
static void neigh_invalidate(struct neighbour *neigh)
__releases(neigh->lock)
__acquires(neigh->lock)
{
struct sk_buff *skb;
NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
neigh_dbg(2, "neigh %p is failed\n", neigh);
neigh->updated = jiffies;
/* It is very thin place. report_unreachable is very complicated
routine. Particularly, it can hit the same neighbour entry!
So that, we try to be accurate and avoid dead loop. --ANK
*/
while (neigh->nud_state == NUD_FAILED &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
write_unlock(&neigh->lock);
neigh->ops->error_report(neigh, skb);
write_lock(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
neigh->arp_queue_len_bytes = 0;
}
static void neigh_probe(struct neighbour *neigh)
__releases(neigh->lock)
{
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
if (neigh->ops->solicit)
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
kfree_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
static void neigh_timer_handler(unsigned long arg)
{
unsigned long now, next;
struct neighbour *neigh = (struct neighbour *)arg;
unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
state = neigh->nud_state;
now = jiffies;
next = now + HZ;
if (!(state & NUD_IN_TIMER))
goto out;
if (state & NUD_REACHABLE) {
if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
neigh->used +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_suspect(neigh);
next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
neigh_dbg(2, "neigh %p is suspected\n", neigh);
neigh->nud_state = NUD_STALE;
neigh->updated = jiffies;
neigh_suspect(neigh);
notify = 1;
}
} else if (state & NUD_DELAY) {
if (time_before_eq(now,
neigh->confirmed +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
neigh_dbg(2, "neigh %p is now reachable\n", neigh);
neigh->nud_state = NUD_REACHABLE;
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
neigh_dbg(2, "neigh %p is probed\n", neigh);
neigh->nud_state = NUD_PROBE;
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh_invalidate(neigh);
goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/2))
next = jiffies + HZ/2;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
}
if (notify)
neigh_update_notify(neigh, 0);
neigh_release(neigh);
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
if (neigh->dead)
goto out_dead;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
NEIGH_VAR(neigh->parms, APP_PROBES)) {
unsigned long next, now = jiffies;
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
HZ/2);
neigh_add_timer(neigh, next);
immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_add_timer(neigh, jiffies +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
while (neigh->arp_queue_len_bytes + skb->truesize >
NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
buff = __skb_dequeue(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
if (immediate_probe)
neigh_probe(neigh);
else
write_unlock(&neigh->lock);
local_bh_enable();
return rc;
out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
EXPORT_SYMBOL(__neigh_event_send);
static void neigh_update_hhs(struct neighbour *neigh)
{
struct hh_cache *hh;
void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
= NULL;
if (neigh->dev->header_ops)
update = neigh->dev->header_ops->cache_update;
if (update) {
hh = &neigh->hh;
if (hh->hh_len) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
}
}
}
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
-- flags
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
lladdr instead of overriding it
if it is different.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
Caller MUST hold reference count on the entry.
*/
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
u32 flags, u32 nlmsg_pid)
{
u8 old;
int err;
int notify = 0;
struct net_device *dev;
int update_isrouter = 0;
write_lock_bh(&neigh->lock);
dev = neigh->dev;
old = neigh->nud_state;
err = -EPERM;
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
if (neigh->dead)
goto out;
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
neigh->nud_state = new;
err = 0;
notify = old & NUD_VALID;
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
}
goto out;
}
/* Compare new lladdr with cached one */
if (!dev->addr_len) {
/* First case: device needs no address. */
lladdr = neigh->ha;
} else if (lladdr) {
/* The second case: if something is already cached
and a new address is proposed:
- compare new & old
- if they are different, check override flag
*/
if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
/* No address is supplied; if we know something,
use it, otherwise discard the request.
*/
err = -EINVAL;
if (!(old & NUD_VALID))
goto out;
lladdr = neigh->ha;
}
/* Update confirmed timestamp for neighbour entry after we
* received ARP packet even if it doesn't change IP to MAC binding.
*/
if (new & NUD_CONNECTED)
neigh->confirmed = jiffies;
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
err = 0;
update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
if (old & NUD_VALID) {
if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
update_isrouter = 0;
if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
(old & NUD_CONNECTED)) {
lladdr = neigh->ha;
new = NUD_STALE;
} else
goto out;
} else {
if (lladdr == neigh->ha && new == NUD_STALE &&
!(flags & NEIGH_UPDATE_F_ADMIN))
new = old;
}
}
/* Update timestamp only once we know we will make a change to the
neighbour: update neigh timestamps iff update is effective It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-16 09:44:24 -06:00
* neighbour entry. Otherwise we risk to move the locktime window with
* noop updates and ignore relevant ARP updates.
*/
if (new != old || lladdr != neigh->ha)
neighbour: update neigh timestamps iff update is effective It's a common practice to send gratuitous ARPs after moving an IP address to another device to speed up healing of a service. To fulfill service availability constraints, the timing of network peers updating their caches to point to a new location of an IP address can be particularly important. Sometimes neigh_update calls won't touch neither lladdr nor state, for example if an update arrives in locktime interval. The neigh->updated value is tested by the protocol specific neigh code, which in turn will influence whether NEIGH_UPDATE_F_OVERRIDE gets set in the call to neigh_update() or not. As a result, we may effectively ignore the update request, bailing out of touching the neigh entry, except that we still bump its timestamps inside neigh_update. This may be a problem for updates arriving in quick succession. For example, consider the following scenario: A service is moved to another device with its IP address. The new device sends three gratuitous ARP requests into the network with ~1 seconds interval between them. Just before the first request arrives to one of network peer nodes, its neigh entry for the IP address transitions from STALE to DELAY. This transition, among other things, updates neigh->updated. Once the kernel receives the first gratuitous ARP, it ignores it because its arrival time is inside the locktime interval. The kernel still bumps neigh->updated. Then the second gratuitous ARP request arrives, and it's also ignored because it's still in the (new) locktime interval. Same happens for the third request. The node eventually heals itself (after delay_first_probe_time seconds since the initial transition to DELAY state), but it just wasted some time and require a new ARP request/reply round trip. This unfortunate behaviour both puts more load on the network, as well as reduces service availability. This patch changes neigh_update so that it bumps neigh->updated (as well as neigh->confirmed) only once we are sure that either lladdr or entry state will change). In the scenario described above, it means that the second gratuitous ARP request will actually update the entry lladdr. Ideally, we would update the neigh entry on the very first gratuitous ARP request. The locktime mechanism is designed to ignore ARP updates in a short timeframe after a previous ARP update was honoured by the kernel layer. This would require tracking timestamps for state transitions separately from timestamps when actual updates are received. This would probably involve changes in neighbour struct. Therefore, the patch doesn't tackle the issue of the first gratuitous APR ignored, leaving it for a follow-up. Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-16 09:44:24 -06:00
neigh->updated = jiffies;
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
neigh->nud_state = new;
notify = 1;
}
if (lladdr != neigh->ha) {
write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
(NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
}
if (new == old)
goto out;
if (new & NUD_CONNECTED)
neigh_connect(neigh);
else
neigh_suspect(neigh);
if (!(old & NUD_VALID)) {
struct sk_buff *skb;
/* Again: avoid dead loop if something went wrong */
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
struct dst_entry *dst = skb_dst(skb);
struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
neigh: fix rcu splat in neigh_update() when use dst_get_neighbour to get neighbour, we need rcu_read_lock to protect, since dst_get_neighbour uses rcu_dereference. The bug was reported by Ari Savolainen <ari.m.savolainen@gmail.com> [ 105.612095] [ 105.612096] =================================================== [ 105.612100] [ INFO: suspicious rcu_dereference_check() usage. ] [ 105.612101] --------------------------------------------------- [ 105.612103] include/net/dst.h:91 invoked rcu_dereference_check() without protection! [ 105.612105] [ 105.612106] other info that might help us debug this: [ 105.612106] [ 105.612108] [ 105.612108] rcu_scheduler_active = 1, debug_locks = 0 [ 105.612110] 1 lock held by dnsmasq/2618: [ 105.612111] #0: (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>] rtnl_lock+0x17/0x20 [ 105.612120] [ 105.612121] stack backtrace: [ 105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41 [ 105.612125] Call Trace: [ 105.612129] [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0 [ 105.612132] [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0 [ 105.612135] [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220 [ 105.612139] [<ffffffff81639298>] arp_req_set+0xb8/0x230 [ 105.612142] [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310 [ 105.612146] [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60 [ 105.612150] [<ffffffff8163fb75>] inet_ioctl+0x85/0x90 [ 105.612154] [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70 [ 105.612157] [<ffffffff815b55d3>] sock_ioctl+0x73/0x280 [ 105.612162] [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570 [ 105.612165] [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0 [ 105.612168] [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80 [ 105.612172] [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b Reported-by: Ari Savolainen <ari.m.savolainen@gmail.com> Signed-off-by: RongQing <roy.qing.li@gmail.com> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-10-17 16:32:42 -06:00
rcu_read_lock();
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper, eql, and sch_teql can end up
* using alternative, different, neigh objects to output
* the packet in the output path. So what we need to do
* here is re-lookup the top-level neigh in the path so
* we can reinject the packet there.
*/
n2 = NULL;
if (dst) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
}
n1->output(n1, skb);
if (n2)
neigh_release(n2);
neigh: fix rcu splat in neigh_update() when use dst_get_neighbour to get neighbour, we need rcu_read_lock to protect, since dst_get_neighbour uses rcu_dereference. The bug was reported by Ari Savolainen <ari.m.savolainen@gmail.com> [ 105.612095] [ 105.612096] =================================================== [ 105.612100] [ INFO: suspicious rcu_dereference_check() usage. ] [ 105.612101] --------------------------------------------------- [ 105.612103] include/net/dst.h:91 invoked rcu_dereference_check() without protection! [ 105.612105] [ 105.612106] other info that might help us debug this: [ 105.612106] [ 105.612108] [ 105.612108] rcu_scheduler_active = 1, debug_locks = 0 [ 105.612110] 1 lock held by dnsmasq/2618: [ 105.612111] #0: (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>] rtnl_lock+0x17/0x20 [ 105.612120] [ 105.612121] stack backtrace: [ 105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41 [ 105.612125] Call Trace: [ 105.612129] [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0 [ 105.612132] [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0 [ 105.612135] [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220 [ 105.612139] [<ffffffff81639298>] arp_req_set+0xb8/0x230 [ 105.612142] [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310 [ 105.612146] [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60 [ 105.612150] [<ffffffff8163fb75>] inet_ioctl+0x85/0x90 [ 105.612154] [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70 [ 105.612157] [<ffffffff815b55d3>] sock_ioctl+0x73/0x280 [ 105.612162] [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570 [ 105.612165] [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0 [ 105.612168] [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80 [ 105.612172] [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b Reported-by: Ari Savolainen <ari.m.savolainen@gmail.com> Signed-off-by: RongQing <roy.qing.li@gmail.com> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-10-17 16:32:42 -06:00
rcu_read_unlock();
write_lock_bh(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
(neigh->flags | NTF_ROUTER) :
(neigh->flags & ~NTF_ROUTER);
}
write_unlock_bh(&neigh->lock);
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
return err;
}
EXPORT_SYMBOL(neigh_update);
/* Update the neigh to listen temporarily for probe responses, even if it is
* in a NUD_FAILED state. The caller has to hold neigh->lock for writing.
*/
void __neigh_set_probe_once(struct neighbour *neigh)
{
if (neigh->dead)
return;
neigh->updated = jiffies;
if (!(neigh->nud_state & NUD_FAILED))
return;
neigh->nud_state = NUD_INCOMPLETE;
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev)
{
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
lladdr || !dev->addr_len);
if (neigh)
neigh_update(neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_OVERRIDE, 0);
return neigh;
}
EXPORT_SYMBOL(neigh_event_ns);
/* called with read_lock_bh(&n->lock); */
static void neigh_hh_init(struct neighbour *n)
{
struct net_device *dev = n->dev;
__be16 prot = n->tbl->protocol;
struct hh_cache *hh = &n->hh;
write_lock_bh(&n->lock);
/* Only one thread can come in here and initialize the
* hh_cache entry.
*/
if (!hh->hh_len)
dev->header_ops->cache(n, hh, prot);
write_unlock_bh(&n->lock);
}
/* Slow and careful. */
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
int rc = 0;
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !neigh->hh.hh_len)
neigh_hh_init(neigh);
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
/* As fast as possible without hh cache */
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
{
struct net_device *dev = neigh->dev;
unsigned int seq;
int err;
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
kfree_skb(skb);
}
return err;
}
EXPORT_SYMBOL(neigh_connected_output);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
{
return dev_queue_xmit(skb);
}
EXPORT_SYMBOL(neigh_direct_output);
static void neigh_proxy_process(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
long sched_next = 0;
unsigned long now = jiffies;
struct sk_buff *skb, *n;
spin_lock(&tbl->proxy_queue.lock);
skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
long tdif = NEIGH_CB(skb)->sched_next - now;
if (tdif <= 0) {
struct net_device *dev = skb->dev;
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 13:32:42 -06:00
__skb_unlink(skb, &tbl->proxy_queue);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 13:32:42 -06:00
if (tbl->proxy_redo && netif_running(dev)) {
rcu_read_lock();
tbl->proxy_redo(skb);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 13:32:42 -06:00
rcu_read_unlock();
} else {
kfree_skb(skb);
arp: fix rcu lockdep splat in arp_process() Dave Jones reported a lockdep splat triggered by an arp_process() call from parp_redo(). Commit faa9dcf793be (arp: RCU changes) is the origin of the bug, since it assumed arp_process() was called under rcu_read_lock(), which is not true in this particular path. Instead of adding rcu_read_lock() in parp_redo(), I chose to add it in neigh_proxy_process() to take care of IPv6 side too. =================================================== [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/inetdevice.h:209 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 4 locks held by setfiles/2123: #0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8114cbc4>] walk_component+0x1ef/0x3e8 #1: (&isec->lock){+.+.+.}, at: [<ffffffff81204bca>] inode_doinit_with_dentry+0x3f/0x41f #2: (&tbl->proxy_timer){+.-...}, at: [<ffffffff8106a803>] run_timer_softirq+0x157/0x372 #3: (class){+.-...}, at: [<ffffffff8141f256>] neigh_proxy_process +0x36/0x103 stack backtrace: Pid: 2123, comm: setfiles Tainted: G W 3.1.0-0.rc2.git7.2.fc16.x86_64 #1 Call Trace: <IRQ> [<ffffffff8108ca23>] lockdep_rcu_dereference+0xa7/0xaf [<ffffffff8146a0b7>] __in_dev_get_rcu+0x55/0x5d [<ffffffff8146a751>] arp_process+0x25/0x4d7 [<ffffffff8146ac11>] parp_redo+0xe/0x10 [<ffffffff8141f2ba>] neigh_proxy_process+0x9a/0x103 [<ffffffff8106a8c4>] run_timer_softirq+0x218/0x372 [<ffffffff8106a803>] ? run_timer_softirq+0x157/0x372 [<ffffffff8141f220>] ? neigh_stat_seq_open+0x41/0x41 [<ffffffff8108f2f0>] ? mark_held_locks+0x6d/0x95 [<ffffffff81062bb6>] __do_softirq+0x112/0x25a [<ffffffff8150d27c>] call_softirq+0x1c/0x30 [<ffffffff81010bf5>] do_softirq+0x4b/0xa2 [<ffffffff81062f65>] irq_exit+0x5d/0xcf [<ffffffff8150dc11>] smp_apic_timer_interrupt+0x7c/0x8a [<ffffffff8150baf3>] apic_timer_interrupt+0x73/0x80 <EOI> [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff814fc285>] ? __slab_free+0x30/0x24c [<ffffffff814fc283>] ? __slab_free+0x2e/0x24c [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204e74>] ? inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81130cb0>] kfree+0x108/0x131 [<ffffffff81204e74>] inode_doinit_with_dentry+0x2e9/0x41f [<ffffffff81204fc6>] selinux_d_instantiate+0x1c/0x1e [<ffffffff81200f4f>] security_d_instantiate+0x21/0x23 [<ffffffff81154625>] d_instantiate+0x5c/0x61 [<ffffffff811563ca>] d_splice_alias+0xbc/0xd2 [<ffffffff811b17ff>] ext4_lookup+0xba/0xeb [<ffffffff8114bf1e>] d_alloc_and_lookup+0x45/0x6b [<ffffffff8114cbea>] walk_component+0x215/0x3e8 [<ffffffff8114cdf8>] lookup_last+0x3b/0x3d [<ffffffff8114daf3>] path_lookupat+0x82/0x2af [<ffffffff8110fc53>] ? might_fault+0xa5/0xac [<ffffffff8110fc0a>] ? might_fault+0x5c/0xac [<ffffffff8114c564>] ? getname_flags+0x31/0x1ca [<ffffffff8114dd48>] do_path_lookup+0x28/0x97 [<ffffffff8114df2c>] user_path_at+0x59/0x96 [<ffffffff811467ad>] ? cp_new_stat+0xf7/0x10d [<ffffffff811469a6>] vfs_fstatat+0x44/0x6e [<ffffffff811469ee>] vfs_lstat+0x1e/0x20 [<ffffffff81146b3d>] sys_newlstat+0x1a/0x33 [<ffffffff8108f439>] ? trace_hardirqs_on_caller+0x121/0x158 [<ffffffff812535fe>] ? trace_hardirqs_on_thunk+0x3a/0x3f [<ffffffff8150af82>] system_call_fastpath+0x16/0x1b Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2011-08-22 13:32:42 -06:00
}
dev_put(dev);
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
del_timer(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
unsigned long now = jiffies;
unsigned long sched_next = now + (prandom_u32() %
NEIGH_VAR(p, PROXY_DELAY));
if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
NEIGH_CB(skb)->sched_next = sched_next;
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
if (del_timer(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
EXPORT_SYMBOL(pneigh_enqueue);
static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
struct net *net, int ifindex)
{
struct neigh_parms *p;
list_for_each_entry(p, &tbl->parms_list, list) {
if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
(!p->dev && !ifindex && net_eq(net, &init_net)))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
return p;
}
return NULL;
}
struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
struct neigh_table *tbl)
{
struct neigh_parms *p;
struct net *net = dev_net(dev);
const struct net_device_ops *ops = dev->netdev_ops;
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
dev_hold(dev);
p->dev = dev;
write_pnet(&p->net, net);
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
dev_put(dev);
kfree(p);
return NULL;
}
write_lock_bh(&tbl->lock);
list_add(&p->list, &tbl->parms.list);
write_unlock_bh(&tbl->lock);
neigh_parms_data_state_cleanall(p);
}
return p;
}
EXPORT_SYMBOL(neigh_parms_alloc);
static void neigh_rcu_free_parms(struct rcu_head *head)
{
struct neigh_parms *parms =
container_of(head, struct neigh_parms, rcu_head);
neigh_parms_put(parms);
}
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
if (!parms || parms == &tbl->parms)
return;
write_lock_bh(&tbl->lock);
list_del(&parms->list);
parms->dead = 1;
write_unlock_bh(&tbl->lock);
if (parms->dev)
dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
static void neigh_parms_destroy(struct neigh_parms *parms)
{
kfree(parms);
}
static struct lock_class_key neigh_table_proxy_queue_class;
static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
void neigh_table_init(int index, struct neigh_table *tbl)
{
unsigned long now = jiffies;
unsigned long phsize;
INIT_LIST_HEAD(&tbl->parms_list);
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
#ifdef CONFIG_PROC_FS
if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
&neigh_stat_seq_fops, tbl))
panic("cannot create neighbour proc dir entry");
#endif
RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3));
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
if (!tbl->entry_size)
tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) +
tbl->key_len, NEIGH_PRIV_ALIGN);
else
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
rwlock_init(&tbl->lock);
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
[NEIGH]: Fix IP-over-ATM and ARP interaction. The classical IP over ATM code maintains its own IPv4 <-> <ATM stuff> ARP table, using the standard neighbour-table code. The neigh_table_init function adds this neighbour table to a linked list of all neighbor tables which is used by the functions neigh_delete() neigh_add() and neightbl_set(), all called by the netlink code. Once the ATM neighbour table is added to the list, there are two tables with family == AF_INET there, and ARP entries sent via netlink go into the first table with matching family. This is indeterminate and often wrong. To see the bug, on a kernel with CLIP enabled, create a standard IPv4 ARP entry by pinging an unused address on a local subnet. Then attempt to complete that entry by doing ip neigh replace <ip address> lladdr <some mac address> nud reachable Looking at the ARP tables by using ip neigh show will reveal two ARP entries for the same address. One of these can be found in /proc/net/arp, and the other in /proc/net/atm/arp. This patch adds a new function, neigh_table_init_no_netlink() which does everything the neigh_table_init() does, except add the table to the netlink all-arp-tables chain. In addition neigh_table_init() has a check that all tables on the chain have a distinct address family. The init call in clip.c is changed to call neigh_table_init_no_netlink(). Since ATM ARP tables are rather more complicated than can currently be handled by the available rtattrs in the netlink protocol, no functionality is lost by this patch, and non-ATM ARP manipulation via netlink is rescued. A more complete solution would involve a rtattr for ATM ARP entries and some way for the netlink code to give neigh_add and friends more information than just address family with which to find the correct ARP table. [ I've changed the assertion checking in neigh_table_init() to not use BUG_ON() while holding neigh_tbl_lock. Instead we remember that we found an existing tbl with the same family, and after dropping the lock we'll give a diagnostic kernel log message and a stack dump. -DaveM ] Signed-off-by: Simon Kelley <simon@thekelleys.org.uk> Signed-off-by: David S. Miller <davem@davemloft.net>
2006-05-12 15:56:08 -06:00
neigh_tables[index] = tbl;
}
EXPORT_SYMBOL(neigh_table_init);
int neigh_table_clear(int index, struct neigh_table *tbl)
{
neigh_tables[index] = NULL;
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
neigh_hash_free_rcu);
tbl->nht = NULL;
kfree(tbl->phash_buckets);
tbl->phash_buckets = NULL;
[NET]: Remove /proc/net/stat/*_arp_cache upon module removal neigh_table_init_no_netlink() creates them, but they aren't removed anywhere. Steps to reproduce: modprobe clip rmmod clip cat /proc/net/stat/clip_arp_cache BUG: unable to handle kernel paging request at virtual address f89d7758 printing eip: c05a99da *pdpt = 0000000000004001 *pde = 0000000004408067 *pte = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: atm af_packet ipv6 binfmt_misc sbs sbshc fan dock battery backlight ac power_supply parport loop rtc_cmos rtc_core rtc_lib serio_raw button k8temp hwmon amd_rng sr_mod cdrom shpchp pci_hotplug ehci_hcd ohci_hcd uhci_hcd usbcore Pid: 2082, comm: cat Not tainted (2.6.24-rc1-b1d08ac064268d0ae2281e98bf5e82627e0f0c56-bloat #4) EIP: 0060:[<c05a99da>] EFLAGS: 00210256 CPU: 0 EIP is at neigh_stat_seq_next+0x26/0x3f EAX: 00000001 EBX: f89d7600 ECX: c587bf40 EDX: 00000000 ESI: 00000000 EDI: 00000001 EBP: 00000400 ESP: c587bf1c DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Process cat (pid: 2082, ti=c587b000 task=c5984e10 task.ti=c587b000) Stack: c06228cc c5313790 c049e5c0 0804f000 c45a7b00 c53137b0 00000000 00000000 00000082 00000001 00000000 00000000 00000000 fffffffb c58d6780 c049e437 c45a7b00 c04b1f93 c587bfa0 00000400 0804f000 00000400 0804f000 c04b1f2f Call Trace: [<c049e5c0>] seq_read+0x189/0x281 [<c049e437>] seq_read+0x0/0x281 [<c04b1f93>] proc_reg_read+0x64/0x77 [<c04b1f2f>] proc_reg_read+0x0/0x77 [<c048907e>] vfs_read+0x80/0xd1 [<c0489491>] sys_read+0x41/0x67 [<c04080fa>] sysenter_past_esp+0x6b/0xc1 ======================= Code: e9 ec 8d 05 00 56 8b 11 53 8b 40 70 8b 58 3c eb 29 0f a3 15 80 91 7b c0 19 c0 85 c0 8d 42 01 74 17 89 c6 c1 fe 1f 89 01 89 71 04 <8b> 83 58 01 00 00 f7 d0 8b 04 90 eb 09 89 c2 83 fa 01 7e d2 31 EIP: [<c05a99da>] neigh_stat_seq_next+0x26/0x3f SS:ESP 0068:c587bf1c Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-11-05 22:28:13 -07:00
remove_proc_entry(tbl->id, init_net.proc_net_stat);
free_percpu(tbl->stats);
tbl->stats = NULL;
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
static struct neigh_table *neigh_find_table(int family)
{
struct neigh_table *tbl = NULL;
switch (family) {
case AF_INET:
tbl = neigh_tables[NEIGH_ARP_TABLE];
break;
case AF_INET6:
tbl = neigh_tables[NEIGH_ND_TABLE];
break;
case AF_DECnet:
tbl = neigh_tables[NEIGH_DN_TABLE];
break;
}
return tbl;
}
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *dst_attr;
struct neigh_table *tbl;
struct neighbour *neigh;
struct net_device *dev = NULL;
int err = -EINVAL;
ASSERT_RTNL();
if (nlmsg_len(nlh) < sizeof(*ndm))
goto out;
dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
if (dst_attr == NULL)
goto out;
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
}
tbl = neigh_find_table(ndm->ndm_family);
if (tbl == NULL)
return -EAFNOSUPPORT;
if (nla_len(dst_attr) < tbl->key_len)
goto out;
if (ndm->ndm_flags & NTF_PROXY) {
err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
goto out;
}
if (dev == NULL)
goto out;
neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
if (neigh == NULL) {
err = -ENOENT;
goto out;
}
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN,
NETLINK_CB(skb).portid);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
neigh_remove_one(neigh, tbl);
write_unlock_bh(&tbl->lock);
out:
return err;
}
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct neigh_table *tbl;
struct net_device *dev = NULL;
struct neighbour *neigh;
void *dst, *lladdr;
int err;
ASSERT_RTNL();
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
if (err < 0)
goto out;
err = -EINVAL;
if (tb[NDA_DST] == NULL)
goto out;
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
goto out;
}
tbl = neigh_find_table(ndm->ndm_family);
if (tbl == NULL)
return -EAFNOSUPPORT;
if (nla_len(tb[NDA_DST]) < tbl->key_len)
goto out;
dst = nla_data(tb[NDA_DST]);
lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
if (ndm->ndm_flags & NTF_PROXY) {
struct pneigh_entry *pn;
err = -ENOBUFS;
pn = pneigh_lookup(tbl, net, dst, dev, 1);
if (pn) {
pn->flags = ndm->ndm_flags;
err = 0;
}
goto out;
}
if (dev == NULL)
goto out;
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
neigh = __neigh_lookup_errno(tbl, dst, dev);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
}
} else {
if (nlh->nlmsg_flags & NLM_F_EXCL) {
err = -EEXIST;
neigh_release(neigh);
goto out;
}
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
} else
err = neigh_update(neigh, lladdr, ndm->ndm_state, flags,
NETLINK_CB(skb).portid);
neigh_release(neigh);
out:
return err;
}
static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
struct nlattr *nest;
nest = nla_nest_start(skb, NDTA_PARMS);
if (nest == NULL)
return -ENOBUFS;
if ((parms->dev &&
nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
/* approximative value for deprecated QUEUE_LEN (in packets) */
nla_put_u32(skb, NDTPA_QUEUE_LEN,
NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) ||
nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) ||
nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) ||
nla_put_u32(skb, NDTPA_UCAST_PROBES,
NEIGH_VAR(parms, UCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_PROBES,
NEIGH_VAR(parms, MCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_REPROBES,
NEIGH_VAR(parms, MCAST_REPROBES)) ||
nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_GC_STALETIME,
NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME,
NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_RETRANS_TIME,
NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_ANYCAST_DELAY,
NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -EMSGSIZE;
}
static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
u32 pid, u32 seq, int type, int flags)
{
struct nlmsghdr *nlh;
struct ndtmsg *ndtmsg;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
goto nla_put_failure;
{
unsigned long now = jiffies;
unsigned int flush_delta = now - tbl->last_flush;
unsigned int rand_delta = now - tbl->last_rand;
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
.ndtc_entry_size = tbl->entry_size,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
.ndtc_proxy_qlen = tbl->proxy_queue.qlen,
};
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
rcu_read_unlock_bh();
if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
goto nla_put_failure;
}
{
int cpu;
struct ndt_stats ndst;
memset(&ndst, 0, sizeof(ndst));
for_each_possible_cpu(cpu) {
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
ndst.ndts_allocs += st->allocs;
ndst.ndts_destroys += st->destroys;
ndst.ndts_hash_grows += st->hash_grows;
ndst.ndts_res_failed += st->res_failed;
ndst.ndts_lookups += st->lookups;
ndst.ndts_hits += st->hits;
ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
ndst.ndts_forced_gc_runs += st->forced_gc_runs;
ndst.ndts_table_fulls += st->table_fulls;
}
if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
NDTA_PAD))
goto nla_put_failure;
}
BUG_ON(tbl->parms.dev);
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
read_unlock_bh(&tbl->lock);
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int neightbl_fill_param_info(struct sk_buff *skb,
struct neigh_table *tbl,
struct neigh_parms *parms,
u32 pid, u32 seq, int type,
unsigned int flags)
{
struct ndtmsg *ndtmsg;
struct nlmsghdr *nlh;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
neightbl_fill_parms(skb, parms) < 0)
goto errout;
read_unlock_bh(&tbl->lock);
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
nlmsg_end(skb, nlh);
return 0;
errout:
read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
[NDTA_NAME] = { .type = NLA_STRING },
[NDTA_THRESH1] = { .type = NLA_U32 },
[NDTA_THRESH2] = { .type = NLA_U32 },
[NDTA_THRESH3] = { .type = NLA_U32 },
[NDTA_GC_INTERVAL] = { .type = NLA_U64 },
[NDTA_PARMS] = { .type = NLA_NESTED },
};
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_MCAST_PROBES] = { .type = NLA_U32 },
[NDTPA_MCAST_REPROBES] = { .type = NLA_U32 },
[NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 },
[NDTPA_GC_STALETIME] = { .type = NLA_U64 },
[NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 },
[NDTPA_RETRANS_TIME] = { .type = NLA_U64 },
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
};
static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
struct nlattr *tb[NDTA_MAX+1];
bool found = false;
int err, tidx;
err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
nl_neightbl_policy, extack);
if (err < 0)
goto errout;
if (tb[NDTA_NAME] == NULL) {
err = -EINVAL;
goto errout;
}
ndtmsg = nlmsg_data(nlh);
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
tbl = neigh_tables[tidx];
if (!tbl)
continue;
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
found = true;
break;
}
}
if (!found)
return -ENOENT;
/*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
write_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
struct neigh_parms *p;
int i, ifindex = 0;
err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
nl_ntbl_parm_policy, extack);
if (err < 0)
goto errout_tbl_lock;
if (tbp[NDTPA_IFINDEX])
ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
p = lookup_neigh_parms(tbl, net, ifindex);
if (p == NULL) {
err = -ENOENT;
goto errout_tbl_lock;
}
for (i = 1; i <= NDTPA_MAX; i++) {
if (tbp[i] == NULL)
continue;
switch (i) {
case NDTPA_QUEUE_LEN:
NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
nla_get_u32(tbp[i]) *
SKB_TRUESIZE(ETH_FRAME_LEN));
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
break;
case NDTPA_QUEUE_LENBYTES:
NEIGH_VAR_SET(p, QUEUE_LEN_BYTES,
nla_get_u32(tbp[i]));
break;
case NDTPA_PROXY_QLEN:
NEIGH_VAR_SET(p, PROXY_QLEN,
nla_get_u32(tbp[i]));
break;
case NDTPA_APP_PROBES:
NEIGH_VAR_SET(p, APP_PROBES,
nla_get_u32(tbp[i]));
break;
case NDTPA_UCAST_PROBES:
NEIGH_VAR_SET(p, UCAST_PROBES,
nla_get_u32(tbp[i]));
break;
case NDTPA_MCAST_PROBES:
NEIGH_VAR_SET(p, MCAST_PROBES,
nla_get_u32(tbp[i]));
break;
case NDTPA_MCAST_REPROBES:
NEIGH_VAR_SET(p, MCAST_REPROBES,
nla_get_u32(tbp[i]));
break;
case NDTPA_BASE_REACHABLE_TIME:
NEIGH_VAR_SET(p, BASE_REACHABLE_TIME,
nla_get_msecs(tbp[i]));
/* update reachable_time as well, otherwise, the change will
* only be effective after the next time neigh_periodic_work
* decides to recompute it (can be multiple minutes)
*/
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
break;
case NDTPA_GC_STALETIME:
NEIGH_VAR_SET(p, GC_STALETIME,
nla_get_msecs(tbp[i]));
break;
case NDTPA_DELAY_PROBE_TIME:
NEIGH_VAR_SET(p, DELAY_PROBE_TIME,
nla_get_msecs(tbp[i]));
call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
break;
case NDTPA_RETRANS_TIME:
NEIGH_VAR_SET(p, RETRANS_TIME,
nla_get_msecs(tbp[i]));
break;
case NDTPA_ANYCAST_DELAY:
NEIGH_VAR_SET(p, ANYCAST_DELAY,
nla_get_msecs(tbp[i]));
break;
case NDTPA_PROXY_DELAY:
NEIGH_VAR_SET(p, PROXY_DELAY,
nla_get_msecs(tbp[i]));
break;
case NDTPA_LOCKTIME:
NEIGH_VAR_SET(p, LOCKTIME,
nla_get_msecs(tbp[i]));
break;
}
}
}
err = -ENOENT;
if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
!net_eq(net, &init_net))
goto errout_tbl_lock;
if (tb[NDTA_THRESH1])
tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
if (tb[NDTA_THRESH2])
tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
if (tb[NDTA_THRESH3])
tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
if (tb[NDTA_GC_INTERVAL])
tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
err = 0;
errout_tbl_lock:
write_unlock_bh(&tbl->lock);
errout:
return err;
}
static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
int family, tidx, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
tbl = neigh_tables[tidx];
if (!tbl)
continue;
if (tidx < tbl_skip || (family && tbl->family != family))
continue;
if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
break;
nidx = 0;
p = list_next_entry(&tbl->parms, list);
list_for_each_entry_from(p, &tbl->parms_list, list) {
if (!net_eq(neigh_parms_net(p), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
continue;
if (nidx < neigh_skip)
goto next;
if (neightbl_fill_param_info(skb, tbl, p,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
goto out;
next:
nidx++;
}
neigh_skip = 0;
}
out:
cb->args[0] = tidx;
cb->args[1] = nidx;
return skb->len;
}
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
u32 pid, u32 seq, int type, unsigned int flags)
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = neigh->flags;
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
goto nla_put_failure;
read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
if (neigh->nud_state & NUD_VALID) {
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, neigh, neigh->dev);
if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
read_unlock_bh(&neigh->lock);
goto nla_put_failure;
}
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
ndm->ndm_family = tbl->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
{
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
__neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
}
static bool neigh_master_filtered(struct net_device *dev, int master_idx)
{
struct net_device *master;
if (!master_idx)
return false;
master = netdev_master_upper_dev_get(dev);
if (!master || master->ifindex != master_idx)
return true;
return false;
}
static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
{
if (filter_idx && dev->ifindex != filter_idx)
return true;
return false;
}
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
const struct nlmsghdr *nlh = cb->nlh;
struct nlattr *tb[NDA_MAX + 1];
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
int filter_master_idx = 0, filter_idx = 0;
unsigned int flags = NLM_F_MULTI;
int err;
err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
if (!err) {
net: validate attribute sizes in neigh_dump_table() [ Upstream commit 7dd07c143a4b54d050e748bee4b4b9e94a7b1744 ] Since neigh_dump_table() calls nlmsg_parse() without giving policy constraints, attributes can have arbirary size that we must validate Reported by syzbot/KMSAN : BUG: KMSAN: uninit-value in neigh_master_filtered net/core/neighbour.c:2292 [inline] BUG: KMSAN: uninit-value in neigh_dump_table net/core/neighbour.c:2348 [inline] BUG: KMSAN: uninit-value in neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 CPU: 1 PID: 3575 Comm: syzkaller268891 Not tainted 4.16.0+ #83 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 neigh_master_filtered net/core/neighbour.c:2292 [inline] neigh_dump_table net/core/neighbour.c:2348 [inline] neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 netlink_dump+0x9ad/0x1540 net/netlink/af_netlink.c:2225 __netlink_dump_start+0x1167/0x12a0 net/netlink/af_netlink.c:2322 netlink_dump_start include/linux/netlink.h:214 [inline] rtnetlink_rcv_msg+0x1435/0x1560 net/core/rtnetlink.c:4598 netlink_rcv_skb+0x355/0x5f0 net/netlink/af_netlink.c:2447 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4653 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline] netlink_unicast+0x1672/0x1750 net/netlink/af_netlink.c:1337 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x43fed9 RSP: 002b:00007ffddbee2798 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fed9 RDX: 0000000000000000 RSI: 0000000020005000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401800 R13: 0000000000401890 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline] netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 21fdd092acc7 ("net: Add support for filtering neigh dump by master device") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Ahern <dsa@cumulusnetworks.com> Reported-by: syzbot <syzkaller@googlegroups.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-04-11 15:46:00 -06:00
if (tb[NDA_IFINDEX]) {
if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
return -EINVAL;
filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
net: validate attribute sizes in neigh_dump_table() [ Upstream commit 7dd07c143a4b54d050e748bee4b4b9e94a7b1744 ] Since neigh_dump_table() calls nlmsg_parse() without giving policy constraints, attributes can have arbirary size that we must validate Reported by syzbot/KMSAN : BUG: KMSAN: uninit-value in neigh_master_filtered net/core/neighbour.c:2292 [inline] BUG: KMSAN: uninit-value in neigh_dump_table net/core/neighbour.c:2348 [inline] BUG: KMSAN: uninit-value in neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 CPU: 1 PID: 3575 Comm: syzkaller268891 Not tainted 4.16.0+ #83 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 neigh_master_filtered net/core/neighbour.c:2292 [inline] neigh_dump_table net/core/neighbour.c:2348 [inline] neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 netlink_dump+0x9ad/0x1540 net/netlink/af_netlink.c:2225 __netlink_dump_start+0x1167/0x12a0 net/netlink/af_netlink.c:2322 netlink_dump_start include/linux/netlink.h:214 [inline] rtnetlink_rcv_msg+0x1435/0x1560 net/core/rtnetlink.c:4598 netlink_rcv_skb+0x355/0x5f0 net/netlink/af_netlink.c:2447 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4653 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline] netlink_unicast+0x1672/0x1750 net/netlink/af_netlink.c:1337 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x43fed9 RSP: 002b:00007ffddbee2798 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fed9 RDX: 0000000000000000 RSI: 0000000020005000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401800 R13: 0000000000401890 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline] netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 21fdd092acc7 ("net: Add support for filtering neigh dump by master device") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Ahern <dsa@cumulusnetworks.com> Reported-by: syzbot <syzkaller@googlegroups.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-04-11 15:46:00 -06:00
}
if (tb[NDA_MASTER]) {
if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
return -EINVAL;
filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
net: validate attribute sizes in neigh_dump_table() [ Upstream commit 7dd07c143a4b54d050e748bee4b4b9e94a7b1744 ] Since neigh_dump_table() calls nlmsg_parse() without giving policy constraints, attributes can have arbirary size that we must validate Reported by syzbot/KMSAN : BUG: KMSAN: uninit-value in neigh_master_filtered net/core/neighbour.c:2292 [inline] BUG: KMSAN: uninit-value in neigh_dump_table net/core/neighbour.c:2348 [inline] BUG: KMSAN: uninit-value in neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 CPU: 1 PID: 3575 Comm: syzkaller268891 Not tainted 4.16.0+ #83 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 neigh_master_filtered net/core/neighbour.c:2292 [inline] neigh_dump_table net/core/neighbour.c:2348 [inline] neigh_dump_info+0x1af0/0x2250 net/core/neighbour.c:2438 netlink_dump+0x9ad/0x1540 net/netlink/af_netlink.c:2225 __netlink_dump_start+0x1167/0x12a0 net/netlink/af_netlink.c:2322 netlink_dump_start include/linux/netlink.h:214 [inline] rtnetlink_rcv_msg+0x1435/0x1560 net/core/rtnetlink.c:4598 netlink_rcv_skb+0x355/0x5f0 net/netlink/af_netlink.c:2447 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4653 netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline] netlink_unicast+0x1672/0x1750 net/netlink/af_netlink.c:1337 netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x43fed9 RSP: 002b:00007ffddbee2798 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fed9 RDX: 0000000000000000 RSI: 0000000020005000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401800 R13: 0000000000401890 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline] netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 21fdd092acc7 ("net: Add support for filtering neigh dump by master device") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: David Ahern <dsa@cumulusnetworks.com> Reported-by: syzbot <syzkaller@googlegroups.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-04-11 15:46:00 -06:00
}
if (filter_idx || filter_master_idx)
flags |= NLM_F_DUMP_FILTERED;
}
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
if (neigh_ifindex_filtered(n->dev, filter_idx) ||
neigh_master_filtered(n->dev, filter_master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
flags) < 0) {
rc = -1;
goto out;
}
next:
idx++;
}
}
rc = skb->len;
out:
rcu_read_unlock_bh();
cb->args[1] = h;
cb->args[2] = idx;
return rc;
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
int rc, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
read_lock_bh(&tbl->lock);
for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
if (idx < s_idx || pneigh_net(n) != net)
goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI, tbl) < 0) {
read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
}
next:
idx++;
}
}
read_unlock_bh(&tbl->lock);
rc = skb->len;
out:
cb->args[3] = h;
cb->args[4] = idx;
return rc;
}
static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct neigh_table *tbl;
int t, family, s_t;
int proxy = 0;
int err;
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
* the same for both structures
*/
if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
proxy = 1;
s_t = cb->args[0];
for (t = 0; t < NEIGH_NR_TABLES; t++) {
tbl = neigh_tables[t];
if (!tbl)
continue;
if (t < s_t || (family && tbl->family != family))
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
if (proxy)
err = pneigh_dump_table(tbl, skb, cb);
else
err = neigh_dump_table(tbl, skb, cb);
if (err < 0)
break;
}
cb->args[0] = t;
return skb->len;
}
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
struct neigh_hash_table *nht;
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
read_lock(&tbl->lock); /* avoid resizes */
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
n != NULL;
n = rcu_dereference_bh(n->next))
cb(n, cookie);
}
read_unlock(&tbl->lock);
rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_for_each);
/* The tbl->lock must be held as a writer and BH disabled. */
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
int chain;
struct neigh_hash_table *nht;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[chain];
while ((n = rcu_dereference_protected(*np,
lockdep_is_held(&tbl->lock))) != NULL) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
n->dead = 1;
} else
np = &n->next;
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
}
}
}
EXPORT_SYMBOL(__neigh_for_each_release);
int neigh_xmit(int index, struct net_device *dev,
const void *addr, struct sk_buff *skb)
{
int err = -EAFNOSUPPORT;
if (likely(index < NEIGH_NR_TABLES)) {
struct neigh_table *tbl;
struct neighbour *neigh;
tbl = neigh_tables[index];
if (!tbl)
goto out;
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit() neigh_xmit() expects to be called inside an RCU-bh read side critical section, and while one of its two current callers gets this right, the other one doesn't. More specifically, neigh_xmit() has two callers, mpls_forward() and mpls_output(), and while both callers call neigh_xmit() under rcu_read_lock(), this provides sufficient protection for neigh_xmit() only in the case of mpls_forward(), as that is always called from softirq context and therefore doesn't need explicit BH protection, while mpls_output() can be called from process context with softirqs enabled. When mpls_output() is called from process context, with softirqs enabled, we can be preempted by a softirq at any time, and RCU-bh considers the completion of a softirq as signaling the end of any pending read-side critical sections, so if we do get a softirq while we are in the part of neigh_xmit() that expects to be run inside an RCU-bh read side critical section, we can end up with an unexpected RCU grace period running right in the middle of that critical section, making things go boom. This patch fixes this impedance mismatch in the callee, by making neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that expects to be treated as an RCU-bh read side critical section, as this seems a safer option than fixing it in the callers. Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit") Signed-off-by: David Barroso <dbarroso@fastly.com> Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Acked-by: Robert Shearman <rshearma@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 02:16:43 -06:00
rcu_read_lock_bh();
neigh = __neigh_lookup_noref(tbl, addr, dev);
if (!neigh)
neigh = __neigh_create(tbl, addr, dev, false);
err = PTR_ERR(neigh);
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit() neigh_xmit() expects to be called inside an RCU-bh read side critical section, and while one of its two current callers gets this right, the other one doesn't. More specifically, neigh_xmit() has two callers, mpls_forward() and mpls_output(), and while both callers call neigh_xmit() under rcu_read_lock(), this provides sufficient protection for neigh_xmit() only in the case of mpls_forward(), as that is always called from softirq context and therefore doesn't need explicit BH protection, while mpls_output() can be called from process context with softirqs enabled. When mpls_output() is called from process context, with softirqs enabled, we can be preempted by a softirq at any time, and RCU-bh considers the completion of a softirq as signaling the end of any pending read-side critical sections, so if we do get a softirq while we are in the part of neigh_xmit() that expects to be run inside an RCU-bh read side critical section, we can end up with an unexpected RCU grace period running right in the middle of that critical section, making things go boom. This patch fixes this impedance mismatch in the callee, by making neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that expects to be treated as an RCU-bh read side critical section, as this seems a safer option than fixing it in the callers. Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit") Signed-off-by: David Barroso <dbarroso@fastly.com> Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Acked-by: Robert Shearman <rshearma@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 02:16:43 -06:00
if (IS_ERR(neigh)) {
rcu_read_unlock_bh();
goto out_kfree_skb;
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit() neigh_xmit() expects to be called inside an RCU-bh read side critical section, and while one of its two current callers gets this right, the other one doesn't. More specifically, neigh_xmit() has two callers, mpls_forward() and mpls_output(), and while both callers call neigh_xmit() under rcu_read_lock(), this provides sufficient protection for neigh_xmit() only in the case of mpls_forward(), as that is always called from softirq context and therefore doesn't need explicit BH protection, while mpls_output() can be called from process context with softirqs enabled. When mpls_output() is called from process context, with softirqs enabled, we can be preempted by a softirq at any time, and RCU-bh considers the completion of a softirq as signaling the end of any pending read-side critical sections, so if we do get a softirq while we are in the part of neigh_xmit() that expects to be run inside an RCU-bh read side critical section, we can end up with an unexpected RCU grace period running right in the middle of that critical section, making things go boom. This patch fixes this impedance mismatch in the callee, by making neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that expects to be treated as an RCU-bh read side critical section, as this seems a safer option than fixing it in the callers. Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit") Signed-off-by: David Barroso <dbarroso@fastly.com> Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Acked-by: Robert Shearman <rshearma@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 02:16:43 -06:00
}
err = neigh->output(neigh, skb);
neigh: Explicitly declare RCU-bh read side critical section in neigh_xmit() neigh_xmit() expects to be called inside an RCU-bh read side critical section, and while one of its two current callers gets this right, the other one doesn't. More specifically, neigh_xmit() has two callers, mpls_forward() and mpls_output(), and while both callers call neigh_xmit() under rcu_read_lock(), this provides sufficient protection for neigh_xmit() only in the case of mpls_forward(), as that is always called from softirq context and therefore doesn't need explicit BH protection, while mpls_output() can be called from process context with softirqs enabled. When mpls_output() is called from process context, with softirqs enabled, we can be preempted by a softirq at any time, and RCU-bh considers the completion of a softirq as signaling the end of any pending read-side critical sections, so if we do get a softirq while we are in the part of neigh_xmit() that expects to be run inside an RCU-bh read side critical section, we can end up with an unexpected RCU grace period running right in the middle of that critical section, making things go boom. This patch fixes this impedance mismatch in the callee, by making neigh_xmit() always take rcu_read_{,un}lock_bh() around the code that expects to be treated as an RCU-bh read side critical section, as this seems a safer option than fixing it in the callers. Fixes: 4fd3d7d9e868f ("neigh: Add helper function neigh_xmit") Signed-off-by: David Barroso <dbarroso@fastly.com> Signed-off-by: Lennert Buytenhek <lbuytenhek@fastly.com> Acked-by: David Ahern <dsa@cumulusnetworks.com> Acked-by: Robert Shearman <rshearma@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-28 02:16:43 -06:00
rcu_read_unlock_bh();
}
else if (index == NEIGH_LINK_TABLE) {
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
addr, NULL, skb->len);
if (err < 0)
goto out_kfree_skb;
err = dev_queue_xmit(skb);
}
out:
return err;
out_kfree_skb:
kfree_skb(skb);
goto out;
}
EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
int bucket = state->bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
n = rcu_dereference_bh(nht->hash_buckets[bucket]);
while (n) {
if (!net_eq(dev_net(n->dev), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
goto next;
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
v = state->neigh_sub_iter(state, n, &fakep);
if (!v)
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (n->nud_state & ~NUD_NOARP)
break;
next:
n = rcu_dereference_bh(n->next);
}
if (n)
break;
}
state->bucket = bucket;
return n;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
struct neighbour *n,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
}
n = rcu_dereference_bh(n->next);
while (1) {
while (n) {
if (!net_eq(dev_net(n->dev), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
goto next;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
goto next;
}
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
break;
if (n->nud_state & ~NUD_NOARP)
break;
next:
n = rcu_dereference_bh(n->next);
}
if (n)
break;
if (++state->bucket >= (1 << nht->hash_shift))
break;
n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
}
if (n && pos)
--(*pos);
return n;
}
static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
{
struct neighbour *n = neigh_get_first(seq);
if (n) {
--(*pos);
while (*pos) {
n = neigh_get_next(seq, n, pos);
if (!n)
break;
}
}
return *pos ? NULL : n;
}
static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
int bucket = state->bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
pn = tbl->phash_buckets[bucket];
while (pn && !net_eq(pneigh_net(pn), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
pn = pn->next;
if (pn)
break;
}
state->bucket = bucket;
return pn;
}
static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct pneigh_entry *pn,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
do {
pn = pn->next;
} while (pn && !net_eq(pneigh_net(pn), net));
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
pn = tbl->phash_buckets[state->bucket];
while (pn && !net_eq(pneigh_net(pn), net))
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
pn = pn->next;
if (pn)
break;
}
if (pn && pos)
--(*pos);
return pn;
}
static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
{
struct pneigh_entry *pn = pneigh_get_first(seq);
if (pn) {
--(*pos);
while (*pos) {
pn = pneigh_get_next(seq, pn, pos);
if (!pn)
break;
}
}
return *pos ? NULL : pn;
}
static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
void *rc;
loff_t idxpos = *pos;
rc = neigh_get_idx(seq, &idxpos);
if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
rc = pneigh_get_idx(seq, &idxpos);
return rc;
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
__acquires(rcu_bh)
{
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock_bh();
state->nht = rcu_dereference_bh(tbl->nht);
return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL(neigh_seq_start);
void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct neigh_seq_state *state;
void *rc;
if (v == SEQ_START_TOKEN) {
rc = neigh_get_first(seq);
goto out;
}
state = seq->private;
if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
rc = neigh_get_next(seq, v, NULL);
if (rc)
goto out;
if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
rc = pneigh_get_first(seq);
} else {
BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
rc = pneigh_get_next(seq, v, NULL);
}
out:
++(*pos);
return rc;
}
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
__releases(rcu_bh)
{
rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_seq_stop);
/* statistics via seq_file */
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
struct neigh_table *tbl = seq->private;
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct neigh_table *tbl = seq->private;
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
{
}
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
struct neigh_table *tbl = seq->private;
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
"%08lx %08lx %08lx %08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
st->destroys,
st->hash_grows,
st->lookups,
st->hits,
st->res_failed,
st->rcv_probes_mcast,
st->rcv_probes_ucast,
st->periodic_gc_runs,
st->forced_gc_runs,
st->unres_discards,
st->table_fulls
);
return 0;
}
static const struct seq_operations neigh_stat_seq_ops = {
.start = neigh_stat_seq_start,
.next = neigh_stat_seq_next,
.stop = neigh_stat_seq_stop,
.show = neigh_stat_seq_show,
};
static int neigh_stat_seq_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &neigh_stat_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
sf->private = PDE_DATA(inode);
}
return ret;
};
static const struct file_operations neigh_stat_seq_fops = {
.owner = THIS_MODULE,
.open = neigh_stat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_PROC_FS */
static inline size_t neigh_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(struct nda_cacheinfo))
+ nla_total_size(4); /* NDA_PROBES */
}
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
err = neigh_fill_info(skb, n, pid, 0, type, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
2009-02-25 00:18:28 -07:00
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
if (err < 0)
[NETNS]: Modify the neighbour table code so it handles multiple network namespaces I'm actually surprised at how much was involved. At first glance it appears that the neighbour table data structures are already split by network device so all that should be needed is to modify the user interface commands to filter the set of neighbours by the network namespace of their devices. However a couple things turned up while I was reading through the code. The proxy neighbour table allows entries with no network device, and the neighbour parms are per network device (except for the defaults) so they now need a per network namespace default. So I updated the two structures (which surprised me) with their very own network namespace parameter. Updated the relevant lookup and destroy routines with a network namespace parameter and modified the code that interacts with users to filter out neighbour table entries for devices of other namespaces. I'm a little concerned that we can modify and display the global table configuration and from all network namespaces. But this appears good enough for now. I keep thinking modifying the neighbour table to have per network namespace instances of each table type would should be cleaner. The hash table is already dynamically sized so there are it is not a limiter. The default parameter would be straight forward to take care of. However when I look at the how the network table is built and used I still find some assumptions that there is only a single neighbour table for each type of table in the kernel. The netlink operations, neigh_seq_start, the non-core network users that call neigh_lookup. So while it might be doable it would require more refactoring than my current approach of just doing a little extra filtering in the code. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-01-24 01:13:18 -07:00
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
void neigh_app_ns(struct neighbour *n)
{
__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
}
EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
static int zero;
static int int_max = INT_MAX;
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
static int proc_unres_qlen(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
{
int size, ret;
struct ctl_table tmp = *ctl;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
tmp.extra1 = &zero;
tmp.extra2 = &unres_qlen_max;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
tmp.data = &size;
size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
if (write && !ret)
*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
return ret;
}
static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
int family)
{
switch (family) {
case AF_INET:
return __in_dev_arp_parms_get_rcu(dev);
case AF_INET6:
return __in6_dev_nd_parms_get_rcu(dev);
}
return NULL;
}
static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
int index)
{
struct net_device *dev;
int family = neigh_parms_family(p);
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
struct neigh_parms *dst_p =
neigh_get_dev_parms_rcu(dev, family);
if (dst_p && !test_bit(index, dst_p->data_state))
dst_p->data[index] = p->data[index];
}
rcu_read_unlock();
}
static void neigh_proc_update(struct ctl_table *ctl, int write)
{
struct net_device *dev = ctl->extra1;
struct neigh_parms *p = ctl->extra2;
struct net *net = neigh_parms_net(p);
int index = (int *) ctl->data - p->data;
if (!write)
return;
set_bit(index, p->data_state);
if (index == NEIGH_VAR_DELAY_PROBE_TIME)
call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
if (!dev) /* NULL dev means this is default value */
neigh_copy_dflt_parms(net, p, index);
}
static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table tmp = *ctl;
int ret;
tmp.extra1 = &zero;
tmp.extra2 = &int_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
int neigh_proc_dointvec(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec);
int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
struct neigh_parms *p = ctl->extra2;
int ret;
if (strcmp(ctl->procname, "base_reachable_time") == 0)
ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0)
ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
else
ret = -1;
if (write && ret == 0) {
/* update reachable_time as well, otherwise, the change will
* only be effective after the next time neigh_periodic_work
* decides to recompute it
*/
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
return ret;
}
#define NEIGH_PARMS_DATA_OFFSET(index) \
(&((struct neigh_parms *) 0)->data[index])
#define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \
[NEIGH_VAR_ ## attr] = { \
.procname = name, \
.data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \
.maxlen = sizeof(int), \
.mode = mval, \
.proc_handler = proc, \
}
#define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax)
#define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies)
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
#define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen)
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
[NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
[NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
[NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
[NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &int_max,
.proc_handler = proc_dointvec_minmax,
},
{},
},
};
int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
proc_handler *handler)
{
int i;
struct neigh_sysctl_table *t;
const char *dev_name_source;
char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
char *p_name;
t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
if (!t)
goto err;
for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
t->neigh_vars[i].data += (long) p;
t->neigh_vars[i].extra1 = dev;
t->neigh_vars[i].extra2 = p;
}
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
struct neigh_table *tbl = p->tbl;
dev_name_source = "default";
t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
}
if (handler) {
/* RetransTime */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
/* ReachableTime */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
/* RetransTime (in milliseconds)*/
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
/* ReachableTime (in milliseconds) */
neigh: new unresolved queue limits Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit : > From: David Miller <davem@davemloft.net> > Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST) > > > From: Eric Dumazet <eric.dumazet@gmail.com> > > Date: Wed, 09 Nov 2011 12:14:09 +0100 > > > >> unres_qlen is the number of frames we are able to queue per unresolved > >> neighbour. Its default value (3) was never changed and is responsible > >> for strange drops, especially if IP fragments are used, or multiple > >> sessions start in parallel. Even a single tcp flow can hit this limit. > > ... > > > > Ok, I've applied this, let's see what happens :-) > > Early answer, build fails. > > Please test build this patch with DECNET enabled and resubmit. The > decnet neigh layer still refers to the removed ->queue_len member. > > Thanks. Ouch, this was fixed on one machine yesterday, but not the other one I used this morning, sorry. [PATCH V5 net-next] neigh: new unresolved queue limits unres_qlen is the number of frames we are able to queue per unresolved neighbour. Its default value (3) was never changed and is responsible for strange drops, especially if IP fragments are used, or multiple sessions start in parallel. Even a single tcp flow can hit this limit. $ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108 PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data. 8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms Signed-off-by: David S. Miller <davem@davemloft.net>
2011-11-09 05:07:14 -07:00
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
} else {
/* Those handlers will update p->reachable_time after
* base_reachable_time(_ms) is set to ensure the new timer starts being
* applied after the next neighbour update instead of waiting for
* neigh_periodic_work to update its value (can be multiple minutes)
* So any handler that replaces them should do this as well
*/
/* ReachableTime */
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
neigh_proc_base_reachable_time;
/* ReachableTime (in milliseconds) */
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
neigh_proc_base_reachable_time;
}
/* Don't export sysctls to unprivileged users */
if (neigh_parms_net(p)->user_ns != &init_user_ns)
t->neigh_vars[0].procname = NULL;
switch (neigh_parms_family(p)) {
case AF_INET:
p_name = "ipv4";
break;
case AF_INET6:
p_name = "ipv6";
break;
default:
BUG();
}
snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
p_name, dev_name_source);
t->sysctl_header =
register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
if (!t->sysctl_header)
goto free;
p->sysctl_table = t;
return 0;
free:
kfree(t);
err:
return -ENOBUFS;
}
EXPORT_SYMBOL(neigh_sysctl_register);
void neigh_sysctl_unregister(struct neigh_parms *p)
{
if (p->sysctl_table) {
struct neigh_sysctl_table *t = p->sysctl_table;
p->sysctl_table = NULL;
unregister_net_sysctl_table(t->sysctl_header);
kfree(t);
}
}
EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
static int __init neigh_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
0);
rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
return 0;
}
subsys_initcall(neigh_init);