1
0
Fork 0
remarkable-linux/net/netfilter/xt_recent.c

764 lines
19 KiB
C
Raw Permalink Normal View History

/*
* Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This is a replacement of the old ipt_recent module, which carried the
* following copyright notice:
*
* Author: Stephen Frost <sfrost@snowman.net>
* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/bitops.h>
#include <linux/skbuff.h>
#include <linux/inet.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 02:04:11 -06:00
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_recent.h>
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_recent");
MODULE_ALIAS("ip6t_recent");
static unsigned int ip_list_tot __read_mostly = 100;
static unsigned int ip_list_hash_size __read_mostly;
static unsigned int ip_list_perms __read_mostly = 0644;
static unsigned int ip_list_uid __read_mostly;
static unsigned int ip_list_gid __read_mostly;
module_param(ip_list_tot, uint, 0400);
module_param(ip_list_hash_size, uint, 0400);
module_param(ip_list_perms, uint, 0400);
module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR);
module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files");
MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files");
/* retained for backwards compatibility */
static unsigned int ip_pkt_list_tot __read_mostly;
module_param(ip_pkt_list_tot, uint, 0400);
MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
#define XT_RECENT_MAX_NSTAMPS 256
struct recent_entry {
struct list_head list;
struct list_head lru_list;
union nf_inet_addr addr;
u_int16_t family;
u_int8_t ttl;
u_int8_t index;
u_int16_t nstamps;
unsigned long stamps[0];
};
struct recent_table {
struct list_head list;
char name[XT_RECENT_NAME_LEN];
union nf_inet_addr mask;
unsigned int refcnt;
unsigned int entries;
u8 nstamps_max_mask;
struct list_head lru_list;
struct list_head iphash[0];
};
struct recent_net {
struct list_head tables;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *xt_recent;
#endif
};
netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-16 18:58:21 -07:00
static unsigned int recent_net_id __read_mostly;
static inline struct recent_net *recent_pernet(struct net *net)
{
return net_generic(net, recent_net_id);
}
static DEFINE_SPINLOCK(recent_lock);
static DEFINE_MUTEX(recent_mutex);
#ifdef CONFIG_PROC_FS
static const struct file_operations recent_mt_fops;
#endif
static u_int32_t hash_rnd __read_mostly;
static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
{
return jhash_1word((__force u32)addr->ip, hash_rnd) &
(ip_list_hash_size - 1);
}
static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr)
{
return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) &
(ip_list_hash_size - 1);
}
static struct recent_entry *
recent_entry_lookup(const struct recent_table *table,
const union nf_inet_addr *addrp, u_int16_t family,
u_int8_t ttl)
{
struct recent_entry *e;
unsigned int h;
if (family == NFPROTO_IPV4)
h = recent_entry_hash4(addrp);
else
h = recent_entry_hash6(addrp);
list_for_each_entry(e, &table->iphash[h], list)
if (e->family == family &&
memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 &&
(ttl == e->ttl || ttl == 0 || e->ttl == 0))
return e;
return NULL;
}
static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
{
list_del(&e->list);
list_del(&e->lru_list);
kfree(e);
t->entries--;
}
/*
* Drop entries with timestamps older then 'time'.
*/
static void recent_entry_reap(struct recent_table *t, unsigned long time)
{
struct recent_entry *e;
/*
* The head of the LRU list is always the oldest entry.
*/
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
/*
* The last time stamp is the most recent.
*/
if (time_after(time, e->stamps[e->index-1]))
recent_entry_remove(t, e);
}
static struct recent_entry *
recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
u_int16_t family, u_int8_t ttl)
{
struct recent_entry *e;
unsigned int nstamps_max = t->nstamps_max_mask;
if (t->entries >= ip_list_tot) {
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
recent_entry_remove(t, e);
}
nstamps_max += 1;
e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * nstamps_max,
GFP_ATOMIC);
if (e == NULL)
return NULL;
memcpy(&e->addr, addr, sizeof(e->addr));
e->ttl = ttl;
e->stamps[0] = jiffies;
e->nstamps = 1;
e->index = 1;
e->family = family;
if (family == NFPROTO_IPV4)
list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
else
list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
list_add_tail(&e->lru_list, &t->lru_list);
t->entries++;
return e;
}
static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
{
e->index &= t->nstamps_max_mask;
e->stamps[e->index++] = jiffies;
if (e->index > e->nstamps)
e->nstamps = e->index;
list_move_tail(&e->lru_list, &t->lru_list);
}
static struct recent_table *recent_table_lookup(struct recent_net *recent_net,
const char *name)
{
struct recent_table *t;
list_for_each_entry(t, &recent_net->tables, list)
if (!strcmp(t->name, name))
return t;
return NULL;
}
static void recent_table_flush(struct recent_table *t)
{
struct recent_entry *e, *next;
unsigned int i;
for (i = 0; i < ip_list_hash_size; i++)
list_for_each_entry_safe(e, next, &t->iphash[i], list)
recent_entry_remove(t, e);
}
static bool
recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = xt_net(par);
struct recent_net *recent_net = recent_pernet(net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
struct recent_entry *e;
union nf_inet_addr addr = {}, addr_mask;
u_int8_t ttl;
bool ret = info->invert;
if (xt_family(par) == NFPROTO_IPV4) {
const struct iphdr *iph = ip_hdr(skb);
if (info->side == XT_RECENT_DEST)
addr.ip = iph->daddr;
else
addr.ip = iph->saddr;
ttl = iph->ttl;
} else {
const struct ipv6hdr *iph = ipv6_hdr(skb);
if (info->side == XT_RECENT_DEST)
memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6));
else
memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6));
ttl = iph->hop_limit;
}
/* use TTL as seen before forwarding */
if (xt_out(par) != NULL && skb->sk == NULL)
ttl++;
spin_lock_bh(&recent_lock);
t = recent_table_lookup(recent_net, info->name);
nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
e = recent_entry_lookup(t, &addr_mask, xt_family(par),
(info->check_set & XT_RECENT_TTL) ? ttl : 0);
if (e == NULL) {
if (!(info->check_set & XT_RECENT_SET))
goto out;
e = recent_entry_init(t, &addr_mask, xt_family(par), ttl);
if (e == NULL)
par->hotdrop = true;
ret = !ret;
goto out;
}
if (info->check_set & XT_RECENT_SET)
ret = !ret;
else if (info->check_set & XT_RECENT_REMOVE) {
recent_entry_remove(t, e);
ret = !ret;
} else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) {
unsigned long time = jiffies - info->seconds * HZ;
unsigned int i, hits = 0;
for (i = 0; i < e->nstamps; i++) {
if (info->seconds && time_after(time, e->stamps[i]))
continue;
if (!info->hit_count || ++hits >= info->hit_count) {
ret = !ret;
break;
}
}
/* info->seconds must be non-zero */
if (info->check_set & XT_RECENT_REAP)
recent_entry_reap(t, time);
}
if (info->check_set & XT_RECENT_SET ||
(info->check_set & XT_RECENT_UPDATE && ret)) {
recent_entry_update(t, e);
e->ttl = ttl;
}
out:
spin_unlock_bh(&recent_lock);
return ret;
}
static void recent_table_free(void *addr)
{
kvfree(addr);
}
static int recent_mt_check(const struct xt_mtchk_param *par,
const struct xt_recent_mtinfo_v1 *info)
{
struct recent_net *recent_net = recent_pernet(par->net);
struct recent_table *t;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde;
kuid_t uid;
kgid_t gid;
#endif
unsigned int nstamp_mask;
unsigned int i;
int ret = -EINVAL;
size_t sz;
net_get_random_once(&hash_rnd, sizeof(hash_rnd));
if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
pr_info("Unsupported user space flags (%08x)\n",
info->check_set);
return -EINVAL;
}
if (hweight8(info->check_set &
(XT_RECENT_SET | XT_RECENT_REMOVE |
XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1)
return -EINVAL;
if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) &&
(info->seconds || info->hit_count ||
(info->check_set & XT_RECENT_MODIFIERS)))
return -EINVAL;
if ((info->check_set & XT_RECENT_REAP) && !info->seconds)
return -EINVAL;
if (info->hit_count >= XT_RECENT_MAX_NSTAMPS) {
pr_info("hitcount (%u) is larger than allowed maximum (%u)\n",
info->hit_count, XT_RECENT_MAX_NSTAMPS - 1);
return -EINVAL;
}
ret = xt_check_proc_name(info->name, sizeof(info->name));
if (ret)
return ret;
if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot)
nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1;
else if (info->hit_count)
nstamp_mask = roundup_pow_of_two(info->hit_count) - 1;
else
nstamp_mask = 32 - 1;
mutex_lock(&recent_mutex);
t = recent_table_lookup(recent_net, info->name);
if (t != NULL) {
if (nstamp_mask > t->nstamps_max_mask) {
spin_lock_bh(&recent_lock);
recent_table_flush(t);
t->nstamps_max_mask = nstamp_mask;
spin_unlock_bh(&recent_lock);
}
t->refcnt++;
ret = 0;
goto out;
}
sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-08 16:57:27 -06:00
t = kvzalloc(sz, GFP_KERNEL);
if (t == NULL) {
ret = -ENOMEM;
goto out;
}
t->refcnt = 1;
t->nstamps_max_mask = nstamp_mask;
memcpy(&t->mask, &info->mask, sizeof(t->mask));
strcpy(t->name, info->name);
INIT_LIST_HEAD(&t->lru_list);
for (i = 0; i < ip_list_hash_size; i++)
INIT_LIST_HEAD(&t->iphash[i]);
#ifdef CONFIG_PROC_FS
uid = make_kuid(&init_user_ns, ip_list_uid);
gid = make_kgid(&init_user_ns, ip_list_gid);
if (!uid_valid(uid) || !gid_valid(gid)) {
recent_table_free(t);
ret = -EINVAL;
goto out;
}
pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
&recent_mt_fops, t);
if (pde == NULL) {
recent_table_free(t);
ret = -ENOMEM;
goto out;
}
proc_set_user(pde, uid, gid);
#endif
spin_lock_bh(&recent_lock);
list_add_tail(&t->list, &recent_net->tables);
spin_unlock_bh(&recent_lock);
ret = 0;
out:
mutex_unlock(&recent_mutex);
return ret;
}
static int recent_mt_check_v0(const struct xt_mtchk_param *par)
{
const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo;
struct xt_recent_mtinfo_v1 info_v1;
/* Copy revision 0 structure to revision 1 */
memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo));
/* Set default mask to ensure backward compatible behaviour */
memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all));
return recent_mt_check(par, &info_v1);
}
static int recent_mt_check_v1(const struct xt_mtchk_param *par)
{
return recent_mt_check(par, par->matchinfo);
}
static void recent_mt_destroy(const struct xt_mtdtor_param *par)
{
struct recent_net *recent_net = recent_pernet(par->net);
const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
struct recent_table *t;
mutex_lock(&recent_mutex);
t = recent_table_lookup(recent_net, info->name);
if (--t->refcnt == 0) {
spin_lock_bh(&recent_lock);
list_del(&t->list);
spin_unlock_bh(&recent_lock);
#ifdef CONFIG_PROC_FS
if (recent_net->xt_recent != NULL)
remove_proc_entry(t->name, recent_net->xt_recent);
#endif
recent_table_flush(t);
recent_table_free(t);
}
mutex_unlock(&recent_mutex);
}
#ifdef CONFIG_PROC_FS
struct recent_iter_state {
const struct recent_table *table;
unsigned int bucket;
};
static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(recent_lock)
{
struct recent_iter_state *st = seq->private;
const struct recent_table *t = st->table;
struct recent_entry *e;
loff_t p = *pos;
spin_lock_bh(&recent_lock);
for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
list_for_each_entry(e, &t->iphash[st->bucket], list)
if (p-- == 0)
return e;
return NULL;
}
static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct recent_iter_state *st = seq->private;
const struct recent_table *t = st->table;
const struct recent_entry *e = v;
const struct list_head *head = e->list.next;
while (head == &t->iphash[st->bucket]) {
if (++st->bucket >= ip_list_hash_size)
return NULL;
head = t->iphash[st->bucket].next;
}
(*pos)++;
return list_entry(head, struct recent_entry, list);
}
static void recent_seq_stop(struct seq_file *s, void *v)
__releases(recent_lock)
{
spin_unlock_bh(&recent_lock);
}
static int recent_seq_show(struct seq_file *seq, void *v)
{
const struct recent_entry *e = v;
struct recent_iter_state *st = seq->private;
const struct recent_table *t = st->table;
unsigned int i;
i = (e->index - 1) & t->nstamps_max_mask;
if (e->family == NFPROTO_IPV4)
seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u",
&e->addr.ip, e->ttl, e->stamps[i], e->index);
else
seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u",
&e->addr.in6, e->ttl, e->stamps[i], e->index);
for (i = 0; i < e->nstamps; i++)
seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
seq_putc(seq, '\n');
return 0;
}
static const struct seq_operations recent_seq_ops = {
.start = recent_seq_start,
.next = recent_seq_next,
.stop = recent_seq_stop,
.show = recent_seq_show,
};
static int recent_seq_open(struct inode *inode, struct file *file)
{
struct recent_iter_state *st;
st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
if (st == NULL)
return -ENOMEM;
st->table = PDE_DATA(inode);
return 0;
}
static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
struct recent_table *t = PDE_DATA(file_inode(file));
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
netfilter: xt_recent: fix proc-file addition/removal of IPv4 addresses Fix regression introduded by commit 079aa88 (netfilter: xt_recent: IPv6 support): From http://bugzilla.kernel.org/show_bug.cgi?id=12753: Problem Description: An uninitialized buffer causes IPv4 addresses added manually (via the +IP command to the proc interface) to never match any packets. Similarly, the -IP command fails to remove IPv4 addresses. Details: In the function recent_entry_lookup, the xt_recent module does comparisons of the entire nf_inet_addr union value, both for IPv4 and IPv6 addresses. For addresses initialized from actual packets the remaining 12 bytes not occupied by the IPv4 are zeroed so this works correctly. However when setting the nf_inet_addr addr variable in the recent_mt_proc_write function, only the IPv4 bytes are initialized and the remaining 12 bytes contain garbage. Hence addresses added in this way never match any packets, unless these uninitialized 12 bytes happened to be zero by coincidence. Similarly, addresses cannot consistently be removed using the proc interface due to mismatch of the garbage bytes (although it will sometimes work to remove an address that was added manually). Reading the /proc/net/xt_recent/ entries hides this problem because this only uses the first 4 bytes when displaying IPv4 addresses. Steps to reproduce: $ iptables -I INPUT -m recent --rcheck -j LOG $ echo +169.254.156.239 > /proc/net/xt_recent/DEFAULT $ cat /proc/net/xt_recent/DEFAULT src=169.254.156.239 ttl: 0 last_seen: 119910 oldest_pkt: 1 119910 [At this point no packets from 169.254.156.239 are being logged.] $ iptables -I INPUT -s 169.254.156.239 -m recent --set $ cat /proc/net/xt_recent/DEFAULT src=169.254.156.239 ttl: 0 last_seen: 119910 oldest_pkt: 1 119910 src=169.254.156.239 ttl: 255 last_seen: 126184 oldest_pkt: 4 125434, 125684, 125934, 126184 [At this point, adding the address via an iptables rule, packets are being logged correctly.] $ echo -169.254.156.239 > /proc/net/xt_recent/DEFAULT $ cat /proc/net/xt_recent/DEFAULT src=169.254.156.239 ttl: 0 last_seen: 119910 oldest_pkt: 1 119910 src=169.254.156.239 ttl: 255 last_seen: 126992 oldest_pkt: 10 125434, 125684, 125934, 126184, 126434, 126684, 126934, 126991, 126991, 126992 $ echo -169.254.156.239 > /proc/net/xt_recent/DEFAULT $ cat /proc/net/xt_recent/DEFAULT src=169.254.156.239 ttl: 0 last_seen: 119910 oldest_pkt: 1 119910 src=169.254.156.239 ttl: 255 last_seen: 126992 oldest_pkt: 10 125434, 125684, 125934, 126184, 126434, 126684, 126934, 126991, 126991, 126992 [Removing the address via /proc interface failed evidently.] Possible solutions: - initialize the addr variable in recent_mt_proc_write - compare only 4 bytes for IPv4 addresses in recent_entry_lookup Signed-off-by: Patrick McHardy <kaber@trash.net>
2009-02-24 06:53:12 -07:00
union nf_inet_addr addr = {};
u_int16_t family;
bool add, succ;
if (size == 0)
return 0;
if (size > sizeof(buf))
size = sizeof(buf);
if (copy_from_user(buf, input, size) != 0)
return -EFAULT;
/* Strict protocol! */
if (*loff != 0)
return -ESPIPE;
switch (*c) {
case '/': /* flush table */
spin_lock_bh(&recent_lock);
recent_table_flush(t);
spin_unlock_bh(&recent_lock);
return size;
case '-': /* remove address */
add = false;
break;
case '+': /* add address */
add = true;
break;
default:
pr_info("Need \"+ip\", \"-ip\" or \"/\"\n");
return -EINVAL;
}
++c;
--size;
if (strnchr(c, size, ':') != NULL) {
family = NFPROTO_IPV6;
succ = in6_pton(c, size, (void *)&addr, '\n', NULL);
} else {
family = NFPROTO_IPV4;
succ = in4_pton(c, size, (void *)&addr, '\n', NULL);
}
if (!succ) {
pr_info("illegal address written to procfs\n");
return -EINVAL;
}
spin_lock_bh(&recent_lock);
e = recent_entry_lookup(t, &addr, family, 0);
if (e == NULL) {
if (add)
recent_entry_init(t, &addr, family, 0);
} else {
if (add)
recent_entry_update(t, e);
else
recent_entry_remove(t, e);
}
spin_unlock_bh(&recent_lock);
/* Note we removed one above */
*loff += size + 1;
return size + 1;
}
static const struct file_operations recent_mt_fops = {
.open = recent_seq_open,
.read = seq_read,
.write = recent_mt_proc_write,
.release = seq_release_private,
.owner = THIS_MODULE,
llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann <arnd@arndb.de> Cc: Julia Lawall <julia@diku.dk> Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 10:52:59 -06:00
.llseek = seq_lseek,
};
static int __net_init recent_proc_net_init(struct net *net)
{
struct recent_net *recent_net = recent_pernet(net);
recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
if (!recent_net->xt_recent)
return -ENOMEM;
return 0;
}
static void __net_exit recent_proc_net_exit(struct net *net)
{
struct recent_net *recent_net = recent_pernet(net);
struct recent_table *t;
/* recent_net_exit() is called before recent_mt_destroy(). Make sure
* that the parent xt_recent proc entry is is empty before trying to
* remove it.
*/
spin_lock_bh(&recent_lock);
list_for_each_entry(t, &recent_net->tables, list)
remove_proc_entry(t->name, recent_net->xt_recent);
recent_net->xt_recent = NULL;
spin_unlock_bh(&recent_lock);
remove_proc_entry("xt_recent", net->proc_net);
}
#else
static inline int recent_proc_net_init(struct net *net)
{
return 0;
}
static inline void recent_proc_net_exit(struct net *net)
{
}
#endif /* CONFIG_PROC_FS */
static int __net_init recent_net_init(struct net *net)
{
struct recent_net *recent_net = recent_pernet(net);
INIT_LIST_HEAD(&recent_net->tables);
return recent_proc_net_init(net);
}
static void __net_exit recent_net_exit(struct net *net)
{
recent_proc_net_exit(net);
}
static struct pernet_operations recent_net_ops = {
.init = recent_net_init,
.exit = recent_net_exit,
.id = &recent_net_id,
.size = sizeof(struct recent_net),
};
static struct xt_match recent_mt_reg[] __read_mostly = {
{
.name = "recent",
.revision = 0,
.family = NFPROTO_IPV4,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo),
.checkentry = recent_mt_check_v0,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
},
{
.name = "recent",
.revision = 0,
.family = NFPROTO_IPV6,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo),
.checkentry = recent_mt_check_v0,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
},
{
.name = "recent",
.revision = 1,
.family = NFPROTO_IPV4,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo_v1),
.checkentry = recent_mt_check_v1,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
},
{
.name = "recent",
.revision = 1,
.family = NFPROTO_IPV6,
.match = recent_mt,
.matchsize = sizeof(struct xt_recent_mtinfo_v1),
.checkentry = recent_mt_check_v1,
.destroy = recent_mt_destroy,
.me = THIS_MODULE,
}
};
static int __init recent_mt_init(void)
{
int err;
BUILD_BUG_ON_NOT_POWER_OF_2(XT_RECENT_MAX_NSTAMPS);
if (!ip_list_tot || ip_pkt_list_tot >= XT_RECENT_MAX_NSTAMPS)
return -EINVAL;
ip_list_hash_size = 1 << fls(ip_list_tot);
err = register_pernet_subsys(&recent_net_ops);
if (err)
return err;
err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
if (err)
unregister_pernet_subsys(&recent_net_ops);
return err;
}
static void __exit recent_mt_exit(void)
{
xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
unregister_pernet_subsys(&recent_net_ops);
}
module_init(recent_mt_init);
module_exit(recent_mt_exit);