1
0
Fork 0
alistair23-linux/net/netfilter/nf_synproxy_core.c

433 lines
9.9 KiB
C
Raw Normal View History

/*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
#include <asm/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_tcpudp.h>
#include <linux/netfilter/xt_SYNPROXY.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
bool
synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
const struct tcphdr *th, struct synproxy_options *opts)
{
int length = (th->doff * 4) - sizeof(*th);
u8 buf[40], *ptr;
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
if (ptr == NULL)
return false;
opts->options = 0;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return true;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2)
return true;
if (opsize > length)
return true;
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
opts->mss = get_unaligned_be16(ptr);
opts->options |= XT_SYNPROXY_OPT_MSS;
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW) {
opts->wscale = *ptr;
if (opts->wscale > 14)
opts->wscale = 14;
opts->options |= XT_SYNPROXY_OPT_WSCALE;
}
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP) {
opts->tsval = get_unaligned_be32(ptr);
opts->tsecr = get_unaligned_be32(ptr + 4);
opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
break;
}
ptr += opsize - 2;
length -= opsize;
}
}
return true;
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
unsigned int synproxy_options_size(const struct synproxy_options *opts)
{
unsigned int size = 0;
if (opts->options & XT_SYNPROXY_OPT_MSS)
size += TCPOLEN_MSS_ALIGNED;
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
size += TCPOLEN_TSTAMP_ALIGNED;
else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
size += TCPOLEN_SACKPERM_ALIGNED;
if (opts->options & XT_SYNPROXY_OPT_WSCALE)
size += TCPOLEN_WSCALE_ALIGNED;
return size;
}
EXPORT_SYMBOL_GPL(synproxy_options_size);
void
synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
{
__be32 *ptr = (__be32 *)(th + 1);
u8 options = opts->options;
if (options & XT_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
opts->mss);
if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
if (options & XT_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
else
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
} else if (options & XT_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
if (options & XT_SYNPROXY_OPT_WSCALE)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->wscale);
}
EXPORT_SYMBOL_GPL(synproxy_build_options);
void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
opts->tsval = tcp_time_stamp & ~0x3f;
if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
opts->wscale = info->wscale;
} else
opts->tsval |= 0xf;
if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
opts->tsval |= 1 << 4;
if (opts->options & XT_SYNPROXY_OPT_ECN)
opts->tsval |= 1 << 5;
}
EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
{
opts->wscale = opts->tsecr & 0xf;
if (opts->wscale != 0xf)
opts->options |= XT_SYNPROXY_OPT_WSCALE;
opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
}
EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
unsigned int protoff,
struct tcphdr *th,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
u32 *ptr, old;
if (synproxy->tsoff == 0)
return 1;
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + th->doff * 4;
if (!skb_make_writable(skb, optend))
return 0;
while (optoff < optend) {
unsigned char *op = skb->data + optoff;
switch (op[0]) {
case TCPOPT_EOL:
return 1;
case TCPOPT_NOP:
optoff++;
continue;
default:
if (optoff + 1 == optend ||
optoff + op[1] > optend ||
op[1] < 2)
return 0;
if (op[0] == TCPOPT_TIMESTAMP &&
op[1] == TCPOLEN_TIMESTAMP) {
if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
ptr = (u32 *)&op[2];
old = *ptr;
*ptr = htonl(ntohl(*ptr) -
synproxy->tsoff);
} else {
ptr = (u32 *)&op[6];
old = *ptr;
*ptr = htonl(ntohl(*ptr) +
synproxy->tsoff);
}
inet_proto_csum_replace4(&th->check, skb,
old, *ptr, 0);
return 1;
}
optoff += op[1];
}
}
return 1;
}
EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
.len = sizeof(struct nf_conn_synproxy),
.align = __alignof__(struct nf_conn_synproxy),
.id = NF_CT_EXT_SYNPROXY,
};
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
return NULL;
}
static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
if (!cpu_possible(cpu))
continue;
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
return NULL;
}
static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
{
return;
}
static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
{
struct synproxy_stats *stats = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "entries\t\tsyn_received\t"
"cookie_invalid\tcookie_valid\t"
"cookie_retrans\tconn_reopened\n");
return 0;
}
seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
stats->syn_received,
stats->cookie_invalid,
stats->cookie_valid,
stats->cookie_retrans,
stats->conn_reopened);
return 0;
}
static const struct seq_operations synproxy_cpu_seq_ops = {
.start = synproxy_cpu_seq_start,
.next = synproxy_cpu_seq_next,
.stop = synproxy_cpu_seq_stop,
.show = synproxy_cpu_seq_show,
};
static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
sizeof(struct seq_net_private));
}
static const struct file_operations synproxy_cpu_seq_fops = {
.owner = THIS_MODULE,
.open = synproxy_cpu_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static int __net_init synproxy_proc_init(struct net *net)
{
if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
&synproxy_cpu_seq_fops))
return -ENOMEM;
return 0;
}
static void __net_exit synproxy_proc_exit(struct net *net)
{
remove_proc_entry("synproxy", net->proc_net_stat);
}
#else
static int __net_init synproxy_proc_init(struct net *net)
{
return 0;
}
static void __net_exit synproxy_proc_exit(struct net *net)
{
return;
}
#endif /* CONFIG_PROC_FS */
static int __net_init synproxy_net_init(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
struct nf_conn *ct;
int err = -ENOMEM;
netfilter: fix netns dependencies with conntrack templates Quoting Daniel Borkmann: "When adding connection tracking template rules to a netns, f.e. to configure netfilter zones, the kernel will endlessly busy-loop as soon as we try to delete the given netns in case there's at least one template present, which is problematic i.e. if there is such bravery that the priviledged user inside the netns is assumed untrusted. Minimal example: ip netns add foo ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1 ip netns del foo What happens is that when nf_ct_iterate_cleanup() is being called from nf_conntrack_cleanup_net_list() for a provided netns, we always end up with a net->ct.count > 0 and thus jump back to i_see_dead_people. We don't get a soft-lockup as we still have a schedule() point, but the serving CPU spins on 100% from that point onwards. Since templates are normally allocated with nf_conntrack_alloc(), we also bump net->ct.count. The issue why they are not yet nf_ct_put() is because the per netns .exit() handler from x_tables (which would eventually invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is called in the dependency chain at a *later* point in time than the per netns .exit() handler for the connection tracker. This is clearly a chicken'n'egg problem: after the connection tracker .exit() handler, we've teared down all the connection tracking infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be invoked at a later point in time during the netns cleanup, as that would lead to a use-after-free. At the same time, we cannot make x_tables depend on the connection tracker module, so that the xt_ct_tg_destroy() would be invoked earlier in the cleanup chain." Daniel confirms this has to do with the order in which modules are loaded or having compiled nf_conntrack as modules while x_tables built-in. So we have no guarantees regarding the order in which netns callbacks are executed. Fix this by allocating the templates through kmalloc() from the respective SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache. Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch is marked as unlikely since conntrack templates are rarely allocated and only from the configuration plane path. Note that templates are not kept in any list to avoid further dependencies with nf_conntrack anymore, thus, the tmpl larval list is removed. Reported-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Tested-by: Daniel Borkmann <daniel@iogearbox.net>
2015-07-13 07:11:48 -06:00
ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL);
if (!ct)
goto err1;
if (!nfct_seqadj_ext_add(ct))
goto err2;
if (!nfct_synproxy_ext_add(ct))
goto err2;
netfilter: fix netns dependencies with conntrack templates Quoting Daniel Borkmann: "When adding connection tracking template rules to a netns, f.e. to configure netfilter zones, the kernel will endlessly busy-loop as soon as we try to delete the given netns in case there's at least one template present, which is problematic i.e. if there is such bravery that the priviledged user inside the netns is assumed untrusted. Minimal example: ip netns add foo ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1 ip netns del foo What happens is that when nf_ct_iterate_cleanup() is being called from nf_conntrack_cleanup_net_list() for a provided netns, we always end up with a net->ct.count > 0 and thus jump back to i_see_dead_people. We don't get a soft-lockup as we still have a schedule() point, but the serving CPU spins on 100% from that point onwards. Since templates are normally allocated with nf_conntrack_alloc(), we also bump net->ct.count. The issue why they are not yet nf_ct_put() is because the per netns .exit() handler from x_tables (which would eventually invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is called in the dependency chain at a *later* point in time than the per netns .exit() handler for the connection tracker. This is clearly a chicken'n'egg problem: after the connection tracker .exit() handler, we've teared down all the connection tracking infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be invoked at a later point in time during the netns cleanup, as that would lead to a use-after-free. At the same time, we cannot make x_tables depend on the connection tracker module, so that the xt_ct_tg_destroy() would be invoked earlier in the cleanup chain." Daniel confirms this has to do with the order in which modules are loaded or having compiled nf_conntrack as modules while x_tables built-in. So we have no guarantees regarding the order in which netns callbacks are executed. Fix this by allocating the templates through kmalloc() from the respective SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache. Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch is marked as unlikely since conntrack templates are rarely allocated and only from the configuration plane path. Note that templates are not kept in any list to avoid further dependencies with nf_conntrack anymore, thus, the tmpl larval list is removed. Reported-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Tested-by: Daniel Borkmann <daniel@iogearbox.net>
2015-07-13 07:11:48 -06:00
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
if (snet->stats == NULL)
goto err2;
err = synproxy_proc_init(net);
if (err < 0)
goto err3;
return 0;
err3:
free_percpu(snet->stats);
err2:
nf_conntrack_free(ct);
err1:
return err;
}
static void __net_exit synproxy_net_exit(struct net *net)
{
struct synproxy_net *snet = synproxy_pernet(net);
netfilter: nf_conntrack: don't release a conntrack with non-zero refcnt With this patch, the conntrack refcount is initially set to zero and it is bumped once it is added to any of the list, so we fulfill Eric's golden rule which is that all released objects always have a refcount that equals zero. Andrey Vagin reports that nf_conntrack_free can't be called for a conntrack with non-zero ref-counter, because it can race with nf_conntrack_find_get(). A conntrack slab is created with SLAB_DESTROY_BY_RCU. Non-zero ref-counter says that this conntrack is used. So when we release a conntrack with non-zero counter, we break this assumption. CPU1 CPU2 ____nf_conntrack_find() nf_ct_put() destroy_conntrack() ... init_conntrack __nf_conntrack_alloc (set use = 1) atomic_inc_not_zero(&ct->use) (use = 2) if (!l4proto->new(ct, skb, dataoff, timeouts)) nf_conntrack_free(ct); (use = 2 !!!) ... __nf_conntrack_alloc (set use = 1) if (!nf_ct_key_equal(h, tuple, zone)) nf_ct_put(ct); (use = 0) destroy_conntrack() /* continue to work with CT */ After applying the path "[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get" another bug was triggered in destroy_conntrack(): <4>[67096.759334] ------------[ cut here ]------------ <2>[67096.759353] kernel BUG at net/netfilter/nf_conntrack_core.c:211! ... <4>[67096.759837] Pid: 498649, comm: atdd veid: 666 Tainted: G C --------------- 2.6.32-042stab084.18 #1 042stab084_18 /DQ45CB <4>[67096.759932] RIP: 0010:[<ffffffffa03d99ac>] [<ffffffffa03d99ac>] destroy_conntrack+0x15c/0x190 [nf_conntrack] <4>[67096.760255] Call Trace: <4>[67096.760255] [<ffffffff814844a7>] nf_conntrack_destroy+0x17/0x30 <4>[67096.760255] [<ffffffffa03d9bb5>] nf_conntrack_find_get+0x85/0x130 [nf_conntrack] <4>[67096.760255] [<ffffffffa03d9fb2>] nf_conntrack_in+0x352/0xb60 [nf_conntrack] <4>[67096.760255] [<ffffffffa048c771>] ipv4_conntrack_local+0x51/0x60 [nf_conntrack_ipv4] <4>[67096.760255] [<ffffffff81484419>] nf_iterate+0x69/0xb0 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814845d4>] nf_hook_slow+0x74/0x110 <4>[67096.760255] [<ffffffff814b5b00>] ? dst_output+0x0/0x20 <4>[67096.760255] [<ffffffff814b66d5>] raw_sendmsg+0x775/0x910 <4>[67096.760255] [<ffffffff8104c5a8>] ? flush_tlb_others_ipi+0x128/0x130 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814c136a>] inet_sendmsg+0x4a/0xb0 <4>[67096.760255] [<ffffffff81444e93>] ? sock_sendmsg+0x13/0x140 <4>[67096.760255] [<ffffffff81444f97>] sock_sendmsg+0x117/0x140 <4>[67096.760255] [<ffffffff8102e299>] ? native_smp_send_reschedule+0x49/0x60 <4>[67096.760255] [<ffffffff81519beb>] ? _spin_unlock_bh+0x1b/0x20 <4>[67096.760255] [<ffffffff8109d930>] ? autoremove_wake_function+0x0/0x40 <4>[67096.760255] [<ffffffff814960f0>] ? do_ip_setsockopt+0x90/0xd80 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff8100bc4e>] ? apic_timer_interrupt+0xe/0x20 <4>[67096.760255] [<ffffffff814457c9>] sys_sendto+0x139/0x190 <4>[67096.760255] [<ffffffff810efa77>] ? audit_syscall_entry+0x1d7/0x200 <4>[67096.760255] [<ffffffff810ef7c5>] ? __audit_syscall_exit+0x265/0x290 <4>[67096.760255] [<ffffffff81474daf>] compat_sys_socketcall+0x13f/0x210 <4>[67096.760255] [<ffffffff8104dea3>] ia32_sysret+0x0/0x5 I have reused the original title for the RFC patch that Andrey posted and most of the original patch description. Cc: Eric Dumazet <edumazet@google.com> Cc: Andrew Vagin <avagin@parallels.com> Cc: Florian Westphal <fw@strlen.de> Reported-by: Andrew Vagin <avagin@parallels.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Andrew Vagin <avagin@parallels.com>
2014-02-03 12:01:53 -07:00
nf_ct_put(snet->tmpl);
synproxy_proc_exit(net);
free_percpu(snet->stats);
}
static struct pernet_operations synproxy_net_ops = {
.init = synproxy_net_init,
.exit = synproxy_net_exit,
.id = &synproxy_net_id,
.size = sizeof(struct synproxy_net),
};
static int __init synproxy_core_init(void)
{
int err;
err = nf_ct_extend_register(&nf_ct_synproxy_extend);
if (err < 0)
goto err1;
err = register_pernet_subsys(&synproxy_net_ops);
if (err < 0)
goto err2;
return 0;
err2:
nf_ct_extend_unregister(&nf_ct_synproxy_extend);
err1:
return err;
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
module_exit(synproxy_core_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");