alistair23-linux/net/netfilter/ipset/ip_set_hash_netnet.c
Jozsef Kadlecsik 18f84d41d3 netfilter: ipset: Introduce RCU locking in hash:* types
Three types of data need to be protected in the case of the hash types:

a. The hash buckets: standard rcu pointer operations are used.
b. The element blobs in the hash buckets are stored in an array and
   a bitmap is used for book-keeping to tell which elements in the array
   are used or free.
c. Networks per cidr values and the cidr values themselves are stored
   in fix sized arrays and need no protection. The values are modified
   in such an order that in the worst case an element testing is repeated
   once with the same cidr value.

The ipset hash approach uses arrays instead of lists and therefore is
incompatible with rhashtable.

Performance is tested by Jesper Dangaard Brouer:

Simple drop in FORWARD
~~~~~~~~~~~~~~~~~~~~~~

Dropping via simple iptables net-mask match::

 iptables -t raw -N simple || iptables -t raw -F simple
 iptables -t raw -I simple  -s 198.18.0.0/15 -j DROP
 iptables -t raw -D PREROUTING -j simple
 iptables -t raw -I PREROUTING -j simple

Drop performance in "raw": 11.3Mpps

Generator: sending 12.2Mpps (tx:12264083 pps)

Drop via original ipset in RAW table
~~~~~~~~~~~~~~~~~~~~~~~~~~~

Create a set with lots of elements::

 sudo ./ipset destroy test
 echo "create test hash:ip hashsize 65536" > test.set
 for x in `seq 0 255`; do
    for y in `seq 0 255`; do
        echo "add test 198.18.$x.$y" >> test.set
    done
 done
 sudo ./ipset restore < test.set

Dropping via ipset::

 iptables -t raw -F
 iptables -t raw -N net198 || iptables -t raw -F net198
 iptables -t raw -I net198 -m set --match-set test src -j DROP
 iptables -t raw -I PREROUTING -j net198

Drop performance in "raw" with ipset: 8Mpps

Perf report numbers ipset drop in "raw"::

 +   24.65%  ksoftirqd/1  [ip_set]           [k] ip_set_test
 -   21.42%  ksoftirqd/1  [kernel.kallsyms]  [k] _raw_read_lock_bh
    - _raw_read_lock_bh
       + 99.88% ip_set_test
 -   19.42%  ksoftirqd/1  [kernel.kallsyms]  [k] _raw_read_unlock_bh
    - _raw_read_unlock_bh
       + 99.72% ip_set_test
 +    4.31%  ksoftirqd/1  [ip_set_hash_ip]   [k] hash_ip4_kadt
 +    2.27%  ksoftirqd/1  [ixgbe]            [k] ixgbe_fetch_rx_buffer
 +    2.18%  ksoftirqd/1  [ip_tables]        [k] ipt_do_table
 +    1.81%  ksoftirqd/1  [ip_set_hash_ip]   [k] hash_ip4_test
 +    1.61%  ksoftirqd/1  [kernel.kallsyms]  [k] __netif_receive_skb_core
 +    1.44%  ksoftirqd/1  [kernel.kallsyms]  [k] build_skb
 +    1.42%  ksoftirqd/1  [kernel.kallsyms]  [k] ip_rcv
 +    1.36%  ksoftirqd/1  [kernel.kallsyms]  [k] __local_bh_enable_ip
 +    1.16%  ksoftirqd/1  [kernel.kallsyms]  [k] dev_gro_receive
 +    1.09%  ksoftirqd/1  [kernel.kallsyms]  [k] __rcu_read_unlock
 +    0.96%  ksoftirqd/1  [ixgbe]            [k] ixgbe_clean_rx_irq
 +    0.95%  ksoftirqd/1  [kernel.kallsyms]  [k] __netdev_alloc_frag
 +    0.88%  ksoftirqd/1  [kernel.kallsyms]  [k] kmem_cache_alloc
 +    0.87%  ksoftirqd/1  [xt_set]           [k] set_match_v3
 +    0.85%  ksoftirqd/1  [kernel.kallsyms]  [k] inet_gro_receive
 +    0.83%  ksoftirqd/1  [kernel.kallsyms]  [k] nf_iterate
 +    0.76%  ksoftirqd/1  [kernel.kallsyms]  [k] put_compound_page
 +    0.75%  ksoftirqd/1  [kernel.kallsyms]  [k] __rcu_read_lock

Drop via ipset in RAW table with RCU-locking
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

With RCU locking, the RW-lock is gone.

Drop performance in "raw" with ipset with RCU-locking: 11.3Mpps

Performance-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
2015-06-14 10:40:17 +02:00

489 lines
13 KiB
C

/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
* Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
/* Kernel module implementing an IP set type: the hash:net type */
#include <linux/jhash.h>
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/random.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
IP_SET_MODULE_DESC("hash:net,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,net");
/* Type specific function prefix */
#define HTYPE hash_netnet
#define IP_SET_HASH_WITH_NETS
#define IPSET_NET_COUNT 2
/* IPv4 variants */
/* Member elements */
struct hash_netnet4_elem {
union {
__be32 ip[2];
__be64 ipcmp;
};
u8 nomatch;
u8 padding;
union {
u8 cidr[2];
u16 ccmp;
};
};
/* Common functions */
static inline bool
hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
const struct hash_netnet4_elem *ip2,
u32 *multi)
{
return ip1->ipcmp == ip2->ipcmp &&
ip1->ccmp == ip2->ccmp;
}
static inline int
hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
static inline void
hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
struct hash_netnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
static inline void
hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
{
if (inner) {
elem->ip[1] &= ip_set_netmask(cidr);
elem->cidr[1] = cidr;
} else {
elem->ip[0] &= ip_set_netmask(cidr);
elem->cidr[0] = cidr;
}
}
static bool
hash_netnet4_data_list(struct sk_buff *skb,
const struct hash_netnet4_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) ||
nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) ||
nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
return false;
nla_put_failure:
return true;
}
static inline void
hash_netnet4_data_next(struct hash_netnet4_elem *next,
const struct hash_netnet4_elem *d)
{
next->ipcmp = d->ipcmp;
}
#define MTYPE hash_netnet4
#define HOST_MASK 32
#include "ip_set_hash_gen.h"
static int
hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
e.ip[0] &= ip_set_netmask(e.cidr[0]);
e.ip[1] &= ip_set_netmask(e.cidr[1]);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, last;
u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
int ret;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
if (ret)
return ret;
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from);
if (ret)
return ret;
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_IP2_TO])) {
e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
}
ip_to = ip;
if (tb[IPSET_ATTR_IP_TO]) {
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
if (ip_to < ip)
swap(ip, ip_to);
if (unlikely(ip + UINT_MAX == ip_to))
return -IPSET_ERR_HASH_RANGE;
} else
ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
ip2_to = ip2_from;
if (tb[IPSET_ATTR_IP2_TO]) {
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
if (ret)
return ret;
if (ip2_to < ip2_from)
swap(ip2_from, ip2_to);
if (unlikely(ip2_from + UINT_MAX == ip2_to))
return -IPSET_ERR_HASH_RANGE;
} else
ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
if (retried)
ip = ntohl(h->next.ip[0]);
while (!after(ip, ip_to)) {
e.ip[0] = htonl(ip);
last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
ip2 = (retried &&
ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
: ip2_from;
while (!after(ip2, ip2_to)) {
e.ip[1] = htonl(ip2);
last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
else
ret = 0;
ip2 = last2 + 1;
}
ip = last + 1;
}
return ret;
}
/* IPv6 variants */
struct hash_netnet6_elem {
union nf_inet_addr ip[2];
u8 nomatch;
u8 padding;
union {
u8 cidr[2];
u16 ccmp;
};
};
/* Common functions */
static inline bool
hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
const struct hash_netnet6_elem *ip2,
u32 *multi)
{
return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
ip1->ccmp == ip2->ccmp;
}
static inline int
hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
static inline void
hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
struct hash_netnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
static inline void
hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
{
if (inner) {
ip6_netmask(&elem->ip[1], cidr);
elem->cidr[1] = cidr;
} else {
ip6_netmask(&elem->ip[0], cidr);
elem->cidr[0] = cidr;
}
}
static bool
hash_netnet6_data_list(struct sk_buff *skb,
const struct hash_netnet6_elem *data)
{
u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) ||
nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) ||
nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) ||
nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) ||
(flags &&
nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
goto nla_put_failure;
return false;
nla_put_failure:
return true;
}
static inline void
hash_netnet6_data_next(struct hash_netnet4_elem *next,
const struct hash_netnet6_elem *d)
{
}
#undef MTYPE
#undef HOST_MASK
#define MTYPE hash_netnet6
#define HOST_MASK 128
#define IP_SET_EMIT_CREATE
#include "ip_set_hash_gen.h"
static int
hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
const struct hash_netnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
if (adt == IPSET_TEST)
e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
int ret;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]);
if (ret)
return ret;
ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]);
if (ret)
return ret;
ret = ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]);
if (!e.cidr[0] || e.cidr[0] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
if (tb[IPSET_ATTR_CIDR2]) {
e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
if (!e.cidr[1] || e.cidr[1] > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
}
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
}
static struct ip_set_type hash_netnet_type __read_mostly = {
.name = "hash:net,net",
.protocol = IPSET_PROTOCOL,
.features = IPSET_TYPE_IP | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH,
.dimension = IPSET_DIM_TWO,
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
.create = hash_netnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
[IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
[IPSET_ATTR_IP2] = { .type = NLA_NESTED },
[IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED },
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
[IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
.len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
},
.me = THIS_MODULE,
};
static int __init
hash_netnet_init(void)
{
return ip_set_type_register(&hash_netnet_type);
}
static void __exit
hash_netnet_fini(void)
{
rcu_barrier();
ip_set_type_unregister(&hash_netnet_type);
}
module_init(hash_netnet_init);
module_exit(hash_netnet_fini);