remarkable-linux/net/netfilter/nfnetlink.c
David S. Miller a01aa920b8 Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Pablo Neira Ayuso says:

====================
Netfilter/IPVS updates for net-next

The following patchset contains Netfilter updates for your net-next
tree. A large bunch of code cleanups, simplify the conntrack extension
codebase, get rid of the fake conntrack object, speed up netns by
selective synchronize_net() calls. More specifically, they are:

1) Check for ct->status bit instead of using nfct_nat() from IPVS and
   Netfilter codebase, patch from Florian Westphal.

2) Use kcalloc() wherever possible in the IPVS code, from Varsha Rao.

3) Simplify FTP IPVS helper module registration path, from Arushi Singhal.

4) Introduce nft_is_base_chain() helper function.

5) Enforce expectation limit from userspace conntrack helper,
   from Gao Feng.

6) Add nf_ct_remove_expect() helper function, from Gao Feng.

7) NAT mangle helper function return boolean, from Gao Feng.

8) ctnetlink_alloc_expect() should only work for conntrack with
   helpers, from Gao Feng.

9) Add nfnl_msg_type() helper function to nfnetlink to build the
   netlink message type.

10) Get rid of unnecessary cast on void, from simran singhal.

11) Use seq_puts()/seq_putc() instead of seq_printf() where possible,
    also from simran singhal.

12) Use list_prev_entry() from nf_tables, from simran signhal.

13) Remove unnecessary & on pointer function in the Netfilter and IPVS
    code.

14) Remove obsolete comment on set of rules per CPU in ip6_tables,
    no longer true. From Arushi Singhal.

15) Remove duplicated nf_conntrack_l4proto_udplite4, from Gao Feng.

16) Remove unnecessary nested rcu_read_lock() in
    __nf_nat_decode_session(). Code running from hooks are already
    guaranteed to run under RCU read side.

17) Remove deadcode in nf_tables_getobj(), from Aaron Conole.

18) Remove double assignment in nf_ct_l4proto_pernet_unregister_one(),
    also from Aaron.

19) Get rid of unsed __ip_set_get_netlink(), from Aaron Conole.

20) Don't propagate NF_DROP error to userspace via ctnetlink in
    __nf_nat_alloc_null_binding() function, from Gao Feng.

21) Revisit nf_ct_deliver_cached_events() to remove unnecessary checks,
    from Gao Feng.

22) Kill the fake untracked conntrack objects, use ctinfo instead to
    annotate a conntrack object is untracked, from Florian Westphal.

23) Remove nf_ct_is_untracked(), now obsolete since we have no
    conntrack template anymore, from Florian.

24) Add event mask support to nft_ct, also from Florian.

25) Move nf_conn_help structure to
    include/net/netfilter/nf_conntrack_helper.h.

26) Add a fixed 32 bytes scratchpad area for conntrack helpers.
    Thus, we don't deal with variable conntrack extensions anymore.
    Make sure userspace conntrack helper doesn't go over that size.
    Remove variable size ct extension infrastructure now this code
    got no more clients. From Florian Westphal.

27) Restore offset and length of nf_ct_ext structure to 8 bytes now
    that wraparound is not possible any longer, also from Florian.

28) Allow to get rid of unassured flows under stress in conntrack,
    this applies to DCCP, SCTP and TCP protocols, from Florian.

29) Shrink size of nf_conntrack_ecache structure, from Florian.

30) Use TCP_MAX_WSCALE instead of hardcoded 14 in TCP tracker,
    from Gao Feng.

31) Register SYNPROXY hooks on demand, from Florian Westphal.

32) Use pernet hook whenever possible, instead of global hook
    registration, from Florian Westphal.

33) Pass hook structure to ebt_register_table() to consolidate some
    infrastructure code, from Florian Westphal.

34) Use consume_skb() and return NF_STOLEN, instead of NF_DROP in the
    SYNPROXY code, to make sure device stats are not fooled, patch
    from Gao Feng.

35) Remove NF_CT_EXT_F_PREALLOC this kills quite some code that we
    don't need anymore if we just select a fixed size instead of
    expensive runtime time calculation of this. From Florian.

36) Constify nf_ct_extend_register() and nf_ct_extend_unregister(),
    from Florian.

37) Simplify nf_ct_ext_add(), this kills nf_ct_ext_create(), from
    Florian.

38) Attach NAT extension on-demand from masquerade and pptp helper
    path, from Florian.

39) Get rid of useless ip_vs_set_state_timeout(), from Aaron Conole.

40) Speed up netns by selective calls of synchronize_net(), from
    Florian Westphal.

41) Silence stack size warning gcc in 32-bit arch in snmp helper,
    from Florian.

42) Inconditionally call nf_ct_ext_destroy(), even if we have no
    extensions, to deal with the NF_NAT_MANIP_SRC case. Patch from
    Liping Zhang.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-01 10:47:53 -04:00

585 lines
14 KiB
C

/* Netfilter messages via netlink socket. Allows for user space
* protocol helpers and general trouble making from userspace.
*
* (C) 2001 by Jay Schulist <jschlst@samba.org>,
* (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
* (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
*
* Initial netfilter messages via netlink development funded and
* generally made possible by Network Robots, Inc. (www.networkrobots.com)
*
* Further development of this code funded by Astaro AG (http://www.astaro.com)
*
* This software may be used and distributed according to the terms
* of the GNU General Public License, incorporated herein by reference.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
#define nfnl_dereference_protected(id) \
rcu_dereference_protected(table[(id)].subsys, \
lockdep_nfnl_is_held((id)))
static char __initdata nfversion[] = "0.30";
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
} table[NFNL_SUBSYS_COUNT];
static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_DESTROY] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP,
[NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP,
[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
[NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES,
[NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT,
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
};
void nfnl_lock(__u8 subsys_id)
{
mutex_lock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_lock);
void nfnl_unlock(__u8 subsys_id)
{
mutex_unlock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_unlock);
#ifdef CONFIG_PROVE_LOCKING
bool lockdep_nfnl_is_held(u8 subsys_id)
{
return lockdep_is_held(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
#endif
int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
nfnl_lock(n->subsys_id);
if (table[n->subsys_id].subsys) {
nfnl_unlock(n->subsys_id);
return -EBUSY;
}
rcu_assign_pointer(table[n->subsys_id].subsys, n);
nfnl_unlock(n->subsys_id);
return 0;
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
{
nfnl_lock(n->subsys_id);
table[n->subsys_id].subsys = NULL;
nfnl_unlock(n->subsys_id);
synchronize_rcu();
return 0;
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
{
u8 subsys_id = NFNL_SUBSYS_ID(type);
if (subsys_id >= NFNL_SUBSYS_COUNT)
return NULL;
return rcu_dereference(table[subsys_id].subsys);
}
static inline const struct nfnl_callback *
nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
{
u8 cb_id = NFNL_MSG_TYPE(type);
if (cb_id >= ss->cb_count)
return NULL;
return &ss->cb[cb_id];
}
int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
return netlink_has_listeners(net->nfnl, group);
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
return netlink_set_err(net->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
int flags)
{
return netlink_unicast(net->nfnl, skb, portid, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
/* Process one complete nfnetlink message. */
static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
const struct nfnl_callback *nc;
const struct nfnetlink_subsystem *ss;
int type, err;
/* All the messages must at least contain nfgenmsg */
if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
return 0;
type = nlh->nlmsg_type;
replay:
rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_MODULES
rcu_read_unlock();
request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
rcu_read_lock();
ss = nfnetlink_get_subsys(type);
if (!ss)
#endif
{
rcu_read_unlock();
return -EINVAL;
}
}
nc = nfnetlink_find_client(type, ss);
if (!nc) {
rcu_read_unlock();
return -EINVAL;
}
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
ss->cb[cb_id].policy, extack);
if (err < 0) {
rcu_read_unlock();
return err;
}
if (nc->call_rcu) {
err = nc->call_rcu(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
rcu_read_unlock();
} else {
rcu_read_unlock();
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
nfnetlink_find_client(type, ss) != nc)
err = -EAGAIN;
else if (nc->call)
err = nc->call(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
else
err = -EINVAL;
nfnl_unlock(subsys_id);
}
if (err == -EAGAIN)
goto replay;
return err;
}
}
struct nfnl_err {
struct list_head head;
struct nlmsghdr *nlh;
int err;
};
static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err)
{
struct nfnl_err *nfnl_err;
nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL);
if (nfnl_err == NULL)
return -ENOMEM;
nfnl_err->nlh = nlh;
nfnl_err->err = err;
list_add_tail(&nfnl_err->head, list);
return 0;
}
static void nfnl_err_del(struct nfnl_err *nfnl_err)
{
list_del(&nfnl_err->head);
kfree(nfnl_err);
}
static void nfnl_err_reset(struct list_head *err_list)
{
struct nfnl_err *nfnl_err, *next;
list_for_each_entry_safe(nfnl_err, next, err_list, head)
nfnl_err_del(nfnl_err);
}
static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
{
struct nfnl_err *nfnl_err, *next;
list_for_each_entry_safe(nfnl_err, next, err_list, head) {
netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, NULL);
nfnl_err_del(nfnl_err);
}
}
enum {
NFNL_BATCH_FAILURE = (1 << 0),
NFNL_BATCH_DONE = (1 << 1),
NFNL_BATCH_REPLAY = (1 << 2),
};
static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
u16 subsys_id, u32 genid)
{
struct sk_buff *oskb = skb;
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
LIST_HEAD(err_list);
u32 status;
int err;
if (subsys_id >= NFNL_SUBSYS_COUNT)
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
nfnl_lock(subsys_id);
ss = nfnl_dereference_protected(subsys_id);
if (!ss) {
#ifdef CONFIG_MODULES
nfnl_unlock(subsys_id);
request_module("nfnetlink-subsys-%d", subsys_id);
nfnl_lock(subsys_id);
ss = nfnl_dereference_protected(subsys_id);
if (!ss)
#endif
{
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
return kfree_skb(skb);
}
}
if (!ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
return kfree_skb(skb);
}
if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
return kfree_skb(skb);
}
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
nlh = nlmsg_hdr(skb);
err = 0;
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
skb->len < nlh->nlmsg_len ||
nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
nfnl_err_reset(&err_list);
status |= NFNL_BATCH_FAILURE;
goto done;
}
/* Only requests are handled by the kernel */
if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
err = -EINVAL;
goto ack;
}
type = nlh->nlmsg_type;
if (type == NFNL_MSG_BATCH_BEGIN) {
/* Malformed: Batch begin twice */
nfnl_err_reset(&err_list);
status |= NFNL_BATCH_FAILURE;
goto done;
} else if (type == NFNL_MSG_BATCH_END) {
status |= NFNL_BATCH_DONE;
goto done;
} else if (type < NLMSG_MIN_TYPE) {
err = -EINVAL;
goto ack;
}
/* We only accept a batch with messages for the same
* subsystem.
*/
if (NFNL_SUBSYS_ID(type) != subsys_id) {
err = -EINVAL;
goto ack;
}
nc = nfnetlink_find_client(type, ss);
if (!nc) {
err = -EINVAL;
goto ack;
}
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
attrlen, ss->cb[cb_id].policy, NULL);
if (err < 0)
goto ack;
if (nc->call_batch) {
err = nc->call_batch(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
}
/* The lock was released to autoload some module, we
* have to abort and start from scratch using the
* original skb.
*/
if (err == -EAGAIN) {
status |= NFNL_BATCH_REPLAY;
goto next;
}
}
ack:
if (nlh->nlmsg_flags & NLM_F_ACK || err) {
/* Errors are delivered once the full batch has been
* processed, this avoids that the same error is
* reported several times when replaying the batch.
*/
if (nfnl_err_add(&err_list, nlh, err) < 0) {
/* We failed to enqueue an error, reset the
* list of errors and send OOM to userspace
* pointing to the batch header.
*/
nfnl_err_reset(&err_list);
netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM,
NULL);
status |= NFNL_BATCH_FAILURE;
goto done;
}
/* We don't stop processing the batch on errors, thus,
* userspace gets all the errors that the batch
* triggers.
*/
if (err)
status |= NFNL_BATCH_FAILURE;
}
next:
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
done:
if (status & NFNL_BATCH_REPLAY) {
ss->abort(net, oskb);
nfnl_err_reset(&err_list);
nfnl_unlock(subsys_id);
kfree_skb(skb);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
ss->commit(net, oskb);
} else {
ss->abort(net, oskb);
}
nfnl_err_deliver(&err_list, oskb);
nfnl_unlock(subsys_id);
kfree_skb(skb);
}
static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
[NFNL_BATCH_GENID] = { .type = NLA_U32 },
};
static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *attr = (void *)nlh + min_len;
struct nlattr *cda[NFNL_BATCH_MAX + 1];
int attrlen = nlh->nlmsg_len - min_len;
struct nfgenmsg *nfgenmsg;
int msglen, err;
u32 gen_id = 0;
u16 res_id;
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
return;
err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy,
NULL);
if (err < 0) {
netlink_ack(skb, nlh, err, NULL);
return;
}
if (cda[NFNL_BATCH_GENID])
gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
/* Work around old nft using host byte order */
if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
res_id = NFNL_SUBSYS_NFTABLES;
else
res_id = ntohs(nfgenmsg->res_id);
nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
}
static void nfnetlink_rcv(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
if (nlh->nlmsg_len < NLMSG_HDRLEN ||
skb->len < nlh->nlmsg_len)
return;
if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
netlink_ack(skb, nlh, -EPERM, NULL);
return;
}
if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
nfnetlink_rcv_skb_batch(skb, nlh);
else
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
#ifdef CONFIG_MODULES
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
int type;
if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
return 0;
type = nfnl_group2type[group];
rcu_read_lock();
ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
request_module("nfnetlink-subsys-%d", type);
return 0;
}
#endif
static int __net_init nfnetlink_net_init(struct net *net)
{
struct sock *nfnl;
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
#endif
};
nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
if (!nfnl)
return -ENOMEM;
net->nfnl_stash = nfnl;
rcu_assign_pointer(net->nfnl, nfnl);
return 0;
}
static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
list_for_each_entry(net, net_exit_list, exit_list)
RCU_INIT_POINTER(net->nfnl, NULL);
synchronize_net();
list_for_each_entry(net, net_exit_list, exit_list)
netlink_kernel_release(net->nfnl_stash);
}
static struct pernet_operations nfnetlink_net_ops = {
.init = nfnetlink_net_init,
.exit_batch = nfnetlink_net_exit_batch,
};
static int __init nfnetlink_init(void)
{
int i;
for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
for (i=0; i<NFNL_SUBSYS_COUNT; i++)
mutex_init(&table[i].mutex);
pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
return register_pernet_subsys(&nfnetlink_net_ops);
}
static void __exit nfnetlink_exit(void)
{
pr_info("Removing netfilter NETLINK layer.\n");
unregister_pernet_subsys(&nfnetlink_net_ops);
}
module_init(nfnetlink_init);
module_exit(nfnetlink_exit);