From 04b80ceadcae754d053bbb96c5dce0b8641c8fe9 Mon Sep 17 00:00:00 2001 From: linzhang Date: Fri, 12 May 2017 13:11:06 +0800 Subject: [PATCH 01/29] netfilter: ctnetlink: delete extra spaces This patch cleans up extra spaces. Signed-off-by: linzhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9799a50bc604..f08604dd1a59 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -636,11 +636,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; - } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { + } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; - } else if (events) { + } else if (events) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; } else From 03eb7d494a6b7667a8e4076572edf5dc2ce3c675 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 May 2017 11:25:31 +0200 Subject: [PATCH 02/29] netfilter: ipt_CLUSTERIP: switch to nf_register_net_hook one of the last remaining users of the old api, hopefully followup commit can remove it soon. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 038f293c2376..f30bee8e407b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -743,14 +743,20 @@ static const struct file_operations clusterip_proc_fops = { static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = net_generic(net, clusterip_net_id); + int ret; INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); + ret = nf_register_net_hook(net, &cip_arp_ops); + if (ret < 0) + return ret; + #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { + nf_unregister_net_hook(net, &cip_arp_ops); pr_err("Unable to proc dir entry\n"); return -ENOMEM; } @@ -765,6 +771,7 @@ static void clusterip_net_exit(struct net *net) struct clusterip_net *cn = net_generic(net, clusterip_net_id); proc_remove(cn->procdir); #endif + nf_unregister_net_hook(net, &cip_arp_ops); } static struct pernet_operations clusterip_net_ops = { @@ -786,17 +793,11 @@ static int __init clusterip_tg_init(void) if (ret < 0) goto cleanup_subsys; - ret = nf_register_hook(&cip_arp_ops); - if (ret < 0) - goto cleanup_target; - pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; -cleanup_target: - xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; @@ -806,7 +807,6 @@ static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); - nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); From a32770b1e76bb78d31be856976586834e587ebfb Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:29:41 -0700 Subject: [PATCH 03/29] netfilter: dup: resolve warnings about missing prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing include file causes: net/netfilter/nf_dup_netdev.c:26:6: warning: no previous prototype for ‘nf_fwd_netdev_egress’ [-Wmissing-prototypes] net/netfilter/nf_dup_netdev.c:40:6: warning: no previous prototype for ‘nf_dup_netdev_egress’ [-Wmissing-prototypes] Signed-off-by: Stephen Hemminger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_dup_netdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index c9d7f95768ab..f4a566e67213 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,6 +13,7 @@ #include #include #include +#include static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) { From cad4394453bd83ac7e82cad94c71149960a93f1a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 19 May 2017 09:29:42 -0700 Subject: [PATCH 04/29] netfilter: nft_rt: make local functions static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves warnings: net/netfilter/nft_rt.c:26:6: warning: no previous prototype for ‘nft_rt_get_eval’ [-Wmissing-prototypes] net/netfilter/nft_rt.c:75:5: warning: no previous prototype for ‘nft_rt_get_init’ [-Wmissing-prototypes] net/netfilter/nft_rt.c:106:5: warning: no previous prototype for ‘nft_rt_get_dump’ [-Wmissing-prototypes] Signed-off-by: Stephen Hemminger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_rt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index d3eb640bc784..c7383d8f88d0 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -23,9 +23,9 @@ struct nft_rt { enum nft_registers dreg:8; }; -void nft_rt_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_rt_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_rt *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; @@ -72,9 +72,9 @@ const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = { [NFTA_RT_KEY] = { .type = NLA_U32 }, }; -int nft_rt_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_rt_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_rt *priv = nft_expr_priv(expr); unsigned int len; @@ -103,8 +103,8 @@ int nft_rt_get_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, len); } -int nft_rt_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_rt_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); From 9fd6452d67fb2acda12e5914e2ad371f067f3465 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 21 May 2017 12:52:55 +0200 Subject: [PATCH 05/29] netfilter: conntrack: rename nf_ct_iterate_cleanup There are several places where we needlesly call nf_ct_iterate_cleanup, we should instead iterate the full table at module unload time. This is a leftover from back when the conntrack table got duplicated per net namespace. So rename nf_ct_iterate_cleanup to nf_ct_iterate_cleanup_net. A later patch will then add a non-net variant. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 6 +++--- net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 4 ++-- net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 10 +++++----- net/netfilter/nf_conntrack_core.c | 10 +++++----- net/netfilter/nf_conntrack_netlink.c | 4 ++-- net/netfilter/nf_conntrack_proto.c | 4 ++-- net/netfilter/nf_nat_core.c | 6 +++--- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8ece3612d0cd..f21180ea4558 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -225,9 +225,9 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Iterate over all conntracks: if iter returns true, it's deleted. */ -void nf_ct_iterate_cleanup(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report); +void nf_ct_iterate_cleanup_net(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report); struct nf_conntrack_zone; diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index dc1dea15c1b4..f39037fca923 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -98,8 +98,8 @@ static int masq_device_event(struct notifier_block *this, */ NF_CT_ASSERT(dev->ifindex != 0); - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_ct_iterate_cleanup_net(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 2297c9f073ba..d7b679037bae 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -75,8 +75,8 @@ static int masq_device_event(struct notifier_block *this, struct net *net = dev_net(dev); if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_ct_iterate_cleanup_net(net, device_cmp, + (void *)(long)dev->ifindex, 0, 0); return NOTIFY_DONE; } @@ -99,7 +99,7 @@ static void iterate_cleanup_work(struct work_struct *work) w = container_of(work, struct masq_dev_work, work); index = w->ifindex; - nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); + nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0); put_net(w->net); kfree(w); @@ -110,12 +110,12 @@ static void iterate_cleanup_work(struct work_struct *work) /* ipv6 inet notifier is an atomic notifier, i.e. we cannot * schedule. * - * Unfortunately, nf_ct_iterate_cleanup can run for a long + * Unfortunately, nf_ct_iterate_cleanup_net can run for a long * time if there are lots of conntracks and the system * handles high softirq load, so it frequently calls cond_resched * while iterating the conntrack table. * - * So we defer nf_ct_iterate_cleanup walk to the system workqueue. + * So we defer nf_ct_iterate_cleanup_net walk to the system workqueue. * * As we can have 'a lot' of inet_events (depending on amount * of ipv6 addresses being deleted), we also need to add an upper diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e847dbaa0c6b..2730f9df33b7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1634,9 +1634,9 @@ found: return ct; } -void nf_ct_iterate_cleanup(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report) +void nf_ct_iterate_cleanup_net(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report) { struct nf_conn *ct; unsigned int bucket = 0; @@ -1654,7 +1654,7 @@ void nf_ct_iterate_cleanup(struct net *net, cond_resched(); } } -EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); +EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); static int kill_all(struct nf_conn *i, void *data) { @@ -1723,7 +1723,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); + nf_ct_iterate_cleanup_net(net, kill_all, NULL, 0, 0); if (atomic_read(&net->ct.count) != 0) busy = 1; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f08604dd1a59..e1eca47105bd 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1117,8 +1117,8 @@ static int ctnetlink_flush_conntrack(struct net *net, return PTR_ERR(filter); } - nf_ct_iterate_cleanup(net, ctnetlink_filter_match, filter, - portid, report); + nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter, + portid, report); kfree(filter); return 0; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 2de6c1fe3261..b7d01f27d463 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -282,7 +282,7 @@ void nf_ct_l3proto_pernet_unregister(struct net *net, proto->net_ns_put(net); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); + nf_ct_iterate_cleanup_net(net, kill_l3proto, proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); @@ -450,7 +450,7 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(net, kill_l4proto, l4proto, 0, 0); + nf_ct_iterate_cleanup_net(net, kill_l4proto, l4proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ef0be325a0c6..daf5b22c07f8 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -586,7 +586,7 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup_net(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -600,7 +600,7 @@ static void nf_nat_l3proto_clean(u8 l3proto) rtnl_lock(); for_each_net(net) - nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); + nf_ct_iterate_cleanup_net(net, nf_nat_proto_remove, &clean, 0, 0); rtnl_unlock(); } @@ -826,7 +826,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; - nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); + nf_ct_iterate_cleanup_net(net, nf_nat_proto_clean, &clean, 0, 0); } static struct pernet_operations nf_nat_net_ops = { From b0feacaad13a0aa9657c37ed80991575981e2e3b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 21 May 2017 12:52:56 +0200 Subject: [PATCH 06/29] netfilter: conntrack: don't call iter for non-confirmed conntracks nf_ct_iterate_cleanup_net currently calls iter() callback also for conntracks on the unconfirmed list, but this is unsafe. Acesses to nf_conn are fine, but some users access the extension area in the iter() callback, but that does only work reliably for confirmed conntracks (ct->ext can be reallocated at any time for unconfirmed conntrack). The seond issue is that there is a short window where a conntrack entry is neither on the list nor in the table: To confirm an entry, it is first removed from the unconfirmed list, then insert into the table. Fix this by iterating the unconfirmed list first and marking all entries as dying, then wait for rcu grace period. This makes sure all entries that were about to be confirmed either are in the main table, or will be dropped soon. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 45 ++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2730f9df33b7..08733685d732 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1592,7 +1592,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct hlist_nulls_node *n; - int cpu; spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { @@ -1614,18 +1613,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), cond_resched(); } - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&pcpu->lock); - cond_resched(); - } return NULL; found: atomic_inc(&ct->ct_general.use); @@ -1634,6 +1621,34 @@ found: return ct; } +static void +__nf_ct_unconfirmed_destroy(struct net *net) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct ct_pcpu *pcpu; + + pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); + + spin_lock_bh(&pcpu->lock); + hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { + struct nf_conn *ct; + + ct = nf_ct_tuplehash_to_ctrack(h); + + /* we cannot call iter() on unconfirmed list, the + * owning cpu can reallocate ct->ext at any time. + */ + set_bit(IPS_DYING_BIT, &ct->status); + } + spin_unlock_bh(&pcpu->lock); + cond_resched(); + } +} + void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) @@ -1646,6 +1661,10 @@ void nf_ct_iterate_cleanup_net(struct net *net, if (atomic_read(&net->ct.count) == 0) return; + __nf_ct_unconfirmed_destroy(net); + + synchronize_net(); + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ From 2843fb69980b84dfa939733c91dceae533aa89e9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 21 May 2017 12:52:57 +0200 Subject: [PATCH 07/29] netfilter: conntrack: add nf_ct_iterate_destroy sledgehammer to be used on module unload (to remove affected conntracks from all namespaces). It will also flag all unconfirmed conntracks as dying, i.e. they will not be committed to main table. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 ++ net/netfilter/nf_conntrack_core.c | 89 +++++++++++++++++++++++----- 2 files changed, 79 insertions(+), 14 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f21180ea4558..48407569585d 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -229,6 +229,10 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report); +/* also set unconfirmed conntracks as dying. Only use in module exit path. */ +void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), + void *data); + struct nf_conntrack_zone; void nf_conntrack_free(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 08733685d732..7ecee79c78b8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1586,7 +1586,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) /* Bring out ya dead! */ static struct nf_conn * -get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), +get_next_corpse(int (*iter)(struct nf_conn *i, void *data), void *data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; @@ -1603,8 +1603,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (net_eq(nf_ct_net(ct), net) && - iter(ct, data)) + if (iter(ct, data)) goto found; } } @@ -1621,6 +1620,39 @@ found: return ct; } +static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), + void *data, u32 portid, int report) +{ + struct nf_conn *ct; + unsigned int bucket = 0; + + might_sleep(); + + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ + + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); + } +} + +struct iter_data { + int (*iter)(struct nf_conn *i, void *data); + void *data; + struct net *net; +}; + +static int iter_net_only(struct nf_conn *i, void *data) +{ + struct iter_data *d = data; + + if (!net_eq(d->net, nf_ct_net(i))) + return 0; + + return d->iter(i, d->data); +} + static void __nf_ct_unconfirmed_destroy(struct net *net) { @@ -1653,8 +1685,7 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - struct nf_conn *ct; - unsigned int bucket = 0; + struct iter_data d; might_sleep(); @@ -1663,21 +1694,51 @@ void nf_ct_iterate_cleanup_net(struct net *net, __nf_ct_unconfirmed_destroy(net); + d.iter = iter; + d.data = data; + d.net = net; + synchronize_net(); - while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } + nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); +/** + * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table + * @iter: callback to invoke for each conntrack + * @data: data to pass to @iter + * + * Like nf_ct_iterate_cleanup, but first marks conntracks on the + * unconfirmed list as dying (so they will not be inserted into + * main table). + */ +void +nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) +{ + struct net *net; + + rtnl_lock(); + for_each_net(net) { + if (atomic_read(&net->ct.count) == 0) + continue; + __nf_ct_unconfirmed_destroy(net); + } + rtnl_unlock(); + + /* a conntrack could have been unlinked from unconfirmed list + * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). + * This makes sure its inserted into conntrack table. + */ + synchronize_net(); + + nf_ct_iterate_cleanup(iter, data, 0, 0); +} +EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); + static int kill_all(struct nf_conn *i, void *data) { - return 1; + return net_eq(nf_ct_net(i), data); } void nf_ct_free_hashtable(void *hash, unsigned int size) @@ -1742,7 +1803,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_iterate_cleanup_net(net, kill_all, NULL, 0, 0); + nf_ct_iterate_cleanup(kill_all, net, 0, 0); if (atomic_read(&net->ct.count) != 0) busy = 1; } From 0d02d5646eb84403766a11a1d3b19e670a3d45d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 21 May 2017 12:52:58 +0200 Subject: [PATCH 08/29] netfilter: conntrack: restart iteration on resize We could some conntracks when a resize occurs in parallel. Avoid this by sampling generation seqcnt and doing a restart if needed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7ecee79c78b8..c3bd9b086dcc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1623,17 +1623,25 @@ found: static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { + unsigned int bucket = 0, sequence; struct nf_conn *ct; - unsigned int bucket = 0; might_sleep(); - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ + for (;;) { + sequence = read_seqcount_begin(&nf_conntrack_generation); - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ + + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); + } + + if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) + break; + bucket = 0; } } From 8f23f35f1e89656d766d6295cac23bac26f17de2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 21 May 2017 12:52:59 +0200 Subject: [PATCH 09/29] netfilter: nat: destroy nat mappings on module exit path only We don't need pernetns cleanup anymore. If the netns is being destroyed, conntrack netns exit will kill all entries in this namespace, and neither conntrack hash table nor bysource hash are per namespace. For the rmmod case, we have to make sure we remove all entries from the nat bysource table, so call the new nf_ct_iterate_destroy in module exit path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index daf5b22c07f8..d26cc2f864e6 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -582,12 +582,8 @@ static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) .l3proto = l3proto, .l4proto = l4proto, }; - struct net *net; - rtnl_lock(); - for_each_net(net) - nf_ct_iterate_cleanup_net(net, nf_nat_proto_remove, &clean, 0, 0); - rtnl_unlock(); + nf_ct_iterate_destroy(nf_nat_proto_remove, &clean); } static void nf_nat_l3proto_clean(u8 l3proto) @@ -595,13 +591,8 @@ static void nf_nat_l3proto_clean(u8 l3proto) struct nf_nat_proto_clean clean = { .l3proto = l3proto, }; - struct net *net; - rtnl_lock(); - - for_each_net(net) - nf_ct_iterate_cleanup_net(net, nf_nat_proto_remove, &clean, 0, 0); - rtnl_unlock(); + nf_ct_iterate_destroy(nf_nat_proto_remove, &clean); } /* Protocol registration. */ @@ -822,17 +813,6 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static void __net_exit nf_nat_net_exit(struct net *net) -{ - struct nf_nat_proto_clean clean = {}; - - nf_ct_iterate_cleanup_net(net, nf_nat_proto_clean, &clean, 0, 0); -} - -static struct pernet_operations nf_nat_net_ops = { - .exit = nf_nat_net_exit, -}; - static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, @@ -853,10 +833,6 @@ static int __init nf_nat_init(void) return ret; } - ret = register_pernet_subsys(&nf_nat_net_ops); - if (ret < 0) - goto cleanup_extend; - nf_ct_helper_expectfn_register(&follow_master_nat); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); @@ -867,18 +843,15 @@ static int __init nf_nat_init(void) RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); #endif return 0; - - cleanup_extend: - rhltable_destroy(&nf_nat_bysource_table); - nf_ct_extend_unregister(&nat_extend); - return ret; } static void __exit nf_nat_cleanup(void) { + struct nf_nat_proto_clean clean = {}; unsigned int i; - unregister_pernet_subsys(&nf_nat_net_ops); + nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); + nf_ct_extend_unregister(&nat_extend); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); From 187388bc3d63a9a3ae66b8dec255d0426eea2236 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:40 +0100 Subject: [PATCH 10/29] netfilter: nft_set_hash: unnecessary forward declaration Replace struct rhashtable_params forward declaration by the structure definition itself. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 3d3a6df4ce70..850be3a00e62 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -40,8 +40,6 @@ struct nft_hash_cmp_arg { u8 genmask; }; -static const struct rhashtable_params nft_hash_params; - static inline u32 nft_hash_key(const void *data, u32 len, u32 seed) { const struct nft_hash_cmp_arg *arg = data; @@ -71,6 +69,14 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, return 0; } +static const struct rhashtable_params nft_hash_params = { + .head_offset = offsetof(struct nft_hash_elem, node), + .hashfn = nft_hash_key, + .obj_hashfn = nft_hash_obj, + .obj_cmpfn = nft_hash_cmp, + .automatic_shrinking = true, +}; + static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { @@ -320,14 +326,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) return sizeof(struct nft_hash); } -static const struct rhashtable_params nft_hash_params = { - .head_offset = offsetof(struct nft_hash_elem, node), - .hashfn = nft_hash_key, - .obj_hashfn = nft_hash_obj, - .obj_cmpfn = nft_hash_cmp, - .automatic_shrinking = true, -}; - static int nft_hash_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const tb[]) From 080ed636a559e960010b714dee035dddacbe73b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:45 +0100 Subject: [PATCH 11/29] netfilter: nf_tables: no size estimation if number of set elements is unknown This size estimation is ignored by the existing set backend selection logic, since this estimation structure is stack allocated, set this to ~0 to make it easier to catch bugs in future changes. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 17 ++++------------- net/netfilter/nft_set_rbtree.c | 8 +++----- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 850be3a00e62..1f1cc33895fd 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -365,22 +365,13 @@ static void nft_hash_destroy(const struct nft_set *set) static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { - unsigned int esize; - - esize = sizeof(struct nft_hash_elem); - if (desc->size) { + if (desc->size) est->size = sizeof(struct nft_hash) + roundup_pow_of_two(desc->size * 4 / 3) * sizeof(struct nft_hash_elem *) + - desc->size * esize; - } else { - /* Resizing happens when the load drops below 30% or goes - * above 75%. The average of 52.5% load (approximated by 50%) - * is used for the size estimation of the hash buckets, - * meaning we calculate two buckets per element. - */ - est->size = esize + 2 * sizeof(struct nft_hash_elem *); - } + desc->size * sizeof(struct nft_hash_elem); + else + est->size = ~0; est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e97e2fb53f0a..fbfb3cbb3916 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -283,13 +283,11 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { - unsigned int nsize; - - nsize = sizeof(struct nft_rbtree_elem); if (desc->size) - est->size = sizeof(struct nft_rbtree) + desc->size * nsize; + est->size = sizeof(struct nft_rbtree) + + desc->size * sizeof(struct nft_rbtree_elem); else - est->size = nsize; + est->size = ~0; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; From 5fc6ced958db70556ce50f7026cd80078bde5a8c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:48 +0100 Subject: [PATCH 12/29] netfilter: nft_set_hash: use nft_rhash prefix for resizable set backend This patch prepares the introduction of a non-resizable hashtable implementation that is significantly faster. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 212 +++++++++++++++++------------------ 1 file changed, 106 insertions(+), 106 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 1f1cc33895fd..7c21e3da0d88 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -22,43 +22,43 @@ #include /* We target a hash table size of 4, element hint is 75% of final size */ -#define NFT_HASH_ELEMENT_HINT 3 +#define NFT_RHASH_ELEMENT_HINT 3 -struct nft_hash { +struct nft_rhash { struct rhashtable ht; struct delayed_work gc_work; }; -struct nft_hash_elem { +struct nft_rhash_elem { struct rhash_head node; struct nft_set_ext ext; }; -struct nft_hash_cmp_arg { +struct nft_rhash_cmp_arg { const struct nft_set *set; const u32 *key; u8 genmask; }; -static inline u32 nft_hash_key(const void *data, u32 len, u32 seed) +static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed) { - const struct nft_hash_cmp_arg *arg = data; + const struct nft_rhash_cmp_arg *arg = data; return jhash(arg->key, len, seed); } -static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed) +static inline u32 nft_rhash_obj(const void *data, u32 len, u32 seed) { - const struct nft_hash_elem *he = data; + const struct nft_rhash_elem *he = data; return jhash(nft_set_ext_key(&he->ext), len, seed); } -static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, - const void *ptr) +static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - const struct nft_hash_cmp_arg *x = arg->key; - const struct nft_hash_elem *he = ptr; + const struct nft_rhash_cmp_arg *x = arg->key; + const struct nft_rhash_elem *he = ptr; if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; @@ -69,49 +69,49 @@ static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, return 0; } -static const struct rhashtable_params nft_hash_params = { - .head_offset = offsetof(struct nft_hash_elem, node), - .hashfn = nft_hash_key, - .obj_hashfn = nft_hash_obj, - .obj_cmpfn = nft_hash_cmp, +static const struct rhashtable_params nft_rhash_params = { + .head_offset = offsetof(struct nft_rhash_elem, node), + .hashfn = nft_rhash_key, + .obj_hashfn = nft_rhash_obj, + .obj_cmpfn = nft_rhash_cmp, .automatic_shrinking = true, }; -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) { - struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_elem *he; - struct nft_hash_cmp_arg arg = { + struct nft_rhash *priv = nft_set_priv(set); + const struct nft_rhash_elem *he; + struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_cur(net), .set = set, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); if (he != NULL) *ext = &he->ext; return !!he; } -static bool nft_hash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) +static bool nft_rhash_update(struct nft_set *set, const u32 *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_regs *regs), + const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_set_ext **ext) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he, *prev; - struct nft_hash_cmp_arg arg = { + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *prev; + struct nft_rhash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; @@ -120,7 +120,7 @@ static bool nft_hash_update(struct nft_set *set, const u32 *key, goto err1; prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params); + nft_rhash_params); if (IS_ERR(prev)) goto err2; @@ -140,21 +140,21 @@ err1: return false; } -static int nft_hash_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, - struct nft_set_ext **ext) +static int nft_rhash_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he = elem->priv; - struct nft_hash_cmp_arg arg = { + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he = elem->priv; + struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; - struct nft_hash_elem *prev; + struct nft_rhash_elem *prev; prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params); + nft_rhash_params); if (IS_ERR(prev)) return PTR_ERR(prev); if (prev) { @@ -164,19 +164,19 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set, return 0; } -static void nft_hash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_rhash_activate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { - struct nft_hash_elem *he = elem->priv; + struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); nft_set_elem_clear_busy(&he->ext); } -static bool nft_hash_flush(const struct net *net, - const struct nft_set *set, void *priv) +static bool nft_rhash_flush(const struct net *net, + const struct nft_set *set, void *priv) { - struct nft_hash_elem *he = priv; + struct nft_rhash_elem *he = priv; if (!nft_set_elem_mark_busy(&he->ext) || !nft_is_active(net, &he->ext)) { @@ -186,22 +186,22 @@ static bool nft_hash_flush(const struct net *net, return false; } -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void *nft_rhash_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; - struct nft_hash_cmp_arg arg = { + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he; + struct nft_rhash_cmp_arg arg = { .genmask = nft_genmask_next(net), .set = set, .key = elem->key.val.data, }; rcu_read_lock(); - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); if (he != NULL && - !nft_hash_flush(net, set, he)) + !nft_rhash_flush(net, set, he)) he = NULL; rcu_read_unlock(); @@ -209,21 +209,21 @@ static void *nft_hash_deactivate(const struct net *net, return he; } -static void nft_hash_remove(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_rhash_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he = elem->priv; + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he = elem->priv; - rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); + rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); } -static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; int err; @@ -272,16 +272,16 @@ out: rhashtable_walk_exit(&hti); } -static void nft_hash_gc(struct work_struct *work) +static void nft_rhash_gc(struct work_struct *work) { struct nft_set *set; - struct nft_hash_elem *he; - struct nft_hash *priv; + struct nft_rhash_elem *he; + struct nft_rhash *priv; struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; int err; - priv = container_of(work, struct nft_hash, gc_work.work); + priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); @@ -307,7 +307,7 @@ static void nft_hash_gc(struct work_struct *work) gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (gcb == NULL) goto out; - rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); + rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, he); } @@ -321,55 +321,55 @@ schedule: nft_set_gc_interval(set)); } -static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +static unsigned int nft_rhash_privsize(const struct nlattr * const nla[]) { - return sizeof(struct nft_hash); + return sizeof(struct nft_rhash); } -static int nft_hash_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const tb[]) +static int nft_rhash_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const tb[]) { - struct nft_hash *priv = nft_set_priv(set); - struct rhashtable_params params = nft_hash_params; + struct nft_rhash *priv = nft_set_priv(set); + struct rhashtable_params params = nft_rhash_params; int err; - params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT; + params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT; params.key_len = set->klen; err = rhashtable_init(&priv->ht, ¶ms); if (err < 0) return err; - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc); + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); if (set->flags & NFT_SET_TIMEOUT) queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); return 0; } -static void nft_hash_elem_destroy(void *ptr, void *arg) +static void nft_rhash_elem_destroy(void *ptr, void *arg) { nft_set_elem_destroy(arg, ptr, true); } -static void nft_hash_destroy(const struct nft_set *set) +static void nft_rhash_destroy(const struct nft_set *set) { - struct nft_hash *priv = nft_set_priv(set); + struct nft_rhash *priv = nft_set_priv(set); cancel_delayed_work_sync(&priv->gc_work); - rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy, + rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)set); } -static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) +static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) { if (desc->size) - est->size = sizeof(struct nft_hash) + + est->size = sizeof(struct nft_rhash) + roundup_pow_of_two(desc->size * 4 / 3) * - sizeof(struct nft_hash_elem *) + - desc->size * sizeof(struct nft_hash_elem); + sizeof(struct nft_rhash_elem *) + + desc->size * sizeof(struct nft_rhash_elem); else est->size = ~0; @@ -379,32 +379,32 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_ops nft_hash_ops __read_mostly = { - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup, - .update = nft_hash_update, - .walk = nft_hash_walk, +static struct nft_set_ops nft_rhash_ops __read_mostly = { + .privsize = nft_rhash_privsize, + .elemsize = offsetof(struct nft_rhash_elem, ext), + .estimate = nft_rhash_estimate, + .init = nft_rhash_init, + .destroy = nft_rhash_destroy, + .insert = nft_rhash_insert, + .activate = nft_rhash_activate, + .deactivate = nft_rhash_deactivate, + .flush = nft_rhash_flush, + .remove = nft_rhash_remove, + .lookup = nft_rhash_lookup, + .update = nft_rhash_update, + .walk = nft_rhash_walk, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .owner = THIS_MODULE, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_hash_ops); + return nft_register_set(&nft_rhash_ops); } static void __exit nft_hash_module_exit(void) { - nft_unregister_set(&nft_hash_ops); + nft_unregister_set(&nft_rhash_ops); } module_init(nft_hash_module_init); From 2b664957c27fe708035b217c908edd1048be355e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:51 +0100 Subject: [PATCH 13/29] netfilter: nf_tables: select set backend flavour depending on description This patch adds the infrastructure to support several implementations of the same set type. This selection will be based on the set description and the features available for this set. This allow us to select set backend implementation that will result in better performance numbers. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 26 ++++++++++---- net/netfilter/nf_tables_api.c | 59 +++++++++++++++++++------------ net/netfilter/nft_set_bitmap.c | 10 ++++-- net/netfilter/nft_set_hash.c | 10 ++++-- net/netfilter/nft_set_rbtree.c | 10 ++++-- 5 files changed, 80 insertions(+), 35 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8a8bab8d7b15..f27012098846 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -281,6 +281,23 @@ struct nft_set_estimate { enum nft_set_class space; }; +/** + * struct nft_set_type - nf_tables set type + * + * @select_ops: function to select nft_set_ops + * @ops: default ops, used when no select_ops functions is present + * @list: used internally + * @owner: module reference + */ +struct nft_set_type { + const struct nft_set_ops *(*select_ops)(const struct nft_ctx *, + const struct nft_set_desc *desc, + u32 flags); + const struct nft_set_ops *ops; + struct list_head list; + struct module *owner; +}; + struct nft_set_ext; struct nft_expr; @@ -297,8 +314,6 @@ struct nft_expr; * @privsize: function to return size of set private data * @init: initialize private data of new set instance * @destroy: destroy private data of set instance - * @list: nf_tables_set_ops list node - * @owner: module reference * @elemsize: element private size * @features: features supported by the implementation */ @@ -345,14 +360,13 @@ struct nft_set_ops { const struct nlattr * const nla[]); void (*destroy)(const struct nft_set *set); - struct list_head list; - struct module *owner; unsigned int elemsize; u32 features; + const struct nft_set_type *type; }; -int nft_register_set(struct nft_set_ops *ops); -void nft_unregister_set(struct nft_set_ops *ops); +int nft_register_set(struct nft_set_type *type); +void nft_unregister_set(struct nft_set_type *type); /** * struct nft_set - nf_tables set instance diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index da314be0c048..c0b2b19607e1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2377,64 +2377,77 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, * Sets */ -static LIST_HEAD(nf_tables_set_ops); +static LIST_HEAD(nf_tables_set_types); -int nft_register_set(struct nft_set_ops *ops) +int nft_register_set(struct nft_set_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail_rcu(&ops->list, &nf_tables_set_ops); + list_add_tail_rcu(&type->list, &nf_tables_set_types); nfnl_unlock(NFNL_SUBSYS_NFTABLES); return 0; } EXPORT_SYMBOL_GPL(nft_register_set); -void nft_unregister_set(struct nft_set_ops *ops) +void nft_unregister_set(struct nft_set_type *type) { nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del_rcu(&ops->list); + list_del_rcu(&type->list); nfnl_unlock(NFNL_SUBSYS_NFTABLES); } EXPORT_SYMBOL_GPL(nft_unregister_set); +#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ + NFT_SET_TIMEOUT | NFT_SET_OBJECT) + +static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) +{ + return (flags & ops->features) == (flags & NFT_SET_FEATURES); +} + /* * Select a set implementation based on the data characteristics and the * given policy. The total memory use might not be known if no size is * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * -nft_select_set_ops(const struct nlattr * const nla[], +nft_select_set_ops(const struct nft_ctx *ctx, + const struct nlattr * const nla[], const struct nft_set_desc *desc, enum nft_set_policies policy) { const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; - u32 features; + const struct nft_set_type *type; + u32 flags = 0; #ifdef CONFIG_MODULES - if (list_empty(&nf_tables_set_ops)) { + if (list_empty(&nf_tables_set_types)) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); request_module("nft-set"); nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (!list_empty(&nf_tables_set_ops)) + if (!list_empty(&nf_tables_set_types)) return ERR_PTR(-EAGAIN); } #endif - features = 0; - if (nla[NFTA_SET_FLAGS] != NULL) { - features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT | - NFT_SET_OBJECT; - } + if (nla[NFTA_SET_FLAGS] != NULL) + flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); bops = NULL; best.size = ~0; best.lookup = ~0; best.space = ~0; - list_for_each_entry(ops, &nf_tables_set_ops, list) { - if ((ops->features & features) != features) + list_for_each_entry(type, &nf_tables_set_types, list) { + if (!type->select_ops) + ops = type->ops; + else + ops = type->select_ops(ctx, desc, flags); + if (!ops) continue; - if (!ops->estimate(desc, features, &est)) + + if (!nft_set_ops_candidate(ops, flags)) + continue; + if (!ops->estimate(desc, flags, &est)) continue; switch (policy) { @@ -2465,10 +2478,10 @@ nft_select_set_ops(const struct nlattr * const nla[], break; } - if (!try_module_get(ops->owner)) + if (!try_module_get(type->owner)) continue; if (bops != NULL) - module_put(bops->owner); + module_put(bops->type->owner); bops = ops; best = est; @@ -3029,7 +3042,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(nla, &desc, policy); + ops = nft_select_set_ops(&ctx, nla, &desc, policy); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -3089,14 +3102,14 @@ err3: err2: kfree(set); err1: - module_put(ops->owner); + module_put(ops->type->owner); return err; } static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); - module_put(set->ops->owner); + module_put(set->ops->type->owner); kfree(set); } diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index b988162b5b15..87d17691278f 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -278,7 +278,9 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static struct nft_set_type nft_bitmap_type; static struct nft_set_ops nft_bitmap_ops __read_mostly = { + .type = &nft_bitmap_type, .privsize = nft_bitmap_privsize, .elemsize = offsetof(struct nft_bitmap_elem, ext), .estimate = nft_bitmap_estimate, @@ -291,17 +293,21 @@ static struct nft_set_ops nft_bitmap_ops __read_mostly = { .activate = nft_bitmap_activate, .lookup = nft_bitmap_lookup, .walk = nft_bitmap_walk, +}; + +static struct nft_set_type nft_bitmap_type __read_mostly = { + .ops = &nft_bitmap_ops, .owner = THIS_MODULE, }; static int __init nft_bitmap_module_init(void) { - return nft_register_set(&nft_bitmap_ops); + return nft_register_set(&nft_bitmap_type); } static void __exit nft_bitmap_module_exit(void) { - nft_unregister_set(&nft_bitmap_ops); + nft_unregister_set(&nft_bitmap_type); } module_init(nft_bitmap_module_init); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 7c21e3da0d88..4ba0717408d9 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -379,7 +379,9 @@ static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static struct nft_set_type nft_hash_type; static struct nft_set_ops nft_rhash_ops __read_mostly = { + .type = &nft_hash_type, .privsize = nft_rhash_privsize, .elemsize = offsetof(struct nft_rhash_elem, ext), .estimate = nft_rhash_estimate, @@ -394,17 +396,21 @@ static struct nft_set_ops nft_rhash_ops __read_mostly = { .update = nft_rhash_update, .walk = nft_rhash_walk, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, +}; + +static struct nft_set_type nft_hash_type __read_mostly = { + .ops = &nft_rhash_ops, .owner = THIS_MODULE, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_rhash_ops); + return nft_register_set(&nft_hash_type); } static void __exit nft_hash_module_exit(void) { - nft_unregister_set(&nft_rhash_ops); + nft_unregister_set(&nft_hash_type); } module_init(nft_hash_module_init); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index fbfb3cbb3916..29d41d378339 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -295,7 +295,9 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static struct nft_set_type nft_rbtree_type; static struct nft_set_ops nft_rbtree_ops __read_mostly = { + .type = &nft_rbtree_type, .privsize = nft_rbtree_privsize, .elemsize = offsetof(struct nft_rbtree_elem, ext), .estimate = nft_rbtree_estimate, @@ -309,17 +311,21 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, +}; + +static struct nft_set_type nft_rbtree_type __read_mostly = { + .ops = &nft_rbtree_ops, .owner = THIS_MODULE, }; static int __init nft_rbtree_module_init(void) { - return nft_register_set(&nft_rbtree_ops); + return nft_register_set(&nft_rbtree_type); } static void __exit nft_rbtree_module_exit(void) { - nft_unregister_set(&nft_rbtree_ops); + nft_unregister_set(&nft_rbtree_type); } module_init(nft_rbtree_module_init); From 347b408d59e7eadcd09f97eba96fa4c270eb3b23 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:54 +0100 Subject: [PATCH 14/29] netfilter: nf_tables: pass set description to ->privsize The new non-resizable hashtable variant needs this to calculate the size of the bucket array. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_set_bitmap.c | 3 ++- net/netfilter/nft_set_hash.c | 3 ++- net/netfilter/nft_set_rbtree.c | 3 ++- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f27012098846..bd5be0d691d5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -351,7 +351,8 @@ struct nft_set_ops { struct nft_set *set, struct nft_set_iter *iter); - unsigned int (*privsize)(const struct nlattr * const nla[]); + unsigned int (*privsize)(const struct nlattr * const nla[], + const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c0b2b19607e1..2969016d8cad 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3052,7 +3052,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, size = 0; if (ops->privsize != NULL) - size = ops->privsize(nla); + size = ops->privsize(nla, &desc); err = -ENOMEM; set = kzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 87d17691278f..734989c40579 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -236,7 +236,8 @@ static inline u32 nft_bitmap_total_size(u32 klen) return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); } -static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[]) +static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 4ba0717408d9..455a11ce8cd0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -321,7 +321,8 @@ schedule: nft_set_gc_interval(set)); } -static unsigned int nft_rhash_privsize(const struct nlattr * const nla[]) +static unsigned int nft_rhash_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { return sizeof(struct nft_rhash); } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 29d41d378339..491e805d3ca2 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -251,7 +251,8 @@ cont: read_unlock_bh(&priv->lock); } -static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) +static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { return sizeof(struct nft_rbtree); } From 2111515abc46cb3e18b22d8551067029acfd1f55 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:56 +0100 Subject: [PATCH 15/29] netfilter: nft_set_hash: add nft_hash_buckets() Add nft_hash_buckets() helper function to calculate the number of hashtable buckets based on the elements. This function can be reused from the follow up patch to add non-resizable hashtables. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 455a11ce8cd0..466cb7092dfa 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -363,12 +363,17 @@ static void nft_rhash_destroy(const struct nft_set *set) (void *)set); } +static u32 nft_hash_buckets(u32 size) +{ + return roundup_pow_of_two(size * 4 / 3); +} + static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (desc->size) est->size = sizeof(struct nft_rhash) + - roundup_pow_of_two(desc->size * 4 / 3) * + nft_hash_buckets(desc->size) * sizeof(struct nft_rhash_elem *) + desc->size * sizeof(struct nft_rhash_elem); else From 1ff75a3e9ab72368fc3ad63d9bc6c7518cbd1389 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:47:59 +0100 Subject: [PATCH 16/29] netfilter: nf_tables: allow large allocations for new sets The new fixed size hashtable backend implementation may result in a large array of buckets that would spew splats from mm. Update this code to fall back on vmalloc in case the memory allocation order is too costly. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2969016d8cad..bc8f03a53734 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3054,10 +3055,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); - err = -ENOMEM; - set = kzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); - if (set == NULL) + set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); + if (!set) { + err = -ENOMEM; goto err1; + } nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); err = nf_tables_set_alloc_name(&ctx, set, name); @@ -3100,7 +3102,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err3: ops->destroy(set); err2: - kfree(set); + kvfree(set); err1: module_put(ops->type->owner); return err; @@ -3110,7 +3112,7 @@ static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); module_put(set->ops->type->owner); - kfree(set); + kvfree(set); } static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) From 6c03ae210ce306f443767d619a0390b0a541a5d8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:48:03 +0100 Subject: [PATCH 17/29] netfilter: nft_set_hash: add non-resizable hashtable implementation This patch adds a simple non-resizable hashtable implementation. If the user specifies the set size, then this new faster hashtable flavour is selected. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 210 +++++++++++++++++++++++++++++++++-- 1 file changed, 202 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 466cb7092dfa..b2eab94362d6 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -371,14 +371,181 @@ static u32 nft_hash_buckets(u32 size) static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { - if (desc->size) - est->size = sizeof(struct nft_rhash) + - nft_hash_buckets(desc->size) * - sizeof(struct nft_rhash_elem *) + - desc->size * sizeof(struct nft_rhash_elem); - else - est->size = ~0; + est->size = ~0; + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; + return true; +} + +struct nft_hash { + u32 seed; + u32 buckets; + struct hlist_head table[]; +}; + +struct nft_hash_elem { + struct hlist_node node; + struct nft_set_ext ext; +}; + +static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + const struct nft_hash_elem *he; + u32 hash; + + hash = jhash(key, set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry_rcu(he, &priv->table[hash], node) { + if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && + nft_set_elem_active(&he->ext, genmask)) { + *ext = &he->ext; + return true; + } + } + return false; +} + +static int nft_hash_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext) +{ + struct nft_hash_elem *this = elem->priv, *he; + struct nft_hash *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); + u32 hash; + + hash = jhash(nft_set_ext_key(&this->ext), set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry(he, &priv->table[hash], node) { + if (!memcmp(nft_set_ext_key(&this->ext), + nft_set_ext_key(&he->ext), set->klen) && + nft_set_elem_active(&he->ext, genmask)) { + *ext = &he->ext; + return -EEXIST; + } + } + hlist_add_head_rcu(&this->node, &priv->table[hash]); + return 0; +} + +static void nft_hash_activate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash_elem *he = elem->priv; + + nft_set_elem_change_active(net, set, &he->ext); +} + +static bool nft_hash_flush(const struct net *net, + const struct nft_set *set, void *priv) +{ + struct nft_hash_elem *he = priv; + + nft_set_elem_change_active(net, set, &he->ext); + return true; +} + +static void *nft_hash_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *this = elem->priv, *he; + u8 genmask = nft_genmask_next(net); + u32 hash; + + hash = jhash(nft_set_ext_key(&this->ext), set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry(he, &priv->table[hash], node) { + if (!memcmp(nft_set_ext_key(&this->ext), &elem->key.val, + set->klen) || + nft_set_elem_active(&he->ext, genmask)) { + nft_set_elem_change_active(net, set, &he->ext); + return he; + } + } + return NULL; +} + +static void nft_hash_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash_elem *he = elem->priv; + + hlist_del_rcu(&he->node); +} + +static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_set_elem elem; + int i; + + for (i = 0; i < priv->buckets; i++) { + hlist_for_each_entry_rcu(he, &priv->table[i], node) { + if (iter->count < iter->skip) + goto cont; + if (!nft_set_elem_active(&he->ext, iter->genmask)) + goto cont; + + elem.priv = he; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + return; +cont: + iter->count++; + } + } +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_hash) + + nft_hash_buckets(desc->size) * sizeof(struct hlist_head); +} + +static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const tb[]) +{ + struct nft_hash *priv = nft_set_priv(set); + + priv->buckets = nft_hash_buckets(desc->size); + get_random_bytes(&priv->seed, sizeof(priv->seed)); + + return 0; +} + +static void nft_hash_destroy(const struct nft_set *set) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct hlist_node *next; + int i; + + for (i = 0; i < priv->buckets; i++) { + hlist_for_each_entry_safe(he, next, &priv->table[i], node) { + hlist_del_rcu(&he->node); + nft_set_elem_destroy(set, he, true); + } + } +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + est->size = sizeof(struct nft_hash) + + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + desc->size * sizeof(struct nft_hash_elem); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; @@ -404,8 +571,35 @@ static struct nft_set_ops nft_rhash_ops __read_mostly = { .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, }; +static struct nft_set_ops nft_hash_ops __read_mostly = { + .type = &nft_hash_type, + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .features = NFT_SET_MAP | NFT_SET_OBJECT, +}; + +static const struct nft_set_ops * +nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, + u32 flags) +{ + if (desc->size) + return &nft_hash_ops; + + return &nft_rhash_ops; +} + static struct nft_set_type nft_hash_type __read_mostly = { - .ops = &nft_rhash_ops, + .select_ops = nft_hash_select_ops, .owner = THIS_MODULE, }; From 446a8268b7f56830d8c7b562718ba9dc113ab3c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 May 2017 17:48:09 +0100 Subject: [PATCH 18/29] netfilter: nft_set_hash: add lookup variant for fixed size hashtable This patch provides a faster variant of the lookup function for 2 and 4 byte keys. Optimizing the one byte case is not worth, as the set backend selection will always select the bitmap set type for such case. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index b2eab94362d6..0fa01d772c5e 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -409,6 +409,38 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, return false; } +/* nft_hash_select_ops() makes sure key size can be either 2 or 4 bytes . */ +static inline u32 nft_hash_key(const u32 *key, u32 klen) +{ + if (klen == 4) + return *key; + + return *(u16 *)key; +} + +static bool nft_hash_lookup_fast(const struct net *net, + const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + const struct nft_hash_elem *he; + u32 hash, k1, k2; + + k1 = nft_hash_key(key, set->klen); + hash = jhash_1word(k1, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); + hlist_for_each_entry_rcu(he, &priv->table[hash], node) { + k2 = nft_hash_key(nft_set_ext_key(&he->ext)->data, set->klen); + if (k1 == k2 && + nft_set_elem_active(&he->ext, genmask)) { + *ext = &he->ext; + return true; + } + } + return false; +} + static int nft_hash_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) @@ -588,12 +620,36 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .features = NFT_SET_MAP | NFT_SET_OBJECT, }; +static struct nft_set_ops nft_hash_fast_ops __read_mostly = { + .type = &nft_hash_type, + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup_fast, + .walk = nft_hash_walk, + .features = NFT_SET_MAP | NFT_SET_OBJECT, +}; + static const struct nft_set_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, u32 flags) { - if (desc->size) - return &nft_hash_ops; + if (desc->size) { + switch (desc->klen) { + case 2: + case 4: + return &nft_hash_fast_ops; + default: + return &nft_hash_ops; + } + } return &nft_rhash_ops; } From ff1acc4964ccde61a4134c789314593d40dd3c93 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 28 May 2017 22:35:52 +0800 Subject: [PATCH 19/29] netfilter: nf_ct_helper: use nf_ct_iterate_destroy to unlink helper objs When we unlink the helper objects, we will iterate the nf_conntrack_hash, iterate the unconfirmed list, handle the hash resize situation, etc. Actually this logic is same as the nf_ct_iterate_destroy, so we can use it to remove these copy & paste code. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 50 +++-------------------------- 1 file changed, 4 insertions(+), 46 deletions(-) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 7f6100ca63be..9129bb3b5153 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -285,16 +285,16 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); /* appropriate ct lock protecting must be taken by caller */ -static inline int unhelp(struct nf_conntrack_tuple_hash *i, - const struct nf_conntrack_helper *me) +static int unhelp(struct nf_conn *ct, void *me) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } + + /* We are not intended to delete this conntrack. */ return 0; } @@ -437,33 +437,10 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); -static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, - struct net *net) -{ - struct nf_conntrack_tuple_hash *h; - const struct hlist_nulls_node *nn; - int cpu; - - /* Get rid of expecteds, set helpers to NULL. */ - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) - unhelp(h, me); - spin_unlock_bh(&pcpu->lock); - } -} - void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { - struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; const struct hlist_node *next; - const struct hlist_nulls_node *nn; - unsigned int last_hsize; - spinlock_t *lock; - struct net *net; unsigned int i; mutex_lock(&nf_ct_helper_mutex); @@ -491,26 +468,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) } spin_unlock_bh(&nf_conntrack_expect_lock); - rtnl_lock(); - for_each_net(net) - __nf_conntrack_helper_unregister(me, net); - rtnl_unlock(); - - local_bh_disable(); -restart: - last_hsize = nf_conntrack_htable_size; - for (i = 0; i < last_hsize; i++) { - lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; - nf_conntrack_lock(lock); - if (last_hsize != nf_conntrack_htable_size) { - spin_unlock(lock); - goto restart; - } - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - unhelp(h, me); - spin_unlock(lock); - } - local_bh_enable(); + nf_ct_iterate_destroy(unhelp, me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); From 34158151d2aa1138983bedb59e5b711d2e25c245 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 28 May 2017 22:37:19 +0800 Subject: [PATCH 20/29] netfilter: cttimeout: use nf_ct_iterate_cleanup_net to unlink timeout objs Similar to nf_conntrack_helper, we can use nf_ct_iterare_cleanup_net to remove these copy & paste code. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 39 ++++------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a3e7bb54d96a..49638b03ccc9 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -287,49 +287,20 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, return ret; } -static void untimeout(struct nf_conntrack_tuple_hash *i, - struct ctnl_timeout *timeout) +static int untimeout(struct nf_conn *ct, void *timeout) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) RCU_INIT_POINTER(timeout_ext->timeout, NULL); + + /* We are not intended to delete this conntrack. */ + return 0; } static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) { - struct nf_conntrack_tuple_hash *h; - const struct hlist_nulls_node *nn; - unsigned int last_hsize; - spinlock_t *lock; - int i, cpu; - - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, nn, &pcpu->unconfirmed, hnnode) - untimeout(h, timeout); - spin_unlock_bh(&pcpu->lock); - } - - local_bh_disable(); -restart: - last_hsize = nf_conntrack_htable_size; - for (i = 0; i < last_hsize; i++) { - lock = &nf_conntrack_locks[i % CONNTRACK_LOCKS]; - nf_conntrack_lock(lock); - if (last_hsize != nf_conntrack_htable_size) { - spin_unlock(lock); - goto restart; - } - - hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) - untimeout(h, timeout); - spin_unlock(lock); - } - local_bh_enable(); + nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); } /* try to delete object, fail if it is still in use. */ From 202f59afd441474cc4c3752d2417cc05dd68ffe5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 24 May 2017 21:24:37 +0800 Subject: [PATCH 21/29] netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 101 +++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f30bee8e407b..7d72decb80f9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -47,7 +47,7 @@ struct clusterip_config { __be32 clusterip; /* the IP address */ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ - struct net_device *dev; /* device */ + int ifindex; /* device ifindex */ u_int16_t num_total_nodes; /* total number of nodes */ unsigned long local_nodes; /* node number array */ @@ -57,6 +57,9 @@ struct clusterip_config { enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; + + char ifname[IFNAMSIZ]; /* device ifname */ + struct notifier_block notifier; /* refresh c->ifindex in it */ }; #ifdef CONFIG_PROC_FS @@ -98,9 +101,8 @@ clusterip_config_put(struct clusterip_config *c) * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void -clusterip_config_entry_put(struct clusterip_config *c) +clusterip_config_entry_put(struct net *net, struct clusterip_config *c) { - struct net *net = dev_net(c->dev); struct clusterip_net *cn = net_generic(net, clusterip_net_id); local_bh_disable(); @@ -109,8 +111,7 @@ clusterip_config_entry_put(struct clusterip_config *c) spin_unlock(&cn->lock); local_bh_enable(); - dev_mc_del(c->dev, c->clustermac); - dev_put(c->dev); + unregister_netdevice_notifier(&c->notifier); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, @@ -170,19 +171,55 @@ clusterip_config_init_nodelist(struct clusterip_config *c, set_bit(i->local_nodes[n] - 1, &c->local_nodes); } -static struct clusterip_config * -clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, - struct net_device *dev) +static int +clusterip_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) { - struct net *net = dev_net(dev); + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct clusterip_config *c; + + c = container_of(this, struct clusterip_config, notifier); + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } else if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; + } + + return NOTIFY_DONE; +} + +static struct clusterip_config * +clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, + __be32 ip, const char *iniface) +{ struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_config *c; + int err; c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); - c->dev = dev; + strcpy(c->ifname, iniface); + c->ifindex = -1; c->clusterip = ip; memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); c->num_total_nodes = i->num_total_nodes; @@ -213,17 +250,27 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { - spin_lock_bh(&cn->lock); - list_del_rcu(&c->list); - spin_unlock_bh(&cn->lock); - kfree(c); - - return ERR_PTR(-ENOMEM); + err = -ENOMEM; + goto err; } } #endif - return c; + c->notifier.notifier_call = clusterip_netdev_event; + err = register_netdevice_notifier(&c->notifier); + if (!err) + return c; + +#ifdef CONFIG_PROC_FS + proc_remove(c->pde); +err: +#endif + spin_lock_bh(&cn->lock); + list_del_rcu(&c->list); + spin_unlock_bh(&cn->lock); + kfree(c); + + return ERR_PTR(err); } #ifdef CONFIG_PROC_FS @@ -425,14 +472,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) e->ip.iniface); return -ENOENT; } + dev_put(dev); - config = clusterip_config_init(cipinfo, - e->ip.dst.s_addr, dev); - if (IS_ERR(config)) { - dev_put(dev); + config = clusterip_config_init(par->net, cipinfo, + e->ip.dst.s_addr, + e->ip.iniface); + if (IS_ERR(config)) return PTR_ERR(config); - } - dev_mc_add(config->dev, config->clustermac); } } cipinfo->config = config; @@ -458,7 +504,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ - clusterip_config_entry_put(cipinfo->config); + clusterip_config_entry_put(par->net, cipinfo->config); clusterip_config_put(cipinfo->config); @@ -558,10 +604,9 @@ arp_mangle(void *priv, * addresses on different interfacs. However, in the CLUSTERIP case * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ - if (c->dev != state->out) { - pr_debug("not mangling arp reply on different " - "interface: cip'%s'-skb'%s'\n", - c->dev->name, state->out->name); + if (c->ifindex != state->out->ifindex) { + pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", + c->ifindex, state->out->ifindex); clusterip_config_put(c); return NF_ACCEPT; } From 2c41f33c1b7030448212cdacd40e80796e347eac Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 May 2017 11:31:06 +0200 Subject: [PATCH 22/29] netfilter: move table iteration out of netns exit paths We only need to iterate & remove in case of module removal; for netns destruction all conntracks will be removed anyway. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b7d01f27d463..6a36623e897c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -265,6 +265,8 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l3proto, proto); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); @@ -280,9 +282,6 @@ void nf_ct_l3proto_pernet_unregister(struct net *net, */ if (proto->net_ns_put) proto->net_ns_put(net); - - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup_net(net, kill_l3proto, proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister); @@ -421,17 +420,23 @@ out: } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); -void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) +static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) + { BUG_ON(l4proto->l3proto >= PF_MAX); - mutex_lock(&nf_ct_proto_mutex); BUG_ON(rcu_dereference_protected( nf_ct_protos[l4proto->l3proto][l4proto->l4proto], lockdep_is_held(&nf_ct_proto_mutex) ) != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); +} + +void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) +{ + mutex_lock(&nf_ct_proto_mutex); + __nf_ct_l4proto_unregister_one(l4proto); mutex_unlock(&nf_ct_proto_mutex); synchronize_rcu(); @@ -448,9 +453,6 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, pn->users--; nf_ct_l4proto_unregister_sysctl(net, pn, l4proto); - - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup_net(net, kill_l4proto, l4proto, 0, 0); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); @@ -500,8 +502,14 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[], unsigned int num_proto) { + mutex_lock(&nf_ct_proto_mutex); while (num_proto-- != 0) - nf_ct_l4proto_unregister_one(l4proto[num_proto]); + __nf_ct_l4proto_unregister_one(l4proto[num_proto]); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_net(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l4proto, l4proto); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); From 7866cc57b51c1e118e5d78d1a8f721f378eec5c4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 May 2017 11:38:12 +0200 Subject: [PATCH 23/29] netns: add and use net_ns_barrier Quoting Joe Stringer: If a user loads nf_conntrack_ftp, sends FTP traffic through a network namespace, destroys that namespace then unloads the FTP helper module, then the kernel will crash. Events that lead to the crash: 1. conntrack is created with ftp helper in netns x 2. This netns is destroyed 3. netns destruction is scheduled 4. netns destruction wq starts, removes netns from global list 5. ftp helper is unloaded, which resets all helpers of the conntracks via for_each_net() but because netns is already gone from list the for_each_net() loop doesn't include it, therefore all of these conntracks are unaffected. 6. helper module unload finishes 7. netns wq invokes destructor for rmmod'ed helper CC: "Eric W. Biederman" Reported-by: Joe Stringer Signed-off-by: Florian Westphal Acked-by: David S. Miller Acked-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 3 +++ net/core/net_namespace.c | 17 +++++++++++++++++ net/netfilter/nf_conntrack_core.c | 9 +++++++++ 3 files changed, 29 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fe80bb48ab1f..a24a57593202 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -158,6 +158,7 @@ extern struct net init_net; struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); +void net_ns_barrier(void); #else /* CONFIG_NET_NS */ #include #include @@ -168,6 +169,8 @@ static inline struct net *copy_net_ns(unsigned long flags, return ERR_PTR(-EINVAL); return old_net; } + +static inline void net_ns_barrier(void) {} #endif /* CONFIG_NET_NS */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1934efd4a9d4..1f15abb1d733 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -482,6 +482,23 @@ static void cleanup_net(struct work_struct *work) net_drop_ns(net); } } + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ + mutex_lock(&net_mutex); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL(net_ns_barrier); + static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c3bd9b086dcc..9979f46c81dc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1720,6 +1720,8 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); * Like nf_ct_iterate_cleanup, but first marks conntracks on the * unconfirmed list as dying (so they will not be inserted into * main table). + * + * Can only be called in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) @@ -1734,6 +1736,13 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) } rtnl_unlock(); + /* Need to wait for netns cleanup worker to finish, if its + * running -- it might have deleted a net namespace from + * the global list, so our __nf_ct_unconfirmed_destroy() might + * not have affected all namespaces. + */ + net_ns_barrier(); + /* a conntrack could have been unlinked from unconfirmed list * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). * This makes sure its inserted into conntrack table. From e15b9c50c4555e30be3c4f26aab7aeb10aee7aa6 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 31 May 2017 16:55:43 +0800 Subject: [PATCH 24/29] netfilter: ebt: Use new helper ebt_invalid_target to check target Use the new helper function ebt_invalid_target instead of the old macro INVALID_TARGET and other duplicated codes to enhance the readability. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 2 -- net/bridge/netfilter/ebt_dnat.c | 2 +- net/bridge/netfilter/ebt_mark.c | 2 +- net/bridge/netfilter/ebt_redirect.c | 2 +- net/bridge/netfilter/ebt_snat.c | 2 +- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index e0cbf17af780..2c2a5514b0df 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -122,8 +122,6 @@ extern unsigned int ebt_do_table(struct sk_buff *skb, #define BASE_CHAIN (par->hook_mask & (1 << NF_BR_NUMHOOKS)) /* Clear the bit in the hook mask that tells if the rule is on a base chain */ #define CLEAR_BASE_CHAIN_BIT (par->hook_mask &= ~(1 << NF_BR_NUMHOOKS)) -/* True if the target is not a standard target */ -#define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0) static inline bool ebt_invalid_target(int target) { diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index e0bb624c3845..dfc86a0199da 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -61,7 +61,7 @@ static int ebt_dnat_tg_check(const struct xt_tgchk_param *par) (strcmp(par->table, "broute") != 0 || hook_mask & ~(1 << NF_BR_BROUTING))) return -EINVAL; - if (INVALID_TARGET) + if (ebt_invalid_target(info->target)) return -EINVAL; return 0; } diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 66697cbd0a8b..19f0f9592d32 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -44,7 +44,7 @@ static int ebt_mark_tg_check(const struct xt_tgchk_param *par) tmp = info->target | ~EBT_VERDICT_BITS; if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; - if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + if (ebt_invalid_target(tmp)) return -EINVAL; tmp = info->target & ~EBT_VERDICT_BITS; if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE && diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 8d2a85e0594e..a7223eaf490b 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -47,7 +47,7 @@ static int ebt_redirect_tg_check(const struct xt_tgchk_param *par) (strcmp(par->table, "broute") != 0 || hook_mask & ~(1 << NF_BR_BROUTING))) return -EINVAL; - if (INVALID_TARGET) + if (ebt_invalid_target(info->target)) return -EINVAL; return 0; } diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index e56ccd060d26..11cf9e9e9222 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -51,7 +51,7 @@ static int ebt_snat_tg_check(const struct xt_tgchk_param *par) if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; - if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0) + if (ebt_invalid_target(tmp)) return -EINVAL; tmp = info->target | EBT_VERDICT_BITS; if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT) From 2becbbc547aa5ae7b92b35e71bee33706fb1b826 Mon Sep 17 00:00:00 2001 From: Jike Song Date: Fri, 2 Jun 2017 17:35:37 +0800 Subject: [PATCH 25/29] netfilter, kbuild: use canonical method to specify objs. Should use ":=" instead of "+=". Signed-off-by: Jike Song Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index c9b78e7b342f..913380919301 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -70,10 +70,9 @@ obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables -nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o -nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o -nf_tables-objs += nft_lookup.o nft_dynset.o +nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \ + nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \ + nft_byteorder.o nft_payload.o nft_lookup.o nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o From d53e3fc3906c7fe82ff436e889c8416649268007 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 4 Jun 2017 23:25:05 +0800 Subject: [PATCH 26/29] netfilter: use nf_conntrack_helpers_register when possible amanda_helper, nf_conntrack_helper_ras and nf_conntrack_helper_q931 are all arrays, so we can use nf_conntrack_helpers_register to register the ct helper, this will help us to eliminate some "goto errX" statements. Also introduce h323_helper_init/exit helper function to register the ct helpers, this is prepared for the followup patch, which will add net namespace support for ct helper. Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_amanda.c | 12 ++--- net/netfilter/nf_conntrack_h323_main.c | 63 +++++++++++++++----------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 03d2ccffa9fa..20edd589fe06 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -197,8 +197,8 @@ static void __exit nf_conntrack_amanda_fini(void) { int i; - nf_conntrack_helper_unregister(&amanda_helper[0]); - nf_conntrack_helper_unregister(&amanda_helper[1]); + nf_conntrack_helpers_unregister(amanda_helper, + ARRAY_SIZE(amanda_helper)); for (i = 0; i < ARRAY_SIZE(search); i++) textsearch_destroy(search[i].ts); } @@ -218,16 +218,12 @@ static int __init nf_conntrack_amanda_init(void) goto err1; } } - ret = nf_conntrack_helper_register(&amanda_helper[0]); + ret = nf_conntrack_helpers_register(amanda_helper, + ARRAY_SIZE(amanda_helper)); if (ret < 0) goto err1; - ret = nf_conntrack_helper_register(&amanda_helper[1]); - if (ret < 0) - goto err2; return 0; -err2: - nf_conntrack_helper_unregister(&amanda_helper[0]); err1: while (--i >= 0) textsearch_destroy(search[i].ts); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3bcdc718484e..f71f0d2558fd 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1815,14 +1815,44 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { }, }; +static int __init h323_helper_init(void) +{ + int ret; + + ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245); + if (ret < 0) + return ret; + ret = nf_conntrack_helpers_register(nf_conntrack_helper_q931, + ARRAY_SIZE(nf_conntrack_helper_q931)); + if (ret < 0) + goto err1; + ret = nf_conntrack_helpers_register(nf_conntrack_helper_ras, + ARRAY_SIZE(nf_conntrack_helper_ras)); + if (ret < 0) + goto err2; + + return 0; +err2: + nf_conntrack_helpers_unregister(nf_conntrack_helper_q931, + ARRAY_SIZE(nf_conntrack_helper_q931)); +err1: + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); + return ret; +} + +static void __exit h323_helper_exit(void) +{ + nf_conntrack_helpers_unregister(nf_conntrack_helper_ras, + ARRAY_SIZE(nf_conntrack_helper_ras)); + nf_conntrack_helpers_unregister(nf_conntrack_helper_q931, + ARRAY_SIZE(nf_conntrack_helper_q931)); + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); +} + /****************************************************************************/ static void __exit nf_conntrack_h323_fini(void) { - nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]); - nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); - nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); - nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); - nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); + h323_helper_exit(); kfree(h323_buffer); pr_debug("nf_ct_h323: fini\n"); } @@ -1837,32 +1867,11 @@ static int __init nf_conntrack_h323_init(void) h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) return -ENOMEM; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245); + ret = h323_helper_init(); if (ret < 0) goto err1; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]); - if (ret < 0) - goto err2; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]); - if (ret < 0) - goto err3; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]); - if (ret < 0) - goto err4; - ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]); - if (ret < 0) - goto err5; pr_debug("nf_ct_h323: init success\n"); return 0; - -err5: - nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); -err4: - nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); -err3: - nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); -err2: - nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); err1: kfree(h323_buffer); return ret; From b7b5fda4686874c9b9b8c27ba9d57a8534f48a70 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 14 Jun 2017 11:54:07 +0200 Subject: [PATCH 27/29] netfilter: conntrack: use NFPROTO_MAX to size array We don't support anything larger than NFPROTO_MAX, so we can shrink this a bit: text data dec hex filename old: 8259 1096 9355 248b net/netfilter/nf_conntrack_proto.o new: 8259 624 8883 22b3 net/netfilter/nf_conntrack_proto.o Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 4 ++-- net/netfilter/nf_conntrack_proto.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index e01559b4d781..6d14b36e3a49 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -71,7 +71,7 @@ struct nf_conntrack_l3proto { struct module *me; }; -extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX]; +extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; #ifdef CONFIG_SYSCTL /* Protocol pernet registration. */ @@ -100,7 +100,7 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; static inline struct nf_conntrack_l3proto * __nf_ct_l3proto_find(u_int16_t l3proto) { - if (unlikely(l3proto >= AF_MAX)) + if (unlikely(l3proto >= NFPROTO_NUMPROTO)) return &nf_conntrack_l3proto_generic; return rcu_dereference(nf_ct_l3protos[l3proto]); } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 6a36623e897c..1dcad229c3cc 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -28,8 +28,8 @@ #include #include -static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; -struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); @@ -68,7 +68,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, struct nf_conntrack_l4proto * __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) { - if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) + if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL)) return &nf_conntrack_l4proto_generic; return rcu_dereference(nf_ct_protos[l3proto][l4proto]); @@ -212,7 +212,7 @@ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) int ret = 0; struct nf_conntrack_l3proto *old; - if (proto->l3proto >= AF_MAX) + if (proto->l3proto >= NFPROTO_NUMPROTO) return -EBUSY; if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size) @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto) { - BUG_ON(proto->l3proto >= AF_MAX); + BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); mutex_lock(&nf_ct_proto_mutex); BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], @@ -341,7 +341,7 @@ int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *l4proto) { int ret = 0; - if (l4proto->l3proto >= PF_MAX) + if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)) return -EBUSY; if ((l4proto->to_nlattr && !l4proto->nlattr_size) || @@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto) { - BUG_ON(l4proto->l3proto >= PF_MAX); + BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)); BUG_ON(rcu_dereference_protected( nf_ct_protos[l4proto->l3proto][l4proto->l4proto], @@ -556,7 +556,7 @@ void nf_conntrack_proto_pernet_fini(struct net *net) int nf_conntrack_proto_init(void) { unsigned int i; - for (i = 0; i < AF_MAX; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) rcu_assign_pointer(nf_ct_l3protos[i], &nf_conntrack_l3proto_generic); return 0; @@ -566,6 +566,6 @@ void nf_conntrack_proto_fini(void) { unsigned int i; /* free l3proto protocol tables */ - for (i = 0; i < PF_MAX; i++) + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) kfree(nf_ct_protos[i]); } From d8297d4f3ebd7a47f443ab66276bb774749612ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 14 Jun 2017 11:51:29 +0200 Subject: [PATCH 28/29] netfilter: nf_tables: reduce chain type table size text data bss dec hex filename old: 151590 2240 1152 154982 25d66 net/netfilter/nf_tables_api.o new: 151666 2240 416 154322 25ad2 net/netfilter/nf_tables_api.o Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bc8f03a53734..5f3339978f6b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -387,7 +387,7 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) return ++table->hgenerator; } -static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX]; +static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; static const struct nf_chain_type * __nf_tables_chain_type_lookup(int family, const struct nlattr *nla) @@ -870,6 +870,9 @@ int nft_register_chain_type(const struct nf_chain_type *ctype) { int err = 0; + if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) + return -EINVAL; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (chain_type[ctype->family][ctype->type] != NULL) { err = -EBUSY; From 04ba724b659c6808b0ca31528121bdb2f2807e00 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 19 Jun 2017 18:35:46 +0100 Subject: [PATCH 29/29] netfilter: nfnetlink: extended ACK reporting Pass down struct netlink_ext_ack as parameter to all of our nfnetlink subsystem callbacks, so we can work on follow up patches to provide finer grain error reporting using the new infrastructure that 2d4bc93368f5 ("netlink: extended ACK reporting") provides. No functional change, just pass down this new object to callbacks. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 10 +++-- net/netfilter/ipset/ip_set_core.c | 39 ++++++++++++------ net/netfilter/nf_conntrack_netlink.c | 39 ++++++++++++------ net/netfilter/nf_tables_api.c | 59 ++++++++++++++++++---------- net/netfilter/nfnetlink.c | 21 +++++++--- net/netfilter/nfnetlink_acct.c | 9 +++-- net/netfilter/nfnetlink_cthelper.c | 9 +++-- net/netfilter/nfnetlink_cttimeout.c | 15 ++++--- net/netfilter/nfnetlink_log.c | 6 ++- net/netfilter/nfnetlink_queue.c | 12 ++++-- net/netfilter/nft_compat.c | 3 +- net/netfilter/xt_osf.c | 6 ++- 12 files changed, 152 insertions(+), 76 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 996711d8a7b4..41d04e9d088a 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -1,7 +1,6 @@ #ifndef _NFNETLINK_H #define _NFNETLINK_H - #include #include #include @@ -10,13 +9,16 @@ struct nfnl_callback { int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]); + const struct nlattr * const cda[], + struct netlink_ext_ack *extack); int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]); + const struct nlattr * const cda[], + struct netlink_ext_ack *extack); int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]); + const struct nlattr * const cda[], + struct netlink_ext_ack *extack); const struct nla_policy *policy; /* netlink attribute policy */ const u_int16_t attr_count; /* number of nlattr's */ }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index ba6a5516dc7c..e495b5e484b1 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -841,14 +841,16 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static int ip_set_create(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *clash = NULL; @@ -989,7 +991,8 @@ ip_set_destroy_set(struct ip_set *set) static int ip_set_destroy(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *s; @@ -1067,7 +1070,8 @@ ip_set_flush_set(struct ip_set *set) static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *s; @@ -1106,7 +1110,8 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { static int ip_set_rename(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set, *s; @@ -1155,7 +1160,8 @@ out: static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *from, *to; @@ -1428,7 +1434,8 @@ out: static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { if (unlikely(protocol_failed(attr))) return -IPSET_ERR_PROTOCOL; @@ -1513,7 +1520,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1567,7 +1575,8 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1621,7 +1630,8 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1656,7 +1666,8 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, static int ip_set_header(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); const struct ip_set *set; @@ -1712,7 +1723,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1770,7 +1782,8 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { static int ip_set_protocol(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[]) + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct sk_buff *skb2; struct nlmsghdr *nlh2; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e1eca47105bd..573eb83d5d17 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1127,7 +1127,8 @@ static int ctnetlink_flush_conntrack(struct net *net, static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; @@ -1179,7 +1180,8 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; @@ -1340,7 +1342,8 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -1362,7 +1365,8 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -1901,7 +1905,8 @@ err1: static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; @@ -2066,7 +2071,8 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { @@ -2111,7 +2117,8 @@ nlmsg_failure: static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct sk_buff *skb2; int err; @@ -2773,7 +2780,8 @@ out: static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { int err; struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2817,7 +2825,8 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; @@ -2829,7 +2838,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) - return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda); + return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda, + extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, @@ -2897,7 +2907,8 @@ out: static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; @@ -3185,7 +3196,8 @@ err_ct: static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; @@ -3291,7 +3303,8 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5f3339978f6b..7843efa33c59 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -535,7 +535,8 @@ done: static int nf_tables_gettable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); @@ -678,7 +679,8 @@ err: static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -831,7 +833,8 @@ out: static int nf_tables_deltable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -1127,7 +1130,8 @@ done: static int nf_tables_getchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); @@ -1323,7 +1327,8 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr * uninitialized_var(name); @@ -1561,7 +1566,8 @@ err1: static int nf_tables_delchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -2042,7 +2048,8 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); @@ -2135,7 +2142,8 @@ static struct nft_expr_info *info; static int nf_tables_newrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -2317,7 +2325,8 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, static int nf_tables_delrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -2833,7 +2842,8 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { u8 genmask = nft_genmask_cur(net); const struct nft_set *set; @@ -2909,7 +2919,8 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, static int nf_tables_newset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -3127,7 +3138,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set static int nf_tables_delset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -3487,7 +3499,8 @@ static int nf_tables_dump_set_done(struct netlink_callback *cb) static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { u8 genmask = nft_genmask_cur(net); const struct nft_set *set; @@ -3888,7 +3901,8 @@ err1: static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { u8 genmask = nft_genmask_next(net); const struct nlattr *attr; @@ -4085,7 +4099,8 @@ err1: static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { u8 genmask = nft_genmask_next(net); const struct nlattr *attr; @@ -4295,7 +4310,8 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype) static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_object_type *type; @@ -4489,7 +4505,8 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) static int nf_tables_getobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_cur(net); @@ -4567,8 +4584,9 @@ static void nft_obj_destroy(struct nft_object *obj) } static int nf_tables_delobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); @@ -4698,7 +4716,8 @@ err: static int nf_tables_getgen(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[]) + const struct nlattr * const nla[], + struct netlink_ext_ack *extack) { struct sk_buff *skb2; int err; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 80f5ecf2c3d7..92b05e188fd1 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -201,7 +201,8 @@ replay: if (nc->call_rcu) { err = nc->call_rcu(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda); + (const struct nlattr **)cda, + extack); rcu_read_unlock(); } else { rcu_read_unlock(); @@ -211,7 +212,8 @@ replay: err = -EAGAIN; else if (nc->call) err = nc->call(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda); + (const struct nlattr **)cda, + extack); else err = -EINVAL; nfnl_unlock(subsys_id); @@ -226,9 +228,11 @@ struct nfnl_err { struct list_head head; struct nlmsghdr *nlh; int err; + struct netlink_ext_ack extack; }; -static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err) +static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack) { struct nfnl_err *nfnl_err; @@ -238,6 +242,7 @@ static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err) nfnl_err->nlh = nlh; nfnl_err->err = err; + nfnl_err->extack = *extack; list_add_tail(&nfnl_err->head, list); return 0; @@ -262,7 +267,8 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) { - netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, NULL); + netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, + &nfnl_err->extack); nfnl_err_del(nfnl_err); } } @@ -280,6 +286,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; + struct netlink_ext_ack extack; LIST_HEAD(err_list); u32 status; int err; @@ -325,6 +332,7 @@ replay: while (skb->len >= nlmsg_total_size(0)) { int msglen, type; + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -384,7 +392,8 @@ replay: if (nc->call_batch) { err = nc->call_batch(net, net->nfnl, skb, nlh, - (const struct nlattr **)cda); + (const struct nlattr **)cda, + &extack); } /* The lock was released to autoload some module, we @@ -402,7 +411,7 @@ ack: * processed, this avoids that the same error is * reported several times when replaying the batch. */ - if (nfnl_err_add(&err_list, nlh, err) < 0) { + if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) { /* We failed to enqueue an error, reset the * list of errors and send OOM to userspace * pointing to the batch header. diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 9898fb4d0512..c45e6d4358ab 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -49,7 +49,8 @@ struct nfacct_filter { static int nfnl_acct_new(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { struct nf_acct *nfacct, *matching = NULL; char *acct_name; @@ -264,7 +265,8 @@ nfacct_filter_alloc(const struct nlattr * const attr) static int nfnl_acct_get(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { int ret = -ENOENT; struct nf_acct *cur; @@ -343,7 +345,8 @@ static int nfnl_acct_try_del(struct nf_acct *cur) static int nfnl_acct_del(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { struct nf_acct *cur, *tmp; int ret = -ENOENT; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index be678a323598..41628b393673 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -398,7 +398,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[], static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; @@ -599,7 +600,8 @@ out: static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { int ret = -ENOENT; struct nf_conntrack_helper *cur; @@ -666,7 +668,8 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { char *helper_name = NULL; struct nf_conntrack_helper *cur; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 49638b03ccc9..400e9ae97153 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -69,7 +69,8 @@ ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { __u16 l3num; __u8 l4num; @@ -239,7 +240,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { int ret = -ENOENT; char *name; @@ -326,7 +328,8 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; @@ -357,7 +360,8 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, static int cttimeout_default_set(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { __u16 l3num; __u8 l4num; @@ -446,7 +450,8 @@ nla_put_failure: static int cttimeout_default_get(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[]) + const struct nlattr * const cda[], + struct netlink_ext_ack *extack) { __u16 l3num; __u8 l4num; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index da9704971a83..9c14892ee65f 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -795,7 +795,8 @@ static struct notifier_block nfulnl_rtnl_notifier = { static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) + const struct nlattr * const nfqa[], + struct netlink_ext_ack *extack) { return -ENOTSUPP; } @@ -818,7 +819,8 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { static int nfulnl_recv_config(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfula[]) + const struct nlattr * const nfula[], + struct netlink_ext_ack *extack) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t group_num = ntohs(nfmsg->res_id); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8a0f218b7938..12b7dc11b6b5 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1032,7 +1032,8 @@ static int nfq_id_after(unsigned int id, unsigned int max) static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) + const struct nlattr * const nfqa[], + struct netlink_ext_ack *extack) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_queue_entry *entry, *tmp; @@ -1136,7 +1137,8 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) + const struct nlattr * const nfqa[], + struct netlink_ext_ack *extack) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); @@ -1200,7 +1202,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) + const struct nlattr * const nfqa[], + struct netlink_ext_ack *extack) { return -ENOTSUPP; } @@ -1217,7 +1220,8 @@ static const struct nf_queue_handler nfqh = { static int nfqnl_recv_config(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[]) + const struct nlattr * const nfqa[], + struct netlink_ext_ack *extack) { struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f753ec69f790..f5a7cb68694e 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -530,7 +530,8 @@ nla_put_failure: static int nfnl_compat_get(struct net *net, struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[]) + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { int ret = 0, target; struct nfgenmsg *nfmsg; diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index c05fefcec238..71cfa9551d08 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -63,7 +63,8 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[]) + const struct nlattr * const osf_attrs[], + struct netlink_ext_ack *extack) { struct xt_osf_user_finger *f; struct xt_osf_finger *kf = NULL, *sf; @@ -107,7 +108,8 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[]) + const struct nlattr * const osf_attrs[], + struct netlink_ext_ack *extack) { struct xt_osf_user_finger *f; struct xt_osf_finger *sf;