1
0
Fork 0
alistair23-linux/net/netfilter/ipvs/ip_vs_ctl.c

4212 lines
106 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the NetFilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* Changes:
*/
#define KMSG_COMPONENT "IPVS"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 02:04:11 -06:00
#include <linux/slab.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/mutex.h>
#include <net/net_namespace.h>
#include <linux/nsproxy.h>
#include <net/ip.h>
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
#include <net/route.h>
#include <net/sock.h>
#include <net/genetlink.h>
#include <linux/uaccess.h>
#include <net/ip_vs.h>
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
/* sysctl variables */
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
int ip_vs_get_debug_level(void)
{
return sysctl_ip_vs_debug_level;
}
#endif
/* Protos */
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
#ifdef CONFIG_IP_VS_IPV6
/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
static bool __ip_vs_addr_is_local_v6(struct net *net,
const struct in6_addr *addr)
{
struct flowi6 fl6 = {
.daddr = *addr,
};
struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
bool is_local;
is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
dst_release(dst);
return is_local;
}
#endif
#ifdef CONFIG_SYSCTL
/*
* update_defense_level is called from keventd and from sysctl,
* so it needs to protect itself from softirqs
*/
static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
static int old_secure_tcp = 0;
int availmem;
int nomem;
int to_change = -1;
/* we only count free and buffered memory (in pages) */
si_meminfo(&i);
availmem = i.freeram + i.bufferram;
/* however in linux 2.5 the i.bufferram is total page cache size,
we need adjust it */
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
nomem = (availmem < ipvs->sysctl_amemthresh);
local_bh_disable();
/* drop_entry */
spin_lock(&ipvs->dropentry_lock);
switch (ipvs->sysctl_drop_entry) {
case 0:
atomic_set(&ipvs->dropentry, 0);
break;
case 1:
if (nomem) {
atomic_set(&ipvs->dropentry, 1);
ipvs->sysctl_drop_entry = 2;
} else {
atomic_set(&ipvs->dropentry, 0);
}
break;
case 2:
if (nomem) {
atomic_set(&ipvs->dropentry, 1);
} else {
atomic_set(&ipvs->dropentry, 0);
ipvs->sysctl_drop_entry = 1;
}
break;
case 3:
atomic_set(&ipvs->dropentry, 1);
break;
}
spin_unlock(&ipvs->dropentry_lock);
/* drop_packet */
spin_lock(&ipvs->droppacket_lock);
switch (ipvs->sysctl_drop_packet) {
case 0:
ipvs->drop_rate = 0;
break;
case 1:
if (nomem) {
ipvs->drop_rate = ipvs->drop_counter
= ipvs->sysctl_amemthresh /
(ipvs->sysctl_amemthresh-availmem);
ipvs->sysctl_drop_packet = 2;
} else {
ipvs->drop_rate = 0;
}
break;
case 2:
if (nomem) {
ipvs->drop_rate = ipvs->drop_counter
= ipvs->sysctl_amemthresh /
(ipvs->sysctl_amemthresh-availmem);
} else {
ipvs->drop_rate = 0;
ipvs->sysctl_drop_packet = 1;
}
break;
case 3:
ipvs->drop_rate = ipvs->sysctl_am_droprate;
break;
}
spin_unlock(&ipvs->droppacket_lock);
/* secure_tcp */
spin_lock(&ipvs->securetcp_lock);
switch (ipvs->sysctl_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
to_change = 0;
break;
case 1:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
ipvs->sysctl_secure_tcp = 2;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
}
break;
case 2:
if (nomem) {
if (old_secure_tcp < 2)
to_change = 1;
} else {
if (old_secure_tcp >= 2)
to_change = 0;
ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
if (old_secure_tcp < 2)
to_change = 1;
break;
}
old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(ipvs,
ipvs->sysctl_secure_tcp > 1);
spin_unlock(&ipvs->securetcp_lock);
local_bh_enable();
}
/*
* Timer for checking the defense
*/
#define DEFENSE_TIMER_PERIOD 1*HZ
static void defense_work_handler(struct work_struct *work)
{
struct netns_ipvs *ipvs =
container_of(work, struct netns_ipvs, defense_work.work);
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
ip_vs_random_dropentry(ipvs);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
}
#endif
int
ip_vs_use_count_inc(void)
{
return try_module_get(THIS_MODULE);
}
void
ip_vs_use_count_dec(void)
{
module_put(THIS_MODULE);
}
/*
* Hash table: for virtual service lookups
*/
#define IP_VS_SVC_TAB_BITS 8
#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
* Returns hash value for virtual service
*/
static inline unsigned int
ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
__u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
ahash = ntohl(addr_fold);
ahash ^= ((size_t) ipvs >> 8);
return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
IP_VS_SVC_TAB_MASK;
}
/*
* Returns hash value of fwmark for virtual service lookup
*/
static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
{
return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
}
/*
* Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
* or in the ip_vs_svc_fwm_table by fwmark.
* Should be called with locked tables.
*/
static int ip_vs_svc_hash(struct ip_vs_service *svc)
{
unsigned int hash;
if (svc->flags & IP_VS_SVC_F_HASHED) {
pr_err("%s(): request for already hashed, called from %pS\n",
__func__, __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/*
* Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
*/
hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
&svc->addr, svc->port);
hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
/* increase its refcnt because it is referenced by the svc table */
atomic_inc(&svc->refcnt);
return 1;
}
/*
* Unhashes a service from svc_table / svc_fwm_table.
* Should be called with locked tables.
*/
static int ip_vs_svc_unhash(struct ip_vs_service *svc)
{
if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
pr_err("%s(): request for unhash flagged, called from %pS\n",
__func__, __builtin_return_address(0));
return 0;
}
if (svc->fwmark == 0) {
/* Remove it from the svc_table table */
hlist_del_rcu(&svc->s_list);
} else {
/* Remove it from the svc_fwm_table table */
hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
atomic_dec(&svc->refcnt);
return 1;
}
/*
* Get service by {netns, proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
&& (svc->protocol == protocol)
&& (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
}
return NULL;
}
/*
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
{
unsigned int hash;
struct ip_vs_service *svc;
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
&& (svc->ipvs == ipvs)) {
/* HIT */
return svc;
}
}
return NULL;
}
/* Find service, called under RCU lock */
struct ip_vs_service *
ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
/*
* Check the table hashed by fwmark first
*/
if (fwmark) {
svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
if (svc)
goto out;
}
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
if (!svc && protocol == IPPROTO_TCP &&
atomic_read(&ipvs->ftpsvc_counter) &&
(vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
&& atomic_read(&ipvs->nullsvc_counter)) {
/*
* Check if the catch-all port (port zero) exists
*/
svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
}
out:
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
svc ? "hit" : "not hit");
return svc;
}
static inline void
__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
atomic_inc(&svc->refcnt);
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
rcu_assign_pointer(dest->svc, svc);
}
static void ip_vs_service_free(struct ip_vs_service *svc)
{
free_percpu(svc->stats.cpustats);
kfree(svc);
}
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
static void ip_vs_service_rcu_free(struct rcu_head *head)
{
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
struct ip_vs_service *svc;
svc = container_of(head, struct ip_vs_service, rcu_head);
ip_vs_service_free(svc);
}
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
if (do_delay)
call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
else
ip_vs_service_free(svc);
}
}
/*
* Returns hash value for real service
*/
static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
& IP_VS_RTAB_MASK;
}
/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
__be16 port;
if (dest->in_rs_table)
return;
switch (IP_VS_DFWD_METHOD(dest)) {
case IP_VS_CONN_F_MASQ:
port = dest->port;
break;
case IP_VS_CONN_F_TUNNEL:
switch (dest->tun_type) {
case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
port = dest->tun_port;
break;
case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
port = 0;
break;
default:
return;
}
break;
default:
return;
}
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
dest->in_rs_table = 1;
}
/* Unhash ip_vs_dest from rs_table. */
static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
if (dest->in_rs_table) {
hlist_del_rcu(&dest->d_list);
dest->in_rs_table = 0;
}
}
/* Check if real service by <proto,addr,port> is present */
bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport)
{
unsigned int hash;
struct ip_vs_dest *dest;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
(dest->protocol == protocol || dest->vfwmark) &&
IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
/* HIT */
return true;
}
}
return false;
}
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
/* Find real service record by <proto,addr,port>.
* In case of multiple records with the same <proto,addr,port>, only
* the first found record is returned.
*
* To be called under RCU lock.
*/
struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
__u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
unsigned int hash;
struct ip_vs_dest *dest;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
(dest->protocol == protocol || dest->vfwmark) &&
IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
/* HIT */
return dest;
}
}
return NULL;
}
/* Find real service record by <af,addr,tun_port>.
* In case of multiple records with the same <af,addr,tun_port>, only
* the first found record is returned.
*
* To be called under RCU lock.
*/
struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
const union nf_inet_addr *daddr,
__be16 tun_port)
{
struct ip_vs_dest *dest;
unsigned int hash;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, tun_port);
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->tun_port == tun_port &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
/* HIT */
return dest;
}
}
return NULL;
}
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
const union nf_inet_addr *daddr, __be16 dport)
{
struct ip_vs_dest *dest;
/*
* Find the destination for the given service
*/
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == dest_af) &&
ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
(dest->port == dport)) {
/* HIT */
return dest;
}
}
return NULL;
}
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
/*
* Find destination by {daddr,dport,vaddr,protocol}
* Created to be used in ip_vs_process_message() in
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
* Called under RCU lock, no refcnt is returned.
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
*/
struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
const union nf_inet_addr *daddr,
__be16 dport,
const union nf_inet_addr *vaddr,
__be16 vport, __u16 protocol, __u32 fwmark,
__u32 flags)
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
{
struct ip_vs_dest *dest;
struct ip_vs_service *svc;
__be16 port = dport;
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
port = 0;
dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
if (!dest)
dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
[IPVS]: Bind connections on stanby if the destination exists This patch fixes the problem with node overload on director fail-over. Given the scenario: 2 nodes each accepting 3 connections at a time and 2 directors, director failover occurs when the nodes are fully loaded (6 connections to the cluster) in this case the new director will assign another 6 connections to the cluster, If the same real servers exist there. The problem turned to be in not binding the inherited connections to the real servers (destinations) on the backup director. Therefore: "ipvsadm -l" reports 0 connections: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 0 -> node484.local:5999 Route 1000 0 0 while "ipvs -lnc" is right root@test2:~# ipvsadm -lnc IPVS connection entries pro expire state source virtual destination TCP 14:56 ESTABLISHED 192.168.0.10:39164 192.168.0.222:5999 192.168.0.51:5999 TCP 14:59 ESTABLISHED 192.168.0.10:39165 192.168.0.222:5999 192.168.0.52:5999 So the patch I am sending fixes the problem by binding the received connections to the appropriate service on the backup director, if it exists, else the connection will be handled the old way. So if the master and the backup directors are synchronized in terms of real services there will be no problem with server over-committing since new connections will not be created on the nonexistent real services on the backup. However if the service is created later on the backup, the binding will be performed when the next connection update is received. With this patch the inherited connections will show as inactive on the backup: root@test2:~# ipvsadm -l IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP test2.local:5999 wlc -> node473.local:5999 Route 1000 0 1 -> node484.local:5999 Route 1000 0 1 rumen@test2:~$ cat /proc/net/ip_vs IP Virtual Server version 1.2.1 (size=4096) Prot LocalAddress:Port Scheduler Flags -> RemoteAddress:Port Forward Weight ActiveConn InActConn TCP C0A800DE:176F wlc -> C0A80033:176F Route 1000 0 1 -> C0A80032:176F Route 1000 0 1 Regards, Rumen Bogdanovski Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Rumen G. Bogdanovski <rumen@voicecho.com> Signed-off-by: Simon Horman <horms@verge.net.au>
2007-11-07 03:35:54 -07:00
return dest;
}
void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
{
struct ip_vs_dest_dst *dest_dst = container_of(head,
struct ip_vs_dest_dst,
rcu_head);
dst_release(dest_dst->dst_cache);
kfree(dest_dst);
}
/* Release dest_dst and dst_cache for dest in user context */
static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
{
struct ip_vs_dest_dst *old;
old = rcu_dereference_protected(dest->dest_dst, 1);
if (old) {
RCU_INIT_POINTER(dest->dest_dst, NULL);
call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
}
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
* from the service table but are still referenced by some conn entries.
* The reason to add the destination trash is when the dest is temporary
* down (either by administrator or by monitor program), the dest can be
* picked back from the trash, the remaining connections to the dest can
* continue, and the counting information of the dest is also useful for
* scheduling.
*/
static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
const union nf_inet_addr *daddr, __be16 dport)
{
struct ip_vs_dest *dest;
struct netns_ipvs *ipvs = svc->ipvs;
/*
* Find the destination in trash
*/
spin_lock_bh(&ipvs->dest_trash_lock);
list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
refcount_read(&dest->refcnt));
if (dest->af == dest_af &&
ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
dest->port == dport &&
dest->vfwmark == svc->fwmark &&
dest->protocol == svc->protocol &&
(svc->fwmark ||
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
list_del(&dest->t_list);
goto out;
}
}
dest = NULL;
out:
spin_unlock_bh(&ipvs->dest_trash_lock);
return dest;
}
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
__ip_vs_dst_cache_reset(dest);
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
__ip_vs_svc_put(svc, false);
free_percpu(dest->stats.cpustats);
ip_vs_dest_put_and_free(dest);
}
/*
* Clean up all the destinations in the trash
* Called by the ip_vs_control_cleanup()
*
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
* be 1, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
del_timer_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
list_del(&dest->t_list);
ip_vs_dest_free(dest);
}
}
static void
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
spin_lock_bh(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
IP_VS_SHOW_STATS_COUNTER(outpkts);
IP_VS_SHOW_STATS_COUNTER(inbytes);
IP_VS_SHOW_STATS_COUNTER(outbytes);
ip_vs_read_estimator(dst, src);
spin_unlock_bh(&src->lock);
}
static void
ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
{
dst->conns = (u32)src->conns;
dst->inpkts = (u32)src->inpkts;
dst->outpkts = (u32)src->outpkts;
dst->inbytes = src->inbytes;
dst->outbytes = src->outbytes;
dst->cps = (u32)src->cps;
dst->inpps = (u32)src->inpps;
dst->outpps = (u32)src->outpps;
dst->inbps = (u32)src->inbps;
dst->outbps = (u32)src->outbps;
}
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
spin_lock_bh(&stats->lock);
/* get current counters as zero point, rates are zeroed */
#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
IP_VS_ZERO_STATS_COUNTER(conns);
IP_VS_ZERO_STATS_COUNTER(inpkts);
IP_VS_ZERO_STATS_COUNTER(outpkts);
IP_VS_ZERO_STATS_COUNTER(inbytes);
IP_VS_ZERO_STATS_COUNTER(outbytes);
ip_vs_zero_estimator(stats);
spin_unlock_bh(&stats->lock);
}
/*
* Update a destination in the given service
*/
static void
__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = svc->ipvs;
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
struct ip_vs_service *old_svc;
struct ip_vs_scheduler *sched;
int conn_flags;
/* We cannot modify an address and change the address family */
BUG_ON(!add && udest->af != dest->af);
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
/* keep the last_weight with latest non-0 weight */
if (add || udest->weight != 0)
atomic_set(&dest->last_weight, udest->weight);
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
/* Need to rehash? */
if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
IP_VS_DFWD_METHOD(dest) ||
udest->tun_type != dest->tun_type ||
udest->tun_port != dest->tun_port)
ip_vs_rs_unhash(dest);
/* set the tunnel info */
dest->tun_type = udest->tun_type;
dest->tun_port = udest->tun_port;
dest->tun_flags = udest->tun_flags;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/* FTP-NAT requires conntrack for mangling */
if (svc->port == FTPPORT)
ip_vs_register_conntrack(svc);
}
atomic_set(&dest->conn_flags, conn_flags);
/* Put the real service in rs_table if not present. */
ip_vs_rs_hash(ipvs, dest);
/* bind the service */
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
old_svc = rcu_dereference_protected(dest->svc, 1);
if (!old_svc) {
__ip_vs_bind_svc(dest, svc);
} else {
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
__ip_vs_svc_put(old_svc, true);
}
}
/* set the dest status flags */
dest->flags |= IP_VS_DEST_F_AVAILABLE;
if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
dest->af = udest->af;
spin_lock_bh(&dest->dst_lock);
__ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
ipvs: changes for local real server This patch deals with local real servers: - Add support for DNAT to local address (different real server port). It needs ip_vs_out hook in LOCAL_OUT for both families because skb->protocol is not set for locally generated packets and can not be used to set 'af'. - Skip packets in ip_vs_in marked with skb->ipvs_property because ip_vs_out processing can be executed in LOCAL_OUT but we still have the conn_out_get check in ip_vs_in. - Ignore packets with inet->nodefrag from local stack - Require skb_dst(skb) != NULL because we use it to get struct net - Add support for changing the route to local IPv4 stack after DNAT depending on the source address type. Local client sets output route and the remote client sets input route. It looks like IPv6 does not need such rerouting because the replies use addresses from initial incoming header, not from skb route. - All transmitters now have strict checks for the destination address type: redirect from non-local address to local real server requires NAT method, local address can not be used as source address when talking to remote real server. - Now LOCALNODE is not set explicitly as forwarding method in real server to allow the connections to provide correct forwarding method to the backup server. Not sure if this breaks tools that expect to see 'Local' real server type. If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL. Now it should be possible connections in backup that lost their fwmark information during sync to be forwarded properly to their daddr, even if it is local address in the backup server. By this way backup could be used as real server for DR or TUN, for NAT there are some restrictions because tuple collisions in conntracks can create problems for the traffic. - Call ip_vs_dst_reset when destination is updated in case some real server IP type is changed between local and remote. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2010-10-17 07:38:15 -06:00
if (add) {
ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
}
}
/*
* Create a destination for the given service
*/
static int
ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest **dest_p)
{
struct ip_vs_dest *dest;
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
unsigned int atype, i;
EnterFunction(2);
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
int ret;
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
!__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
return -EINVAL;
ret = nf_defrag_ipv6_enable(svc->ipvs->net);
if (ret)
return ret;
} else
#endif
{
atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
if (atype != RTN_LOCAL && atype != RTN_UNICAST)
return -EINVAL;
}
dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
if (dest == NULL)
return -ENOMEM;
dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!dest->stats.cpustats)
goto err_alloc;
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *ip_vs_dest_stats;
ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
u64_stats_init(&ip_vs_dest_stats->syncp);
}
dest->af = udest->af;
dest->protocol = svc->protocol;
dest->vaddr = svc->addr;
dest->vport = svc->port;
dest->vfwmark = svc->fwmark;
ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
dest->port = udest->port;
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
refcount_set(&dest->refcnt, 1);
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
*dest_p = dest;
LeaveFunction(2);
return 0;
err_alloc:
kfree(dest);
return -ENOMEM;
}
/*
* Add a destination into an existing service
*/
static int
ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
int ret;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
if (udest->tun_port == 0) {
pr_err("%s(): tunnel port is zero\n", __func__);
return -EINVAL;
}
}
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
return -EEXIST;
}
/*
* Check if the dest already exists in the trash and
* is from the same service
*/
dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
if (dest != NULL) {
IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
"dest->refcnt=%d, service %u/%s:%u\n",
IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
refcount_read(&dest->refcnt),
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
ret = ip_vs_new_dest(svc, udest, &dest);
}
LeaveFunction(2);
return ret;
}
/*
* Edit a destination in the given service
*/
static int
ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
union nf_inet_addr daddr;
__be16 dport = udest->port;
EnterFunction(2);
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
}
if (udest->l_threshold > udest->u_threshold) {
pr_err("%s(): lower threshold is higher than upper threshold\n",
__func__);
return -ERANGE;
}
if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
if (udest->tun_port == 0) {
pr_err("%s(): tunnel port is zero\n", __func__);
return -EINVAL;
}
}
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
return -ENOENT;
}
__ip_vs_update_dest(svc, dest, udest, 0);
LeaveFunction(2);
return 0;
}
/*
* Delete a destination (must be already unlinked from the service)
*/
static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
bool cleanup)
{
ip_vs_stop_estimator(ipvs, &dest->stats);
/*
* Remove it from the d-linked list with the real services.
*/
ip_vs_rs_unhash(dest);
spin_lock_bh(&ipvs->dest_trash_lock);
IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
refcount_read(&dest->refcnt));
if (list_empty(&ipvs->dest_trash) && !cleanup)
mod_timer(&ipvs->dest_trash_timer,
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
/* dest lives in trash with reference */
list_add(&dest->t_list, &ipvs->dest_trash);
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
}
/*
* Unlink a destination from the given service
*/
static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
int svcupd)
{
dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
/*
* Remove it from the d-linked destination list.
*/
list_del_rcu(&dest->n_list);
svc->num_dests--;
if (dest->af != svc->af)
svc->ipvs->mixed_address_family_dests--;
if (svcupd) {
struct ip_vs_scheduler *sched;
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->del_dest)
sched->del_dest(svc, dest);
}
}
/*
* Delete a destination server in the given service
*/
static int
ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
__be16 dport = udest->port;
EnterFunction(2);
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
/*
* Delete the destination
*/
__ip_vs_del_dest(svc->ipvs, dest, false);
LeaveFunction(2);
return 0;
}
static void ip_vs_dest_trash_expire(struct timer_list *t)
{
struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer);
struct ip_vs_dest *dest, *next;
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
unsigned long now = jiffies;
spin_lock(&ipvs->dest_trash_lock);
list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
if (refcount_read(&dest->refcnt) > 1)
continue;
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
if (dest->idle_start) {
if (time_before(now, dest->idle_start +
IP_VS_DEST_TRASH_PERIOD))
continue;
} else {
dest->idle_start = max(1UL, now);
continue;
}
IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
dest->vfwmark,
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port));
list_del(&dest->t_list);
ip_vs_dest_free(dest);
}
if (!list_empty(&ipvs->dest_trash))
mod_timer(&ipvs->dest_trash_timer,
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
spin_unlock(&ipvs->dest_trash_lock);
}
/*
* Add a service into the service hash table
*/
static int
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
int ret = 0, i;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
/* increase the module use count */
ip_vs_use_count_inc();
/* Lookup the scheduler by 'u->sched_name' */
if (strcmp(u->sched_name, "none")) {
sched = ip_vs_scheduler_get(u->sched_name);
if (!sched) {
pr_info("Scheduler module ip_vs_%s not found\n",
u->sched_name);
ret = -ENOENT;
goto out_err;
}
}
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out_err;
}
}
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6) {
__u32 plen = (__force __u32) u->netmask;
if (plen < 1 || plen > 128) {
ret = -EINVAL;
goto out_err;
}
ret = nf_defrag_ipv6_enable(ipvs->net);
if (ret)
goto out_err;
}
#endif
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
goto out_err;
}
svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!svc->stats.cpustats) {
ret = -ENOMEM;
goto out_err;
}
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *ip_vs_stats;
ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
u64_stats_init(&ip_vs_stats->syncp);
}
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
svc->protocol = u->protocol;
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
svc->flags = u->flags;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->ipvs = ipvs;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
ret = ip_vs_bind_scheduler(svc, sched);
if (ret)
goto out_err;
sched = NULL;
}
/* Bind the ct retriever */
RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
if (svc->pe && svc->pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
/* Now there is a service - full throttle */
ipvs->enable = 1;
return 0;
out_err:
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
/*
* Edit a service and bind it with a new scheduler
*/
static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
bool new_pe_conn_out, old_pe_conn_out;
/*
* Lookup the scheduler, by 'u->sched_name'
*/
if (strcmp(u->sched_name, "none")) {
sched = ip_vs_scheduler_get(u->sched_name);
if (!sched) {
pr_info("Scheduler module ip_vs_%s not found\n",
u->sched_name);
return -ENOENT;
}
}
old_sched = sched;
if (u->pe_name && *u->pe_name) {
pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
ret = -ENOENT;
goto out;
}
old_pe = pe;
}
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6) {
__u32 plen = (__force __u32) u->netmask;
if (plen < 1 || plen > 128) {
ret = -EINVAL;
goto out;
}
}
#endif
old_sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched != old_sched) {
if (old_sched) {
ip_vs_unbind_scheduler(svc, old_sched);
RCU_INIT_POINTER(svc->scheduler, NULL);
/* Wait all svc->sched_data users */
synchronize_rcu();
}
/* Bind the new scheduler */
if (sched) {
ret = ip_vs_bind_scheduler(svc, sched);
if (ret) {
ip_vs_scheduler_put(sched);
goto out;
}
}
}
/*
* Set the flags and timeout value
*/
svc->flags = u->flags | IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe);
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
/* check for optional methods in new pe */
new_pe_conn_out = (pe && pe->conn_out) ? true : false;
old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
if (new_pe_conn_out && !old_pe_conn_out)
atomic_inc(&svc->ipvs->conn_out_counter);
if (old_pe_conn_out && !new_pe_conn_out)
atomic_dec(&svc->ipvs->conn_out_counter);
}
out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services--;
ip_vs_stop_estimator(svc->ipvs, &svc->stats);
/* Unbind scheduler */
old_sched = rcu_dereference_protected(svc->scheduler, 1);
ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
if (old_pe && old_pe->conn_out)
atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe);
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
__ip_vs_del_dest(svc->ipvs, dest, cleanup);
}
/*
* Update the virtual service counters
*/
if (svc->port == FTPPORT)
atomic_dec(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_dec(&ipvs->nullsvc_counter);
/*
* Free the service if nobody refers to it
*/
ipvs: make the service replacement more robust commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2013-09-12 02:21:07 -06:00
__ip_vs_svc_put(svc, true);
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
ip_vs_unregister_conntrack(svc);
/* Hold svc to avoid double release from dest_trash */
atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
ip_vs_svc_unhash(svc);
__ip_vs_del_service(svc, cleanup);
}
/*
* Delete a service from the service list
*/
static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
ip_vs_unlink_service(svc, false);
return 0;
}
/*
* Flush all the virtual services
*/
static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
{
int idx;
struct ip_vs_service *svc;
struct hlist_node *n;
/*
* Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
s_list) {
if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
/*
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
f_list) {
if (svc->ipvs == ipvs)
ip_vs_unlink_service(svc, cleanup);
}
}
return 0;
}
/*
* Delete service by {netns} in the service table.
* Called by __ip_vs_cleanup()
*/
void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
{
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
ip_vs_flush(ipvs, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
/* Put all references for device (dst_cache) */
static inline void
ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
struct ip_vs_dest_dst *dest_dst;
spin_lock_bh(&dest->dst_lock);
dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
if (dest_dst && dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
refcount_read(&dest->refcnt));
__ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
}
/* Netdev event receiver
* Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
unsigned int idx;
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
}
}
}
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (svc->ipvs == ipvs) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
ip_vs_forget_dev(dest, dev);
}
}
}
}
spin_lock_bh(&ipvs->dest_trash_lock);
list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
ip_vs_forget_dev(dest, dev);
}
spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
return NOTIFY_DONE;
}
/*
* Zero counters in a service or all services
*/
static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
return 0;
}
static int ip_vs_zero_all(struct netns_ipvs *ipvs)
{
int idx;
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (svc->ipvs == ipvs)
ip_vs_zero_service(svc);
}
}
ip_vs_zero_stats(&ipvs->tot_stats);
return 0;
}
#ifdef CONFIG_SYSCTL
static int three = 3;
static int
proc_do_defense_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if ((*valp < 0) || (*valp > 3)) {
/* Restore the correct value */
*valp = val;
} else {
update_defense_level(ipvs);
}
}
return rc;
}
static int
proc_do_sync_threshold(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val[2];
int rc;
/* backup the value first */
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (valp[0] < 0 || valp[1] < 0 ||
(valp[0] >= valp[1] && valp[1]))) {
/* Restore the correct value */
memcpy(valp, val, sizeof(val));
}
return rc;
}
static int
proc_do_sync_mode(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if ((*valp < 0) || (*valp > 1)) {
/* Restore the correct value */
*valp = val;
}
}
return rc;
}
static int
proc_do_sync_ports(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
int rc;
rc = proc_dointvec(table, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
if (*valp < 1 || !is_power_of_2(*valp)) {
/* Restore the correct value */
*valp = val;
}
}
return rc;
}
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
* align with netns init in ip_vs_control_net_init()
*/
static struct ctl_table vs_vars[] = {
{
.procname = "amemthresh",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "am_droprate",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "drop_entry",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "drop_packet",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
ipvs: netfilter connection tracking changes Add more code to IPVS to work with Netfilter connection tracking and fix some problems. - Allow IPVS to be compiled without connection tracking as in 2.6.35 and before. This can avoid keeping conntracks for all IPVS connections because this costs memory. ip_vs_ftp still depends on connection tracking and NAT as implemented for 2.6.36. - Add sysctl var "conntrack" to enable connection tracking for all IPVS connections. For loaded IPVS directors it needs tuning of nf_conntrack_max limit. - Add IP_VS_CONN_F_NFCT connection flag to request the connection to use connection tracking. This allows user space to provide this flag, for example, in dest->conn_flags. This can be useful to request connection tracking per real server instead of forcing it for all connections with the "conntrack" sysctl. This flag is set currently only by ip_vs_ftp and of course by "conntrack" sysctl. - Add ip_vs_nfct.c file to hold all connection tracking code, by this way main code should not depend of netfilter conntrack support. - Return back the ip_vs_post_routing handler as in 2.6.35 and use skb->ipvs_property=1 to allow IPVS to work without connection tracking Connection tracking: - most of the code is already in 2.6.36-rc - alter conntrack reply tuple for LVS-NAT connections when first packet from client is forwarded and conntrack state is NEW or RELATED. Additionally, alter reply for RELATED connections from real server, again for packet in original direction. - add IP_VS_XMIT_TUNNEL to confirm conntrack (without altering reply) for LVS-TUN early because we want to call nf_reset. It is needed because we add IPIP header and the original conntrack should be preserved, not destroyed. The transmitted IPIP packets can reuse same conntrack, so we do not set skb->ipvs_property. - try to destroy conntrack when the IPVS connection is destroyed. It is not fatal if conntrack disappears before that, it depends on the used timers. Fix problems from long time: - add skb->ip_summed = CHECKSUM_NONE for the LVS-TUN transmitters Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-09-21 09:35:41 -06:00
#ifdef CONFIG_IP_VS_NFCT
{
.procname = "conntrack",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
{
.procname = "secure_tcp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
{
.procname = "snat_reroute",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.procname = "sync_version",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_sync_mode,
},
{
.procname = "sync_ports",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_sync_ports,
},
{
.procname = "sync_persist_mode",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_qlen_max",
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "sync_sock_size",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "cache_bypass",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "expire_nodest_conn",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sloppy_tcp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sloppy_sctp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "expire_quiescent_template",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_threshold",
.maxlen =
sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
.mode = 0644,
.proc_handler = proc_do_sync_threshold,
},
{
.procname = "sync_refresh_period",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "sync_retries",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
proc/sysctl: add shared variables for range check In the sysctl code the proc_dointvec_minmax() function is often used to validate the user supplied value between an allowed range. This function uses the extra1 and extra2 members from struct ctl_table as minimum and maximum allowed value. On sysctl handler declaration, in every source file there are some readonly variables containing just an integer which address is assigned to the extra1 and extra2 members, so the sysctl range is enforced. The special values 0, 1 and INT_MAX are very often used as range boundary, leading duplication of variables like zero=0, one=1, int_max=INT_MAX in different source files: $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l 248 Add a const int array containing the most commonly used values, some macros to refer more easily to the correct array member, and use them instead of creating a local one for every object file. This is the bloat-o-meter output comparing the old and new binary compiled with the default Fedora config: # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164) Data old new delta sysctl_vals - 12 +12 __kstrtab_sysctl_vals - 12 +12 max 14 10 -4 int_max 16 - -16 one 68 - -68 zero 128 28 -100 Total: Before=20583249, After=20583085, chg -0.00% [mcroce@redhat.com: tipc: remove two unused variables] Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com [akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c] [arnd@arndb.de: proc/sysctl: make firmware loader table conditional] Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de [akpm@linux-foundation.org: fix fs/eventpoll.c] Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com Signed-off-by: Matteo Croce <mcroce@redhat.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Kees Cook <keescook@chromium.org> Reviewed-by: Aaron Tomlin <atomlin@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 16:58:50 -06:00
.extra1 = SYSCTL_ZERO,
.extra2 = &three,
},
{
.procname = "nat_icmp_send",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "pmtu_disc",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "backup_only",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "conn_reuse_mode",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "schedule_icmp",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ignore_tunneled",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
.data = &sysctl_ip_vs_debug_level,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
{ }
};
#endif
#ifdef CONFIG_PROC_FS
struct ip_vs_iter {
struct seq_net_private p; /* Do not move this, netns depends upon it*/
struct hlist_head *table;
int bucket;
};
/*
* Write the contents of the VS rule table to a PROCfs file.
* (It is kept just for backward compatibility)
*/
static inline const char *ip_vs_fwd_name(unsigned int flags)
{
switch (flags & IP_VS_CONN_F_FWD_MASK) {
case IP_VS_CONN_F_LOCALNODE:
return "Local";
case IP_VS_CONN_F_TUNNEL:
return "Tunnel";
case IP_VS_CONN_F_DROUTE:
return "Route";
default:
return "Masq";
}
}
/* Get the Nth entry in the two lists */
static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_iter *iter = seq->private;
int idx;
struct ip_vs_service *svc;
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
return svc;
}
}
}
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
f_list) {
if ((svc->ipvs == ipvs) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
return svc;
}
}
}
return NULL;
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
++*pos;
if (v == SEQ_START_TOKEN)
return ip_vs_info_array(seq,0);
svc = v;
iter = seq->private;
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
e = rcu_dereference(hlist_next_rcu(&svc->s_list));
if (e)
return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
hlist_for_each_entry_rcu(svc,
&ip_vs_svc_table[iter->bucket],
s_list) {
return svc;
}
}
iter->table = ip_vs_svc_fwm_table;
iter->bucket = -1;
goto scan_fwmark;
}
/* next service in hashed by fwmark */
e = rcu_dereference(hlist_next_rcu(&svc->f_list));
if (e)
return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
hlist_for_each_entry_rcu(svc,
&ip_vs_svc_fwm_table[iter->bucket],
f_list)
return svc;
}
return NULL;
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN) {
seq_printf(seq,
"IP Virtual Server version %d.%d.%d (size=%d)\n",
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-04 21:50:24 -07:00
NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
seq_puts(seq,
"Prot LocalAddress:Port Scheduler Flags\n");
seq_puts(seq,
" -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
} else {
2017-10-15 05:54:10 -06:00
struct net *net = seq_file_net(seq);
struct netns_ipvs *ipvs = net_ipvs(net);
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
char *sched_name = sched ? sched->name : "none";
2017-10-15 05:54:10 -06:00
if (svc->ipvs != ipvs)
return 0;
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
seq_printf(seq, "%s [%pI6]:%04X %s ",
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
sched_name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
svc->fwmark, sched_name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
seq_printf(seq, "persistent %d %08X\n",
svc->timeout,
ntohl(svc->netmask));
else
seq_putc(seq, '\n');
list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
" -> [%pI6]:%04X"
" %-7s %-6d %-10d %-10d\n",
&dest->addr.in6,
ntohs(dest->port),
ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
atomic_read(&dest->weight),
atomic_read(&dest->activeconns),
atomic_read(&dest->inactconns));
else
#endif
seq_printf(seq,
" -> %08X:%04X "
"%-7s %-6d %-10d %-10d\n",
ntohl(dest->addr.ip),
ntohs(dest->port),
ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
atomic_read(&dest->weight),
atomic_read(&dest->activeconns),
atomic_read(&dest->inactconns));
}
}
return 0;
}
static const struct seq_operations ip_vs_info_seq_ops = {
.start = ip_vs_info_seq_start,
.next = ip_vs_info_seq_next,
.stop = ip_vs_info_seq_stop,
.show = ip_vs_info_seq_show,
};
static int ip_vs_stats_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_kstats show;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Total Incoming Outgoing Incoming Outgoing\n");
seq_puts(seq,
" Conns Packets Packets Bytes Bytes\n");
ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)show.conns,
(unsigned long long)show.inpkts,
(unsigned long long)show.outpkts,
(unsigned long long)show.inbytes,
(unsigned long long)show.outbytes);
/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
(unsigned long long)show.cps,
(unsigned long long)show.inpps,
(unsigned long long)show.outpps,
(unsigned long long)show.inbps,
(unsigned long long)show.outbps);
return 0;
}
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_kstats kstats;
int i;
/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Total Incoming Outgoing Incoming Outgoing\n");
seq_puts(seq,
"CPU Conns Packets Packets Bytes Bytes\n");
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
unsigned int start;
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
start = u64_stats_fetch_begin_irq(&u->syncp);
conns = u->cnt.conns;
inpkts = u->cnt.inpkts;
outpkts = u->cnt.outpkts;
inbytes = u->cnt.inbytes;
outbytes = u->cnt.outbytes;
} while (u64_stats_fetch_retry_irq(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
(u64)outpkts, (u64)inbytes,
(u64)outbytes);
}
ip_vs_copy_stats(&kstats, tot_stats);
seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)kstats.conns,
(unsigned long long)kstats.inpkts,
(unsigned long long)kstats.outpkts,
(unsigned long long)kstats.inbytes,
(unsigned long long)kstats.outbytes);
/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
seq_puts(seq,
" Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
kstats.cps,
kstats.inpps,
kstats.outpps,
kstats.inbps,
kstats.outbps);
return 0;
}
#endif
/*
* Set timeout values for tcp tcpfin udp in the timeout_table.
*/
static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
#endif
IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
u->tcp_timeout,
u->tcp_fin_timeout,
u->udp_timeout);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
return -EINVAL;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
return -EINVAL;
#endif
#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
= u->tcp_timeout * HZ;
}
if (u->tcp_fin_timeout) {
pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
= u->tcp_fin_timeout * HZ;
}
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (u->udp_timeout) {
pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
pd->timeout_table[IP_VS_UDP_S_NORMAL]
= u->udp_timeout * HZ;
}
#endif
return 0;
}
#define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
struct ip_vs_svcdest_user {
struct ip_vs_service_user s;
struct ip_vs_dest_user d;
};
static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
[CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
[CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
[CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
[CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
[CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
[CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
[CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
[CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
[CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
[CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
};
union ip_vs_set_arglen {
struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
};
#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
struct ip_vs_service_user *usvc_compat)
{
memset(usvc, 0, sizeof(*usvc));
usvc->af = AF_INET;
usvc->protocol = usvc_compat->protocol;
usvc->addr.ip = usvc_compat->addr;
usvc->port = usvc_compat->port;
usvc->fwmark = usvc_compat->fwmark;
/* Deep copy of sched_name is not needed here */
usvc->sched_name = usvc_compat->sched_name;
usvc->flags = usvc_compat->flags;
usvc->timeout = usvc_compat->timeout;
usvc->netmask = usvc_compat->netmask;
}
static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest_user *udest_compat)
{
memset(udest, 0, sizeof(*udest));
udest->addr.ip = udest_compat->addr;
udest->port = udest_compat->port;
udest->conn_flags = udest_compat->conn_flags;
udest->weight = udest_compat->weight;
udest->u_threshold = udest_compat->u_threshold;
udest->l_threshold = udest_compat->l_threshold;
udest->af = AF_INET;
udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
}
static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
struct net *net = sock_net(sk);
int ret;
unsigned char arg[MAX_SET_ARGLEN];
struct ip_vs_service_user *usvc_compat;
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
struct ip_vs_dest_user *udest_compat;
struct ip_vs_dest_user_kern udest;
struct netns_ipvs *ipvs = net_ipvs(net);
BUILD_BUG_ON(sizeof(arg) > 255);
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
return -EINVAL;
if (len != set_arglen[CMDID(cmd)]) {
IP_VS_DBG(1, "set_ctl: len %u != %u\n",
len, set_arglen[CMDID(cmd)]);
return -EINVAL;
}
if (copy_from_user(arg, user, len) != 0)
return -EFAULT;
/* increase the module use count */
ip_vs_use_count_inc();
/* Handle daemons since they have another lock */
if (cmd == IP_VS_SO_SET_STARTDAEMON ||
cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
if (cmd == IP_VS_SO_SET_STARTDAEMON) {
struct ipvs_sync_daemon_cfg cfg;
memset(&cfg, 0, sizeof(cfg));
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
ret = -EINVAL;
if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
sizeof(cfg.mcast_ifn)) <= 0)
goto out_dec;
cfg.syncid = dm->syncid;
ret = start_sync_thread(ipvs, &cfg, dm->state);
} else {
ret = stop_sync_thread(ipvs, dm->state);
}
goto out_dec;
}
mutex_lock(&__ip_vs_mutex);
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
ret = ip_vs_flush(ipvs, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
/* We only use the new structs internally, so copy userspace compat
* structs to extended internal versions */
ip_vs_copy_usvc_compat(&usvc, usvc_compat);
ip_vs_copy_udest_compat(&udest, udest_compat);
if (cmd == IP_VS_SO_SET_ZERO) {
/* if no service address is set, zero counters in all */
if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
ret = ip_vs_zero_all(ipvs);
goto out_unlock;
}
}
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
IP_VS_SCHEDNAME_MAXLEN) {
ret = -EINVAL;
goto out_unlock;
}
/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
usvc.protocol != IPPROTO_SCTP) {
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
usvc.protocol, &usvc.addr.ip,
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
ntohs(usvc.port));
ret = -EFAULT;
goto out_unlock;
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
rcu_read_lock();
if (usvc.fwmark == 0)
svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
ret = -ESRCH;
goto out_unlock;
}
switch (cmd) {
case IP_VS_SO_SET_ADD:
if (svc != NULL)
ret = -EEXIST;
else
ret = ip_vs_add_service(ipvs, &usvc, &svc);
break;
case IP_VS_SO_SET_EDIT:
ret = ip_vs_edit_service(svc, &usvc);
break;
case IP_VS_SO_SET_DEL:
ret = ip_vs_del_service(svc);
if (!ret)
goto out_unlock;
break;
case IP_VS_SO_SET_ZERO:
ret = ip_vs_zero_service(svc);
break;
case IP_VS_SO_SET_ADDDEST:
ret = ip_vs_add_dest(svc, &udest);
break;
case IP_VS_SO_SET_EDITDEST:
ret = ip_vs_edit_dest(svc, &udest);
break;
case IP_VS_SO_SET_DELDEST:
ret = ip_vs_del_dest(svc, &udest);
break;
default:
ret = -EINVAL;
}
out_unlock:
mutex_unlock(&__ip_vs_mutex);
out_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
return ret;
}
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
struct ip_vs_scheduler *sched;
struct ip_vs_kstats kstats;
char *sched_name;
sched = rcu_dereference_protected(src->scheduler, 1);
sched_name = sched ? sched->name : "none";
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
dst->num_dests = src->num_dests;
ip_vs_copy_stats(&kstats, &src->stats);
ip_vs_export_stats_user(&dst->stats, &kstats);
}
static inline int
__ip_vs_get_service_entries(struct netns_ipvs *ipvs,
const struct ip_vs_get_services *get,
struct ip_vs_get_services __user *uptr)
{
int idx, count=0;
struct ip_vs_service *svc;
struct ip_vs_service_entry entry;
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
goto out;
memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || (svc->ipvs != ipvs))
continue;
if (count >= get->num_services)
goto out;
memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
goto out;
}
count++;
}
}
out:
return ret;
}
static inline int
__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
struct ip_vs_get_dests __user *uptr)
{
struct ip_vs_service *svc;
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
rcu_read_lock();
if (get->fwmark)
svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
else
svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
get->port);
rcu_read_unlock();
if (svc) {
int count = 0;
struct ip_vs_dest *dest;
struct ip_vs_dest_entry entry;
struct ip_vs_kstats kstats;
memset(&entry, 0, sizeof(entry));
list_for_each_entry(dest, &svc->destinations, n_list) {
if (count >= get->num_dests)
break;
/* Cannot expose heterogeneous members via sockopt
* interface
*/
if (dest->af != svc->af)
continue;
entry.addr = dest->addr.ip;
entry.port = dest->port;
entry.conn_flags = atomic_read(&dest->conn_flags);
entry.weight = atomic_read(&dest->weight);
entry.u_threshold = dest->u_threshold;
entry.l_threshold = dest->l_threshold;
entry.activeconns = atomic_read(&dest->activeconns);
entry.inactconns = atomic_read(&dest->inactconns);
entry.persistconns = atomic_read(&dest->persistconns);
ip_vs_copy_stats(&kstats, &dest->stats);
ip_vs_export_stats_user(&entry.stats, &kstats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
ret = -EFAULT;
break;
}
count++;
}
} else
ret = -ESRCH;
return ret;
}
static inline void
__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
{
#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
struct ip_vs_proto_data *pd;
#endif
memset(u, 0, sizeof (*u));
#ifdef CONFIG_IP_VS_PROTO_TCP
pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
u->udp_timeout =
pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
#endif
}
static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
[CMDID(IP_VS_SO_GET_VERSION)] = 64,
[CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
[CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
[CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
[CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
[CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
[CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
};
union ip_vs_get_arglen {
char field_IP_VS_SO_GET_VERSION[64];
struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
};
#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
static int
do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
unsigned char arg[MAX_GET_ARGLEN];
int ret = 0;
unsigned int copylen;
struct net *net = sock_net(sk);
struct netns_ipvs *ipvs = net_ipvs(net);
BUG_ON(!net);
BUILD_BUG_ON(sizeof(arg) > 255);
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
return -EINVAL;
copylen = get_arglen[CMDID(cmd)];
if (*len < (int) copylen) {
IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
return -EINVAL;
}
if (copy_from_user(arg, user, copylen) != 0)
return -EFAULT;
/*
* Handle daemons first since it has its own locking
*/
if (cmd == IP_VS_SO_GET_DAEMON) {
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->bcfg.syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
ret = -EFAULT;
mutex_unlock(&ipvs->sync_mutex);
return ret;
}
mutex_lock(&__ip_vs_mutex);
switch (cmd) {
case IP_VS_SO_GET_VERSION:
{
char buf[64];
sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-04 21:50:24 -07:00
NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
ret = -EFAULT;
goto out;
}
*len = strlen(buf)+1;
}
break;
case IP_VS_SO_GET_INFO:
{
struct ip_vs_getinfo info;
info.version = IP_VS_VERSION_CODE;
IPVS: Allow boot time change of hash size I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
2010-01-04 21:50:24 -07:00
info.size = ip_vs_conn_tab_size;
info.num_services = ipvs->num_services;
if (copy_to_user(user, &info, sizeof(info)) != 0)
ret = -EFAULT;
}
break;
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
int size;
get = (struct ip_vs_get_services *)arg;
size = struct_size(get, entrytable, get->num_services);
if (*len != size) {
pr_err("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_service_entries(ipvs, get, user);
}
break;
case IP_VS_SO_GET_SERVICE:
{
struct ip_vs_service_entry *entry;
struct ip_vs_service *svc;
union nf_inet_addr addr;
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
rcu_read_lock();
if (entry->fwmark)
svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
else
svc = __ip_vs_service_find(ipvs, AF_INET,
entry->protocol, &addr,
entry->port);
rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
} else
ret = -ESRCH;
}
break;
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
int size;
get = (struct ip_vs_get_dests *)arg;
size = struct_size(get, entrytable, get->num_dests);
if (*len != size) {
pr_err("length: %u != %u\n", *len, size);
ret = -EINVAL;
goto out;
}
ret = __ip_vs_get_dest_entries(ipvs, get, user);
}
break;
case IP_VS_SO_GET_TIMEOUT:
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(ipvs, &t);
if (copy_to_user(user, &t, sizeof(t)) != 0)
ret = -EFAULT;
}
break;
default:
ret = -EINVAL;
}
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static struct nf_sockopt_ops ip_vs_sockopts = {
.pf = PF_INET,
.set_optmin = IP_VS_BASE_CTL,
.set_optmax = IP_VS_SO_SET_MAX+1,
.set = do_ip_vs_set_ctl,
.get_optmin = IP_VS_BASE_CTL,
.get_optmax = IP_VS_SO_GET_MAX+1,
.get = do_ip_vs_get_ctl,
[NETFILTER]: Fix/improve deadlock condition on module removal netfilter So I've had a deadlock reported to me. I've found that the sequence of events goes like this: 1) process A (modprobe) runs to remove ip_tables.ko 2) process B (iptables-restore) runs and calls setsockopt on a netfilter socket, increasing the ip_tables socket_ops use count 3) process A acquires a file lock on the file ip_tables.ko, calls remove_module in the kernel, which in turn executes the ip_tables module cleanup routine, which calls nf_unregister_sockopt 4) nf_unregister_sockopt, seeing that the use count is non-zero, puts the calling process into uninterruptible sleep, expecting the process using the socket option code to wake it up when it exits the kernel 4) the user of the socket option code (process B) in do_ipt_get_ctl, calls ipt_find_table_lock, which in this case calls request_module to load ip_tables_nat.ko 5) request_module forks a copy of modprobe (process C) to load the module and blocks until modprobe exits. 6) Process C. forked by request_module process the dependencies of ip_tables_nat.ko, of which ip_tables.ko is one. 7) Process C attempts to lock the request module and all its dependencies, it blocks when it attempts to lock ip_tables.ko (which was previously locked in step 3) Theres not really any great permanent solution to this that I can see, but I've developed a two part solution that corrects the problem Part 1) Modifies the nf_sockopt registration code so that, instead of using a use counter internal to the nf_sockopt_ops structure, we instead use a pointer to the registering modules owner to do module reference counting when nf_sockopt calls a modules set/get routine. This prevents the deadlock by preventing set 4 from happening. Part 2) Enhances the modprobe utilty so that by default it preforms non-blocking remove operations (the same way rmmod does), and add an option to explicity request blocking operation. So if you select blocking operation in modprobe you can still cause the above deadlock, but only if you explicity try (and since root can do any old stupid thing it would like.... :) ). Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-09-11 03:28:26 -06:00
.owner = THIS_MODULE,
};
/*
* Generic Netlink interface
*/
/* IPVS genetlink family */
static struct genl_family ip_vs_genl_family;
/* Policy used for first-level command attributes */
static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
[IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
[IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
[IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
[IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
[IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
.len = IP_VS_IFNAME_MAXLEN - 1 },
[IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
[IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
[IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
[IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
[IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
[IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
[IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
.len = sizeof(union nf_inet_addr) },
[IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
[IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
ipvs: fix buffer overflow with sync daemon and service syzkaller reports for buffer overflow for interface name when starting sync daemons [1] What we do is that we copy user structure into larger stack buffer but later we search NUL past the stack buffer. The same happens for sched_name when adding/editing virtual server. We are restricted by IP_VS_SCHEDNAME_MAXLEN and IP_VS_IFNAME_MAXLEN being used as size in include/uapi/linux/ip_vs.h, so they include the space for NUL. As using strlcpy is wrong for unsafe source, replace it with strscpy and add checks to return EINVAL if source string is not NUL-terminated. The incomplete strlcpy fix comes from 2.6.13. For the netlink interface reduce the len parameter for IPVS_DAEMON_ATTR_MCAST_IFN and IPVS_SVC_ATTR_SCHED_NAME, so that we get proper EINVAL. [1] kernel BUG at lib/string.c:1052! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051 RSP: 0018:ffff8801c976f800 EFLAGS: 00010282 RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000 RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6 RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0 R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20 R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b FS: 00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: strlen include/linux/string.h:270 [inline] strlcpy include/linux/string.h:293 [inline] do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388 nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115 ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253 udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487 ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917 tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x447369 RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369 RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0 R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001 Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56 RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800 Reported-and-tested-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com Fixes: e4ff67513096 ("ipvs: add sync_maxlen parameter for the sync daemon") Fixes: 4da62fc70d7c ("[IPVS]: Fix for overflows") Signed-off-by: Julian Anastasov <ja@ssi.bg> Acked-by: Simon Horman <horms+renesas@verge.net.au> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-05-19 09:22:35 -06:00
.len = IP_VS_SCHEDNAME_MAXLEN - 1 },
[IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
.len = IP_VS_PENAME_MAXLEN },
[IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
.len = sizeof(struct ip_vs_flags) },
[IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
};
/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
.len = sizeof(union nf_inet_addr) },
[IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
[IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
[IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
struct ip_vs_kstats *kstats)
{
struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
IPVS_STATS_ATTR_PAD) ||
nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
goto nla_put_failure;
nla_nest_end(skb, nl_stats);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_stats);
return -EMSGSIZE;
}
static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
struct ip_vs_kstats *kstats)
{
struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
IPVS_STATS_ATTR_PAD) ||
nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
IPVS_STATS_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(skb, nl_stats);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_stats);
return -EMSGSIZE;
}
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
struct ip_vs_scheduler *sched;
struct ip_vs_pe *pe;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
struct ip_vs_kstats kstats;
char *sched_name;
nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
if (!nl_service)
return -EMSGSIZE;
if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
goto nla_put_failure;
if (svc->fwmark) {
if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
goto nla_put_failure;
} else {
if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
goto nla_put_failure;
}
sched = rcu_dereference_protected(svc->scheduler, 1);
sched_name = sched ? sched->name : "none";
pe = rcu_dereference_protected(svc->pe, 1);
if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
(pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
goto nla_put_failure;
ip_vs_copy_stats(&kstats, &svc->stats);
if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
goto nla_put_failure;
if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
goto nla_put_failure;
nla_nest_end(skb, nl_service);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_service);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_service(struct sk_buff *skb,
struct ip_vs_service *svc,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_SERVICE);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_service(skb, svc) < 0)
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
genlmsg_end(skb, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_services(struct sk_buff *skb,
struct netlink_callback *cb)
{
int idx = 0, i;
int start = cb->args[0];
struct ip_vs_service *svc;
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
if (++idx <= start || (svc->ipvs != ipvs))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
}
nla_put_failure:
mutex_unlock(&__ip_vs_mutex);
cb->args[0] = idx;
return skb->len;
}
static bool ip_vs_is_af_valid(int af)
{
if (af == AF_INET)
return true;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6 && ipv6_mod_enabled())
return true;
#endif
return false;
}
static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
struct ip_vs_service_user_kern *usvc,
struct nlattr *nla, bool full_entry,
struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
struct ip_vs_service *svc;
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 06:07:28 -06:00
nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
return -EINVAL;
nla_af = attrs[IPVS_SVC_ATTR_AF];
nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
nla_port = attrs[IPVS_SVC_ATTR_PORT];
nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
return -EINVAL;
memset(usvc, 0, sizeof(*usvc));
usvc->af = nla_get_u16(nla_af);
if (!ip_vs_is_af_valid(usvc->af))
return -EAFNOSUPPORT;
if (nla_fwmark) {
usvc->protocol = IPPROTO_TCP;
usvc->fwmark = nla_get_u32(nla_fwmark);
} else {
usvc->protocol = nla_get_u16(nla_protocol);
nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
usvc->port = nla_get_be16(nla_port);
usvc->fwmark = 0;
}
rcu_read_lock();
if (usvc->fwmark)
svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
else
svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
*nla_netmask;
struct ip_vs_flags flags;
nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
return -EINVAL;
nla_memcpy(&flags, nla_flags, sizeof(flags));
/* prefill flags from service if it already exists */
if (svc)
usvc->flags = svc->flags;
/* set new flags from userland */
usvc->flags = (usvc->flags & ~flags.mask) |
(flags.flags & flags.mask);
usvc->sched_name = nla_data(nla_sched);
usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
usvc->netmask = nla_get_be32(nla_netmask);
}
return 0;
}
static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
struct ip_vs_service *svc;
int ret;
ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
return ret ? ERR_PTR(ret) : svc;
}
static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
{
struct nlattr *nl_dest;
struct ip_vs_kstats kstats;
nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
if (!nl_dest)
return -EMSGSIZE;
if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
(atomic_read(&dest->conn_flags) &
IP_VS_CONN_F_FWD_MASK)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
atomic_read(&dest->weight)) ||
nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
dest->tun_type) ||
nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
dest->tun_port) ||
nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
dest->tun_flags) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
atomic_read(&dest->activeconns)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
atomic_read(&dest->inactconns)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
atomic_read(&dest->persistconns)) ||
nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
goto nla_put_failure;
ip_vs_copy_stats(&kstats, &dest->stats);
if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
goto nla_put_failure;
if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
goto nla_put_failure;
nla_nest_end(skb, nl_dest);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_dest);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DEST);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_dest(skb, dest) < 0)
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
genlmsg_end(skb, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_dests(struct sk_buff *skb,
struct netlink_callback *cb)
{
int idx = 0;
int start = cb->args[0];
struct ip_vs_service *svc;
struct ip_vs_dest *dest;
struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&__ip_vs_mutex);
/* Try to find the service for which to dump destinations */
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 06:07:28 -06:00
if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
goto out_err;
svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR_OR_NULL(svc))
goto out_err;
/* Dump the destinations */
list_for_each_entry(dest, &svc->destinations, n_list) {
if (++idx <= start)
continue;
if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
idx--;
goto nla_put_failure;
}
}
nla_put_failure:
cb->args[0] = idx;
out_err:
mutex_unlock(&__ip_vs_mutex);
return skb->len;
}
static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
struct nlattr *nla, bool full_entry)
{
struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
struct nlattr *nla_addr, *nla_port;
struct nlattr *nla_addr_family;
/* Parse mandatory identifying destination fields first */
if (nla == NULL ||
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 06:07:28 -06:00
nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
return -EINVAL;
nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
nla_port = attrs[IPVS_DEST_ATTR_PORT];
nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
if (!(nla_addr && nla_port))
return -EINVAL;
memset(udest, 0, sizeof(*udest));
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
udest->port = nla_get_be16(nla_port);
if (nla_addr_family)
udest->af = nla_get_u16(nla_addr_family);
else
udest->af = 0;
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
*nla_l_thresh, *nla_tun_type, *nla_tun_port,
*nla_tun_flags;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
udest->conn_flags = nla_get_u32(nla_fwd)
& IP_VS_CONN_F_FWD_MASK;
udest->weight = nla_get_u32(nla_weight);
udest->u_threshold = nla_get_u32(nla_u_thresh);
udest->l_threshold = nla_get_u32(nla_l_thresh);
if (nla_tun_type)
udest->tun_type = nla_get_u8(nla_tun_type);
if (nla_tun_port)
udest->tun_port = nla_get_be16(nla_tun_port);
if (nla_tun_flags)
udest->tun_flags = nla_get_u16(nla_tun_flags);
}
return 0;
}
static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
struct ipvs_sync_daemon_cfg *c)
{
struct nlattr *nl_daemon;
nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
if (!nl_daemon)
return -EMSGSIZE;
if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
goto nla_put_failure;
#ifdef CONFIG_IP_VS_IPV6
if (c->mcast_af == AF_INET6) {
if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
&c->mcast_group.in6))
goto nla_put_failure;
} else
#endif
if (c->mcast_af == AF_INET &&
nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
c->mcast_group.ip))
goto nla_put_failure;
nla_nest_end(skb, nl_daemon);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nl_daemon);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
struct ipvs_sync_daemon_cfg *c,
struct netlink_callback *cb)
{
void *hdr;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&ip_vs_genl_family, NLM_F_MULTI,
IPVS_CMD_NEW_DAEMON);
if (!hdr)
return -EMSGSIZE;
if (ip_vs_genl_fill_daemon(skb, state, c))
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-16 14:09:00 -07:00
genlmsg_end(skb, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
mutex_lock(&ipvs->sync_mutex);
if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
&ipvs->mcfg, cb) < 0)
goto nla_put_failure;
cb->args[0] = 1;
}
if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
&ipvs->bcfg, cb) < 0)
goto nla_put_failure;
cb->args[1] = 1;
}
nla_put_failure:
mutex_unlock(&ipvs->sync_mutex);
return skb->len;
}
static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
struct ipvs_sync_daemon_cfg c;
struct nlattr *a;
int ret;
memset(&c, 0, sizeof(c));
if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
sizeof(c.mcast_ifn));
c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
if (a)
c.sync_maxlen = nla_get_u16(a);
a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
if (a) {
c.mcast_af = AF_INET;
c.mcast_group.ip = nla_get_in_addr(a);
if (!ipv4_is_multicast(c.mcast_group.ip))
return -EINVAL;
} else {
a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
if (a) {
#ifdef CONFIG_IP_VS_IPV6
int addr_type;
c.mcast_af = AF_INET6;
c.mcast_group.in6 = nla_get_in6_addr(a);
addr_type = ipv6_addr_type(&c.mcast_group.in6);
if (!(addr_type & IPV6_ADDR_MULTICAST))
return -EINVAL;
#else
return -EAFNOSUPPORT;
#endif
}
}
a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
if (a)
c.mcast_port = nla_get_u16(a);
a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
if (a)
c.mcast_ttl = nla_get_u8(a);
/* The synchronization protocol is incompatible with mixed family
* services
*/
if (ipvs->mixed_address_family_dests > 0)
return -EINVAL;
ret = start_sync_thread(ipvs, &c,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
return ret;
}
static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
int ret;
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
ret = stop_sync_thread(ipvs,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
return ret;
}
static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(ipvs, &t);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
t.tcp_fin_timeout =
nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
return ip_vs_set_timeout(ipvs, &t);
}
static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
{
int ret = -EINVAL, cmd;
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 06:07:28 -06:00
nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
goto out;
if (cmd == IPVS_CMD_NEW_DAEMON)
ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
else
ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
}
out:
return ret;
}
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
{
bool need_full_svc = false, need_full_dest = false;
struct ip_vs_service *svc = NULL;
struct ip_vs_service_user_kern usvc;
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
ret = ip_vs_flush(ipvs, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
ret = ip_vs_genl_set_config(ipvs, info->attrs);
goto out;
} else if (cmd == IPVS_CMD_ZERO &&
!info->attrs[IPVS_CMD_ATTR_SERVICE]) {
ret = ip_vs_zero_all(ipvs);
goto out;
}
/* All following commands require a service argument, so check if we
* received a valid one. We need a full service specification when
* adding / editing a service. Only identifying members otherwise. */
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
need_full_svc = true;
ret = ip_vs_genl_parse_service(ipvs, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
need_full_svc, &svc);
if (ret)
goto out;
/* Unless we're adding a new service, the service must already exist */
if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
ret = -ESRCH;
goto out;
}
/* Destination commands require a valid destination argument. For
* adding / editing a destination, we need a full destination
* specification. */
if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
cmd == IPVS_CMD_DEL_DEST) {
if (cmd != IPVS_CMD_DEL_DEST)
need_full_dest = true;
ret = ip_vs_genl_parse_dest(&udest,
info->attrs[IPVS_CMD_ATTR_DEST],
need_full_dest);
if (ret)
goto out;
/* Old protocols did not allow the user to specify address
* family, so we set it to zero instead. We also didn't
* allow heterogeneous pools in the old code, so it's safe
* to assume that this will have the same address family as
* the service.
*/
if (udest.af == 0)
udest.af = svc->af;
if (!ip_vs_is_af_valid(udest.af)) {
ret = -EAFNOSUPPORT;
goto out;
}
if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
/* The synchronization protocol is incompatible
* with mixed family services
*/
if (ipvs->sync_state) {
ret = -EINVAL;
goto out;
}
/* Which connection types do we support? */
switch (udest.conn_flags) {
case IP_VS_CONN_F_TUNNEL:
/* We are able to forward this */
break;
default:
ret = -EINVAL;
goto out;
}
}
}
switch (cmd) {
case IPVS_CMD_NEW_SERVICE:
if (svc == NULL)
ret = ip_vs_add_service(ipvs, &usvc, &svc);
else
ret = -EEXIST;
break;
case IPVS_CMD_SET_SERVICE:
ret = ip_vs_edit_service(svc, &usvc);
break;
case IPVS_CMD_DEL_SERVICE:
ret = ip_vs_del_service(svc);
/* do not use svc, it can be freed */
break;
case IPVS_CMD_NEW_DEST:
ret = ip_vs_add_dest(svc, &udest);
break;
case IPVS_CMD_SET_DEST:
ret = ip_vs_edit_dest(svc, &udest);
break;
case IPVS_CMD_DEL_DEST:
ret = ip_vs_del_dest(svc, &udest);
break;
case IPVS_CMD_ZERO:
ret = ip_vs_zero_service(svc);
break;
default:
ret = -EINVAL;
}
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
void *reply;
int ret, cmd, reply_cmd;
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
cmd = info->genlhdr->cmd;
if (cmd == IPVS_CMD_GET_SERVICE)
reply_cmd = IPVS_CMD_NEW_SERVICE;
else if (cmd == IPVS_CMD_GET_INFO)
reply_cmd = IPVS_CMD_SET_INFO;
else if (cmd == IPVS_CMD_GET_CONFIG)
reply_cmd = IPVS_CMD_SET_CONFIG;
else {
pr_err("unknown Generic Netlink command\n");
return -EINVAL;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
mutex_lock(&__ip_vs_mutex);
reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
if (reply == NULL)
goto nla_put_failure;
switch (cmd) {
case IPVS_CMD_GET_SERVICE:
{
struct ip_vs_service *svc;
svc = ip_vs_genl_find_service(ipvs,
info->attrs[IPVS_CMD_ATTR_SERVICE]);
if (IS_ERR(svc)) {
ret = PTR_ERR(svc);
goto out_err;
} else if (svc) {
ret = ip_vs_genl_fill_service(msg, svc);
if (ret)
goto nla_put_failure;
} else {
ret = -ESRCH;
goto out_err;
}
break;
}
case IPVS_CMD_GET_CONFIG:
{
struct ip_vs_timeout_user t;
__ip_vs_get_timeouts(ipvs, &t);
#ifdef CONFIG_IP_VS_PROTO_TCP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
t.tcp_timeout) ||
nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
t.tcp_fin_timeout))
goto nla_put_failure;
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
goto nla_put_failure;
#endif
break;
}
case IPVS_CMD_GET_INFO:
if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
IP_VS_VERSION_CODE) ||
nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
ip_vs_conn_tab_size))
goto nla_put_failure;
break;
}
genlmsg_end(msg, reply);
genetlink: make netns aware This makes generic netlink network namespace aware. No generic netlink families except for the controller family are made namespace aware, they need to be checked one by one and then set the family->netnsok member to true. A new function genlmsg_multicast_netns() is introduced to allow sending a multicast message in a given namespace, for example when it applies to an object that lives in that namespace, a new function genlmsg_multicast_allns() to send a message to all network namespaces (for objects that do not have an associated netns). The function genlmsg_multicast() is changed to multicast the message in just init_net, which is currently correct for all generic netlink families since they only work in init_net right now. Some will later want to work in all net namespaces because they do not care about the netns at all -- those will have to be converted to use one of the new functions genlmsg_multicast_allns() or genlmsg_multicast_netns() whenever they are made netns aware in some way. After this patch families can easily decide whether or not they should be available in all net namespaces. Many genl families us it for objects not related to networking and should therefore be available in all namespaces, but that will have to be done on a per family basis. Note that this doesn't touch on the checkpoint/restart problem where network namespaces could be used, genl families and multicast groups are numbered globally and I see no easy way of changing that, especially since it must be possible to multicast to all network namespaces for those families that do not care about netns. Signed-off-by: Johannes Berg <johannes@sipsolutions.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-07-10 03:51:34 -06:00
ret = genlmsg_reply(msg, info);
goto out;
nla_put_failure:
pr_err("not enough space in Netlink message\n");
ret = -EMSGSIZE;
out_err:
nlmsg_free(msg);
out:
mutex_unlock(&__ip_vs_mutex);
return ret;
}
static const struct genl_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
.dumpit = ip_vs_genl_dump_services,
},
{
.cmd = IPVS_CMD_NEW_DEST,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_DEST,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_DEST,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_DEST,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.dumpit = ip_vs_genl_dump_dests,
},
{
.cmd = IPVS_CMD_NEW_DAEMON,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_DEL_DAEMON,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_GET_DAEMON,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.dumpit = ip_vs_genl_dump_daemons,
},
{
.cmd = IPVS_CMD_SET_CONFIG,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_CONFIG,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_GET_INFO,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_ZERO,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_FLUSH,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
};
static struct genl_family ip_vs_genl_family __ro_after_init = {
.hdrsize = 0,
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_ATTR_MAX,
.policy = ip_vs_cmd_policy,
.netnsok = true, /* Make ipvsadm to work on netns */
.module = THIS_MODULE,
.ops = ip_vs_genl_ops,
.n_ops = ARRAY_SIZE(ip_vs_genl_ops),
};
static int __init ip_vs_genl_register(void)
{
return genl_register_family(&ip_vs_genl_family);
}
static void ip_vs_genl_unregister(void)
{
genl_unregister_family(&ip_vs_genl_family);
}
/* End of Generic Netlink interface definitions */
/*
* per netns intit/exit func.
*/
#ifdef CONFIG_SYSCTL
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
int idx;
struct ctl_table *tbl;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
if (tbl[idx].proc_handler == proc_do_defense_mode)
tbl[idx].extra2 = ipvs;
}
idx = 0;
ipvs->sysctl_amemthresh = 1024;
tbl[idx++].data = &ipvs->sysctl_amemthresh;
ipvs->sysctl_am_droprate = 10;
tbl[idx++].data = &ipvs->sysctl_am_droprate;
tbl[idx++].data = &ipvs->sysctl_drop_entry;
tbl[idx++].data = &ipvs->sysctl_drop_packet;
#ifdef CONFIG_IP_VS_NFCT
tbl[idx++].data = &ipvs->sysctl_conntrack;
#endif
tbl[idx++].data = &ipvs->sysctl_secure_tcp;
ipvs->sysctl_snat_reroute = 1;
tbl[idx++].data = &ipvs->sysctl_snat_reroute;
ipvs->sysctl_sync_ver = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ver;
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
ipvs->sysctl_sync_sock_size = 0;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
tbl[idx++].data = &ipvs->sysctl_sync_retries;
tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
ipvs->sysctl_pmtu_disc = 1;
tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
tbl[idx++].data = &ipvs->sysctl_backup_only;
ipvs->sysctl_conn_reuse_mode = 1;
tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
if (ipvs->sysctl_hdr == NULL) {
if (!net_eq(net, &init_net))
kfree(tbl);
return -ENOMEM;
}
ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
ipvs->sysctl_tbl = tbl;
/* Schedule defense work */
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
return 0;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
}
#else
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
#endif
static struct notifier_block ip_vs_dst_notifier = {
.notifier_call = ip_vs_dst_event,
#ifdef CONFIG_IP_VS_IPV6
.priority = ADDRCONF_NOTIFY_PRIORITY + 5,
#endif
};
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
int i, idx;
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
INIT_LIST_HEAD(&ipvs->dest_trash);
spin_lock_init(&ipvs->dest_trash_lock);
timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
ipvs: handle connections started by real-servers When using LVS-NAT and SIP persistence-egine over UDP, the following limitations are present with current implementation: 1) To actually have load-balancing based on Call-ID header, you need to use one-packet-scheduling mode. But with one-packet-scheduling the connection is deleted just after packet is forwarded, so SIP responses coming from real-servers do not match any connection and SNAT is not applied. 2) If you do not use "-o" option, IPVS behaves as normal UDP load balancer, so different SIP calls (each one identified by a different Call-ID) coming from the same ip-address/port go to the same real-server. So basically you don’t have load-balancing based on Call-ID as intended. 3) Call-ID is not learned when a new SIP call is started by a real-server (inside-to-outside direction), but only in the outside-to-inside direction. This would be a general problem for all SIP servers acting as Back2BackUserAgent. This patch aims to solve problems 1) and 3) while keeping OPS mode mandatory for SIP-UDP, so that 2) is not a problem anymore. The basic mechanism implemented is to make packets, that do not match any existent connection but come from real-servers, create new connections instead of let them pass without any effect. When such packets pass through ip_vs_out(), if their source ip address and source port match a configured real-server, a new connection is automatically created in the same way as it would have happened if the packet had come from outside-to-inside direction. A new connection template is created too if the virtual-service is persistent and there is no matching connection template found. The new connection automatically created, if the service had "-o" option, is an OPS connection that lasts only the time to forward the packet, just like it happens on the ingress side. The main part of this mechanism is implemented inside a persistent-engine specific callback (at the moment only SIP persistent engine exists) and is triggered only for UDP packets, since connection oriented protocols, by using different set of ports (typically ephemeral ports) to open new outgoing connections, should not need this feature. The following requisites are needed for automatic connection creation; if any is missing the packet simply goes the same way as before. a) virtual-service is not fwmark based (this is because fwmark services do not store address and port of the virtual-service, required to build the connection data). b) virtual-service and real-servers must not have been configured with omitted port (this is again to have all data to create the connection). Signed-off-by: Marco Angaroni <marcoangaroni@gmail.com> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
2016-04-05 10:26:29 -06:00
atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
if (!ipvs->tot_stats.cpustats)
return -ENOMEM;
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 16:51:58 -06:00
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *ipvs_tot_stats;
ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
u64_stats_init(&ipvs_tot_stats->syncp);
}
spin_lock_init(&ipvs->tot_stats.lock);
proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
sizeof(struct ip_vs_iter));
proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
ip_vs_stats_show, NULL);
proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
ip_vs_stats_percpu_show, NULL);
if (ip_vs_control_net_init_sysctl(ipvs))
goto err;
return 0;
err:
free_percpu(ipvs->tot_stats.cpustats);
return -ENOMEM;
}
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
free_percpu(ipvs->tot_stats.cpustats);
}
int __init ip_vs_register_nl_ioctl(void)
{
int ret;
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
goto err_sock;
}
ret = ip_vs_genl_register();
if (ret) {
pr_err("cannot register Generic Netlink interface.\n");
goto err_genl;
}
return 0;
err_genl:
nf_unregister_sockopt(&ip_vs_sockopts);
err_sock:
return ret;
}
void ip_vs_unregister_nl_ioctl(void)
{
ip_vs_genl_unregister();
nf_unregister_sockopt(&ip_vs_sockopts);
}
int __init ip_vs_control_init(void)
{
int idx;
int ret;
EnterFunction(2);
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
ret = register_netdevice_notifier(&ip_vs_dst_notifier);
if (ret < 0)
return ret;
LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
LeaveFunction(2);
}