Merge branch 'tcp-more-perns-sysctls'

Eric Dumazet says:

====================
tcp: move 12 sysctls to namespaces

Ideally all TCP sysctls should be per netns.
This patch series takes care of 12 sysctls.

Remains the ones that need discussion :

sysctl_tcp_mem, sysctl_tcp_rmem, sysctl_tcp_wmem, and sysctl_tcp_max_orphans
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-10-28 19:24:39 +09:00
commit 871da0a761
11 changed files with 149 additions and 164 deletions

View file

@ -142,6 +142,18 @@ struct netns_ipv4 {
int sysctl_tcp_app_win;
int sysctl_tcp_adv_win_scale;
int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
int sysctl_tcp_moderate_rcvbuf;
int sysctl_tcp_tso_win_divisor;
int sysctl_tcp_workaround_signed_windows;
int sysctl_tcp_limit_output_bytes;
int sysctl_tcp_challenge_ack_limit;
int sysctl_tcp_min_tso_segs;
int sysctl_tcp_min_rtt_wlen;
int sysctl_tcp_autocorking;
int sysctl_tcp_invalid_ratelimit;
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen;

View file

@ -247,22 +247,9 @@ extern int sysctl_tcp_max_orphans;
extern long sysctl_tcp_mem[3];
extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_nometrics_save;
extern int sysctl_tcp_moderate_rcvbuf;
extern int sysctl_tcp_tso_win_divisor;
extern int sysctl_tcp_workaround_signed_windows;
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_min_rtt_wlen;
extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
extern int sysctl_tcp_pacing_ca_ratio;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;
@ -1305,7 +1292,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
}
/* Determine a window scaling and initial window to offer. */
void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
void tcp_select_initial_window(const struct sock *sk, int __space,
__u32 mss, __u32 *rcv_wnd,
__u32 *window_clamp, int wscale_ok,
__u8 *rcv_wscale, __u32 init_rcv_wnd);

View file

@ -385,7 +385,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* Try to redo what tcp_v4_send_synack did. */
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));

View file

@ -437,13 +437,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
{
.procname = "tcp_min_rtt_wlen",
.data = &sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
@ -451,54 +444,12 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_tso_win_divisor",
.data = &sysctl_tcp_tso_win_divisor,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_congestion_control",
.mode = 0644,
.maxlen = TCP_CA_NAME_MAX,
.proc_handler = proc_tcp_congestion_control,
},
{
.procname = "tcp_workaround_signed_windows",
.data = &sysctl_tcp_workaround_signed_windows,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_limit_output_bytes",
.data = &sysctl_tcp_limit_output_bytes,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_challenge_ack_limit",
.data = &sysctl_tcp_challenge_ack_limit,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#ifdef CONFIG_NETLABEL
{
.procname = "cipso_cache_enable",
@ -541,49 +492,6 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_allowed_congestion_control,
},
{
.procname = "tcp_min_tso_segs",
.data = &sysctl_tcp_min_tso_segs,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
.extra2 = &gso_max_segs,
},
{
.procname = "tcp_pacing_ss_ratio",
.data = &sysctl_tcp_pacing_ss_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{
.procname = "tcp_pacing_ca_ratio",
.data = &sysctl_tcp_pacing_ca_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{
.procname = "tcp_autocorking",
.data = &sysctl_tcp_autocorking,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
{
.procname = "tcp_invalid_ratelimit",
.data = &sysctl_tcp_invalid_ratelimit,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
.procname = "tcp_available_ulp",
.maxlen = TCP_ULP_BUF_MAX,
@ -1145,6 +1053,98 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_no_metrics_save",
.data = &init_net.ipv4.sysctl_tcp_nometrics_save,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_moderate_rcvbuf",
.data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_tso_win_divisor",
.data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "tcp_workaround_signed_windows",
.data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_limit_output_bytes",
.data = &init_net.ipv4.sysctl_tcp_limit_output_bytes,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_challenge_ack_limit",
.data = &init_net.ipv4.sysctl_tcp_challenge_ack_limit,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_min_tso_segs",
.data = &init_net.ipv4.sysctl_tcp_min_tso_segs,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
.extra2 = &gso_max_segs,
},
{
.procname = "tcp_min_rtt_wlen",
.data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "tcp_autocorking",
.data = &init_net.ipv4.sysctl_tcp_autocorking,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
{
.procname = "tcp_invalid_ratelimit",
.data = &init_net.ipv4.sysctl_tcp_invalid_ratelimit,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
.procname = "tcp_pacing_ss_ratio",
.data = &init_net.ipv4.sysctl_tcp_pacing_ss_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{
.procname = "tcp_pacing_ca_ratio",
.data = &init_net.ipv4.sysctl_tcp_pacing_ca_ratio,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &thousand,
},
{ }
};

View file

@ -285,10 +285,6 @@
#include <trace/events/tcp.h>
int sysctl_tcp_min_tso_segs __read_mostly = 2;
int sysctl_tcp_autocorking __read_mostly = 1;
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@ -699,7 +695,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
sysctl_tcp_autocorking &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
skb != tcp_write_queue_head(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}

View file

@ -79,13 +79,7 @@
#include <linux/unaligned/access_ok.h>
#include <linux/static_key.h>
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 1000;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@ -411,7 +405,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
* Allow enough cushion so that sender is not limited by our window
*/
if (sysctl_tcp_moderate_rcvbuf)
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
rcvmem <<= 2;
if (sk->sk_rcvbuf < rcvmem)
@ -602,7 +596,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
if (sysctl_tcp_moderate_rcvbuf &&
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvwin, rcvmem, rcvbuf;
@ -773,15 +767,6 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->srtt_us = max(1U, srtt);
}
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
* Note: TCP stack does not yet implement pacing.
* FQ packet scheduler can be used to implement cheap but effective
* TCP pacing, to smooth the burst on large writes when packets
* in flight is significantly lower than cwnd (or rwin)
*/
int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@ -799,9 +784,9 @@ static void tcp_update_pacing_rate(struct sock *sk)
* end of slow start and should slow down.
*/
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
rate *= sysctl_tcp_pacing_ss_ratio;
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
else
rate *= sysctl_tcp_pacing_ca_ratio;
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
@ -2919,8 +2904,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk);
u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1));
@ -3408,7 +3393,7 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
@ -3444,10 +3429,11 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static u32 challenge_timestamp;
static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 count, now;
/* First check our per-socket dupack rate limit. */
if (__tcp_oow_rate_limited(sock_net(sk),
if (__tcp_oow_rate_limited(net,
LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
&tp->last_oow_ack_time))
return;
@ -3455,16 +3441,16 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
u32 half = (ack_limit + 1) >> 1;
challenge_timestamp = now;
WRITE_ONCE(challenge_count, half +
prandom_u32_max(sysctl_tcp_challenge_ack_limit));
WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
}
count = READ_ONCE(challenge_count);
if (count > 0) {
WRITE_ONCE(challenge_count, count - 1);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}

View file

@ -2493,6 +2493,22 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_app_win = 31;
net->ipv4.sysctl_tcp_adv_win_scale = 1;
net->ipv4.sysctl_tcp_frto = 2;
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
/* Default TSQ limit of four TSO segments */
net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
net->ipv4.sysctl_tcp_autocorking = 1;
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);

View file

@ -20,8 +20,6 @@
#include <net/tcp.h>
#include <net/genetlink.h>
int sysctl_tcp_nometrics_save __read_mostly;
static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
const struct inetpeer_addr *daddr,
struct net *net, unsigned int hash);
@ -330,7 +328,7 @@ void tcp_update_metrics(struct sock *sk)
int m;
sk_dst_confirm(sk);
if (sysctl_tcp_nometrics_save || !dst)
if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
return;
rcu_read_lock();

View file

@ -369,7 +369,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
full_space = rcv_wnd * mss;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
tcp_select_initial_window(sk_listener, full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,

View file

@ -45,20 +45,6 @@
#include <trace/events/tcp.h>
/* People can turn this on to work with those rare, broken TCPs that
* interpret the window field as a signed quantity.
*/
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
/* Default TSQ limit of four TSO segments */
int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
int sysctl_tcp_tso_win_divisor __read_mostly = 3;
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@ -202,7 +188,7 @@ u32 tcp_default_init_rwnd(u32 mss)
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(int __space, __u32 mss,
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
@ -226,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
if (sysctl_tcp_workaround_signed_windows)
if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = space;
@ -286,7 +272,8 @@ static u16 tcp_select_window(struct sock *sk)
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
if (!tp->rx_opt.rcv_wscale &&
sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@ -1771,7 +1758,8 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
return tso_segs ? :
tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
tcp_tso_autosize(sk, mss_now,
sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
}
/* Returns the portion of skb which can be sent right away */
@ -1988,7 +1976,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
win_divisor = ACCESS_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@ -2225,7 +2213,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int limit;
limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
limit = min_t(u32, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
@ -3355,7 +3344,7 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
tcp_select_initial_window(tcp_full_space(sk),
tcp_select_initial_window(sk, tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,

View file

@ -244,7 +244,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
}
req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));