Merge branch 'tcp_get_info-locking'

Eric Dumazet says:

====================
tcp: tcp_get_info() locking changes

This short series prepares tcp_get_info() for more detailed infos.

In order to not slow down fast path, our goal is to use the normal
socket spinlock instead of custom synchronization.

All we need to ensure is that tcp_get_info() is not called with
ehash lock, which might dead lock, since packet processing would acquire
the spinlocks in reverse way.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2016-11-09 13:02:28 -05:00
commit c68d7f1b63
4 changed files with 65 additions and 48 deletions

View file

@ -176,8 +176,6 @@ struct tcp_sock {
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
u32 snd_una; /* First byte we want an ack for */
u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */

View file

@ -861,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
u32 idiag_states = r->idiag_states;
int i, num, s_i, s_num;
struct sock *sk;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@ -877,7 +878,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
@ -922,13 +922,14 @@ skip_listen_ht:
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node;
struct sock *sk;
num = 0;
struct sock *sk_arr[SKARR_SZ];
int num_arr[SKARR_SZ];
int idx, accum, res;
if (hlist_nulls_empty(&head->chain))
continue;
@ -936,9 +937,12 @@ skip_listen_ht:
if (i > s_i)
s_num = 0;
next_chunk:
num = 0;
accum = 0;
spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) {
int state, res;
int state;
if (!net_eq(sock_net(sk), net))
continue;
@ -962,21 +966,35 @@ skip_listen_ht:
if (!inet_diag_bc_sk(bc, sk))
goto next_normal;
res = sk_diag_fill(sk, skb, r,
sock_hold(sk);
num_arr[accum] = num;
sk_arr[accum] = sk;
if (++accum == SKARR_SZ)
break;
next_normal:
++num;
}
spin_unlock_bh(lock);
res = 0;
for (idx = 0; idx < accum; idx++) {
if (res >= 0) {
res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
if (res < 0)
num = num_arr[idx];
}
next_normal:
++num;
sock_gen_put(sk_arr[idx]);
}
spin_unlock_bh(lock);
if (res < 0)
break;
cond_resched();
if (accum == SKARR_SZ) {
s_num = num + 1;
goto next_chunk;
}
}
done:

View file

@ -405,7 +405,6 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
@ -2710,9 +2709,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64;
bool slow;
u32 rate;
memset(info, 0, sizeof(*info));
@ -2721,6 +2719,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_state = sk_state_load(sk);
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_max_pacing_rate);
info->tcpi_reordering = tp->reordering;
info->tcpi_snd_cwnd = tp->snd_cwnd;
if (info->tcpi_state == TCP_LISTEN) {
/* listeners aliased fields :
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
return;
}
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@ -2748,13 +2767,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
if (info->tcpi_state == TCP_LISTEN) {
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
}
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
@ -2768,34 +2783,24 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
info->tcpi_reordering = tp->reordering;
info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
info->tcpi_rcv_space = tp->rcvq_space.space;
info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_pacing_rate);
slow = lock_sock_fast(sk);
rate = READ_ONCE(sk->sk_max_pacing_rate);
rate64 = rate != ~0U ? rate : ~0ULL;
put_unaligned(rate64, &info->tcpi_max_pacing_rate);
put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
unlock_sock_fast(sk, slow);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
info->tcpi_notsent_bytes = max(0, notsent_bytes);
info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;

View file

@ -3351,9 +3351,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@ -3363,9 +3361,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}