alistair23-linux/fs/afs/vl_probe.c
David Howells c410bf0193 rxrpc: Fix the excessive initial retransmission timeout
rxrpc currently uses a fixed 4s retransmission timeout until the RTT is
sufficiently sampled.  This can cause problems with some fileservers with
calls to the cache manager in the afs filesystem being dropped from the
fileserver because a packet goes missing and the retransmission timeout is
greater than the call expiry timeout.

Fix this by:

 (1) Copying the RTT/RTO calculation code from Linux's TCP implementation
     and altering it to fit rxrpc.

 (2) Altering the various users of the RTT to make use of the new SRTT
     value.

 (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO
     value instead (which is needed in jiffies), along with a backoff.

Notes:

 (1) rxrpc provides RTT samples by matching the serial numbers on outgoing
     DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets
     against the reference serial number in incoming REQUESTED ACK and
     PING-RESPONSE ACK packets.

 (2) Each packet that is transmitted on an rxrpc connection gets a new
     per-connection serial number, even for retransmissions, so an ACK can
     be cross-referenced to a specific trigger packet.  This allows RTT
     information to be drawn from retransmitted DATA packets also.

 (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than
     on an rxrpc_call because many RPC calls won't live long enough to
     generate more than one sample.

 (4) The calculated SRTT value is in units of 8ths of a microsecond rather
     than nanoseconds.

The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers.

Fixes: 17926a7932 ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"")
Signed-off-by: David Howells <dhowells@redhat.com>
2020-05-11 16:42:28 +01:00

273 lines
6.7 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/* AFS vlserver probing
*
* Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include "afs_fs.h"
#include "internal.h"
#include "protocol_yfs.h"
static bool afs_vl_probe_done(struct afs_vlserver *server)
{
if (!atomic_dec_and_test(&server->probe_outstanding))
return false;
wake_up_var(&server->probe_outstanding);
clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
return true;
}
/*
* Process the result of probing a vlserver. This is called after successful
* or failed delivery of an VL.GetCapabilities operation.
*/
void afs_vlserver_probe_result(struct afs_call *call)
{
struct afs_addr_list *alist = call->alist;
struct afs_vlserver *server = call->vlserver;
unsigned int server_index = call->server_index;
unsigned int rtt_us = 0;
unsigned int index = call->addr_ix;
bool have_result = false;
int ret = call->error;
_enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
spin_lock(&server->probe_lock);
switch (ret) {
case 0:
server->probe.error = 0;
goto responded;
case -ECONNABORTED:
if (!server->probe.responded) {
server->probe.abort_code = call->abort_code;
server->probe.error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
server->probe.local_failure = true;
afs_io_error(call, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
default:
clear_bit(index, &alist->responded);
set_bit(index, &alist->failed);
if (!server->probe.responded &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
server->probe.error == -ETIME))
server->probe.error = ret;
afs_io_error(call, afs_io_error_vl_probe_fail);
goto out;
}
responded:
set_bit(index, &alist->responded);
clear_bit(index, &alist->failed);
if (call->service_id == YFS_VL_SERVICE) {
server->probe.is_yfs = true;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
} else {
server->probe.not_yfs = true;
if (!server->probe.is_yfs) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
}
rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
}
smp_wmb(); /* Set rtt before responded. */
server->probe.responded = true;
set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
server_index, index, &alist->addrs[index].transport, rtt_us, ret);
have_result |= afs_vl_probe_done(server);
if (have_result) {
server->probe.have_result = true;
wake_up_var(&server->probe.have_result);
wake_up_all(&server->probe_wq);
}
}
/*
* Probe all of a vlserver's addresses to find out the best route and to
* query its capabilities.
*/
static bool afs_do_probe_vlserver(struct afs_net *net,
struct afs_vlserver *server,
struct key *key,
unsigned int server_index,
struct afs_error *_e)
{
struct afs_addr_cursor ac = {
.index = 0,
};
struct afs_call *call;
bool in_progress = false;
_enter("%s", server->name);
read_lock(&server->lock);
ac.alist = rcu_dereference_protected(server->addresses,
lockdep_is_held(&server->lock));
read_unlock(&server->lock);
atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
memset(&server->probe, 0, sizeof(server->probe));
server->probe.rtt = UINT_MAX;
for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
call = afs_vl_get_capabilities(net, &ac, key, server,
server_index);
if (!IS_ERR(call)) {
afs_put_call(call);
in_progress = true;
} else {
afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
}
}
if (!in_progress)
afs_vl_probe_done(server);
return in_progress;
}
/*
* Send off probes to all unprobed servers.
*/
int afs_send_vl_probes(struct afs_net *net, struct key *key,
struct afs_vlserver_list *vllist)
{
struct afs_vlserver *server;
struct afs_error e;
bool in_progress = false;
int i;
e.error = 0;
e.responded = false;
for (i = 0; i < vllist->nr_servers; i++) {
server = vllist->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
continue;
if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) &&
afs_do_probe_vlserver(net, server, key, i, &e))
in_progress = true;
}
return in_progress ? 0 : e.error;
}
/*
* Wait for the first as-yet untried server to respond.
*/
int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
unsigned long untried)
{
struct wait_queue_entry *waits;
struct afs_vlserver *server;
unsigned int rtt = UINT_MAX;
bool have_responders = false;
int pref = -1, i;
_enter("%u,%lx", vllist->nr_servers, untried);
/* Only wait for servers that have a probe outstanding. */
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
__clear_bit(i, &untried);
if (server->probe.responded)
have_responders = true;
}
}
if (have_responders || !untried)
return 0;
waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL);
if (!waits)
return -ENOMEM;
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
init_waitqueue_entry(&waits[i], current);
add_wait_queue(&server->probe_wq, &waits[i]);
}
}
for (;;) {
bool still_probing = false;
set_current_state(TASK_INTERRUPTIBLE);
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
if (server->probe.responded)
goto stop;
if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
still_probing = true;
}
}
if (!still_probing || signal_pending(current))
goto stop;
schedule();
}
stop:
set_current_state(TASK_RUNNING);
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
if (server->probe.responded &&
server->probe.rtt < rtt) {
pref = i;
rtt = server->probe.rtt;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
}
}
kfree(waits);
if (pref == -1 && signal_pending(current))
return -ERESTARTSYS;
if (pref >= 0)
vllist->preferred = pref;
_leave(" = 0 [%u]", pref);
return 0;
}