diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt new file mode 100644 index 000000000000..0d5dfbc89ec9 --- /dev/null +++ b/Documentation/networking/dctcp.txt @@ -0,0 +1,43 @@ +DCTCP (DataCenter TCP) +---------------------- + +DCTCP is an enhancement to the TCP congestion control algorithm for data +center networks and leverages Explicit Congestion Notification (ECN) in +the data center network to provide multi-bit feedback to the end hosts. + +To enable it on end hosts: + + sysctl -w net.ipv4.tcp_congestion_control=dctcp + +All switches in the data center network running DCTCP must support ECN +marking and be configured for marking when reaching defined switch buffer +thresholds. The default ECN marking threshold heuristic for DCTCP on +switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, +but might need further careful tweaking. + +For more details, see below documents: + +Paper: + +The algorithm is further described in detail in the following two +SIGCOMM/SIGMETRICS papers: + + i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + "Data Center TCP (DCTCP)", Data Center Networks session + Proc. ACM SIGCOMM, New Delhi, 2010. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 + +ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + "Analysis of DCTCP: Stability, Convergence, and Fairness" + Proc. ACM SIGMETRICS, San Jose, 2011. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + +IETF informational draft: + + http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 + +DCTCP site: + + http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbde90fa5838..d65c0a09efd3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -110,10 +110,10 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, + INET_DIAG_DCTCPINFO, }; -#define INET_DIAG_MAX INET_DIAG_SHUTDOWN - +#define INET_DIAG_MAX INET_DIAG_DCTCPINFO /* INET_DIAG_MEM */ @@ -133,5 +133,14 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; +/* INET_DIAG_DCTCPINFO */ + +struct tcp_dctcp_info { + __u16 dctcp_enabled; + __u16 dctcp_ce_state; + __u32 dctcp_alpha; + __u32 dctcp_ab_ecn; + __u32 dctcp_ab_tot; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 84f710b7472a..69fb37854449 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_DCTCP + tristate "DataCenter TCP (DCTCP)" + default n + ---help--- + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: + + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. + + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. + + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -598,9 +619,11 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_DCTCP + bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_RENO bool "Reno" - endchoice endif @@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d78d404c596f..d8105787c199 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c @@ -0,0 +1,344 @@ +/* DataCenter TCP (DCTCP) congestion control. + * + * http://simula.stanford.edu/~alizade/Site/DCTCP.html + * + * This is an implementation of DCTCP over Reno, an enhancement to the + * TCP congestion control algorithm designed for data centers. DCTCP + * leverages Explicit Congestion Notification (ECN) in the network to + * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet + * the following three data center transport requirements: + * + * - High burst tolerance (incast due to partition/aggregate) + * - Low latency (short flows, queries) + * - High throughput (continuous data updates, large file transfers) + * with commodity shallow buffered switches + * + * The algorithm is described in detail in the following two papers: + * + * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + * "Data Center TCP (DCTCP)", Data Center Networks session + * Proc. ACM SIGCOMM, New Delhi, 2010. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + * + * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + * "Analysis of DCTCP: Stability, Convergence, and Fairness" + * Proc. ACM SIGMETRICS, San Jose, 2011. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + * + * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. + * + * Authors: + * + * Daniel Borkmann + * Florian Westphal + * Glenn Judd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + u32 acked_bytes_ecn; + u32 acked_bytes_total; + u32 prior_snd_una; + u32 prior_rcv_nxt; + u32 dctcp_alpha; + u32 next_seq; + u32 ce_state; + u32 delayed_ack_reserved; +}; + +static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ +module_param(dctcp_shift_g, uint, 0644); +MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); + +static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; +module_param(dctcp_alpha_on_init, uint, 0644); +MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); + +static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; +module_param(dctcp_clamp_alpha_on_loss, uint, 0644); +MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, + "parameter for clamping alpha on loss"); + +static struct tcp_congestion_ops dctcp_reno; + +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->acked_bytes_ecn = 0; + ca->acked_bytes_total = 0; +} + +static void dctcp_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->ecn_flags & TCP_ECN_OK) || + (sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE)) { + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_snd_una = tp->snd_una; + ca->prior_rcv_nxt = tp->rcv_nxt; + + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + + ca->delayed_ack_reserved = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); + return; + } + + /* No ECN support? Fall back to Reno. Also need to clear + * ECT from sk since it is set during 3WHS for DCTCP. + */ + inet_csk(sk)->icsk_ca_ops = &dctcp_reno; + INET_ECN_dontxmit(sk); +} + +static u32 dctcp_ssthresh(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ + +static void dctcp_ce_state_0_to_1(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ + if (!ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=0. */ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; + + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +} + +static void dctcp_ce_state_1_to_0(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ + if (ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=1. */ + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; + + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static void dctcp_update_alpha(struct sock *sk, u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + u32 acked_bytes = tp->snd_una - ca->prior_snd_una; + + /* If ack did not advance snd_una, count dupack as MSS size. + * If ack did update window, do not count it at all. + */ + if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; + if (acked_bytes) { + ca->acked_bytes_total += acked_bytes; + ca->prior_snd_una = tp->snd_una; + + if (flags & CA_ACK_ECE) + ca->acked_bytes_ecn += acked_bytes; + } + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + /* For avoiding denominator == 1. */ + if (ca->acked_bytes_total == 0) + ca->acked_bytes_total = 1; + + /* alpha = (1 - g) * alpha + g * F */ + ca->dctcp_alpha = ca->dctcp_alpha - + (ca->dctcp_alpha >> dctcp_shift_g) + + (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / + ca->acked_bytes_total; + + if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) + /* Clamp dctcp_alpha to max. */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + + dctcp_reset(tp, ca); + } +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { + struct dctcp *ca = inet_csk_ca(sk); + + /* If this extension is enabled, we clamp dctcp_alpha to + * max on packet loss; the motivation is that dctcp_alpha + * is an indicator to the extend of congestion and packet + * loss is an indicator of extreme congestion; setting + * this in practice turned out to be beneficial, and + * effectively assumes total congestion which reduces the + * window by half. + */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + } +} + +static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_DELAYED_ACK: + if (!ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 1; + break; + case CA_EVENT_NON_DELAYED_ACK: + if (ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 0; + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ + switch (ev) { + case CA_EVENT_ECN_IS_CE: + dctcp_ce_state_0_to_1(sk); + break; + case CA_EVENT_ECN_NO_CE: + dctcp_ce_state_1_to_0(sk); + break; + case CA_EVENT_DELAYED_ACK: + case CA_EVENT_NON_DELAYED_ACK: + dctcp_update_ack_reserved(sk, ev); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + /* Fill it also in case of VEGASINFO due to req struct limits. + * We can still correctly retrieve it later. + */ + if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_dctcp_info info; + + memset(&info, 0, sizeof(info)); + if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { + info.dctcp_enabled = 1; + info.dctcp_ce_state = (u16) ca->ce_state; + info.dctcp_alpha = ca->dctcp_alpha; + info.dctcp_ab_ecn = ca->acked_bytes_ecn; + info.dctcp_ab_tot = ca->acked_bytes_total; + } + + nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops dctcp __read_mostly = { + .init = dctcp_init, + .in_ack_event = dctcp_update_alpha, + .cwnd_event = dctcp_cwnd_event, + .ssthresh = dctcp_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .set_state = dctcp_state, + .get_info = dctcp_get_info, + .flags = TCP_CONG_NEEDS_ECN, + .owner = THIS_MODULE, + .name = "dctcp", +}; + +static struct tcp_congestion_ops dctcp_reno __read_mostly = { + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .get_info = dctcp_get_info, + .owner = THIS_MODULE, + .name = "dctcp-reno", +}; + +static int __init dctcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&dctcp); +} + +static void __exit dctcp_unregister(void) +{ + tcp_unregister_congestion_control(&dctcp); +} + +module_init(dctcp_register); +module_exit(dctcp_unregister); + +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Glenn Judd "); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 124f9e4e4594..86a0216fcaa1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3211,6 +3211,7 @@ void tcp_send_ack(struct sock *sk) skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it.