From eeb84aa0d0aff3177c93397cdc62be87e54af486 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 May 2019 16:48:53 -0700 Subject: [PATCH 1/2] net_sched: sch_fq: do not assume EDT packets are ordered TCP stack makes sure packets for a given flow are monotically increasing, but we want to allow UDP packets to use EDT as well, so that QUIC servers can use in-kernel pacing. This patch adds a per-flow rb-tree on which packets might be stored. We still try to use the linear list for the typical cases where packets are queued with monotically increasing skb->tstamp, since queue/dequeue packets on a standard list is O(1). Note that the ability to store packets in arbitrary EDT order will allow us to implement later a per TCP socket mechanism adding delays (with jitter eventually) and reorders, to implement convenient network emulators. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 95 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index d107c74767cd..ee138365ec45 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -54,10 +54,23 @@ #include #include +struct fq_skb_cb { + u64 time_to_send; +}; + +static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); + return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; +} + /* - * Per flow structure, dynamically allocated + * Per flow structure, dynamically allocated. + * If packets have monotically increasing time_to_send, they are placed in O(1) + * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { + struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ @@ -298,6 +311,8 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) q->stat_allocation_errors++; return &q->internal; } + /* f->t_root is already zeroed after kmem_cache_zalloc() */ + fq_flow_set_detached(f); f->sk = sk; if (skb->sk) @@ -312,14 +327,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) return f; } +static struct sk_buff *fq_peek(struct fq_flow *flow) +{ + struct sk_buff *skb = skb_rb_first(&flow->t_root); + struct sk_buff *head = flow->head; + + if (!skb) + return head; + + if (!head) + return skb; + + if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) + return skb; + return head; +} + +static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) +{ + if (skb == flow->head) { + flow->head = skb->next; + } else { + rb_erase(&skb->rbnode, &flow->t_root); + skb->dev = qdisc_dev(sch); + } +} /* remove one skb from head of flow queue */ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) { - struct sk_buff *skb = flow->head; + struct sk_buff *skb = fq_peek(flow); if (skb) { - flow->head = skb->next; + fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -330,15 +371,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) { - struct sk_buff *head = flow->head; + struct rb_node **p, *parent; + struct sk_buff *head, *aux; - skb->next = NULL; - if (!head) - flow->head = skb; - else - flow->tail->next = skb; + fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); - flow->tail = skb; + head = flow->head; + if (!head || + fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { + if (!head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; + return; + } + + p = &flow->t_root.rb_node; + parent = NULL; + + while (*p) { + parent = *p; + aux = rb_to_skb(parent); + if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &flow->t_root); } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -450,9 +512,9 @@ begin: goto begin; } - skb = f->head; + skb = fq_peek(f); if (skb) { - u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), + u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); if (now < time_next_packet) { @@ -533,6 +595,15 @@ out: static void fq_flow_purge(struct fq_flow *flow) { + struct rb_node *p = rb_first(&flow->t_root); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + rb_erase(&skb->rbnode, &flow->t_root); + rtnl_kfree_skbs(skb, skb); + } rtnl_kfree_skbs(flow->head, flow->tail); flow->head = NULL; flow->qlen = 0; From 37c0aead7902b1ddf1b668e1ab74c80b9a7fd183 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 May 2019 16:48:54 -0700 Subject: [PATCH 2/2] net_sched: sch_fq: handle non connected flows FQ packet scheduler assumed that packets could be classified based on their owning socket. This means that if a UDP server uses one UDP socket to send packets to different destinations, packets all land in one FQ flow. This is unfair, since each TCP flow has a unique bucket, meaning that in case of pressure (fully utilised uplink), TCP flows have more share of the bandwidth. If we instead detect unconnected sockets, we can use a stochastic hash based on the 4-tuple hash. This also means a QUIC server using one UDP socket will properly spread the outgoing packets to different buckets, and in-kernel pacing based on EDT model will no longer risk having big rb-tree on one flow. Note that UDP application might provide the skb->hash in an ancillary message at sendmsg() time to avoid the cost of a dissection in fq packet scheduler. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index ee138365ec45..26a94e5cd5df 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -270,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) */ sk = (struct sock *)((hash << 1) | 1UL); skb_orphan(skb); + } else if (sk->sk_state == TCP_CLOSE) { + unsigned long hash = skb_get_hash(skb) & q->orphan_mask; + /* + * Sockets in TCP_CLOSE are non connected. + * Typical use case is UDP sockets, they can send packets + * with sendto() to many different destinations. + * We probably could use a generic bit advertising + * non connected sockets, instead of sk_state == TCP_CLOSE, + * if we care enough. + */ + sk = (struct sock *)((hash << 1) | 1UL); } root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; @@ -290,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) * It not, we need to refill credit with * initial quantum */ - if (unlikely(skb->sk && + if (unlikely(skb->sk == sk && f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; @@ -315,7 +326,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) fq_flow_set_detached(f); f->sk = sk; - if (skb->sk) + if (skb->sk == sk) f->socket_hash = sk->sk_hash; f->credit = q->initial_quantum;