diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e9b7244ac381..89ee47c2f17d 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1084,4 +1084,50 @@ enum { CAKE_ATM_MAX }; + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e95741388311..1b9afdee5ba9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -194,6 +194,17 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_TAPRIO + tristate "Time Aware Priority (taprio) Scheduler" + help + Say Y here if you want to use the Time Aware Priority (taprio) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_taprio. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index f0403f49edcb..8a40431d7b5c 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o +obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c new file mode 100644 index 000000000000..206e4dbed12f --- /dev/null +++ b/net/sched/sch_taprio.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_taprio.c Time Aware Priority Scheduler + * + * Authors: Vinicius Costa Gomes + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TAPRIO_ALL_GATES_OPEN -1 + +struct sched_entry { + struct list_head list; + + /* The instant that this entry "closes" and the next one + * should open, the qdisc will make some effort so that no + * packet leaves after this time. + */ + ktime_t close_time; + atomic_t budget; + int index; + u32 gate_mask; + u32 interval; + u8 command; +}; + +struct taprio_sched { + struct Qdisc **qdiscs; + struct Qdisc *root; + s64 base_time; + int clockid; + int picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ + size_t num_entries; + + /* Protects the update side of the RCU protected current_entry */ + spinlock_t current_entry_lock; + struct sched_entry __rcu *current_entry; + struct list_head entries; + ktime_t (*get_time)(void); + struct hrtimer advance_timer; +}; + +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct Qdisc *child; + int queue; + + queue = skb_get_queue_mapping(skb); + + child = q->qdiscs[queue]; + if (unlikely(!child)) + return qdisc_drop(skb, sch, to_free); + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return qdisc_enqueue(skb, child, to_free); +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + gate_mask = entry ? entry->gate_mask : -1; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + int prio; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + return NULL; + + return skb; + } + + return NULL; +} + +static inline int length_to_duration(struct taprio_sched *q, int len) +{ + return (len * q->picos_per_byte) / 1000; +} + +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + /* if there's no entry, it means that the schedule didn't + * start yet, so force all gates to be open, this is in + * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 + * "AdminGateSates" + */ + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + ktime_t guard; + int prio; + int len; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + continue; + + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(q->get_time(), + length_to_duration(q, len)); + + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + ktime_after(guard, entry->close_time)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + atomic_sub_return(len, &entry->budget) < 0) + return NULL; + + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; + } + + return NULL; +} + +static bool should_restart_cycle(const struct taprio_sched *q, + const struct sched_entry *entry) +{ + WARN_ON(!entry); + + return list_is_last(&entry->list, &q->entries); +} + +static enum hrtimer_restart advance_sched(struct hrtimer *timer) +{ + struct taprio_sched *q = container_of(timer, struct taprio_sched, + advance_timer); + struct sched_entry *entry, *next; + struct Qdisc *sch = q->root; + ktime_t close_time; + + spin_lock(&q->current_entry_lock); + entry = rcu_dereference_protected(q->current_entry, + lockdep_is_held(&q->current_entry_lock)); + + /* This is the case that it's the first time that the schedule + * runs, so it only happens once per schedule. The first entry + * is pre-calculated during the schedule initialization. + */ + if (unlikely(!entry)) { + next = list_first_entry(&q->entries, struct sched_entry, + list); + close_time = next->close_time; + goto first_run; + } + + if (should_restart_cycle(q, entry)) + next = list_first_entry(&q->entries, struct sched_entry, + list); + else + next = list_next_entry(entry, list); + + close_time = ktime_add_ns(entry->close_time, next->interval); + + next->close_time = close_time; + atomic_set(&next->budget, + (next->interval * 1000) / q->picos_per_byte); + +first_run: + rcu_assign_pointer(q->current_entry, next); + spin_unlock(&q->current_entry_lock); + + hrtimer_set_expires(&q->advance_timer, close_time); + + rcu_read_lock(); + __netif_schedule(sch); + rcu_read_unlock(); + + return HRTIMER_RESTART; +} + +static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, + [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, +}; + +static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { + [TCA_TAPRIO_ATTR_PRIOMAP] = { + .len = sizeof(struct tc_mqprio_qopt) + }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) + entry->command = nla_get_u8( + tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) + entry->gate_mask = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) + interval = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + return 0; +} + +static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_sched_entry(tb, entry, extack); +} + +/* Returns the number of entries in case of success */ +static int parse_sched_single_entry(struct nlattr *n, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; + struct sched_entry *entry; + bool found = false; + u32 index; + int err; + + err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX, + n, entry_list_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { + NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); + return -EINVAL; + } + + err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX, + tb_list[TCA_TAPRIO_SCHED_ENTRY], + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { + NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); + return -EINVAL; + } + + index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); + if (index >= q->num_entries) { + NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); + return -EINVAL; + } + + list_for_each_entry(entry, &q->entries, list) { + if (entry->index == index) { + found = true; + break; + } + } + + if (!found) { + NL_SET_ERR_MSG(extack, "Could not find entry"); + return -ENOENT; + } + + err = fill_sched_entry(tb_entry, entry, extack); + if (err < 0) + return err; + + return q->num_entries; +} + +static int parse_sched_list(struct nlattr *list, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list) + return -EINVAL; + + nla_for_each_nested(n, list, rem) { + struct sched_entry *entry; + + if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + err = parse_sched_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + return err; + } + + list_add_tail(&entry->list, &q->entries); + i++; + } + + q->num_entries = i; + + return i; +} + +/* Returns the number of entries in case of success */ +static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + int err = 0; + int clockid; + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) + return -EINVAL; + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) + q->base_time = nla_get_s64( + tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) + return -EINVAL; + + q->clockid = clockid; + } + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) + err = parse_sched_list( + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); + else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + err = parse_sched_single_entry( + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); + + /* parse_sched_* return the number of entries in the schedule, + * a schedule with zero entries is an error. + */ + if (err == 0) { + NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); + return -EINVAL; + } + + return err; +} + +static int taprio_parse_mqprio_opt(struct net_device *dev, + struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + int i, j; + + if (!qopt) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* taprio imposes that traffic classes map 1:n to tx queues */ + if (qopt->num_tc > dev->num_tx_queues) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); + return -EINVAL; + } + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) { + NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); + return -EINVAL; + } + } + } + + return 0; +} + +static ktime_t taprio_get_start_time(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *entry; + ktime_t now, base, cycle; + s64 n; + + base = ns_to_ktime(q->base_time); + cycle = 0; + + /* Calculate the cycle_time, by summing all the intervals. + */ + list_for_each_entry(entry, &q->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) + return base; + + now = q->get_time(); + + if (ktime_after(base, now)) + return base; + + /* Schedule the start time for the beginning of the next + * cycle. + */ + n = div64_s64(ktime_sub_ns(now, base), cycle); + + return ktime_add_ns(base, (n + 1) * cycle); +} + +static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *first; + unsigned long flags; + + spin_lock_irqsave(&q->current_entry_lock, flags); + + first = list_first_entry(&q->entries, struct sched_entry, + list); + + first->close_time = ktime_add_ns(start, first->interval); + atomic_set(&first->budget, + (first->interval * 1000) / q->picos_per_byte); + rcu_assign_pointer(q->current_entry, NULL); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + + hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); +} + +static int taprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt *mqprio = NULL; + struct ethtool_link_ksettings ecmd; + int i, err, size; + s64 link_speed; + ktime_t start; + + err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, + taprio_policy, extack); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) + mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); + + err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (err < 0) + return err; + + /* A schedule with less than one entry is an error */ + size = parse_taprio_opt(tb, q, extack); + if (size < 0) + return size; + + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + return -ENOTSUPP; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + link_speed * 1000 * 1000); + + start = taprio_get_start_time(sch); + if (!start) + return 0; + + taprio_start_sched(sch, start); + + return 0; +} + +static void taprio_destroy(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry, *n; + unsigned int i; + + hrtimer_cancel(&q->advance_timer); + + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + qdisc_put(q->qdiscs[i]); + + kfree(q->qdiscs); + } + q->qdiscs = NULL; + + netdev_set_num_tc(dev, 0); + + list_for_each_entry_safe(entry, n, &q->entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int taprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + INIT_LIST_HEAD(&q->entries); + spin_lock_init(&q->current_entry_lock); + + /* We may overwrite the configuration later */ + hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + + q->root = sch; + + /* We only support static clockids. Use an invalid value as default + * and get the valid one on taprio_change(). + */ + q->clockid = -1; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdisc, attachment can't fail */ + q->qdiscs = kcalloc(dev->num_tx_queues, + sizeof(q->qdiscs[0]), + GFP_KERNEL); + + if (!q->qdiscs) + return -ENOMEM; + + if (!opt) + return -EINVAL; + + return taprio_change(sch, opt, extack); +} + +static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + + return netdev_get_tx_queue(dev, ntx); +} + +static int taprio_graft(struct Qdisc *sch, unsigned long cl, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + + if (new) + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int dump_entry(struct sk_buff *msg, + const struct sched_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, + entry->gate_mask)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, + entry->interval)) + goto nla_put_failure; + + return nla_nest_end(msg, item); + +nla_put_failure: + nla_nest_cancel(msg, item); + return -1; +} + +static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt opt = { 0 }; + struct nlattr *nest, *entry_list; + struct sched_entry *entry; + unsigned int i; + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + return -ENOSPC; + + if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) + goto options_error; + + if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + q->base_time, TCA_TAPRIO_PAD)) + goto options_error; + + if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + goto options_error; + + entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto options_error; + + list_for_each_entry(entry, &q->entries, list) { + if (dump_entry(skb, entry) < 0) + goto options_error; + } + + nla_nest_end(skb, entry_list); + + return nla_nest_end(skb, nest); + +options_error: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long taprio_find(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!taprio_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + + return 0; +} + +static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + return -1; + return 0; +} + +static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + +static const struct Qdisc_class_ops taprio_class_ops = { + .graft = taprio_graft, + .leaf = taprio_leaf, + .find = taprio_find, + .walk = taprio_walk, + .dump = taprio_dump_class, + .dump_stats = taprio_dump_class_stats, + .select_queue = taprio_select_queue, +}; + +static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { + .cl_ops = &taprio_class_ops, + .id = "taprio", + .priv_size = sizeof(struct taprio_sched), + .init = taprio_init, + .destroy = taprio_destroy, + .peek = taprio_peek, + .dequeue = taprio_dequeue, + .enqueue = taprio_enqueue, + .dump = taprio_dump, + .owner = THIS_MODULE, +}; + +static int __init taprio_module_init(void) +{ + return register_qdisc(&taprio_qdisc_ops); +} + +static void __exit taprio_module_exit(void) +{ + unregister_qdisc(&taprio_qdisc_ops); +} + +module_init(taprio_module_init); +module_exit(taprio_module_exit); +MODULE_LICENSE("GPL");