diff --git a/include/linux/socket.h b/include/linux/socket.h index 5c19cba34dce..fab4d0ddf4ed 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -181,6 +181,7 @@ struct ucred { #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ +#define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ @@ -226,6 +227,7 @@ struct ucred { #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB +#define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 36faf4990c4b..2cb9acb618e9 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -26,6 +26,7 @@ #endif #include #include +#include #include struct user_namespace; @@ -129,6 +130,9 @@ struct net { #endif #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; +#endif +#if IS_ENABLED(CONFIG_MPLS) + struct netns_mpls mpls; #endif struct sock *diag_nlsk; atomic_t fnhe_genid; diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h new file mode 100644 index 000000000000..f90aaf8d4f89 --- /dev/null +++ b/include/net/netns/mpls.h @@ -0,0 +1,15 @@ +/* + * mpls in net namespaces + */ + +#ifndef __NETNS_MPLS_H__ +#define __NETNS_MPLS_H__ + +struct mpls_route; + +struct netns_mpls { + size_t platform_labels; + struct mpls_route __rcu * __rcu *platform_label; +}; + +#endif /* __NETNS_MPLS_H__ */ diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index a77fbcdd04ee..f4286ee7e2b0 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -22,4 +22,9 @@ config NET_MPLS_GSO that have had MPLS stack entries pushed onto them and thus become MPLS GSO packets. +config MPLS_ROUTING + bool "MPLS: routing support" + help + Add support for forwarding of mpls packets. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 6dec088c2d0f..60af15f1960e 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -2,3 +2,4 @@ # Makefile for MPLS. # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o +obj-$(CONFIG_MPLS_ROUTING) += af_mpls.o diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c new file mode 100644 index 000000000000..924377736b2a --- /dev/null +++ b/net/mpls/af_mpls.c @@ -0,0 +1,349 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define MAX_NEW_LABELS 2 + +/* This maximum ha length copied from the definition of struct neighbour */ +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) + +struct mpls_route { /* next hop label forwarding entry */ + struct net_device *rt_dev; + struct rcu_head rt_rcu; + u32 rt_label[MAX_NEW_LABELS]; + u8 rt_protocol; /* routing protocol that set this entry */ + u8 rt_labels:2, + rt_via_alen:6; + unsigned short rt_via_family; + u8 rt_via[0]; +}; + +static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) +{ + struct mpls_route *rt = NULL; + + if (index < net->mpls.platform_labels) { + struct mpls_route __rcu **platform_label = + rcu_dereference(net->mpls.platform_label); + rt = rcu_dereference(platform_label[index]); + } + return rt; +} + +static bool mpls_output_possible(const struct net_device *dev) +{ + return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); +} + +static unsigned int mpls_rt_header_size(const struct mpls_route *rt) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return rt->rt_labels * sizeof(struct mpls_shim_hdr); +} + +static unsigned int mpls_dev_mtu(const struct net_device *dev) +{ + /* The amount of data the layer 2 frame can hold */ + return dev->mtu; +} + +static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, + struct mpls_entry_decoded dec) +{ + /* RFC4385 and RFC5586 encode other packets in mpls such that + * they don't conflict with the ip version number, making + * decoding by examining the ip version correct in everything + * except for the strangest cases. + * + * The strange cases if we choose to support them will require + * manual configuration. + */ + struct iphdr *hdr4 = ip_hdr(skb); + bool success = true; + + if (hdr4->version == 4) { + skb->protocol = htons(ETH_P_IP); + csum_replace2(&hdr4->check, + htons(hdr4->ttl << 8), + htons(dec.ttl << 8)); + hdr4->ttl = dec.ttl; + } + else if (hdr4->version == 6) { + struct ipv6hdr *hdr6 = ipv6_hdr(skb); + skb->protocol = htons(ETH_P_IPV6); + hdr6->hop_limit = dec.ttl; + } + else + /* version 0 and version 1 are used by pseudo wires */ + success = false; + return success; +} + +static int mpls_forward(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mpls_shim_hdr *hdr; + struct mpls_route *rt; + struct mpls_entry_decoded dec; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + int err; + + /* Careful this entire function runs inside of an rcu critical section */ + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*hdr))) + goto drop; + + /* Read and decode the label */ + hdr = mpls_hdr(skb); + dec = mpls_entry_decode(hdr); + + /* Pop the label */ + skb_pull(skb, sizeof(*hdr)); + skb_reset_network_header(skb); + + skb_orphan(skb); + + rt = mpls_route_input_rcu(net, dec.label); + if (!rt) + goto drop; + + /* Find the output device */ + out_dev = rt->rt_dev; + if (!mpls_output_possible(out_dev)) + goto drop; + + if (skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + /* Verify ttl is valid */ + if (dec.ttl <= 2) + goto drop; + dec.ttl -= 1; + + /* Verify the destination can hold the packet */ + new_header_size = mpls_rt_header_size(rt); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + if (unlikely(!new_header_size && dec.bos)) { + /* Penultimate hop popping */ + if (!mpls_egress(rt, skb, dec)) + goto drop; + } else { + bool bos; + int i; + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = dec.bos; + for (i = rt->rt_labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos); + bos = false; + } + } + + err = neigh_xmit(rt->rt_via_family, out_dev, rt->rt_via, skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + return 0; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mpls_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_MPLS_UC), + .func = mpls_forward, +}; + +static struct mpls_route *mpls_rt_alloc(size_t alen) +{ + struct mpls_route *rt; + + rt = kzalloc(GFP_KERNEL, sizeof(*rt) + alen); + if (rt) + rt->rt_via_alen = alen; + return rt; +} + +static void mpls_rt_free(struct mpls_route *rt) +{ + if (rt) + kfree_rcu(rt, rt_rcu); +} + +static void mpls_route_update(struct net *net, unsigned index, + struct net_device *dev, struct mpls_route *new, + const struct nl_info *info) +{ + struct mpls_route *rt, *old = NULL; + + ASSERT_RTNL(); + + rt = net->mpls.platform_label[index]; + if (!dev || (rt && (rt->rt_dev == dev))) { + rcu_assign_pointer(net->mpls.platform_label[index], new); + old = rt; + } + + /* If we removed a route free it now */ + mpls_rt_free(old); +} + +static void mpls_ifdown(struct net_device *dev) +{ + struct net *net = dev_net(dev); + unsigned index; + + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = net->mpls.platform_label[index]; + if (!rt) + continue; + if (rt->rt_dev != dev) + continue; + rt->rt_dev = NULL; + } +} + +static int mpls_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch(event) { + case NETDEV_UNREGISTER: + mpls_ifdown(dev); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mpls_dev_notifier = { + .notifier_call = mpls_dev_notify, +}; + +static int mpls_net_init(struct net *net) +{ + net->mpls.platform_labels = 0; + net->mpls.platform_label = NULL; + + return 0; +} + +static void mpls_net_exit(struct net *net) +{ + unsigned int index; + + /* An rcu grace period haselapsed since there was a device in + * the network namespace (and thus the last in fqlight packet) + * left this network namespace. This is because + * unregister_netdevice_many and netdev_run_todo has completed + * for each network device that was in this network namespace. + * + * As such no additional rcu synchronization is necessary when + * freeing the platform_label table. + */ + rtnl_lock(); + for (index = 0; index < net->mpls.platform_labels; index++) { + struct mpls_route *rt = net->mpls.platform_label[index]; + rcu_assign_pointer(net->mpls.platform_label[index], NULL); + mpls_rt_free(rt); + } + rtnl_unlock(); + + kvfree(net->mpls.platform_label); +} + +static struct pernet_operations mpls_net_ops = { + .init = mpls_net_init, + .exit = mpls_net_exit, +}; + +static int __init mpls_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4); + + err = register_pernet_subsys(&mpls_net_ops); + if (err) + goto out; + + err = register_netdevice_notifier(&mpls_dev_notifier); + if (err) + goto out_unregister_pernet; + + dev_add_pack(&mpls_packet_type); + + err = 0; +out: + return err; + +out_unregister_pernet: + unregister_pernet_subsys(&mpls_net_ops); + goto out; +} +module_init(mpls_init); + +static void __exit mpls_exit(void) +{ + dev_remove_pack(&mpls_packet_type); + unregister_netdevice_notifier(&mpls_dev_notifier); + unregister_pernet_subsys(&mpls_net_ops); +} +module_exit(mpls_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_NETPROTO(PF_MPLS); diff --git a/net/mpls/internal.h b/net/mpls/internal.h new file mode 100644 index 000000000000..c2944cb84d48 --- /dev/null +++ b/net/mpls/internal.h @@ -0,0 +1,56 @@ +#ifndef MPLS_INTERNAL_H +#define MPLS_INTERNAL_H + +#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */ +#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */ +#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */ +#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */ +#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */ +#define LABEL_GAL 13 /* RFC5586 */ +#define LABEL_OAM_ALERT 14 /* RFC3429 */ +#define LABEL_EXTENSION 15 /* RFC7274 */ + + +struct mpls_shim_hdr { + __be32 label_stack_entry; +}; + +struct mpls_entry_decoded { + u32 label; + u8 ttl; + u8 tc; + u8 bos; +}; + +struct sk_buff; + +static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) +{ + return (struct mpls_shim_hdr *)skb_network_header(skb); +} + +static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) +{ + struct mpls_shim_hdr result; + result.label_stack_entry = + cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | + (tc << MPLS_LS_TC_SHIFT) | + (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | + (ttl << MPLS_LS_TTL_SHIFT)); + return result; +} + +static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) +{ + struct mpls_entry_decoded result; + unsigned entry = be32_to_cpu(hdr->label_stack_entry); + + result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; + + return result; +} + +#endif /* MPLS_INTERNAL_H */