1
0
Fork 0

High Performance UML Vector Network Driver

1. Provides infrastructure for vector IO using recvmmsg/sendmmsg.
    1.1. Multi-message read.
    1.2. Multi-message write.
    1.3. Optimized queue support for multi-packet enqueue/dequeue.
    1.4. BQL/DQL support.
2. Implements transports for several transports as well support
for direct wiring of PWEs to NIC. Allows direct connection of VMs
to host, other VMs and network devices with no switch in use.
    2.1. Raw socket >4 times higher PPS and 10 times higher tcp RX
    than existing pcap based transport (> 4Gbit)
    2.2. New tap transport using socket RX and tap xmit. Similar
    performance improvements (>4Gbit)
    2.3. GRE transport - direct wiring to GRE PWE
    2.4. L2TPv3 transport - direct wiring to L2TPv3 PWE
3. Tuning, performance and offload related setting support via ethtool.
4. Initial BPF support - used in tap/raw to avoid software looping
5. Scatter Gather support.
6. VNET and checksum offload support for raw socket transport.
7. TSO/GSO support where applicable or available
8. Migrates all error messages to netdevice_*() and rate limits
them where needed.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
hifive-unleashed-5.1
Anton Ivanov 2017-11-20 21:17:59 +00:00 committed by Richard Weinberger
parent ff6a17989c
commit 49da7e64f3
10 changed files with 2932 additions and 3 deletions

View File

@ -109,6 +109,17 @@ config UML_NET_DAEMON
more than one without conflict. If you don't need UML networking,
say N.
config UML_NET_VECTOR
bool "Vector I/O high performance network devices"
depends on UML_NET
help
This User-Mode Linux network driver uses multi-message send
and receive functions. The host running the UML guest must have
a linux kernel version above 3.0 and a libc version > 2.13.
This driver provides tap, raw, gre and l2tpv3 network transports
with up to 4 times higher network throughput than the UML network
drivers.
config UML_NET_VDE
bool "VDE transport"
depends on UML_NET

View File

@ -9,6 +9,7 @@
slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
vector-objs := vector_kern.o vector_user.o vector_transports.o
umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o
mconsole-objs := mconsole_kern.o mconsole_user.o
@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
obj-$(CONFIG_UML_NET_VECTOR) += vector.o
obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o
@ -61,7 +63,7 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o
obj-$(CONFIG_UML_RANDOM) += random.o
# pcap_user.o must be added explicitly.
USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o
USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o
CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH)
include arch/um/scripts/Makefile.rules

View File

@ -288,7 +288,7 @@ static void uml_net_user_timer_expire(struct timer_list *t)
#endif
}
static void setup_etheraddr(struct net_device *dev, char *str)
void uml_net_setup_etheraddr(struct net_device *dev, char *str)
{
unsigned char *addr = dev->dev_addr;
char *end;
@ -412,7 +412,7 @@ static void eth_configure(int n, void *init, char *mac,
*/
snprintf(dev->name, sizeof(dev->name), "eth%d", n);
setup_etheraddr(dev, mac);
uml_net_setup_etheraddr(dev, mac);
printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,129 @@
/*
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#ifndef __UM_VECTOR_KERN_H
#define __UM_VECTOR_KERN_H
#include <linux/netdevice.h>
#include <linux/platform_device.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/list.h>
#include <linux/ctype.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include "vector_user.h"
/* Queue structure specially adapted for multiple enqueue/dequeue
* in a mmsgrecv/mmsgsend context
*/
/* Dequeue method */
#define QUEUE_SENDMSG 0
#define QUEUE_SENDMMSG 1
#define VECTOR_RX 1
#define VECTOR_TX (1 << 1)
#define VECTOR_BPF (1 << 2)
#define ETH_MAX_PACKET 1500
#define ETH_HEADER_OTHER 32 /* just in case someone decides to go mad on QnQ */
struct vector_queue {
struct mmsghdr *mmsg_vector;
void **skbuff_vector;
/* backlink to device which owns us */
struct net_device *dev;
spinlock_t head_lock;
spinlock_t tail_lock;
int queue_depth, head, tail, max_depth, max_iov_frags;
short options;
};
struct vector_estats {
uint64_t rx_queue_max;
uint64_t rx_queue_running_average;
uint64_t tx_queue_max;
uint64_t tx_queue_running_average;
uint64_t rx_encaps_errors;
uint64_t tx_timeout_count;
uint64_t tx_restart_queue;
uint64_t tx_kicks;
uint64_t tx_flow_control_xon;
uint64_t tx_flow_control_xoff;
uint64_t rx_csum_offload_good;
uint64_t rx_csum_offload_errors;
uint64_t sg_ok;
uint64_t sg_linearized;
};
#define VERIFY_HEADER_NOK -1
#define VERIFY_HEADER_OK 0
#define VERIFY_CSUM_OK 1
struct vector_private {
struct list_head list;
spinlock_t lock;
struct net_device *dev;
int unit;
/* Timeout timer in TX */
struct timer_list tl;
/* Scheduled "remove device" work */
struct work_struct reset_tx;
struct vector_fds *fds;
struct vector_queue *rx_queue;
struct vector_queue *tx_queue;
int rx_irq;
int tx_irq;
struct arglist *parsed;
void *transport_data; /* transport specific params if needed */
int max_packet;
int req_size; /* different from max packet - used for TSO */
int headroom;
int options;
/* remote address if any - some transports will leave this as null */
int header_size;
int rx_header_size;
int coalesce;
void *header_rxbuffer;
void *header_txbuffer;
int (*form_header)(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp);
int (*verify_header)(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp);
spinlock_t stats_lock;
struct tasklet_struct tx_poll;
bool rexmit_scheduled;
bool opened;
bool in_write_poll;
/* ethtool stats */
struct vector_estats estats;
void *bpf;
char user[0];
};
extern int build_transport_data(struct vector_private *vp);
#endif

View File

@ -0,0 +1,458 @@
/*
* Copyright (C) 2017 - Cambridge Greys Limited
* Copyright (C) 2011 - 2014 Cisco Systems Inc
* Licensed under the GPL.
*/
#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <asm/byteorder.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/virtio_net.h>
#include <linux/virtio_net.h>
#include <linux/virtio_byteorder.h>
#include <linux/netdev_features.h>
#include "vector_user.h"
#include "vector_kern.h"
#define GOOD_LINEAR 512
#define GSO_ERROR "Incoming GSO frames and GRO disabled on the interface"
struct gre_minimal_header {
uint16_t header;
uint16_t arptype;
};
struct uml_gre_data {
uint32_t rx_key;
uint32_t tx_key;
uint32_t sequence;
bool ipv6;
bool has_sequence;
bool pin_sequence;
bool checksum;
bool key;
struct gre_minimal_header expected_header;
uint32_t checksum_offset;
uint32_t key_offset;
uint32_t sequence_offset;
};
struct uml_l2tpv3_data {
uint64_t rx_cookie;
uint64_t tx_cookie;
uint64_t rx_session;
uint64_t tx_session;
uint32_t counter;
bool udp;
bool ipv6;
bool has_counter;
bool pin_counter;
bool cookie;
bool cookie_is_64;
uint32_t cookie_offset;
uint32_t session_offset;
uint32_t counter_offset;
};
static int l2tpv3_form_header(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp)
{
struct uml_l2tpv3_data *td = vp->transport_data;
uint32_t *counter;
if (td->udp)
*(uint32_t *) header = cpu_to_be32(L2TPV3_DATA_PACKET);
(*(uint32_t *) (header + td->session_offset)) = td->tx_session;
if (td->cookie) {
if (td->cookie_is_64)
(*(uint64_t *)(header + td->cookie_offset)) =
td->tx_cookie;
else
(*(uint32_t *)(header + td->cookie_offset)) =
td->tx_cookie;
}
if (td->has_counter) {
counter = (uint32_t *)(header + td->counter_offset);
if (td->pin_counter) {
*counter = 0;
} else {
td->counter++;
*counter = cpu_to_be32(td->counter);
}
}
return 0;
}
static int gre_form_header(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp)
{
struct uml_gre_data *td = vp->transport_data;
uint32_t *sequence;
*((uint32_t *) header) = *((uint32_t *) &td->expected_header);
if (td->key)
(*(uint32_t *) (header + td->key_offset)) = td->tx_key;
if (td->has_sequence) {
sequence = (uint32_t *)(header + td->sequence_offset);
if (td->pin_sequence)
*sequence = 0;
else
*sequence = cpu_to_be32(++td->sequence);
}
return 0;
}
static int raw_form_header(uint8_t *header,
struct sk_buff *skb, struct vector_private *vp)
{
struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header;
virtio_net_hdr_from_skb(
skb,
vheader,
virtio_legacy_is_little_endian(),
false
);
return 0;
}
static int l2tpv3_verify_header(
uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
{
struct uml_l2tpv3_data *td = vp->transport_data;
uint32_t *session;
uint64_t cookie;
if ((!td->udp) && (!td->ipv6))
header += sizeof(struct iphdr) /* fix for ipv4 raw */;
/* we do not do a strict check for "data" packets as per
* the RFC spec because the pure IP spec does not have
* that anyway.
*/
if (td->cookie) {
if (td->cookie_is_64)
cookie = *(uint64_t *)(header + td->cookie_offset);
else
cookie = *(uint32_t *)(header + td->cookie_offset);
if (cookie != td->rx_cookie) {
if (net_ratelimit())
netdev_err(vp->dev, "uml_l2tpv3: unknown cookie id");
return -1;
}
}
session = (uint32_t *) (header + td->session_offset);
if (*session != td->rx_session) {
if (net_ratelimit())
netdev_err(vp->dev, "uml_l2tpv3: session mismatch");
return -1;
}
return 0;
}
static int gre_verify_header(
uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
{
uint32_t key;
struct uml_gre_data *td = vp->transport_data;
if (!td->ipv6)
header += sizeof(struct iphdr) /* fix for ipv4 raw */;
if (*((uint32_t *) header) != *((uint32_t *) &td->expected_header)) {
if (net_ratelimit())
netdev_err(vp->dev, "header type disagreement, expecting %0x, got %0x",
*((uint32_t *) &td->expected_header),
*((uint32_t *) header)
);
return -1;
}
if (td->key) {
key = (*(uint32_t *)(header + td->key_offset));
if (key != td->rx_key) {
if (net_ratelimit())
netdev_err(vp->dev, "unknown key id %0x, expecting %0x",
key, td->rx_key);
return -1;
}
}
return 0;
}
static int raw_verify_header(
uint8_t *header, struct sk_buff *skb, struct vector_private *vp)
{
struct virtio_net_hdr *vheader = (struct virtio_net_hdr *) header;
if ((vheader->gso_type != VIRTIO_NET_HDR_GSO_NONE) &&
(vp->req_size != 65536)) {
if (net_ratelimit())
netdev_err(
vp->dev,
GSO_ERROR
);
}
if ((vheader->flags & VIRTIO_NET_HDR_F_DATA_VALID) > 0)
return 1;
virtio_net_hdr_to_skb(skb, vheader, virtio_legacy_is_little_endian());
return 0;
}
static bool get_uint_param(
struct arglist *def, char *param, unsigned int *result)
{
char *arg = uml_vector_fetch_arg(def, param);
if (arg != NULL) {
if (kstrtoint(arg, 0, result) == 0)
return true;
}
return false;
}
static bool get_ulong_param(
struct arglist *def, char *param, unsigned long *result)
{
char *arg = uml_vector_fetch_arg(def, param);
if (arg != NULL) {
if (kstrtoul(arg, 0, result) == 0)
return true;
return true;
}
return false;
}
static int build_gre_transport_data(struct vector_private *vp)
{
struct uml_gre_data *td;
int temp_int;
int temp_rx;
int temp_tx;
vp->transport_data = kmalloc(sizeof(struct uml_gre_data), GFP_KERNEL);
if (vp->transport_data == NULL)
return -ENOMEM;
td = vp->transport_data;
td->sequence = 0;
td->expected_header.arptype = GRE_IRB;
td->expected_header.header = 0;
vp->form_header = &gre_form_header;
vp->verify_header = &gre_verify_header;
vp->header_size = 4;
td->key_offset = 4;
td->sequence_offset = 4;
td->checksum_offset = 4;
td->ipv6 = false;
if (get_uint_param(vp->parsed, "v6", &temp_int)) {
if (temp_int > 0)
td->ipv6 = true;
}
td->key = false;
if (get_uint_param(vp->parsed, "rx_key", &temp_rx)) {
if (get_uint_param(vp->parsed, "tx_key", &temp_tx)) {
td->key = true;
td->expected_header.header |= GRE_MODE_KEY;
td->rx_key = cpu_to_be32(temp_rx);
td->tx_key = cpu_to_be32(temp_tx);
vp->header_size += 4;
td->sequence_offset += 4;
} else {
return -EINVAL;
}
}
td->sequence = false;
if (get_uint_param(vp->parsed, "sequence", &temp_int)) {
if (temp_int > 0) {
vp->header_size += 4;
td->has_sequence = true;
td->expected_header.header |= GRE_MODE_SEQUENCE;
if (get_uint_param(
vp->parsed, "pin_sequence", &temp_int)) {
if (temp_int > 0)
td->pin_sequence = true;
}
}
}
vp->rx_header_size = vp->header_size;
if (!td->ipv6)
vp->rx_header_size += sizeof(struct iphdr);
return 0;
}
static int build_l2tpv3_transport_data(struct vector_private *vp)
{
struct uml_l2tpv3_data *td;
int temp_int, temp_rxs, temp_txs;
unsigned long temp_rx;
unsigned long temp_tx;
vp->transport_data = kmalloc(
sizeof(struct uml_l2tpv3_data), GFP_KERNEL);
if (vp->transport_data == NULL)
return -ENOMEM;
td = vp->transport_data;
vp->form_header = &l2tpv3_form_header;
vp->verify_header = &l2tpv3_verify_header;
td->counter = 0;
vp->header_size = 4;
td->session_offset = 0;
td->cookie_offset = 4;
td->counter_offset = 4;
td->ipv6 = false;
if (get_uint_param(vp->parsed, "v6", &temp_int)) {
if (temp_int > 0)
td->ipv6 = true;
}
if (get_uint_param(vp->parsed, "rx_session", &temp_rxs)) {
if (get_uint_param(vp->parsed, "tx_session", &temp_txs)) {
td->tx_session = cpu_to_be32(temp_txs);
td->rx_session = cpu_to_be32(temp_rxs);
} else {
return -EINVAL;
}
} else {
return -EINVAL;
}
td->cookie_is_64 = false;
if (get_uint_param(vp->parsed, "cookie64", &temp_int)) {
if (temp_int > 0)
td->cookie_is_64 = true;
}
td->cookie = false;
if (get_ulong_param(vp->parsed, "rx_cookie", &temp_rx)) {
if (get_ulong_param(vp->parsed, "tx_cookie", &temp_tx)) {
td->cookie = true;
if (td->cookie_is_64) {
td->rx_cookie = cpu_to_be64(temp_rx);
td->tx_cookie = cpu_to_be64(temp_tx);
vp->header_size += 8;
td->counter_offset += 8;
} else {
td->rx_cookie = cpu_to_be32(temp_rx);
td->tx_cookie = cpu_to_be32(temp_tx);
vp->header_size += 4;
td->counter_offset += 4;
}
} else {
return -EINVAL;
}
}
td->has_counter = false;
if (get_uint_param(vp->parsed, "counter", &temp_int)) {
if (temp_int > 0) {
td->has_counter = true;
vp->header_size += 4;
if (get_uint_param(
vp->parsed, "pin_counter", &temp_int)) {
if (temp_int > 0)
td->pin_counter = true;
}
}
}
if (get_uint_param(vp->parsed, "udp", &temp_int)) {
if (temp_int > 0) {
td->udp = true;
vp->header_size += 4;
td->counter_offset += 4;
td->session_offset += 4;
td->cookie_offset += 4;
}
}
vp->rx_header_size = vp->header_size;
if ((!td->ipv6) && (!td->udp))
vp->rx_header_size += sizeof(struct iphdr);
return 0;
}
static int build_raw_transport_data(struct vector_private *vp)
{
if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) {
if (!uml_raw_enable_vnet_headers(vp->fds->tx_fd))
return -1;
vp->form_header = &raw_form_header;
vp->verify_header = &raw_verify_header;
vp->header_size = sizeof(struct virtio_net_hdr);
vp->rx_header_size = sizeof(struct virtio_net_hdr);
vp->dev->hw_features |= (NETIF_F_TSO | NETIF_F_GRO);
vp->dev->features |=
(NETIF_F_RXCSUM | NETIF_F_HW_CSUM |
NETIF_F_TSO | NETIF_F_GRO);
netdev_info(
vp->dev,
"raw: using vnet headers for tso and tx/rx checksum"
);
}
return 0;
}
static int build_tap_transport_data(struct vector_private *vp)
{
if (uml_raw_enable_vnet_headers(vp->fds->rx_fd)) {
vp->form_header = &raw_form_header;
vp->verify_header = &raw_verify_header;
vp->header_size = sizeof(struct virtio_net_hdr);
vp->rx_header_size = sizeof(struct virtio_net_hdr);
vp->dev->hw_features |=
(NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO);
vp->dev->features |=
(NETIF_F_RXCSUM | NETIF_F_HW_CSUM |
NETIF_F_TSO | NETIF_F_GSO | NETIF_F_GRO);
netdev_info(
vp->dev,
"tap/raw: using vnet headers for tso and tx/rx checksum"
);
} else {
return 0; /* do not try to enable tap too if raw failed */
}
if (uml_tap_enable_vnet_headers(vp->fds->tx_fd))
return 0;
return -1;
}
int build_transport_data(struct vector_private *vp)
{
char *transport = uml_vector_fetch_arg(vp->parsed, "transport");
if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0)
return build_gre_transport_data(vp);
if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0)
return build_l2tpv3_transport_data(vp);
if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0)
return build_raw_transport_data(vp);
if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0)
return build_tap_transport_data(vp);
return 0;
}

View File

@ -0,0 +1,586 @@
/*
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#include <stdio.h>
#include <unistd.h>
#include <stdarg.h>
#include <errno.h>
#include <stddef.h>
#include <string.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <net/ethernet.h>
#include <netinet/ip.h>
#include <netinet/ether.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <linux/virtio_net.h>
#include <netdb.h>
#include <stdlib.h>
#include <os.h>
#include <um_malloc.h>
#include "vector_user.h"
#define ID_GRE 0
#define ID_L2TPV3 1
#define ID_MAX 1
#define TOKEN_IFNAME "ifname"
#define TRANS_RAW "raw"
#define TRANS_RAW_LEN strlen(TRANS_RAW)
#define QDISC_FAIL "user_init_raw: could not disable qdisc on interface"
#define VNET_HDR_FAIL "could not enable vnet headers on fd %d"
#define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s"
#define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i"
#define BPF_ATTACH_FAIL "Failed to attach filter size %d to %d, err %d\n"
/* This is very ugly and brute force lookup, but it is done
* only once at initialization so not worth doing hashes or
* anything more intelligent
*/
char *uml_vector_fetch_arg(struct arglist *ifspec, char *token)
{
int i;
for (i = 0; i < ifspec->numargs; i++) {
if (strcmp(ifspec->tokens[i], token) == 0)
return ifspec->values[i];
}
return NULL;
}
struct arglist *uml_parse_vector_ifspec(char *arg)
{
struct arglist *result;
int pos, len;
bool parsing_token = true, next_starts = true;
if (arg == NULL)
return NULL;
result = uml_kmalloc(sizeof(struct arglist), UM_GFP_KERNEL);
if (result == NULL)
return NULL;
result->numargs = 0;
len = strlen(arg);
for (pos = 0; pos < len; pos++) {
if (next_starts) {
if (parsing_token) {
result->tokens[result->numargs] = arg + pos;
} else {
result->values[result->numargs] = arg + pos;
result->numargs++;
}
next_starts = false;
}
if (*(arg + pos) == '=') {
if (parsing_token)
parsing_token = false;
else
goto cleanup;
next_starts = true;
(*(arg + pos)) = '\0';
}
if (*(arg + pos) == ',') {
parsing_token = true;
next_starts = true;
(*(arg + pos)) = '\0';
}
}
return result;
cleanup:
printk(UM_KERN_ERR "vector_setup - Couldn't parse '%s'\n", arg);
kfree(result);
return NULL;
}
/*
* Socket/FD configuration functions. These return an structure
* of rx and tx descriptors to cover cases where these are not
* the same (f.e. read via raw socket and write via tap).
*/
#define PATH_NET_TUN "/dev/net/tun"
static struct vector_fds *user_init_tap_fds(struct arglist *ifspec)
{
struct ifreq ifr;
int fd = -1;
struct sockaddr_ll sock;
int err = -ENOMEM, offload;
char *iface;
struct vector_fds *result = NULL;
iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME);
if (iface == NULL) {
printk(UM_KERN_ERR "uml_tap: failed to parse interface spec\n");
goto tap_cleanup;
}
result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
if (result == NULL) {
printk(UM_KERN_ERR "uml_tap: failed to allocate file descriptors\n");
goto tap_cleanup;
}
result->rx_fd = -1;
result->tx_fd = -1;
result->remote_addr = NULL;
result->remote_addr_size = 0;
/* TAP */
fd = open(PATH_NET_TUN, O_RDWR);
if (fd < 0) {
printk(UM_KERN_ERR "uml_tap: failed to open tun device\n");
goto tap_cleanup;
}
result->tx_fd = fd;
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
err = ioctl(fd, TUNSETIFF, (void *) &ifr);
if (err != 0) {
printk(UM_KERN_ERR "uml_tap: failed to select tap interface\n");
goto tap_cleanup;
}
offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
ioctl(fd, TUNSETOFFLOAD, offload);
/* RAW */
fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (fd == -1) {
printk(UM_KERN_ERR
"uml_tap: failed to create socket: %i\n", -errno);
goto tap_cleanup;
}
result->rx_fd = fd;
memset(&ifr, 0, sizeof(ifr));
strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
printk(UM_KERN_ERR
"uml_tap: failed to set interface: %i\n", -errno);
goto tap_cleanup;
}
sock.sll_family = AF_PACKET;
sock.sll_protocol = htons(ETH_P_ALL);
sock.sll_ifindex = ifr.ifr_ifindex;
if (bind(fd,
(struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
printk(UM_KERN_ERR
"user_init_tap: failed to bind raw pair, err %d\n",
-errno);
goto tap_cleanup;
}
return result;
tap_cleanup:
printk(UM_KERN_ERR "user_init_tap: init failed, error %d", err);
if (result != NULL) {
if (result->rx_fd >= 0)
os_close_file(result->rx_fd);
if (result->tx_fd >= 0)
os_close_file(result->tx_fd);
kfree(result);
}
return NULL;
}
static struct vector_fds *user_init_raw_fds(struct arglist *ifspec)
{
struct ifreq ifr;
int rxfd = -1, txfd = -1;
struct sockaddr_ll sock;
int err = -ENOMEM;
char *iface;
struct vector_fds *result = NULL;
int optval = 1;
iface = uml_vector_fetch_arg(ifspec, TOKEN_IFNAME);
if (iface == NULL)
goto cleanup;
rxfd = socket(AF_PACKET, SOCK_RAW, ETH_P_ALL);
if (rxfd == -1) {
err = -errno;
goto cleanup;
}
txfd = socket(AF_PACKET, SOCK_RAW, 0); /* Turn off RX on this fd */
if (txfd == -1) {
err = -errno;
goto cleanup;
}
memset(&ifr, 0, sizeof(ifr));
strncpy((char *)&ifr.ifr_name, iface, sizeof(ifr.ifr_name) - 1);
if (ioctl(rxfd, SIOCGIFINDEX, (void *) &ifr) < 0) {
err = -errno;
goto cleanup;
}
sock.sll_family = AF_PACKET;
sock.sll_protocol = htons(ETH_P_ALL);
sock.sll_ifindex = ifr.ifr_ifindex;
if (bind(rxfd,
(struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
err = -errno;
goto cleanup;
}
sock.sll_family = AF_PACKET;
sock.sll_protocol = htons(ETH_P_IP);
sock.sll_ifindex = ifr.ifr_ifindex;
if (bind(txfd,
(struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
err = -errno;
goto cleanup;
}
if (setsockopt(txfd,
SOL_PACKET, PACKET_QDISC_BYPASS,
&optval, sizeof(optval)) != 0) {
printk(UM_KERN_INFO QDISC_FAIL);
}
result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
if (result != NULL) {
result->rx_fd = rxfd;
result->tx_fd = txfd;
result->remote_addr = NULL;
result->remote_addr_size = 0;
}
return result;
cleanup:
printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err);
if (rxfd >= 0)
os_close_file(rxfd);
if (txfd >= 0)
os_close_file(txfd);
if (result != NULL)
kfree(result);
return NULL;
}
bool uml_raw_enable_vnet_headers(int fd)
{
int optval = 1;
if (setsockopt(fd,
SOL_PACKET, PACKET_VNET_HDR,
&optval, sizeof(optval)) != 0) {
printk(UM_KERN_INFO VNET_HDR_FAIL, fd);
return false;
}
return true;
}
bool uml_tap_enable_vnet_headers(int fd)
{
unsigned int features;
int len = sizeof(struct virtio_net_hdr);
if (ioctl(fd, TUNGETFEATURES, &features) == -1) {
printk(UM_KERN_INFO TUN_GET_F_FAIL, strerror(errno));
return false;
}
if ((features & IFF_VNET_HDR) == 0) {
printk(UM_KERN_INFO "tapraw: No VNET HEADER support");
return false;
}
ioctl(fd, TUNSETVNETHDRSZ, &len);
return true;
}
static struct vector_fds *user_init_socket_fds(struct arglist *ifspec, int id)
{
int err = -ENOMEM;
int fd = -1, gairet;
struct addrinfo srchints;
struct addrinfo dsthints;
bool v6, udp;
char *value;
char *src, *dst, *srcport, *dstport;
struct addrinfo *gairesult = NULL;
struct vector_fds *result = NULL;
value = uml_vector_fetch_arg(ifspec, "v6");
v6 = false;
udp = false;
if (value != NULL) {
if (strtol((const char *) value, NULL, 10) > 0)
v6 = true;
}
value = uml_vector_fetch_arg(ifspec, "udp");
if (value != NULL) {
if (strtol((const char *) value, NULL, 10) > 0)
udp = true;
}
src = uml_vector_fetch_arg(ifspec, "src");
dst = uml_vector_fetch_arg(ifspec, "dst");
srcport = uml_vector_fetch_arg(ifspec, "srcport");
dstport = uml_vector_fetch_arg(ifspec, "dstport");
memset(&dsthints, 0, sizeof(dsthints));
if (v6)
dsthints.ai_family = AF_INET6;
else
dsthints.ai_family = AF_INET;
switch (id) {
case ID_GRE:
dsthints.ai_socktype = SOCK_RAW;
dsthints.ai_protocol = IPPROTO_GRE;
break;
case ID_L2TPV3:
if (udp) {
dsthints.ai_socktype = SOCK_DGRAM;
dsthints.ai_protocol = 0;
} else {
dsthints.ai_socktype = SOCK_RAW;
dsthints.ai_protocol = IPPROTO_L2TP;
}
break;
default:
printk(KERN_ERR "Unsupported socket type\n");
return NULL;
}
memcpy(&srchints, &dsthints, sizeof(struct addrinfo));
gairet = getaddrinfo(src, srcport, &dsthints, &gairesult);
if ((gairet != 0) || (gairesult == NULL)) {
printk(UM_KERN_ERR
"socket_open : could not resolve src, error = %s",
gai_strerror(gairet)
);
return NULL;
}
fd = socket(gairesult->ai_family,
gairesult->ai_socktype, gairesult->ai_protocol);
if (fd == -1) {
printk(UM_KERN_ERR
"socket_open : could not open socket, error = %d",
-errno
);
goto cleanup;
}
if (bind(fd,
(struct sockaddr *) gairesult->ai_addr,
gairesult->ai_addrlen)) {
printk(UM_KERN_ERR L2TPV3_BIND_FAIL, errno);
goto cleanup;
}
if (gairesult != NULL)
freeaddrinfo(gairesult);
gairesult = NULL;
gairet = getaddrinfo(dst, dstport, &dsthints, &gairesult);
if ((gairet != 0) || (gairesult == NULL)) {
printk(UM_KERN_ERR
"socket_open : could not resolve dst, error = %s",
gai_strerror(gairet)
);
return NULL;
}
result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL);
if (result != NULL) {
result->rx_fd = fd;
result->tx_fd = fd;
result->remote_addr = uml_kmalloc(
gairesult->ai_addrlen, UM_GFP_KERNEL);
if (result->remote_addr == NULL)
goto cleanup;
result->remote_addr_size = gairesult->ai_addrlen;
memcpy(
result->remote_addr,
gairesult->ai_addr,
gairesult->ai_addrlen
);
}
freeaddrinfo(gairesult);
return result;
cleanup:
if (gairesult != NULL)
freeaddrinfo(gairesult);
printk(UM_KERN_ERR "user_init_socket: init failed, error %d", err);
if (fd >= 0)
os_close_file(fd);
if (result != NULL) {
if (result->remote_addr != NULL)
kfree(result->remote_addr);
kfree(result);
}
return NULL;
}
struct vector_fds *uml_vector_user_open(
int unit,
struct arglist *parsed
)
{
char *transport;
if (parsed == NULL) {
printk(UM_KERN_ERR "no parsed config for unit %d\n", unit);
return NULL;
}
transport = uml_vector_fetch_arg(parsed, "transport");
if (transport == NULL) {
printk(UM_KERN_ERR "missing transport for unit %d\n", unit);
return NULL;
}
if (strncmp(transport, TRANS_RAW, TRANS_RAW_LEN) == 0)
return user_init_raw_fds(parsed);
if (strncmp(transport, TRANS_TAP, TRANS_TAP_LEN) == 0)
return user_init_tap_fds(parsed);
if (strncmp(transport, TRANS_GRE, TRANS_GRE_LEN) == 0)
return user_init_socket_fds(parsed, ID_GRE);
if (strncmp(transport, TRANS_L2TPV3, TRANS_L2TPV3_LEN) == 0)
return user_init_socket_fds(parsed, ID_L2TPV3);
return NULL;
}
int uml_vector_sendmsg(int fd, void *hdr, int flags)
{
int n;
CATCH_EINTR(n = sendmsg(fd, (struct msghdr *) hdr, flags));
if ((n < 0) && (errno == EAGAIN))
return 0;
if (n >= 0)
return n;
else
return -errno;
}
int uml_vector_recvmsg(int fd, void *hdr, int flags)
{
int n;
CATCH_EINTR(n = recvmsg(fd, (struct msghdr *) hdr, flags));
if ((n < 0) && (errno == EAGAIN))
return 0;
if (n >= 0)
return n;
else
return -errno;
}
int uml_vector_writev(int fd, void *hdr, int iovcount)
{
int n;
CATCH_EINTR(n = writev(fd, (struct iovec *) hdr, iovcount));
if ((n < 0) && (errno == EAGAIN))
return 0;
if (n >= 0)
return n;
else
return -errno;
}
int uml_vector_sendmmsg(
int fd,
void *msgvec,
unsigned int vlen,
unsigned int flags)
{
int n;
CATCH_EINTR(n = sendmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags));
if ((n < 0) && (errno == EAGAIN))
return 0;
if (n >= 0)
return n;
else
return -errno;
}
int uml_vector_recvmmsg(
int fd,
void *msgvec,
unsigned int vlen,
unsigned int flags)
{
int n;
CATCH_EINTR(
n = recvmmsg(fd, (struct mmsghdr *) msgvec, vlen, flags, 0));
if ((n < 0) && (errno == EAGAIN))
return 0;
if (n >= 0)
return n;
else
return -errno;
}
int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len)
{
int err = setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, bpf, bpf_len);
if (err < 0)
printk(KERN_ERR BPF_ATTACH_FAIL, bpf_len, fd, -errno);
return err;
}
#define DEFAULT_BPF_LEN 6
void *uml_vector_default_bpf(int fd, void *mac)
{
struct sock_filter *bpf;
uint32_t *mac1 = (uint32_t *)(mac + 2);
uint16_t *mac2 = (uint16_t *) mac;
struct sock_fprog bpf_prog = {
.len = 6,
.filter = NULL,
};
bpf = uml_kmalloc(
sizeof(struct sock_filter) * DEFAULT_BPF_LEN, UM_GFP_KERNEL);
if (bpf != NULL) {
bpf_prog.filter = bpf;
/* ld [8] */
bpf[0] = (struct sock_filter){ 0x20, 0, 0, 0x00000008 };
/* jeq #0xMAC[2-6] jt 2 jf 5*/
bpf[1] = (struct sock_filter){ 0x15, 0, 3, ntohl(*mac1)};
/* ldh [6] */
bpf[2] = (struct sock_filter){ 0x28, 0, 0, 0x00000006 };
/* jeq #0xMAC[0-1] jt 4 jf 5 */
bpf[3] = (struct sock_filter){ 0x15, 0, 1, ntohs(*mac2)};
/* ret #0 */
bpf[4] = (struct sock_filter){ 0x6, 0, 0, 0x00000000 };
/* ret #0x40000 */
bpf[5] = (struct sock_filter){ 0x6, 0, 0, 0x00040000 };
if (uml_vector_attach_bpf(
fd, &bpf_prog, sizeof(struct sock_fprog)) < 0) {
kfree(bpf);
bpf = NULL;
}
}
return bpf;
}

View File

@ -0,0 +1,99 @@
/*
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
#ifndef __UM_VECTOR_USER_H
#define __UM_VECTOR_USER_H
#define MAXVARGS 20
#define TOKEN_IFNAME "ifname"
#define TRANS_RAW "raw"
#define TRANS_RAW_LEN strlen(TRANS_RAW)
#define TRANS_TAP "tap"
#define TRANS_TAP_LEN strlen(TRANS_TAP)
#define TRANS_GRE "gre"
#define TRANS_GRE_LEN strlen(TRANS_RAW)
#define TRANS_L2TPV3 "l2tpv3"
#define TRANS_L2TPV3_LEN strlen(TRANS_L2TPV3)
#ifndef IPPROTO_GRE
#define IPPROTO_GRE 0x2F
#endif
#define GRE_MODE_CHECKSUM cpu_to_be16(8 << 12) /* checksum */
#define GRE_MODE_RESERVED cpu_to_be16(4 << 12) /* unused */
#define GRE_MODE_KEY cpu_to_be16(2 << 12) /* KEY present */
#define GRE_MODE_SEQUENCE cpu_to_be16(1 << 12) /* sequence */
#define GRE_IRB cpu_to_be16(0x6558)
#define L2TPV3_DATA_PACKET 0x30000
/* IANA-assigned IP protocol ID for L2TPv3 */
#ifndef IPPROTO_L2TP
#define IPPROTO_L2TP 0x73
#endif
struct arglist {
int numargs;
char *tokens[MAXVARGS];
char *values[MAXVARGS];
};
/* Separating read and write FDs allows us to have different
* rx and tx method. Example - read tap via raw socket using
* recvmmsg, write using legacy tap write calls
*/
struct vector_fds {
int rx_fd;
int tx_fd;
void *remote_addr;
int remote_addr_size;
};
#define VECTOR_READ 1
#define VECTOR_WRITE (1 < 1)
#define VECTOR_HEADERS (1 < 2)
extern struct arglist *uml_parse_vector_ifspec(char *arg);
extern struct vector_fds *uml_vector_user_open(
int unit,
struct arglist *parsed
);
extern char *uml_vector_fetch_arg(
struct arglist *ifspec,
char *token
);
extern int uml_vector_recvmsg(int fd, void *hdr, int flags);
extern int uml_vector_sendmsg(int fd, void *hdr, int flags);
extern int uml_vector_writev(int fd, void *hdr, int iovcount);
extern int uml_vector_sendmmsg(
int fd, void *msgvec,
unsigned int vlen,
unsigned int flags
);
extern int uml_vector_recvmmsg(
int fd,
void *msgvec,
unsigned int vlen,
unsigned int flags
);
extern void *uml_vector_default_bpf(int fd, void *mac);
extern int uml_vector_attach_bpf(int fd, void *bpf, int bpf_len);
extern bool uml_raw_enable_vnet_headers(int fd);
extern bool uml_tap_enable_vnet_headers(int fd);
#endif

View File

@ -18,7 +18,19 @@
#define XTERM_IRQ 13
#define RANDOM_IRQ 14
#ifdef CONFIG_UML_NET_VECTOR
#define VECTOR_BASE_IRQ 15
#define VECTOR_IRQ_SPACE 8
#define LAST_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ)
#else
#define LAST_IRQ RANDOM_IRQ
#endif
#define NR_IRQS (LAST_IRQ + 1)
#endif

View File

@ -65,5 +65,7 @@ extern int tap_setup_common(char *str, char *type, char **dev_name,
char **mac_out, char **gate_addr);
extern void register_transport(struct transport *new);
extern unsigned short eth_protocol(struct sk_buff *skb);
extern void uml_net_setup_etheraddr(struct net_device *dev, char *str);
#endif