remarkable-linux/net/openvswitch/flow_table.c
Jesse Gross ae5f2fb1d5 openvswitch: Zero flows on allocation.
When support for megaflows was introduced, OVS needed to start
installing flows with a mask applied to them. Since masking is an
expensive operation, OVS also had an optimization that would only
take the parts of the flow keys that were covered by a non-zero
mask. The values stored in the remaining pieces should not matter
because they are masked out.

While this works fine for the purposes of matching (which must always
look at the mask), serialization to netlink can be problematic. Since
the flow and the mask are serialized separately, the uninitialized
portions of the flow can be encoded with whatever values happen to be
present.

In terms of functionality, this has little effect since these fields
will be masked out by definition. However, it leaks kernel memory to
userspace, which is a potential security vulnerability. It is also
possible that other code paths could look at the masked key and get
uninitialized data, although this does not currently appear to be an
issue in practice.

This removes the mask optimization for flows that are being installed.
This was always intended to be the case as the mask optimizations were
really targetting per-packet flow operations.

Fixes: 03f0d916 ("openvswitch: Mega flow implementation")
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-09-22 17:33:41 -07:00

782 lines
19 KiB
C

/*
* Copyright (c) 2007-2014 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#include "flow.h"
#include "datapath.h"
#include "flow_netlink.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <net/llc_pdu.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/llc.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#define TBL_MIN_BUCKETS 1024
#define REHASH_INTERVAL (10 * 60 * HZ)
static struct kmem_cache *flow_cache;
struct kmem_cache *flow_stats_cache __read_mostly;
static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
return range->end - range->start;
}
void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
bool full, const struct sw_flow_mask *mask)
{
int start = full ? 0 : mask->range.start;
int len = full ? sizeof *dst : range_n_bytes(&mask->range);
const long *m = (const long *)((const u8 *)&mask->key + start);
const long *s = (const long *)((const u8 *)src + start);
long *d = (long *)((u8 *)dst + start);
int i;
/* If 'full' is true then all of 'dst' is fully initialized. Otherwise,
* if 'full' is false the memory outside of the 'mask->range' is left
* uninitialized. This can be used as an optimization when further
* operations on 'dst' only use contents within 'mask->range'.
*/
for (i = 0; i < len; i += sizeof(long))
*d++ = *s++ & *m++;
}
struct sw_flow *ovs_flow_alloc(void)
{
struct sw_flow *flow;
struct flow_stats *stats;
int node;
flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
if (!flow)
return ERR_PTR(-ENOMEM);
flow->sf_acts = NULL;
flow->mask = NULL;
flow->id.unmasked_key = NULL;
flow->id.ufid_len = 0;
flow->stats_last_writer = NUMA_NO_NODE;
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
GFP_KERNEL | __GFP_ZERO, 0);
if (!stats)
goto err;
spin_lock_init(&stats->lock);
RCU_INIT_POINTER(flow->stats[0], stats);
for_each_node(node)
if (node != 0)
RCU_INIT_POINTER(flow->stats[node], NULL);
return flow;
err:
kmem_cache_free(flow_cache, flow);
return ERR_PTR(-ENOMEM);
}
int ovs_flow_tbl_count(const struct flow_table *table)
{
return table->count;
}
static struct flex_array *alloc_buckets(unsigned int n_buckets)
{
struct flex_array *buckets;
int i, err;
buckets = flex_array_alloc(sizeof(struct hlist_head),
n_buckets, GFP_KERNEL);
if (!buckets)
return NULL;
err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
if (err) {
flex_array_free(buckets);
return NULL;
}
for (i = 0; i < n_buckets; i++)
INIT_HLIST_HEAD((struct hlist_head *)
flex_array_get(buckets, i));
return buckets;
}
static void flow_free(struct sw_flow *flow)
{
int node;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
for_each_node(node)
if (flow->stats[node])
kmem_cache_free(flow_stats_cache,
(struct flow_stats __force *)flow->stats[node]);
kmem_cache_free(flow_cache, flow);
}
static void rcu_free_flow_callback(struct rcu_head *rcu)
{
struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
flow_free(flow);
}
void ovs_flow_free(struct sw_flow *flow, bool deferred)
{
if (!flow)
return;
if (deferred)
call_rcu(&flow->rcu, rcu_free_flow_callback);
else
flow_free(flow);
}
static void free_buckets(struct flex_array *buckets)
{
flex_array_free(buckets);
}
static void __table_instance_destroy(struct table_instance *ti)
{
free_buckets(ti->buckets);
kfree(ti);
}
static struct table_instance *table_instance_alloc(int new_size)
{
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
if (!ti)
return NULL;
ti->buckets = alloc_buckets(new_size);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
get_random_bytes(&ti->hash_seed, sizeof(u32));
return ti;
}
int ovs_flow_tbl_init(struct flow_table *table)
{
struct table_instance *ti, *ufid_ti;
ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ti)
return -ENOMEM;
ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ufid_ti)
goto free_ti;
rcu_assign_pointer(table->ti, ti);
rcu_assign_pointer(table->ufid_ti, ufid_ti);
INIT_LIST_HEAD(&table->mask_list);
table->last_rehash = jiffies;
table->count = 0;
table->ufid_count = 0;
return 0;
free_ti:
__table_instance_destroy(ti);
return -ENOMEM;
}
static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
__table_instance_destroy(ti);
}
static void table_instance_destroy(struct table_instance *ti,
struct table_instance *ufid_ti,
bool deferred)
{
int i;
if (!ti)
return;
BUG_ON(!ufid_ti);
if (ti->keep_flows)
goto skip_flows;
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
struct hlist_head *head = flex_array_get(ti->buckets, i);
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
hlist_del_rcu(&flow->flow_table.node[ver]);
if (ovs_identifier_is_ufid(&flow->id))
hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
ovs_flow_free(flow, deferred);
}
}
skip_flows:
if (deferred) {
call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
} else {
__table_instance_destroy(ti);
__table_instance_destroy(ufid_ti);
}
}
/* No need for locking this function is called from RCU callback or
* error path.
*/
void ovs_flow_tbl_destroy(struct flow_table *table)
{
struct table_instance *ti = rcu_dereference_raw(table->ti);
struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
table_instance_destroy(ti, ufid_ti, false);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
u32 *bucket, u32 *last)
{
struct sw_flow *flow;
struct hlist_head *head;
int ver;
int i;
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
head = flex_array_get(ti->buckets, *bucket);
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
continue;
}
*last = i + 1;
return flow;
}
(*bucket)++;
*last = 0;
}
return NULL;
}
static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
{
hash = jhash_1word(hash, ti->hash_seed);
return flex_array_get(ti->buckets,
(hash & (ti->n_buckets - 1)));
}
static void table_instance_insert(struct table_instance *ti,
struct sw_flow *flow)
{
struct hlist_head *head;
head = find_bucket(ti, flow->flow_table.hash);
hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head);
}
static void ufid_table_instance_insert(struct table_instance *ti,
struct sw_flow *flow)
{
struct hlist_head *head;
head = find_bucket(ti, flow->ufid_table.hash);
hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
}
static void flow_table_copy_flows(struct table_instance *old,
struct table_instance *new, bool ufid)
{
int old_ver;
int i;
old_ver = old->node_ver;
new->node_ver = !old_ver;
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
struct hlist_head *head;
head = flex_array_get(old->buckets, i);
if (ufid)
hlist_for_each_entry(flow, head,
ufid_table.node[old_ver])
ufid_table_instance_insert(new, flow);
else
hlist_for_each_entry(flow, head,
flow_table.node[old_ver])
table_instance_insert(new, flow);
}
old->keep_flows = true;
}
static struct table_instance *table_instance_rehash(struct table_instance *ti,
int n_buckets, bool ufid)
{
struct table_instance *new_ti;
new_ti = table_instance_alloc(n_buckets);
if (!new_ti)
return NULL;
flow_table_copy_flows(ti, new_ti, ufid);
return new_ti;
}
int ovs_flow_tbl_flush(struct flow_table *flow_table)
{
struct table_instance *old_ti, *new_ti;
struct table_instance *old_ufid_ti, *new_ufid_ti;
new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!new_ti)
return -ENOMEM;
new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!new_ufid_ti)
goto err_free_ti;
old_ti = ovsl_dereference(flow_table->ti);
old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
rcu_assign_pointer(flow_table->ti, new_ti);
rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
flow_table->last_rehash = jiffies;
flow_table->count = 0;
flow_table->ufid_count = 0;
table_instance_destroy(old_ti, old_ufid_ti, true);
return 0;
err_free_ti:
__table_instance_destroy(new_ti);
return -ENOMEM;
}
static u32 flow_hash(const struct sw_flow_key *key,
const struct sw_flow_key_range *range)
{
int key_start = range->start;
int key_end = range->end;
const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
int hash_u32s = (key_end - key_start) >> 2;
/* Make sure number of hash bytes are multiple of u32. */
BUILD_BUG_ON(sizeof(long) % sizeof(u32));
return jhash2(hash_key, hash_u32s, 0);
}
static int flow_key_start(const struct sw_flow_key *key)
{
if (key->tun_key.u.ipv4.dst)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
sizeof(long));
}
static bool cmp_key(const struct sw_flow_key *key1,
const struct sw_flow_key *key2,
int key_start, int key_end)
{
const long *cp1 = (const long *)((const u8 *)key1 + key_start);
const long *cp2 = (const long *)((const u8 *)key2 + key_start);
long diffs = 0;
int i;
for (i = key_start; i < key_end; i += sizeof(long))
diffs |= *cp1++ ^ *cp2++;
return diffs == 0;
}
static bool flow_cmp_masked_key(const struct sw_flow *flow,
const struct sw_flow_key *key,
const struct sw_flow_key_range *range)
{
return cmp_key(&flow->key, key, range->start, range->end);
}
static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
const struct sw_flow_match *match)
{
struct sw_flow_key *key = match->key;
int key_start = flow_key_start(key);
int key_end = match->range.end;
BUG_ON(ovs_identifier_is_ufid(&flow->id));
return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
}
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
const struct sw_flow_mask *mask)
{
struct sw_flow *flow;
struct hlist_head *head;
u32 hash;
struct sw_flow_key masked_key;
ovs_flow_mask_key(&masked_key, unmasked, false, mask);
hash = flow_hash(&masked_key, &mask->range);
head = find_bucket(ti, hash);
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
}
return NULL;
}
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
const struct sw_flow_key *key,
u32 *n_mask_hit)
{
struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct sw_flow_mask *mask;
struct sw_flow *flow;
*n_mask_hit = 0;
list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
(*n_mask_hit)++;
flow = masked_flow_lookup(ti, key, mask);
if (flow) /* Found */
return flow;
}
return NULL;
}
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
const struct sw_flow_key *key)
{
u32 __always_unused n_mask_hit;
return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
const struct sw_flow_match *match)
{
struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
struct sw_flow_mask *mask;
struct sw_flow *flow;
/* Always called under ovs-mutex. */
list_for_each_entry(mask, &tbl->mask_list, list) {
flow = masked_flow_lookup(ti, match->key, mask);
if (flow && ovs_identifier_is_key(&flow->id) &&
ovs_flow_cmp_unmasked_key(flow, match))
return flow;
}
return NULL;
}
static u32 ufid_hash(const struct sw_flow_id *sfid)
{
return jhash(sfid->ufid, sfid->ufid_len, 0);
}
static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
const struct sw_flow_id *sfid)
{
if (flow->id.ufid_len != sfid->ufid_len)
return false;
return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
}
bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
{
if (ovs_identifier_is_ufid(&flow->id))
return flow_cmp_masked_key(flow, match->key, &match->range);
return ovs_flow_cmp_unmasked_key(flow, match);
}
struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
const struct sw_flow_id *ufid)
{
struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
struct sw_flow *flow;
struct hlist_head *head;
u32 hash;
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
}
return NULL;
}
int ovs_flow_tbl_num_masks(const struct flow_table *table)
{
struct sw_flow_mask *mask;
int num = 0;
list_for_each_entry(mask, &table->mask_list, list)
num++;
return num;
}
static struct table_instance *table_instance_expand(struct table_instance *ti,
bool ufid)
{
return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
}
/* Remove 'mask' from the mask list, if it is not needed any more. */
static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
{
if (mask) {
/* ovs-lock is required to protect mask-refcount and
* mask list.
*/
ASSERT_OVSL();
BUG_ON(!mask->ref_count);
mask->ref_count--;
if (!mask->ref_count) {
list_del_rcu(&mask->list);
kfree_rcu(mask, rcu);
}
}
}
/* Must be called with OVS mutex held. */
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
struct table_instance *ti = ovsl_dereference(table->ti);
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
table->count--;
if (ovs_identifier_is_ufid(&flow->id)) {
hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
table->ufid_count--;
}
/* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
* accessible as long as the RCU read lock is held.
*/
flow_mask_remove(table, flow->mask);
}
static struct sw_flow_mask *mask_alloc(void)
{
struct sw_flow_mask *mask;
mask = kmalloc(sizeof(*mask), GFP_KERNEL);
if (mask)
mask->ref_count = 1;
return mask;
}
static bool mask_equal(const struct sw_flow_mask *a,
const struct sw_flow_mask *b)
{
const u8 *a_ = (const u8 *)&a->key + a->range.start;
const u8 *b_ = (const u8 *)&b->key + b->range.start;
return (a->range.end == b->range.end)
&& (a->range.start == b->range.start)
&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
}
static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
const struct sw_flow_mask *mask)
{
struct list_head *ml;
list_for_each(ml, &tbl->mask_list) {
struct sw_flow_mask *m;
m = container_of(ml, struct sw_flow_mask, list);
if (mask_equal(mask, m))
return m;
}
return NULL;
}
/* Add 'mask' into the mask list, if it is not already there. */
static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
const struct sw_flow_mask *new)
{
struct sw_flow_mask *mask;
mask = flow_mask_find(tbl, new);
if (!mask) {
/* Allocate a new mask if none exsits. */
mask = mask_alloc();
if (!mask)
return -ENOMEM;
mask->key = new->key;
mask->range = new->range;
list_add_rcu(&mask->list, &tbl->mask_list);
} else {
BUG_ON(!mask->ref_count);
mask->ref_count++;
}
flow->mask = mask;
return 0;
}
/* Must be called with OVS mutex held. */
static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
{
struct table_instance *new_ti = NULL;
struct table_instance *ti;
flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
ti = ovsl_dereference(table->ti);
table_instance_insert(ti, flow);
table->count++;
/* Expand table, if necessary, to make room. */
if (table->count > ti->n_buckets)
new_ti = table_instance_expand(ti, false);
else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
new_ti = table_instance_rehash(ti, ti->n_buckets, false);
if (new_ti) {
rcu_assign_pointer(table->ti, new_ti);
call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
table->last_rehash = jiffies;
}
}
/* Must be called with OVS mutex held. */
static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
{
struct table_instance *ti;
flow->ufid_table.hash = ufid_hash(&flow->id);
ti = ovsl_dereference(table->ufid_ti);
ufid_table_instance_insert(ti, flow);
table->ufid_count++;
/* Expand table, if necessary, to make room. */
if (table->ufid_count > ti->n_buckets) {
struct table_instance *new_ti;
new_ti = table_instance_expand(ti, true);
if (new_ti) {
rcu_assign_pointer(table->ufid_ti, new_ti);
call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
}
}
}
/* Must be called with OVS mutex held. */
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
const struct sw_flow_mask *mask)
{
int err;
err = flow_mask_insert(table, flow, mask);
if (err)
return err;
flow_key_insert(table, flow);
if (ovs_identifier_is_ufid(&flow->id))
flow_ufid_insert(table, flow);
return 0;
}
/* Initializes the flow module.
* Returns zero if successful or a negative error code. */
int ovs_flow_init(void)
{
BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+ (nr_node_ids
* sizeof(struct flow_stats *)),
0, 0, NULL);
if (flow_cache == NULL)
return -ENOMEM;
flow_stats_cache
= kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
0, SLAB_HWCACHE_ALIGN, NULL);
if (flow_stats_cache == NULL) {
kmem_cache_destroy(flow_cache);
flow_cache = NULL;
return -ENOMEM;
}
return 0;
}
/* Uninitializes the flow module. */
void ovs_flow_exit(void)
{
kmem_cache_destroy(flow_stats_cache);
kmem_cache_destroy(flow_cache);
}