From b121d1e74d1f24654bdc3165d3db1ca149501356 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:13 -0800
Subject: [PATCH 1/9] bpf: prevent kprobe+bpf deadlocks

if kprobe is placed within update or delete hash map helpers
that hold bucket spin lock and triggered bpf program is trying to
grab the spinlock for the same bucket on the same cpu, it will
deadlock.
Fix it by extending existing recursion prevention mechanism.

Note, map_lookup and other tracing helpers don't have this problem,
since they don't hold any locks and don't modify global data.
bpf_trace_printk has its own recursive check and ok as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     | 13 +++++++++++++
 kernel/trace/bpf_trace.c |  2 --
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e498e5470e..4b070827200d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/percpu.h>
 
 struct bpf_map;
 
@@ -163,6 +164,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
+DECLARE_PER_CPU(int, bpf_prog_active);
+
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c95a753c2007..dc99f6a000f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 
+DEFINE_PER_CPU(int, bpf_prog_active);
+
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
 static LIST_HEAD(bpf_map_types);
@@ -347,6 +349,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
@@ -356,6 +363,8 @@ static int map_update_elem(union bpf_attr *attr)
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		rcu_read_unlock();
 	}
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
 free_value:
 	kfree(value);
@@ -394,9 +403,13 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
 	rcu_read_lock();
 	err = map->ops->map_delete_elem(map, key);
 	rcu_read_unlock();
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 
 free_key:
 	kfree(key);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4b8caa392b86..3e4ffb3ace5f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,8 +13,6 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
-static DEFINE_PER_CPU(int, bpf_prog_active);
-
 /**
  * trace_call_bpf - invoke BPF program
  * @prog: BPF program

From e19494edab82f55a633911f25094581891bdc351 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:14 -0800
Subject: [PATCH 2/9] bpf: introduce percpu_freelist

Introduce simple percpu_freelist to keep single list of elements
spread across per-cpu singly linked lists.

/* push element into the list */
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);

/* pop element from the list */
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);

The object is pushed to the current cpu list.
Pop first trying to get the object from the current cpu list,
if it's empty goes to the neigbour cpu list.

For bpf program usage pattern the collision rate is very low,
since programs push and pop the objects typically on the same cpu.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile          |   2 +-
 kernel/bpf/percpu_freelist.c | 100 +++++++++++++++++++++++++++++++++++
 kernel/bpf/percpu_freelist.h |  31 +++++++++++
 3 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/percpu_freelist.c
 create mode 100644 kernel/bpf/percpu_freelist.h

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 8a932d079c24..eed911d091da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
new file mode 100644
index 000000000000..5c51d1985b51
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "percpu_freelist.h"
+
+int pcpu_freelist_init(struct pcpu_freelist *s)
+{
+	int cpu;
+
+	s->freelist = alloc_percpu(struct pcpu_freelist_head);
+	if (!s->freelist)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
+
+		raw_spin_lock_init(&head->lock);
+		head->first = NULL;
+	}
+	return 0;
+}
+
+void pcpu_freelist_destroy(struct pcpu_freelist *s)
+{
+	free_percpu(s->freelist);
+}
+
+static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
+					struct pcpu_freelist_node *node)
+{
+	raw_spin_lock(&head->lock);
+	node->next = head->first;
+	head->first = node;
+	raw_spin_unlock(&head->lock);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+			struct pcpu_freelist_node *node)
+{
+	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
+
+	__pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+			    u32 nr_elems)
+{
+	struct pcpu_freelist_head *head;
+	unsigned long flags;
+	int i, cpu, pcpu_entries;
+
+	pcpu_entries = nr_elems / num_possible_cpus() + 1;
+	i = 0;
+
+	/* disable irq to workaround lockdep false positive
+	 * in bpf usage pcpu_freelist_populate() will never race
+	 * with pcpu_freelist_push()
+	 */
+	local_irq_save(flags);
+	for_each_possible_cpu(cpu) {
+again:
+		head = per_cpu_ptr(s->freelist, cpu);
+		__pcpu_freelist_push(head, buf);
+		i++;
+		buf += elem_size;
+		if (i == nr_elems)
+			break;
+		if (i % pcpu_entries)
+			goto again;
+	}
+	local_irq_restore(flags);
+}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_head *head;
+	struct pcpu_freelist_node *node;
+	int orig_cpu, cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		head = per_cpu_ptr(s->freelist, cpu);
+		raw_spin_lock(&head->lock);
+		node = head->first;
+		if (node) {
+			head->first = node->next;
+			raw_spin_unlock(&head->lock);
+			return node;
+		}
+		raw_spin_unlock(&head->lock);
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+		if (cpu == orig_cpu)
+			return NULL;
+	}
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
new file mode 100644
index 000000000000..3049aae8ea1e
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __PERCPU_FREELIST_H__
+#define __PERCPU_FREELIST_H__
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+struct pcpu_freelist_head {
+	struct pcpu_freelist_node *first;
+	raw_spinlock_t lock;
+};
+
+struct pcpu_freelist {
+	struct pcpu_freelist_head __percpu *freelist;
+};
+
+struct pcpu_freelist_node {
+	struct pcpu_freelist_node *next;
+};
+
+void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+			    u32 nr_elems);
+int pcpu_freelist_init(struct pcpu_freelist *);
+void pcpu_freelist_destroy(struct pcpu_freelist *s);
+#endif

From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:15 -0800
Subject: [PATCH 3/9] bpf: pre-allocate hash map elements

If kprobe is placed on spin_unlock then calling kmalloc/kfree from
bpf programs is not safe, since the following dead lock is possible:
kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe->
bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock)
and deadlocks.

The following solutions were considered and some implemented, but
eventually discarded
- kmem_cache_create for every map
- add recursion check to slow-path of slub
- use reserved memory in bpf_map_update for in_irq or in preempt_disabled
- kmalloc via irq_work

At the end pre-allocation of all map elements turned out to be the simplest
solution and since the user is charged upfront for all the memory, such
pre-allocation doesn't affect the user space visible behavior.

Since it's impossible to tell whether kprobe is triggered in a safe
location from kmalloc point of view, use pre-allocation by default
and introduce new BPF_F_NO_PREALLOC flag.

While testing of per-cpu hash maps it was discovered
that alloc_percpu(GFP_ATOMIC) has odd corner cases and often
fails to allocate memory even when 90% of it is free.
The pre-allocation of per-cpu hash elements solves this problem as well.

Turned out that bpf_map_update() quickly followed by
bpf_map_lookup()+bpf_map_delete() is very common pattern used
in many of iovisor/bcc/tools, so there is additional benefit of
pre-allocation, since such use cases are must faster.

Since all hash map elements are now pre-allocated we can remove
atomic increment of htab->count and save few more cycles.

Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid
large malloc/free done by users who don't have sufficient limits.

Pre-allocation is done with vmalloc and alloc/free is done
via percpu_freelist. Here are performance numbers for different
pre-allocation algorithms that were implemented, but discarded
in favor of percpu_freelist:

1 cpu:
pcpu_ida	2.1M
pcpu_ida nolock	2.3M
bt		2.4M
kmalloc		1.8M
hlist+spinlock	2.3M
pcpu_freelist	2.6M

4 cpu:
pcpu_ida	1.5M
pcpu_ida nolock	1.8M
bt w/smp_align	1.7M
bt no/smp_align	1.1M
kmalloc		0.7M
hlist+spinlock	0.2M
pcpu_freelist	2.0M

8 cpu:
pcpu_ida	0.7M
bt w/smp_align	0.8M
kmalloc		0.4M
pcpu_freelist	1.5M

32 cpu:
kmalloc		0.13M
pcpu_freelist	0.49M

pcpu_ida nolock is a modified percpu_ida algorithm without
percpu_ida_cpu locks and without cross-cpu tag stealing.
It's faster than existing percpu_ida, but not as fast as pcpu_freelist.

bt is a variant of block/blk-mq-tag.c simlified and customized
for bpf use case. bt w/smp_align is using cache line for every 'long'
(similar to blk-mq-tag). bt no/smp_align allocates 'long'
bitmasks continuously to save memory. It's comparable to percpu_ida
and in some cases faster, but slower than percpu_freelist

hlist+spinlock is the simplest free list with single spinlock.
As expeceted it has very bad scaling in SMP.

kmalloc is existing implementation which is still available via
BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and
in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist,
but saves memory, so in cases where map->max_entries can be large
and number of map update/delete per second is low, it may make
sense to use it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   2 +
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/hashtab.c     | 240 +++++++++++++++++++++++++++------------
 kernel/bpf/syscall.c     |  15 ++-
 4 files changed, 186 insertions(+), 74 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4b070827200d..efd1d4ca95c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
+	u32 map_flags;
 	u32 pages;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
@@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
+int bpf_map_precharge_memlock(u32 pages);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9221f653fee3..0e30b19012a5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,12 +101,15 @@ enum bpf_prog_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+#define BPF_F_NO_PREALLOC	(1U << 0)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* prealloc or not */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a68e95133fcd..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
+#include "percpu_freelist.h"
 
 struct bucket {
 	struct hlist_head head;
@@ -22,6 +24,8 @@ struct bucket {
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
+	void *elems;
+	struct pcpu_freelist freelist;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -29,15 +33,86 @@ struct bpf_htab {
 
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
-	struct hlist_node hash_node;
-	struct rcu_head rcu;
 	union {
-		u32 hash;
-		u32 key_size;
+		struct hlist_node hash_node;
+		struct bpf_htab *htab;
+		struct pcpu_freelist_node fnode;
 	};
+	struct rcu_head rcu;
+	u32 hash;
 	char key[0] __aligned(8);
 };
 
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+	int i;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto free_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		void __percpu *pptr;
+
+		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+					 htab->map.key_size);
+		free_percpu(pptr);
+	}
+free_elems:
+	vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+	int err = -ENOMEM, i;
+
+	htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+	if (!htab->elems)
+		return -ENOMEM;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto skip_percpu_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		u32 size = round_up(htab->map.value_size, 8);
+		void __percpu *pptr;
+
+		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		if (!pptr)
+			goto free_elems;
+		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+				  pptr);
+	}
+
+skip_percpu_elems:
+	err = pcpu_freelist_init(&htab->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+			       htab->map.max_entries);
+	return 0;
+
+free_elems:
+	htab_free_elems(htab);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	int err, i;
 	u64 cost;
 
+	if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
+	htab->map.map_flags = attr->map_flags;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		htab->elem_size += sizeof(void *);
 	else
-		htab->elem_size += htab->map.value_size;
+		htab->elem_size += round_up(htab->map.value_size, 8);
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
-	atomic_set(&htab->count, 0);
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+		err = prealloc_elems_and_freelist(htab);
+		if (err)
+			goto free_buckets;
+	}
 
 	return &htab->map;
 
+free_buckets:
+	kvfree(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -249,42 +340,42 @@ find_first_elem:
 		}
 	}
 
-	/* itereated over all buckets and all elements */
+	/* iterated over all buckets and all elements */
 	return -ENOENT;
 }
 
-
-static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
-				     void __percpu *pptr)
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
-	*(void __percpu **)(l->key + key_size) = pptr;
-}
-
-static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
-{
-	return *(void __percpu **)(l->key + key_size);
-}
-
-static void htab_percpu_elem_free(struct htab_elem *l)
-{
-	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
+
 }
 
-static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+static void htab_elem_free_rcu(struct rcu_head *head)
 {
 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+	struct bpf_htab *htab = l->htab;
 
-	htab_percpu_elem_free(l);
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+	 * we're calling kfree, otherwise deadlock is possible if kprobes
+	 * are placed somewhere inside of slub
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	htab_elem_free(htab, l);
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 }
 
-static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
-	if (percpu) {
-		l->key_size = key_size;
-		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
-		kfree_rcu(l, rcu);
+		atomic_dec(&htab->count);
+		l->htab = htab;
+		call_rcu(&l->rcu, htab_elem_free_rcu);
 	}
 }
 
@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 bool percpu, bool onallcpus)
 {
 	u32 size = htab->map.value_size;
+	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
 
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return NULL;
+	if (prealloc) {
+		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+		if (!l_new)
+			return ERR_PTR(-E2BIG);
+	} else {
+		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!l_new)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
 		/* round up value_size to 8 bytes */
 		size = round_up(size, 8);
 
-		/* alloc_percpu zero-fills */
-		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
-		if (!pptr) {
-			kfree(l_new);
-			return NULL;
+		if (prealloc) {
+			pptr = htab_elem_get_ptr(l_new, key_size);
+		} else {
+			/* alloc_percpu zero-fills */
+			pptr = __alloc_percpu_gfp(size, 8,
+						  GFP_ATOMIC | __GFP_NOWARN);
+			if (!pptr) {
+				kfree(l_new);
+				return ERR_PTR(-ENOMEM);
+			}
 		}
 
 		if (!onallcpus) {
@@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				off += size;
 			}
 		}
-		htab_elem_set_ptr(l_new, key_size, pptr);
+		if (!prealloc)
+			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
 	}
@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		return -E2BIG;
-
 	if (l_old && map_flags == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	hash = htab_map_hash(key, key_size);
 
-	/* allocate new element outside of the lock, since
-	 * we're most likley going to insert it
-	 */
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
-	if (!l_new)
-		return -ENOMEM;
-
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	if (IS_ERR(l_new)) {
+		/* all pre-allocated elements are in use or memory exhausted */
+		ret = PTR_ERR(l_new);
+		goto err;
+	}
+
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
-	} else {
-		atomic_inc(&htab->count);
+		free_htab_elem(htab, l_old);
 	}
-	raw_spin_unlock_irqrestore(&b->lock, flags);
-	return 0;
+	ret = 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-	kfree(l_new);
 	return ret;
 }
 
@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
 					hash, true, onallcpus);
-		if (!l_new) {
-			ret = -ENOMEM;
+		if (IS_ERR(l_new)) {
+			ret = PTR_ERR(l_new);
 			goto err;
 		}
 		hlist_add_head_rcu(&l_new->hash_node, head);
-		atomic_inc(&htab->count);
 	}
 	ret = 0;
 err:
@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		atomic_dec(&htab->count);
-		free_htab_elem(l, percpu, key_size);
+		free_htab_elem(htab, l);
 		ret = 0;
 	}
 
@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			atomic_dec(&htab->count);
-			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
-				l->key_size = htab->map.key_size;
-				htab_percpu_elem_free(l);
-			} else {
-				kfree(l);
-			}
+			htab_elem_free(htab, l);
 		}
 	}
 }
-
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
-	/* some of kfree_rcu() callbacks for elements of this map may not have
-	 * executed. It's ok. Proceed to free residual elements and map itself
+	/* some of free_htab_elem() callbacks for elements of this map may
+	 * not have executed. Wait for them.
 	 */
-	delete_all_elements(htab);
+	rcu_barrier();
+	if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+		delete_all_elements(htab);
+	} else {
+		htab_free_elems(htab);
+		pcpu_freelist_destroy(&htab->freelist);
+	}
 	kvfree(htab->buckets);
 	kfree(htab);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc99f6a000f5..cbd94b2144ff 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+int bpf_map_precharge_memlock(u32 pages)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit, cur;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	cur = atomic_long_read(&user->locked_vm);
+	free_uid(user);
+	if (cur + pages > memlock_limit)
+		return -EPERM;
+	return 0;
+}
+
 static int bpf_map_charge_memlock(struct bpf_map *map)
 {
 	struct user_struct *user = get_current_user();
@@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {

From 823707b68d6e6c4b1be619b039c7045fef1740e6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:16 -0800
Subject: [PATCH 4/9] bpf: check for reserved flag bits in array and stack maps

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 2 +-
 kernel/bpf/stackmap.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bd3bdf2486a7..76d5a794e426 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,7 +53,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0)
+	    attr->value_size == 0 || attr->map_flags)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8a60ee14a977..f0a02c344358 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -35,6 +35,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
+	if (attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||

From 557c0c6e7df8e14a46bd7560d193fa5bbc00a858 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:17 -0800
Subject: [PATCH 5/9] bpf: convert stackmap to pre-allocation

It was observed that calling bpf_get_stackid() from a kprobe inside
slub or from spin_unlock causes similar deadlock as with hashmap,
therefore convert stackmap to use pre-allocated memory.

The call_rcu is no longer feasible mechanism, since delayed freeing
causes bpf_get_stackid() to fail unpredictably when number of actual
stacks is significantly less than user requested max_entries.
Since elements are no longer freed into slub, we can push elements into
freelist immediately and let them be recycled.
However the very unlikley race between user space map_lookup() and
program-side recycling is possible:
     cpu0                          cpu1
     ----                          ----
user does lookup(stackidX)
starts copying ips into buffer
                                   delete(stackidX)
                                   calls bpf_get_stackid()
				   which recyles the element and
                                   overwrites with new stack trace

To avoid user space seeing a partial stack trace consisting of two
merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket);
to preserve consistent stack trace delivery to user space.
Now we can move memset(,0) of left-over element value from critical
path of bpf_get_stackid() into slow-path of user space lookup.
Also disallow lookup() from bpf program, since it's useless and
program shouldn't be messing with collected stack trace.

Note that similar race between user space lookup and kernel side updates
is also present in hashmap, but it's not a new race. bpf programs were
always allowed to modify hash and array map elements while user space
is copying them.

Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/stackmap.c | 86 ++++++++++++++++++++++++++++++++++---------
 kernel/bpf/syscall.c  |  2 +
 3 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index efd1d4ca95c6..21ee41b92e8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 			   u64 flags);
 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 			    u64 flags);
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f0a02c344358..499d9e933f8e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -10,9 +10,10 @@
 #include <linux/vmalloc.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include "percpu_freelist.h"
 
 struct stack_map_bucket {
-	struct rcu_head rcu;
+	struct pcpu_freelist_node fnode;
 	u32 hash;
 	u32 nr;
 	u64 ip[];
@@ -20,10 +21,34 @@ struct stack_map_bucket {
 
 struct bpf_stack_map {
 	struct bpf_map map;
+	void *elems;
+	struct pcpu_freelist freelist;
 	u32 n_buckets;
-	struct stack_map_bucket __rcu *buckets[];
+	struct stack_map_bucket *buckets[];
 };
 
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+	int err;
+
+	smap->elems = vzalloc(elem_size * smap->map.max_entries);
+	if (!smap->elems)
+		return -ENOMEM;
+
+	err = pcpu_freelist_init(&smap->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+			       smap->map.max_entries);
+	return 0;
+
+free_elems:
+	vfree(smap->elems);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 {
@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	err = bpf_map_precharge_memlock(smap->map.pages);
+	if (err)
+		goto free_smap;
+
 	err = get_callchain_buffers();
 	if (err)
 		goto free_smap;
 
+	err = prealloc_elems_and_freelist(smap);
+	if (err)
+		goto put_buffers;
+
 	return &smap->map;
 
+put_buffers:
+	put_callchain_buffers();
 free_smap:
 	kvfree(smap);
 	return ERR_PTR(err);
@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	ips = trace->ip + skip + init_nr;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
-	bucket = rcu_dereference(smap->buckets[id]);
+	bucket = READ_ONCE(smap->buckets[id]);
 
 	if (bucket && bucket->hash == hash) {
 		if (flags & BPF_F_FAST_STACK_CMP)
@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	if (bucket && !(flags & BPF_F_REUSE_STACKID))
 		return -EEXIST;
 
-	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
-			     GFP_ATOMIC | __GFP_NOWARN);
+	new_bucket = (struct stack_map_bucket *)
+		pcpu_freelist_pop(&smap->freelist);
 	if (unlikely(!new_bucket))
 		return -ENOMEM;
 
 	memcpy(new_bucket->ip, ips, trace_len);
-	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
 	new_bucket->hash = hash;
 	new_bucket->nr = trace_nr;
 
 	old_bucket = xchg(&smap->buckets[id], new_bucket);
 	if (old_bucket)
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 	return id;
 }
 
@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-/* Called from syscall or from eBPF program */
+/* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	struct stack_map_bucket *bucket;
-	u32 id = *(u32 *)key;
+	struct stack_map_bucket *bucket, *old_bucket;
+	u32 id = *(u32 *)key, trace_len;
 
 	if (unlikely(id >= smap->n_buckets))
-		return NULL;
-	bucket = rcu_dereference(smap->buckets[id]);
-	return bucket ? bucket->ip : NULL;
+		return -ENOENT;
+
+	bucket = xchg(&smap->buckets[id], NULL);
+	if (!bucket)
+		return -ENOENT;
+
+	trace_len = bucket->nr * sizeof(u64);
+	memcpy(value, bucket->ip, trace_len);
+	memset(value + trace_len, 0, map->value_size - trace_len);
+
+	old_bucket = xchg(&smap->buckets[id], bucket);
+	if (old_bucket)
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+	return 0;
 }
 
 static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 
 	old_bucket = xchg(&smap->buckets[id], NULL);
 	if (old_bucket) {
-		kfree_rcu(old_bucket, rcu);
+		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 		return 0;
 	} else {
 		return -ENOENT;
@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
 static void stack_map_free(struct bpf_map *map)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	int i;
 
+	/* wait for bpf programs to complete before freeing stack map */
 	synchronize_rcu();
 
-	for (i = 0; i < smap->n_buckets; i++)
-		if (smap->buckets[i])
-			kfree_rcu(smap->buckets[i], rcu);
+	vfree(smap->elems);
+	pcpu_freelist_destroy(&smap->freelist);
 	kvfree(smap);
 	put_callchain_buffers();
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cbd94b2144ff..2978d0d08869 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);

From 618ec9a7b1fda46f8dd4a630ded983aeb51218f3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:18 -0800
Subject: [PATCH 6/9] samples/bpf: make map creation more verbose

map creation is typically the first one to fail when rlimits are
too low, not enough memory, etc
Make this failure scenario more verbose

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index da86a8e0a95a..816bca5760a0 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -158,8 +158,11 @@ static int load_maps(struct bpf_map_def *maps, int len)
 					   maps[i].key_size,
 					   maps[i].value_size,
 					   maps[i].max_entries);
-		if (map_fd[i] < 0)
+		if (map_fd[i] < 0) {
+			printf("failed to create a map: %d %s\n",
+			       errno, strerror(errno));
 			return 1;
+		}
 
 		if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
 			prog_array_fd = map_fd[i];

From 3622e7e4935105991dc648bca650c858576aecda Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:19 -0800
Subject: [PATCH 7/9] samples/bpf: move ksym_search() into library

move ksym search from offwaketime into library to be reused
in other tests

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.c         | 62 +++++++++++++++++++++++++++++++
 samples/bpf/bpf_load.h         |  6 +++
 samples/bpf/offwaketime_user.c | 67 +---------------------------------
 3 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 816bca5760a0..d16864293c00 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -346,3 +346,65 @@ void read_trace_pipe(void)
 		}
 	}
 }
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+	FILE *f = fopen("/proc/kallsyms", "r");
+	char func[256], buf[256];
+	char symbol;
+	void *addr;
+	int i = 0;
+
+	if (!f)
+		return -ENOENT;
+
+	while (!feof(f)) {
+		if (!fgets(buf, sizeof(buf), f))
+			break;
+		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+			break;
+		if (!addr)
+			continue;
+		syms[i].addr = (long) addr;
+		syms[i].name = strdup(func);
+		i++;
+	}
+	sym_cnt = i;
+	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+	return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+	int start = 0, end = sym_cnt;
+	int result;
+
+	while (start < end) {
+		size_t mid = start + (end - start) / 2;
+
+		result = key - syms[mid].addr;
+		if (result < 0)
+			end = mid;
+		else if (result > 0)
+			start = mid + 1;
+		else
+			return &syms[mid];
+	}
+
+	if (start >= 1 && syms[start - 1].addr < key &&
+	    key < syms[start].addr)
+		/* valid ksym */
+		return &syms[start - 1];
+
+	/* out of range. return _stext */
+	return &syms[0];
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index cbd7c2b532b9..dfa57fe65c8e 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -23,5 +23,11 @@ extern int event_fd[MAX_PROGS];
 int load_bpf_file(char *path);
 
 void read_trace_pipe(void);
+struct ksym {
+	long addr;
+	char *name;
+};
 
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
 #endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 17cf3024e22c..6f002a9c24fa 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -18,80 +18,15 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 
-#define MAX_SYMS 300000
 #define PRINT_RAW_ADDR 0
 
-static struct ksym {
-	long addr;
-	char *name;
-} syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
-	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-static int load_kallsyms(void)
-{
-	FILE *f = fopen("/proc/kallsyms", "r");
-	char func[256], buf[256];
-	char symbol;
-	void *addr;
-	int i = 0;
-
-	if (!f)
-		return -ENOENT;
-
-	while (!feof(f)) {
-		if (!fgets(buf, sizeof(buf), f))
-			break;
-		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
-			break;
-		if (!addr)
-			continue;
-		syms[i].addr = (long) addr;
-		syms[i].name = strdup(func);
-		i++;
-	}
-	sym_cnt = i;
-	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
-	return 0;
-}
-
-static void *search(long key)
-{
-	int start = 0, end = sym_cnt;
-	int result;
-
-	while (start < end) {
-		size_t mid = start + (end - start) / 2;
-
-		result = key - syms[mid].addr;
-		if (result < 0)
-			end = mid;
-		else if (result > 0)
-			start = mid + 1;
-		else
-			return &syms[mid];
-	}
-
-	if (start >= 1 && syms[start - 1].addr < key &&
-	    key < syms[start].addr)
-		/* valid ksym */
-		return &syms[start - 1];
-
-	/* out of range. return _stext */
-	return &syms[0];
-}
-
 static void print_ksym(__u64 addr)
 {
 	struct ksym *sym;
 
 	if (!addr)
 		return;
-	sym = search(addr);
+	sym = ksym_search(addr);
 	if (PRINT_RAW_ADDR)
 		printf("%s/%llx;", sym->name, addr);
 	else

From 89b976070190eb9dd14943c0d6ca4b7209f61405 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:20 -0800
Subject: [PATCH 8/9] samples/bpf: add map_flags to bpf loader

note old loader is compatible with new kernel.
map_flags are optional

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_helpers.h   |  1 +
 samples/bpf/bpf_load.c      |  3 ++-
 samples/bpf/fds_example.c   |  2 +-
 samples/bpf/libbpf.c        |  5 +++--
 samples/bpf/libbpf.h        |  2 +-
 samples/bpf/sock_example.c  |  2 +-
 samples/bpf/test_maps.c     | 19 ++++++++++++-------
 samples/bpf/test_verifier.c |  4 ++--
 8 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 811bcca0f29d..9363500131a7 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -61,6 +61,7 @@ struct bpf_map_def {
 	unsigned int key_size;
 	unsigned int value_size;
 	unsigned int max_entries;
+	unsigned int map_flags;
 };
 
 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index d16864293c00..58f86bd11b3d 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -157,7 +157,8 @@ static int load_maps(struct bpf_map_def *maps, int len)
 		map_fd[i] = bpf_create_map(maps[i].type,
 					   maps[i].key_size,
 					   maps[i].value_size,
-					   maps[i].max_entries);
+					   maps[i].max_entries,
+					   maps[i].map_flags);
 		if (map_fd[i] < 0) {
 			printf("failed to create a map: %d %s\n",
 			       errno, strerror(errno));
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index e2fd16c3d0f0..625e797be6ef 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -44,7 +44,7 @@ static void usage(void)
 static int bpf_map_create(void)
 {
 	return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t),
-			      sizeof(uint32_t), 1024);
+			      sizeof(uint32_t), 1024, 0);
 }
 
 static int bpf_prog_create(const char *object)
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index 65a8d48d2799..9969e35550c3 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -19,13 +19,14 @@ static __u64 ptr_to_u64(void *ptr)
 }
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
-		   int max_entries)
+		   int max_entries, int map_flags)
 {
 	union bpf_attr attr = {
 		.map_type = map_type,
 		.key_size = key_size,
 		.value_size = value_size,
-		.max_entries = max_entries
+		.max_entries = max_entries,
+		.map_flags = map_flags,
 	};
 
 	return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 014aacf916e4..364582b77888 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -5,7 +5,7 @@
 struct bpf_insn;
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
-		   int max_entries);
+		   int max_entries, int map_flags);
 int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
 int bpf_delete_elem(int fd, void *key);
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index a0ce251c5390..28b60baa9fa8 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -34,7 +34,7 @@ static int test_sock(void)
 	long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
-				256);
+				256, 0);
 	if (map_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 		goto cleanup;
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index ad466ed33093..7bd9edd02d9b 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -2,6 +2,7 @@
  * Testsuite for eBPF maps
  *
  * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -17,13 +18,16 @@
 #include <stdlib.h>
 #include "libbpf.h"
 
+static int map_flags;
+
 /* sanity tests for map API */
 static void test_hashmap_sanity(int i, void *data)
 {
 	long long key, next_key, value;
 	int map_fd;
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+				2, map_flags);
 	if (map_fd < 0) {
 		printf("failed to create hashmap '%s'\n", strerror(errno));
 		exit(1);
@@ -99,7 +103,7 @@ static void test_percpu_hashmap_sanity(int task, void *data)
 	int map_fd, i;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
-				sizeof(value[0]), 2);
+				sizeof(value[0]), 2, map_flags);
 	if (map_fd < 0) {
 		printf("failed to create hashmap '%s'\n", strerror(errno));
 		exit(1);
@@ -188,7 +192,8 @@ static void test_arraymap_sanity(int i, void *data)
 	int key, next_key, map_fd;
 	long long value;
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2);
+	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
+				2, 0);
 	if (map_fd < 0) {
 		printf("failed to create arraymap '%s'\n", strerror(errno));
 		exit(1);
@@ -244,7 +249,7 @@ static void test_percpu_arraymap_many_keys(void)
 	int key, map_fd, i;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
-				sizeof(values[0]), nr_keys);
+				sizeof(values[0]), nr_keys, 0);
 	if (map_fd < 0) {
 		printf("failed to create per-cpu arraymap '%s'\n",
 		       strerror(errno));
@@ -275,7 +280,7 @@ static void test_percpu_arraymap_sanity(int i, void *data)
 	int key, next_key, map_fd;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
-				sizeof(values[0]), 2);
+				sizeof(values[0]), 2, 0);
 	if (map_fd < 0) {
 		printf("failed to create arraymap '%s'\n", strerror(errno));
 		exit(1);
@@ -336,7 +341,7 @@ static void test_map_large(void)
 
 	/* allocate 4Mbyte of memory */
 	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-				MAP_SIZE);
+				MAP_SIZE, map_flags);
 	if (map_fd < 0) {
 		printf("failed to create large map '%s'\n", strerror(errno));
 		exit(1);
@@ -421,7 +426,7 @@ static void test_map_parallel(void)
 	int data[2];
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
-				MAP_SIZE);
+				MAP_SIZE, map_flags);
 	if (map_fd < 0) {
 		printf("failed to create map for parallel test '%s'\n",
 		       strerror(errno));
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 563c507c0a09..4b51a9039c0d 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -1198,7 +1198,7 @@ static int create_map(void)
 	int map_fd;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
-				sizeof(long long), sizeof(long long), 1024);
+				sizeof(long long), sizeof(long long), 1024, 0);
 	if (map_fd < 0)
 		printf("failed to create map '%s'\n", strerror(errno));
 
@@ -1210,7 +1210,7 @@ static int create_prog_array(void)
 	int map_fd;
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY,
-				sizeof(int), sizeof(int), 4);
+				sizeof(int), sizeof(int), 4, 0);
 	if (map_fd < 0)
 		printf("failed to create prog_array '%s'\n", strerror(errno));
 

From c3f85cffc50d2f259903555979581a632b945ec2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:21 -0800
Subject: [PATCH 9/9] samples/bpf: test both pre-alloc and normal maps

extend test coveraged to include pre-allocated and run-time alloc maps

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/test_maps.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index 7bd9edd02d9b..47bf0858f9e4 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -468,7 +468,7 @@ static void test_map_parallel(void)
 	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
 }
 
-int main(void)
+static void run_all_tests(void)
 {
 	test_hashmap_sanity(0, NULL);
 	test_percpu_hashmap_sanity(0, NULL);
@@ -479,6 +479,14 @@ int main(void)
 	test_map_large();
 	test_map_parallel();
 	test_map_stress();
+}
+
+int main(void)
+{
+	map_flags = 0;
+	run_all_tests();
+	map_flags = BPF_F_NO_PREALLOC;
+	run_all_tests();
 	printf("test_maps: OK\n");
 	return 0;
 }