From 53d6eb08e9f185834231d5c32499b10310cce3aa Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 25 Sep 2018 15:20:37 -0700
Subject: [PATCH 01/64] bpftool: Fix bpftool net output

Print `bpftool net` output to stdout instead of stderr. Only errors
should be printed to stderr. Regular output should go to stdout and this
is what all other subcommands of bpftool do, including --json and
--pretty formats of `bpftool net` itself.

Fixes: commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/netlink_dumper.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
index 0788cfbbed0e..e3516b586a34 100644
--- a/tools/bpf/bpftool/netlink_dumper.h
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -16,7 +16,7 @@
 		jsonw_name(json_wtr, name);		\
 		jsonw_start_object(json_wtr);		\
 	} else {					\
-		fprintf(stderr, "%s {", name);		\
+		fprintf(stdout, "%s {", name);		\
 	}						\
 }
 
@@ -25,7 +25,7 @@
 	if (json_output)				\
 		jsonw_start_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "{");			\
+		fprintf(stdout, "{");			\
 }
 
 #define NET_END_OBJECT_NESTED				\
@@ -33,7 +33,7 @@
 	if (json_output)				\
 		jsonw_end_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "}");			\
+		fprintf(stdout, "}");			\
 }
 
 #define NET_END_OBJECT					\
@@ -47,7 +47,7 @@
 	if (json_output)				\
 		jsonw_end_object(json_wtr);		\
 	else						\
-		fprintf(stderr, "\n");			\
+		fprintf(stdout, "\n");			\
 }
 
 #define NET_START_ARRAY(name, fmt_str)			\
@@ -56,7 +56,7 @@
 		jsonw_name(json_wtr, name);		\
 		jsonw_start_array(json_wtr);		\
 	} else {					\
-		fprintf(stderr, fmt_str, name);		\
+		fprintf(stdout, fmt_str, name);		\
 	}						\
 }
 
@@ -65,7 +65,7 @@
 	if (json_output)				\
 		jsonw_end_array(json_wtr);		\
 	else						\
-		fprintf(stderr, "%s", endstr);		\
+		fprintf(stdout, "%s", endstr);		\
 }
 
 #define NET_DUMP_UINT(name, fmt_str, val)		\
@@ -73,7 +73,7 @@
 	if (json_output)				\
 		jsonw_uint_field(json_wtr, name, val);	\
 	else						\
-		fprintf(stderr, fmt_str, val);		\
+		fprintf(stdout, fmt_str, val);		\
 }
 
 #define NET_DUMP_STR(name, fmt_str, str)		\
@@ -81,7 +81,7 @@
 	if (json_output)				\
 		jsonw_string_field(json_wtr, name, str);\
 	else						\
-		fprintf(stderr, fmt_str, str);		\
+		fprintf(stdout, fmt_str, str);		\
 }
 
 #define NET_DUMP_STR_ONLY(str)				\
@@ -89,7 +89,7 @@
 	if (json_output)				\
 		jsonw_string(json_wtr, str);		\
 	else						\
-		fprintf(stderr, "%s ", str);		\
+		fprintf(stdout, "%s ", str);		\
 }
 
 #endif

From 100811936f89fd455eda1984810c09003550555b Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 27 Sep 2018 09:34:41 -0700
Subject: [PATCH 02/64] bpf: test_bpf: add init_net to dev for flow_dissector

Latest changes in __skb_flow_dissect() assume skb->dev has valid nd_net.
However, this is not true for test_bpf. As a result, test_bpf.ko crashes
the system with the following stack trace:

[ 1133.716622] BUG: unable to handle kernel paging request at 0000000000001030
[ 1133.716623] PGD 8000001fbf7ee067
[ 1133.716624] P4D 8000001fbf7ee067
[ 1133.716624] PUD 1f6c1cf067
[ 1133.716625] PMD 0
[ 1133.716628] Oops: 0000 [#1] SMP PTI
[ 1133.716630] CPU: 7 PID: 40473 Comm: modprobe Kdump: loaded Not tainted 4.19.0-rc5-00805-gca11cc92ccd2 #1167
[ 1133.716631] Hardware name: Wiwynn Leopard-Orv2/Leopard-DDR BW, BIOS LBM12.5 12/06/2017
[ 1133.716638] RIP: 0010:__skb_flow_dissect+0x83/0x1680
[ 1133.716639] Code: 04 00 00 41 0f b7 44 24 04 48 85 db 4d 8d 14 07 0f 84 01 02 00 00 48 8b 43 10 48 85 c0 0f 84 e5 01 00 00 48 8b 80 a8 04 00 00 <48> 8b 90 30 10 00 00 48 85 d2 0f 84 dd 01 00 00 31 c0 b9 05 00 00
[ 1133.716640] RSP: 0018:ffffc900303c7a80 EFLAGS: 00010282
[ 1133.716642] RAX: 0000000000000000 RBX: ffff881fea0b7400 RCX: 0000000000000000
[ 1133.716643] RDX: ffffc900303c7bb4 RSI: ffffffff8235c3e0 RDI: ffff881fea0b7400
[ 1133.716643] RBP: ffffc900303c7b80 R08: 0000000000000000 R09: 000000000000000e
[ 1133.716644] R10: ffffc900303c7bb4 R11: ffff881fb6840400 R12: ffffffff8235c3e0
[ 1133.716645] R13: 0000000000000008 R14: 000000000000001e R15: ffffc900303c7bb4
[ 1133.716646] FS:  00007f54e75d3740(0000) GS:ffff881fff5c0000(0000) knlGS:0000000000000000
[ 1133.716648] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1133.716649] CR2: 0000000000001030 CR3: 0000001f6c226005 CR4: 00000000003606e0
[ 1133.716649] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1133.716650] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1133.716651] Call Trace:
[ 1133.716660]  ? sched_clock_cpu+0xc/0xa0
[ 1133.716662]  ? sched_clock_cpu+0xc/0xa0
[ 1133.716665]  ? log_store+0x1b5/0x260
[ 1133.716667]  ? up+0x12/0x60
[ 1133.716669]  ? skb_get_poff+0x4b/0xa0
[ 1133.716674]  ? __kmalloc_reserve.isra.47+0x2e/0x80
[ 1133.716675]  skb_get_poff+0x4b/0xa0
[ 1133.716680]  bpf_skb_get_pay_offset+0xa/0x10
[ 1133.716686]  ? test_bpf_init+0x578/0x1000 [test_bpf]
[ 1133.716690]  ? netlink_broadcast_filtered+0x153/0x3d0
[ 1133.716695]  ? free_pcppages_bulk+0x324/0x600
[ 1133.716696]  ? 0xffffffffa0279000
[ 1133.716699]  ? do_one_initcall+0x46/0x1bd
[ 1133.716704]  ? kmem_cache_alloc_trace+0x144/0x1a0
[ 1133.716709]  ? do_init_module+0x5b/0x209
[ 1133.716712]  ? load_module+0x2136/0x25d0
[ 1133.716715]  ? __do_sys_finit_module+0xba/0xe0
[ 1133.716717]  ? __do_sys_finit_module+0xba/0xe0
[ 1133.716719]  ? do_syscall_64+0x48/0x100
[ 1133.716724]  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

This patch fixes tes_bpf by using init_net in the dummy dev.

Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Reported-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 08d3d59dca17..aa22bcaec1dc 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6494,6 +6494,7 @@ static struct sk_buff *populate_skb(char *buf, int size)
 	skb->queue_mapping = SKB_QUEUE_MAP;
 	skb->vlan_tci = SKB_VLAN_TCI;
 	skb->vlan_proto = htons(ETH_P_IP);
+	dev_net_set(&dev, &init_net);
 	skb->dev = &dev;
 	skb->dev->ifindex = SKB_DEV_IFINDEX;
 	skb->dev->type = SKB_DEV_TYPE;

From 956b620fcf0b64de403cd26a56bc41e6e4826ea6 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 26 Sep 2018 15:24:53 -0700
Subject: [PATCH 03/64] libbpf: Introduce libbpf_attach_type_by_name

There is a common use-case when ELF object contains multiple BPF
programs and every program has its own section name. If it's cgroup-bpf
then programs have to be 1) loaded and 2) attached to a cgroup.

It's convenient to have information necessary to load BPF program
together with program itself. This is where section name works fine in
conjunction with libbpf_prog_type_by_name that identifies prog_type and
expected_attach_type and these can be used with BPF_PROG_LOAD.

But there is currently no way to identify attach_type by section name
and it leads to messy code in user space that reinvents guessing logic
every time it has to identify attach type to use with BPF_PROG_ATTACH.

The patch introduces libbpf_attach_type_by_name that guesses attach type
by section name if a program can be attached.

The difference between expected_attach_type provided by
libbpf_prog_type_by_name and attach_type provided by
libbpf_attach_type_by_name is the former is used at BPF_PROG_LOAD time
and can be zero if a program of prog_type X has only one corresponding
attach type Y whether the latter provides specific attach type to use
with BPF_PROG_ATTACH.

No new section names were added to section_names array. Only existing
ones were reorganized and attach_type was added where appropriate.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 121 ++++++++++++++++++++++++++++-------------
 tools/lib/bpf/libbpf.h |   2 +
 2 files changed, 84 insertions(+), 39 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4f8d43ae20d2..59e589a64d5c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2085,58 +2085,82 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 	prog->expected_attach_type = type;
 }
 
-#define BPF_PROG_SEC_FULL(string, ptype, atype) \
-	{ string, sizeof(string) - 1, ptype, atype }
+#define BPF_PROG_SEC_IMPL(string, ptype, eatype, atype) \
+	{ string, sizeof(string) - 1, ptype, eatype, atype }
 
-#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_FULL(string, ptype, 0)
+/* Programs that can NOT be attached. */
+#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_IMPL(string, ptype, 0, -EINVAL)
 
-#define BPF_S_PROG_SEC(string, ptype) \
-	BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK, ptype)
+/* Programs that can be attached. */
+#define BPF_APROG_SEC(string, ptype, atype) \
+	BPF_PROG_SEC_IMPL(string, ptype, 0, atype)
 
-#define BPF_SA_PROG_SEC(string, ptype) \
-	BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ptype)
+/* Programs that must specify expected attach type at load time. */
+#define BPF_EAPROG_SEC(string, ptype, eatype) \
+	BPF_PROG_SEC_IMPL(string, ptype, eatype, eatype)
+
+/* Programs that can be attached but attach type can't be identified by section
+ * name. Kept for backward compatibility.
+ */
+#define BPF_APROG_COMPAT(string, ptype) BPF_PROG_SEC(string, ptype)
 
 static const struct {
 	const char *sec;
 	size_t len;
 	enum bpf_prog_type prog_type;
 	enum bpf_attach_type expected_attach_type;
+	enum bpf_attach_type attach_type;
 } section_names[] = {
-	BPF_PROG_SEC("socket",		BPF_PROG_TYPE_SOCKET_FILTER),
-	BPF_PROG_SEC("kprobe/",		BPF_PROG_TYPE_KPROBE),
-	BPF_PROG_SEC("kretprobe/",	BPF_PROG_TYPE_KPROBE),
-	BPF_PROG_SEC("classifier",	BPF_PROG_TYPE_SCHED_CLS),
-	BPF_PROG_SEC("action",		BPF_PROG_TYPE_SCHED_ACT),
-	BPF_PROG_SEC("tracepoint/",	BPF_PROG_TYPE_TRACEPOINT),
-	BPF_PROG_SEC("raw_tracepoint/",	BPF_PROG_TYPE_RAW_TRACEPOINT),
-	BPF_PROG_SEC("xdp",		BPF_PROG_TYPE_XDP),
-	BPF_PROG_SEC("perf_event",	BPF_PROG_TYPE_PERF_EVENT),
-	BPF_PROG_SEC("cgroup/skb",	BPF_PROG_TYPE_CGROUP_SKB),
-	BPF_PROG_SEC("cgroup/sock",	BPF_PROG_TYPE_CGROUP_SOCK),
-	BPF_PROG_SEC("cgroup/dev",	BPF_PROG_TYPE_CGROUP_DEVICE),
-	BPF_PROG_SEC("lwt_in",		BPF_PROG_TYPE_LWT_IN),
-	BPF_PROG_SEC("lwt_out",		BPF_PROG_TYPE_LWT_OUT),
-	BPF_PROG_SEC("lwt_xmit",	BPF_PROG_TYPE_LWT_XMIT),
-	BPF_PROG_SEC("lwt_seg6local",	BPF_PROG_TYPE_LWT_SEG6LOCAL),
-	BPF_PROG_SEC("sockops",		BPF_PROG_TYPE_SOCK_OPS),
-	BPF_PROG_SEC("sk_skb",		BPF_PROG_TYPE_SK_SKB),
-	BPF_PROG_SEC("sk_msg",		BPF_PROG_TYPE_SK_MSG),
-	BPF_PROG_SEC("lirc_mode2",	BPF_PROG_TYPE_LIRC_MODE2),
-	BPF_PROG_SEC("flow_dissector",	BPF_PROG_TYPE_FLOW_DISSECTOR),
-	BPF_SA_PROG_SEC("cgroup/bind4",	BPF_CGROUP_INET4_BIND),
-	BPF_SA_PROG_SEC("cgroup/bind6",	BPF_CGROUP_INET6_BIND),
-	BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
-	BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT),
-	BPF_SA_PROG_SEC("cgroup/sendmsg4", BPF_CGROUP_UDP4_SENDMSG),
-	BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
-	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
-	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
+	BPF_PROG_SEC("socket",			BPF_PROG_TYPE_SOCKET_FILTER),
+	BPF_PROG_SEC("kprobe/",			BPF_PROG_TYPE_KPROBE),
+	BPF_PROG_SEC("kretprobe/",		BPF_PROG_TYPE_KPROBE),
+	BPF_PROG_SEC("classifier",		BPF_PROG_TYPE_SCHED_CLS),
+	BPF_PROG_SEC("action",			BPF_PROG_TYPE_SCHED_ACT),
+	BPF_PROG_SEC("tracepoint/",		BPF_PROG_TYPE_TRACEPOINT),
+	BPF_PROG_SEC("raw_tracepoint/",		BPF_PROG_TYPE_RAW_TRACEPOINT),
+	BPF_PROG_SEC("xdp",			BPF_PROG_TYPE_XDP),
+	BPF_PROG_SEC("perf_event",		BPF_PROG_TYPE_PERF_EVENT),
+	BPF_PROG_SEC("lwt_in",			BPF_PROG_TYPE_LWT_IN),
+	BPF_PROG_SEC("lwt_out",			BPF_PROG_TYPE_LWT_OUT),
+	BPF_PROG_SEC("lwt_xmit",		BPF_PROG_TYPE_LWT_XMIT),
+	BPF_PROG_SEC("lwt_seg6local",		BPF_PROG_TYPE_LWT_SEG6LOCAL),
+	BPF_APROG_COMPAT("cgroup/skb",		BPF_PROG_TYPE_CGROUP_SKB),
+	BPF_APROG_SEC("cgroup/sock",		BPF_PROG_TYPE_CGROUP_SOCK,
+						BPF_CGROUP_INET_SOCK_CREATE),
+	BPF_EAPROG_SEC("cgroup/post_bind4",	BPF_PROG_TYPE_CGROUP_SOCK,
+						BPF_CGROUP_INET4_POST_BIND),
+	BPF_EAPROG_SEC("cgroup/post_bind6",	BPF_PROG_TYPE_CGROUP_SOCK,
+						BPF_CGROUP_INET6_POST_BIND),
+	BPF_APROG_SEC("cgroup/dev",		BPF_PROG_TYPE_CGROUP_DEVICE,
+						BPF_CGROUP_DEVICE),
+	BPF_APROG_SEC("sockops",		BPF_PROG_TYPE_SOCK_OPS,
+						BPF_CGROUP_SOCK_OPS),
+	BPF_APROG_COMPAT("sk_skb",		BPF_PROG_TYPE_SK_SKB),
+	BPF_APROG_SEC("sk_msg",			BPF_PROG_TYPE_SK_MSG,
+						BPF_SK_MSG_VERDICT),
+	BPF_APROG_SEC("lirc_mode2",		BPF_PROG_TYPE_LIRC_MODE2,
+						BPF_LIRC_MODE2),
+	BPF_APROG_SEC("flow_dissector",		BPF_PROG_TYPE_FLOW_DISSECTOR,
+						BPF_FLOW_DISSECTOR),
+	BPF_EAPROG_SEC("cgroup/bind4",		BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_INET4_BIND),
+	BPF_EAPROG_SEC("cgroup/bind6",		BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_INET6_BIND),
+	BPF_EAPROG_SEC("cgroup/connect4",	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_INET4_CONNECT),
+	BPF_EAPROG_SEC("cgroup/connect6",	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_INET6_CONNECT),
+	BPF_EAPROG_SEC("cgroup/sendmsg4",	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_UDP4_SENDMSG),
+	BPF_EAPROG_SEC("cgroup/sendmsg6",	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						BPF_CGROUP_UDP6_SENDMSG),
 };
 
+#undef BPF_PROG_SEC_IMPL
 #undef BPF_PROG_SEC
-#undef BPF_PROG_SEC_FULL
-#undef BPF_S_PROG_SEC
-#undef BPF_SA_PROG_SEC
+#undef BPF_APROG_SEC
+#undef BPF_EAPROG_SEC
+#undef BPF_APROG_COMPAT
 
 int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 			     enum bpf_attach_type *expected_attach_type)
@@ -2156,6 +2180,25 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 	return -EINVAL;
 }
 
+int libbpf_attach_type_by_name(const char *name,
+			       enum bpf_attach_type *attach_type)
+{
+	int i;
+
+	if (!name)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(section_names); i++) {
+		if (strncmp(name, section_names[i].sec, section_names[i].len))
+			continue;
+		if (section_names[i].attach_type == -EINVAL)
+			return -EINVAL;
+		*attach_type = section_names[i].attach_type;
+		return 0;
+	}
+	return -EINVAL;
+}
+
 static int
 bpf_program__identify_section(struct bpf_program *prog,
 			      enum bpf_prog_type *prog_type,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e3b00e23e181..511c1294dcbf 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -104,6 +104,8 @@ void *bpf_object__priv(struct bpf_object *prog);
 
 int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 			     enum bpf_attach_type *expected_attach_type);
+int libbpf_attach_type_by_name(const char *name,
+			       enum bpf_attach_type *attach_type);
 
 /* Accessors of bpf_program */
 struct bpf_program;

From bafa7afe63391419c7cb8e42f1ba9f8dddd7c8fb Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 26 Sep 2018 15:24:54 -0700
Subject: [PATCH 04/64] libbpf: Support cgroup_skb/{e,in}gress section names

Add section names for BPF_CGROUP_INET_INGRESS and BPF_CGROUP_INET_EGRESS
attach types to be able to identify them in libbpf_attach_type_by_name.

"cgroup_skb" is used instead of "cgroup/skb" mostly to easy possible
unifying of how libbpf and bpftool works with section names:
* bpftool uses "cgroup_skb" to in "prog list" sub-command;
* bpftool uses "ingress" and "egress" in "cgroup list" sub-command;
* having two parts instead of three in a string like "cgroup_skb/ingress"
  can be leveraged to split it to prog_type part and attach_type part,
  or vise versa: use two parts to make a section name.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 59e589a64d5c..51edf6cd390e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2124,6 +2124,10 @@ static const struct {
 	BPF_PROG_SEC("lwt_out",			BPF_PROG_TYPE_LWT_OUT),
 	BPF_PROG_SEC("lwt_xmit",		BPF_PROG_TYPE_LWT_XMIT),
 	BPF_PROG_SEC("lwt_seg6local",		BPF_PROG_TYPE_LWT_SEG6LOCAL),
+	BPF_APROG_SEC("cgroup_skb/ingress",	BPF_PROG_TYPE_CGROUP_SKB,
+						BPF_CGROUP_INET_INGRESS),
+	BPF_APROG_SEC("cgroup_skb/egress",	BPF_PROG_TYPE_CGROUP_SKB,
+						BPF_CGROUP_INET_EGRESS),
 	BPF_APROG_COMPAT("cgroup/skb",		BPF_PROG_TYPE_CGROUP_SKB),
 	BPF_APROG_SEC("cgroup/sock",		BPF_PROG_TYPE_CGROUP_SOCK,
 						BPF_CGROUP_INET_SOCK_CREATE),

From c6f6851b28ae26000352598f01968b3ff7dcf588 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 26 Sep 2018 15:24:55 -0700
Subject: [PATCH 05/64] libbpf: Support sk_skb/stream_{parser, verdict} section
 names

Add section names for BPF_SK_SKB_STREAM_PARSER and
BPF_SK_SKB_STREAM_VERDICT attach types to be able to identify them in
libbpf_attach_type_by_name.

"stream_parser" and "stream_verdict" are used instead of simple "parser"
and "verdict" just to avoid possible confusion in a place where attach
type is used alone (e.g. in bpftool's show sub-commands) since there is
another attach point that can be named as "verdict": BPF_SK_MSG_VERDICT.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 51edf6cd390e..425d5ca45c97 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2139,6 +2139,10 @@ static const struct {
 						BPF_CGROUP_DEVICE),
 	BPF_APROG_SEC("sockops",		BPF_PROG_TYPE_SOCK_OPS,
 						BPF_CGROUP_SOCK_OPS),
+	BPF_APROG_SEC("sk_skb/stream_parser",	BPF_PROG_TYPE_SK_SKB,
+						BPF_SK_SKB_STREAM_PARSER),
+	BPF_APROG_SEC("sk_skb/stream_verdict",	BPF_PROG_TYPE_SK_SKB,
+						BPF_SK_SKB_STREAM_VERDICT),
 	BPF_APROG_COMPAT("sk_skb",		BPF_PROG_TYPE_SK_SKB),
 	BPF_APROG_SEC("sk_msg",			BPF_PROG_TYPE_SK_MSG,
 						BPF_SK_MSG_VERDICT),

From c9bf507d0acba950e5d85ef27727ce61458ef5aa Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 26 Sep 2018 15:24:56 -0700
Subject: [PATCH 06/64] selftests/bpf: Use libbpf_attach_type_by_name in
 test_socket_cookie

Use newly introduced libbpf_attach_type_by_name in test_socket_cookie
selftest.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_socket_cookie.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c
index 68e108e4687a..b6c2c605d8c0 100644
--- a/tools/testing/selftests/bpf/test_socket_cookie.c
+++ b/tools/testing/selftests/bpf/test_socket_cookie.c
@@ -158,11 +158,7 @@ static int run_test(int cgfd)
 	bpf_object__for_each_program(prog, pobj) {
 		prog_name = bpf_program__title(prog, /*needs_copy*/ false);
 
-		if (strcmp(prog_name, "cgroup/connect6") == 0) {
-			attach_type = BPF_CGROUP_INET6_CONNECT;
-		} else if (strcmp(prog_name, "sockops") == 0) {
-			attach_type = BPF_CGROUP_SOCK_OPS;
-		} else {
+		if (libbpf_attach_type_by_name(prog_name, &attach_type)) {
 			log_err("Unexpected prog: %s", prog_name);
 			goto err;
 		}

From 370920c47b26ae3a837f7c46f0795804ae2940f7 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 26 Sep 2018 15:24:57 -0700
Subject: [PATCH 07/64] selftests/bpf: Test libbpf_{prog,attach}_type_by_name

Add selftest for libbpf functions libbpf_prog_type_by_name and
libbpf_attach_type_by_name.

Example of output:
  % ./tools/testing/selftests/bpf/test_section_names
  Summary: 35 PASSED, 0 FAILED

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 .../selftests/bpf/test_section_names.c        | 208 ++++++++++++++++++
 2 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_section_names.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fd3851d5c079..059d64a0f897 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
-	test_socket_cookie test_cgroup_storage test_select_reuseport
+	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
diff --git a/tools/testing/selftests/bpf/test_section_names.c b/tools/testing/selftests/bpf/test_section_names.c
new file mode 100644
index 000000000000..7c4f41572b1c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_section_names.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <err.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+
+struct sec_name_test {
+	const char sec_name[32];
+	struct {
+		int rc;
+		enum bpf_prog_type prog_type;
+		enum bpf_attach_type expected_attach_type;
+	} expected_load;
+	struct {
+		int rc;
+		enum bpf_attach_type attach_type;
+	} expected_attach;
+};
+
+static struct sec_name_test tests[] = {
+	{"InvAliD", {-EINVAL, 0, 0}, {-EINVAL, 0} },
+	{"cgroup", {-EINVAL, 0, 0}, {-EINVAL, 0} },
+	{"socket", {0, BPF_PROG_TYPE_SOCKET_FILTER, 0}, {-EINVAL, 0} },
+	{"kprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+	{"kretprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+	{"classifier", {0, BPF_PROG_TYPE_SCHED_CLS, 0}, {-EINVAL, 0} },
+	{"action", {0, BPF_PROG_TYPE_SCHED_ACT, 0}, {-EINVAL, 0} },
+	{"tracepoint/", {0, BPF_PROG_TYPE_TRACEPOINT, 0}, {-EINVAL, 0} },
+	{
+		"raw_tracepoint/",
+		{0, BPF_PROG_TYPE_RAW_TRACEPOINT, 0},
+		{-EINVAL, 0},
+	},
+	{"xdp", {0, BPF_PROG_TYPE_XDP, 0}, {-EINVAL, 0} },
+	{"perf_event", {0, BPF_PROG_TYPE_PERF_EVENT, 0}, {-EINVAL, 0} },
+	{"lwt_in", {0, BPF_PROG_TYPE_LWT_IN, 0}, {-EINVAL, 0} },
+	{"lwt_out", {0, BPF_PROG_TYPE_LWT_OUT, 0}, {-EINVAL, 0} },
+	{"lwt_xmit", {0, BPF_PROG_TYPE_LWT_XMIT, 0}, {-EINVAL, 0} },
+	{"lwt_seg6local", {0, BPF_PROG_TYPE_LWT_SEG6LOCAL, 0}, {-EINVAL, 0} },
+	{
+		"cgroup_skb/ingress",
+		{0, BPF_PROG_TYPE_CGROUP_SKB, 0},
+		{0, BPF_CGROUP_INET_INGRESS},
+	},
+	{
+		"cgroup_skb/egress",
+		{0, BPF_PROG_TYPE_CGROUP_SKB, 0},
+		{0, BPF_CGROUP_INET_EGRESS},
+	},
+	{"cgroup/skb", {0, BPF_PROG_TYPE_CGROUP_SKB, 0}, {-EINVAL, 0} },
+	{
+		"cgroup/sock",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK, 0},
+		{0, BPF_CGROUP_INET_SOCK_CREATE},
+	},
+	{
+		"cgroup/post_bind4",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND},
+		{0, BPF_CGROUP_INET4_POST_BIND},
+	},
+	{
+		"cgroup/post_bind6",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND},
+		{0, BPF_CGROUP_INET6_POST_BIND},
+	},
+	{
+		"cgroup/dev",
+		{0, BPF_PROG_TYPE_CGROUP_DEVICE, 0},
+		{0, BPF_CGROUP_DEVICE},
+	},
+	{"sockops", {0, BPF_PROG_TYPE_SOCK_OPS, 0}, {0, BPF_CGROUP_SOCK_OPS} },
+	{
+		"sk_skb/stream_parser",
+		{0, BPF_PROG_TYPE_SK_SKB, 0},
+		{0, BPF_SK_SKB_STREAM_PARSER},
+	},
+	{
+		"sk_skb/stream_verdict",
+		{0, BPF_PROG_TYPE_SK_SKB, 0},
+		{0, BPF_SK_SKB_STREAM_VERDICT},
+	},
+	{"sk_skb", {0, BPF_PROG_TYPE_SK_SKB, 0}, {-EINVAL, 0} },
+	{"sk_msg", {0, BPF_PROG_TYPE_SK_MSG, 0}, {0, BPF_SK_MSG_VERDICT} },
+	{"lirc_mode2", {0, BPF_PROG_TYPE_LIRC_MODE2, 0}, {0, BPF_LIRC_MODE2} },
+	{
+		"flow_dissector",
+		{0, BPF_PROG_TYPE_FLOW_DISSECTOR, 0},
+		{0, BPF_FLOW_DISSECTOR},
+	},
+	{
+		"cgroup/bind4",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND},
+		{0, BPF_CGROUP_INET4_BIND},
+	},
+	{
+		"cgroup/bind6",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND},
+		{0, BPF_CGROUP_INET6_BIND},
+	},
+	{
+		"cgroup/connect4",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT},
+		{0, BPF_CGROUP_INET4_CONNECT},
+	},
+	{
+		"cgroup/connect6",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT},
+		{0, BPF_CGROUP_INET6_CONNECT},
+	},
+	{
+		"cgroup/sendmsg4",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG},
+		{0, BPF_CGROUP_UDP4_SENDMSG},
+	},
+	{
+		"cgroup/sendmsg6",
+		{0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG},
+		{0, BPF_CGROUP_UDP6_SENDMSG},
+	},
+};
+
+static int test_prog_type_by_name(const struct sec_name_test *test)
+{
+	enum bpf_attach_type expected_attach_type;
+	enum bpf_prog_type prog_type;
+	int rc;
+
+	rc = libbpf_prog_type_by_name(test->sec_name, &prog_type,
+				      &expected_attach_type);
+
+	if (rc != test->expected_load.rc) {
+		warnx("prog: unexpected rc=%d for %s", rc, test->sec_name);
+		return -1;
+	}
+
+	if (rc)
+		return 0;
+
+	if (prog_type != test->expected_load.prog_type) {
+		warnx("prog: unexpected prog_type=%d for %s", prog_type,
+		      test->sec_name);
+		return -1;
+	}
+
+	if (expected_attach_type != test->expected_load.expected_attach_type) {
+		warnx("prog: unexpected expected_attach_type=%d for %s",
+		      expected_attach_type, test->sec_name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int test_attach_type_by_name(const struct sec_name_test *test)
+{
+	enum bpf_attach_type attach_type;
+	int rc;
+
+	rc = libbpf_attach_type_by_name(test->sec_name, &attach_type);
+
+	if (rc != test->expected_attach.rc) {
+		warnx("attach: unexpected rc=%d for %s", rc, test->sec_name);
+		return -1;
+	}
+
+	if (rc)
+		return 0;
+
+	if (attach_type != test->expected_attach.attach_type) {
+		warnx("attach: unexpected attach_type=%d for %s", attach_type,
+		      test->sec_name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int run_test_case(const struct sec_name_test *test)
+{
+	if (test_prog_type_by_name(test))
+		return -1;
+	if (test_attach_type_by_name(test))
+		return -1;
+	return 0;
+}
+
+static int run_tests(void)
+{
+	int passes = 0;
+	int fails = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+		if (run_test_case(&tests[i]))
+			++fails;
+		else
+			++passes;
+	}
+	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+	return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+	return run_tests();
+}

From 5bf7a60b8e70969f65c961d7e2c4eb40eb2c664d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 27 Sep 2018 14:37:30 -0700
Subject: [PATCH 08/64] bpf: permit CGROUP_DEVICE programs accessing helper
 bpf_get_current_cgroup_id()

Currently, helper bpf_get_current_cgroup_id() is not permitted
for CGROUP_DEVICE type of programs. If the helper is used
in such cases, the verifier will log the following error:

  0: (bf) r6 = r1
  1: (69) r7 = *(u16 *)(r6 +0)
  2: (85) call bpf_get_current_cgroup_id#80
  unknown func bpf_get_current_cgroup_id#80

The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE
type of programs in order to customize action based on cgroup id.
This patch added such a support.

Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a7d931bbc55..549f6fbcc461 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -677,6 +677,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_trace_printk:
 		if (capable(CAP_SYS_ADMIN))
 			return bpf_get_trace_printk_proto();

From 8bad74f9840f87661f20ced3dc80c84ab4fd55a1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:36 +0000
Subject: [PATCH 09/64] bpf: extend cgroup bpf core to allow multiple cgroup
 storage types

In order to introduce per-cpu cgroup storage, let's generalize
bpf cgroup core to support multiple cgroup storage types.
Potentially, per-node cgroup storage can be added later.

This commit is mostly a formal change that replaces
cgroup_storage pointer with a array of cgroup_storage pointers.
It doesn't actually introduce a new storage type,
it will be done later.

Each bpf program is now able to have one cgroup storage of each type.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 44 ++++++++++++++++-------
 include/linux/bpf.h        | 11 ++++--
 kernel/bpf/cgroup.c        | 74 ++++++++++++++++++++++++++------------
 kernel/bpf/helpers.c       | 15 ++++----
 kernel/bpf/local_storage.c | 18 ++++++----
 kernel/bpf/syscall.c       |  9 +++--
 kernel/bpf/verifier.c      |  8 +++--
 net/bpf/test_run.c         | 20 +++++++----
 8 files changed, 139 insertions(+), 60 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index f91b0f8ff3a9..e9871b012dac 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -2,6 +2,7 @@
 #ifndef _BPF_CGROUP_H
 #define _BPF_CGROUP_H
 
+#include <linux/bpf.h>
 #include <linux/errno.h>
 #include <linux/jump_label.h>
 #include <linux/percpu.h>
@@ -22,7 +23,10 @@ struct bpf_cgroup_storage;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+
+#define for_each_cgroup_storage_type(stype) \
+	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
 
 struct bpf_cgroup_storage_map;
 
@@ -43,7 +47,7 @@ struct bpf_cgroup_storage {
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *storage;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
 struct bpf_prog_array;
@@ -101,18 +105,29 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
+static inline enum bpf_cgroup_storage_type cgroup_storage_type(
+	struct bpf_map *map)
 {
-	struct bpf_storage_buffer *buf;
-
-	if (!storage)
-		return;
-
-	buf = READ_ONCE(storage->buf);
-	this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
+	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
+static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
+					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+{
+	enum bpf_cgroup_storage_type stype;
+	struct bpf_storage_buffer *buf;
+
+	for_each_cgroup_storage_type(stype) {
+		if (!storage[stype])
+			continue;
+
+		buf = READ_ONCE(storage[stype]->buf);
+		this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]);
+	}
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
+					enum bpf_cgroup_storage_type stype);
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 			     struct cgroup *cgroup,
@@ -265,13 +280,14 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
+static inline void bpf_cgroup_storage_set(
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
 					    struct bpf_map *map) { return 0; }
 static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
 					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
-	struct bpf_prog *prog) { return 0; }
+	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
 static inline void bpf_cgroup_storage_free(
 	struct bpf_cgroup_storage *storage) {}
 
@@ -293,6 +309,8 @@ static inline void bpf_cgroup_storage_free(
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
+#define for_each_cgroup_storage_type(stype) for (; false; )
+
 #endif /* CONFIG_CGROUP_BPF */
 
 #endif /* _BPF_CGROUP_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 988a00797bcd..b457fbe7b70b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -272,6 +272,13 @@ struct bpf_prog_offload {
 	u32			jited_len;
 };
 
+enum bpf_cgroup_storage_type {
+	BPF_CGROUP_STORAGE_SHARED,
+	__BPF_CGROUP_STORAGE_MAX
+};
+
+#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -289,7 +296,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	struct bpf_map *cgroup_storage;
+	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
 #ifdef CONFIG_SECURITY
 	void *security;
@@ -358,7 +365,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
  */
 struct bpf_prog_array_item {
 	struct bpf_prog *prog;
-	struct bpf_cgroup_storage *cgroup_storage;
+	struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
 struct bpf_prog_array {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 549f6fbcc461..00f6ed2e4f9a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  */
 void cgroup_bpf_put(struct cgroup *cgrp)
 {
+	enum bpf_cgroup_storage_type stype;
 	unsigned int type;
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 		list_for_each_entry_safe(pl, tmp, progs, node) {
 			list_del(&pl->node);
 			bpf_prog_put(pl->prog);
-			bpf_cgroup_storage_unlink(pl->storage);
-			bpf_cgroup_storage_free(pl->storage);
+			for_each_cgroup_storage_type(stype) {
+				bpf_cgroup_storage_unlink(pl->storage[stype]);
+				bpf_cgroup_storage_free(pl->storage[stype]);
+			}
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
@@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
 				   enum bpf_attach_type type,
 				   struct bpf_prog_array __rcu **array)
 {
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_array *progs;
 	struct bpf_prog_list *pl;
 	struct cgroup *p = cgrp;
@@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp,
 				continue;
 
 			progs->items[cnt].prog = pl->prog;
-			progs->items[cnt].cgroup_storage = pl->storage;
+			for_each_cgroup_storage_type(stype)
+				progs->items[cnt].cgroup_storage[stype] =
+					pl->storage[stype];
 			cnt++;
 		}
 	} while ((p = cgroup_parent(p)));
@@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
-	struct bpf_cgroup_storage *storage, *old_storage = NULL;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
+		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_prog_list *pl;
 	bool pl_was_allocated;
 	int err;
@@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 		return -E2BIG;
 
-	storage = bpf_cgroup_storage_alloc(prog);
-	if (IS_ERR(storage))
-		return -ENOMEM;
+	for_each_cgroup_storage_type(stype) {
+		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(storage[stype])) {
+			storage[stype] = NULL;
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
+			return -ENOMEM;
+		}
+	}
 
 	if (flags & BPF_F_ALLOW_MULTI) {
 		list_for_each_entry(pl, progs, node) {
 			if (pl->prog == prog) {
 				/* disallow attaching the same prog twice */
-				bpf_cgroup_storage_free(storage);
+				for_each_cgroup_storage_type(stype)
+					bpf_cgroup_storage_free(storage[stype]);
 				return -EINVAL;
 			}
 		}
 
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
-			bpf_cgroup_storage_free(storage);
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
 			return -ENOMEM;
 		}
 
 		pl_was_allocated = true;
 		pl->prog = prog;
-		pl->storage = storage;
+		for_each_cgroup_storage_type(stype)
+			pl->storage[stype] = storage[stype];
 		list_add_tail(&pl->node, progs);
 	} else {
 		if (list_empty(progs)) {
 			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 			if (!pl) {
-				bpf_cgroup_storage_free(storage);
+				for_each_cgroup_storage_type(stype)
+					bpf_cgroup_storage_free(storage[stype]);
 				return -ENOMEM;
 			}
 			pl_was_allocated = true;
@@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		} else {
 			pl = list_first_entry(progs, typeof(*pl), node);
 			old_prog = pl->prog;
-			old_storage = pl->storage;
-			bpf_cgroup_storage_unlink(old_storage);
+			for_each_cgroup_storage_type(stype) {
+				old_storage[stype] = pl->storage[stype];
+				bpf_cgroup_storage_unlink(old_storage[stype]);
+			}
 			pl_was_allocated = false;
 		}
 		pl->prog = prog;
-		pl->storage = storage;
+		for_each_cgroup_storage_type(stype)
+			pl->storage[stype] = storage[stype];
 	}
 
 	cgrp->bpf.flags[type] = flags;
@@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		goto cleanup;
 
 	static_branch_inc(&cgroup_bpf_enabled_key);
-	if (old_storage)
-		bpf_cgroup_storage_free(old_storage);
+	for_each_cgroup_storage_type(stype) {
+		if (!old_storage[stype])
+			continue;
+		bpf_cgroup_storage_free(old_storage[stype]);
+	}
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
-	bpf_cgroup_storage_link(storage, cgrp, type);
+	for_each_cgroup_storage_type(stype)
+		bpf_cgroup_storage_link(storage[stype], cgrp, type);
 	return 0;
 
 cleanup:
 	/* and cleanup the prog list */
 	pl->prog = old_prog;
-	bpf_cgroup_storage_free(pl->storage);
-	pl->storage = old_storage;
-	bpf_cgroup_storage_link(old_storage, cgrp, type);
+	for_each_cgroup_storage_type(stype) {
+		bpf_cgroup_storage_free(pl->storage[stype]);
+		pl->storage[stype] = old_storage[stype];
+		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
+	}
 	if (pl_was_allocated) {
 		list_del(&pl->node);
 		kfree(pl);
@@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 unused_flags)
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
+	enum bpf_cgroup_storage_type stype;
 	u32 flags = cgrp->bpf.flags[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_prog_list *pl;
@@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	list_del(&pl->node);
-	bpf_cgroup_storage_unlink(pl->storage);
-	bpf_cgroup_storage_free(pl->storage);
+	for_each_cgroup_storage_type(stype) {
+		bpf_cgroup_storage_unlink(pl->storage[stype]);
+		bpf_cgroup_storage_free(pl->storage[stype]);
+	}
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1991466b8327..9070b2ace6aa 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -194,16 +194,18 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage);
+#ifdef CONFIG_CGROUP_BPF
+DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
-	/* map and flags arguments are not used now,
-	 * but provide an ability to extend the API
-	 * for other types of local storages.
-	 * verifier checks that their values are correct.
+	/* flags argument is not used now,
+	 * but provides an ability to extend the API.
+	 * verifier checks that its value is correct.
 	 */
-	return (unsigned long) this_cpu_read(bpf_cgroup_storage);
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+
+	return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]);
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
@@ -214,3 +216,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 #endif
+#endif
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 22ad967d1e5f..0bd9f19fc557 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,7 +7,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
-DEFINE_PER_CPU(void*, bpf_cgroup_storage);
+DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -251,6 +251,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
 {
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
 	int ret = -EBUSY;
 
@@ -258,11 +259,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
 
 	if (map->prog && map->prog != prog)
 		goto unlock;
-	if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map)
+	if (prog->aux->cgroup_storage[stype] &&
+	    prog->aux->cgroup_storage[stype] != _map)
 		goto unlock;
 
 	map->prog = prog;
-	prog->aux->cgroup_storage = _map;
+	prog->aux->cgroup_storage[stype] = _map;
 	ret = 0;
 unlock:
 	spin_unlock_bh(&map->lock);
@@ -272,24 +274,26 @@ unlock:
 
 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
 {
+	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
 
 	spin_lock_bh(&map->lock);
 	if (map->prog == prog) {
-		WARN_ON(prog->aux->cgroup_storage != _map);
+		WARN_ON(prog->aux->cgroup_storage[stype] != _map);
 		map->prog = NULL;
-		prog->aux->cgroup_storage = NULL;
+		prog->aux->cgroup_storage[stype] = NULL;
 	}
 	spin_unlock_bh(&map->lock);
 }
 
-struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog)
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
+					enum bpf_cgroup_storage_type stype)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_map *map;
 	u32 pages;
 
-	map = prog->aux->cgroup_storage;
+	map = prog->aux->cgroup_storage[stype];
 	if (!map)
 		return NULL;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b3c2d09bcf7a..8c91d2b41b1e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -988,10 +988,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 /* drop refcnt on maps used by eBPF program and free auxilary data */
 static void free_used_maps(struct bpf_prog_aux *aux)
 {
+	enum bpf_cgroup_storage_type stype;
 	int i;
 
-	if (aux->cgroup_storage)
-		bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage);
+	for_each_cgroup_storage_type(stype) {
+		if (!aux->cgroup_storage[stype])
+			continue;
+		bpf_cgroup_storage_release(aux->prog,
+					   aux->cgroup_storage[stype]);
+	}
 
 	for (i = 0; i < aux->used_map_cnt; i++)
 		bpf_map_put(aux->used_maps[i]);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e986518d7bc3..e90899df585d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5171,11 +5171,15 @@ next_insn:
 /* drop refcnt of maps used by the rejected program */
 static void release_maps(struct bpf_verifier_env *env)
 {
+	enum bpf_cgroup_storage_type stype;
 	int i;
 
-	if (env->prog->aux->cgroup_storage)
+	for_each_cgroup_storage_type(stype) {
+		if (!env->prog->aux->cgroup_storage[stype])
+			continue;
 		bpf_cgroup_storage_release(env->prog,
-					   env->prog->aux->cgroup_storage);
+			env->prog->aux->cgroup_storage[stype]);
+	}
 
 	for (i = 0; i < env->used_map_cnt; i++)
 		bpf_map_put(env->used_maps[i]);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f4078830ea50..0c423b8cd75c 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -12,7 +12,7 @@
 #include <linux/sched/signal.h>
 
 static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
-					    struct bpf_cgroup_storage *storage)
+		struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	u32 ret;
 
@@ -28,13 +28,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
 
 static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 {
-	struct bpf_cgroup_storage *storage = NULL;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
+	enum bpf_cgroup_storage_type stype;
 	u64 time_start, time_spent = 0;
 	u32 ret = 0, i;
 
-	storage = bpf_cgroup_storage_alloc(prog);
-	if (IS_ERR(storage))
-		return PTR_ERR(storage);
+	for_each_cgroup_storage_type(stype) {
+		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(storage[stype])) {
+			storage[stype] = NULL;
+			for_each_cgroup_storage_type(stype)
+				bpf_cgroup_storage_free(storage[stype]);
+			return -ENOMEM;
+		}
+	}
 
 	if (!repeat)
 		repeat = 1;
@@ -53,7 +60,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
 	do_div(time_spent, repeat);
 	*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
 
-	bpf_cgroup_storage_free(storage);
+	for_each_cgroup_storage_type(stype)
+		bpf_cgroup_storage_free(storage[stype]);
 
 	return ret;
 }

From f294b37ec7b24a574884cd157497a3748081c0f0 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:40 +0000
Subject: [PATCH 10/64] bpf: rework cgroup storage pointer passing

To simplify the following introduction of per-cpu cgroup storage,
let's rework a bit a mechanism of passing a pointer to a cgroup
storage into the bpf_get_local_storage(). Let's save a pointer
to the corresponding bpf_cgroup_storage structure, instead of
a pointer to the actual buffer.

It will help us to handle per-cpu storage later, which has
a different way of accessing to the actual data.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 13 ++++---------
 kernel/bpf/helpers.c       |  8 ++++++--
 kernel/bpf/local_storage.c |  3 ++-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index e9871b012dac..7e0c9a1d48b7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -23,7 +23,8 @@ struct bpf_cgroup_storage;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
-DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage*,
+		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -115,15 +116,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
 					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	enum bpf_cgroup_storage_type stype;
-	struct bpf_storage_buffer *buf;
 
-	for_each_cgroup_storage_type(stype) {
-		if (!storage[stype])
-			continue;
-
-		buf = READ_ONCE(storage[stype]->buf);
-		this_cpu_write(bpf_cgroup_storage[stype], &buf->data[0]);
-	}
+	for_each_cgroup_storage_type(stype)
+		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
 }
 
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9070b2ace6aa..e42f8789b7ea 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -195,7 +195,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage*,
+		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -204,8 +205,11 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+	struct bpf_cgroup_storage *storage;
 
-	return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]);
+	storage = this_cpu_read(bpf_cgroup_storage[stype]);
+
+	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 0bd9f19fc557..6742292fb39e 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -7,7 +7,8 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
-DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DEFINE_PER_CPU(struct bpf_cgroup_storage*,
+	       bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 

From b741f1630346defcbc8cc60f1a2bdae8b3b0036f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:43 +0000
Subject: [PATCH 11/64] bpf: introduce per-cpu cgroup local storage

This commit introduced per-cpu cgroup local storage.

Per-cpu cgroup local storage is very similar to simple cgroup storage
(let's call it shared), except all the data is per-cpu.

The main goal of per-cpu variant is to implement super fast
counters (e.g. packet counters), which don't require neither
lookups, neither atomic operations.

>From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and
arrays).

Writing to a per-cpu cgroup storage is not atomic, but is performed
by copying longs, so some minimal atomicity is here, exactly
as with other per-cpu maps.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h |  20 ++++-
 include/linux/bpf.h        |   1 +
 include/linux/bpf_types.h  |   1 +
 include/uapi/linux/bpf.h   |   1 +
 kernel/bpf/helpers.c       |   8 +-
 kernel/bpf/local_storage.c | 150 ++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c       |  11 ++-
 kernel/bpf/verifier.c      |  15 +++-
 8 files changed, 179 insertions(+), 28 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7e0c9a1d48b7..588dd5f0bd85 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
 };
 
 struct bpf_cgroup_storage {
-	struct bpf_storage_buffer *buf;
+	union {
+		struct bpf_storage_buffer *buf;
+		void __percpu *percpu_buf;
+	};
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_cgroup_storage_key key;
 	struct list_head list;
@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		return BPF_CGROUP_STORAGE_PERCPU;
+
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
 
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
+				     void *value, u64 flags);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
 static inline void bpf_cgroup_storage_free(
 	struct bpf_cgroup_storage *storage) {}
+static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
+						 void *value) {
+	return 0;
+}
+static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
+					void *key, void *value, u64 flags) {
+	return 0;
+}
 
 #define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b457fbe7b70b..018299a595c8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -274,6 +274,7 @@ struct bpf_prog_offload {
 
 enum bpf_cgroup_storage_type {
 	BPF_CGROUP_STORAGE_SHARED,
+	BPF_CGROUP_STORAGE_PERCPU,
 	__BPF_CGROUP_STORAGE_MAX
 };
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c9bd6fb765b0..5432f4c9f50e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2385ed..e2070d819e04 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKHASH,
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e42f8789b7ea..6502115e8f55 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
 	struct bpf_cgroup_storage *storage;
+	void *ptr;
 
 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
 
-	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
+	if (stype == BPF_CGROUP_STORAGE_SHARED)
+		ptr = &READ_ONCE(storage->buf)->data[0];
+	else
+		ptr = this_cpu_ptr(storage->percpu_buf);
+
+	return (unsigned long)ptr;
 }
 
 const struct bpf_func_proto bpf_get_local_storage_proto = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 6742292fb39e..944eb297465f 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	return 0;
 }
 
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+				   void *value)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	int cpu, off = 0;
+	u32 size;
+
+	rcu_read_lock();
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/* per_cpu areas are zero-filled and bpf programs can only
+	 * access 'value_size' of them, so copying rounded areas
+	 * will not leak any kernel data
+	 */
+	size = round_up(_map->value_size, 8);
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(value + off,
+				per_cpu_ptr(storage->percpu_buf, cpu), size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+				     void *value, u64 map_flags)
+{
+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct bpf_cgroup_storage_key *key = _key;
+	struct bpf_cgroup_storage *storage;
+	int cpu, off = 0;
+	u32 size;
+
+	if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
+		return -EINVAL;
+
+	rcu_read_lock();
+	storage = cgroup_storage_lookup(map, key, false);
+	if (!storage) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	/* the user space will provide round_up(value_size, 8) bytes that
+	 * will be copied into per-cpu area. bpf programs can only access
+	 * value_size of it. During lookup the same extra bytes will be
+	 * returned or zeros which were zero-filled by percpu_alloc,
+	 * so no kernel data leaks possible
+	 */
+	size = round_up(_map->value_size, 8);
+	for_each_possible_cpu(cpu) {
+		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
+				value + off, size);
+		off += size;
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
 				       void *_next_key)
 {
@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
 	spin_unlock_bh(&map->lock);
 }
 
+static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
+{
+	size_t size;
+
+	if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
+		size = sizeof(struct bpf_storage_buffer) + map->value_size;
+		*pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
+				  PAGE_SIZE) >> PAGE_SHIFT;
+	} else {
+		size = map->value_size;
+		*pages = round_up(round_up(size, 8) * num_possible_cpus(),
+				  PAGE_SIZE) >> PAGE_SHIFT;
+	}
+
+	return size;
+}
+
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 					enum bpf_cgroup_storage_type stype)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_map *map;
+	gfp_t flags;
+	size_t size;
 	u32 pages;
 
 	map = prog->aux->cgroup_storage[stype];
 	if (!map)
 		return NULL;
 
-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
-			 sizeof(struct bpf_storage_buffer) +
-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+	size = bpf_cgroup_storage_calculate_size(map, &pages);
+
 	if (bpf_map_charge_memlock(map, pages))
 		return ERR_PTR(-EPERM);
 
 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
 			       __GFP_ZERO | GFP_USER, map->numa_node);
-	if (!storage) {
-		bpf_map_uncharge_memlock(map, pages);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!storage)
+		goto enomem;
 
-	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
-				    map->value_size, __GFP_ZERO | GFP_USER,
-				    map->numa_node);
-	if (!storage->buf) {
-		bpf_map_uncharge_memlock(map, pages);
-		kfree(storage);
-		return ERR_PTR(-ENOMEM);
+	flags = __GFP_ZERO | GFP_USER;
+
+	if (stype == BPF_CGROUP_STORAGE_SHARED) {
+		storage->buf = kmalloc_node(size, flags, map->numa_node);
+		if (!storage->buf)
+			goto enomem;
+	} else {
+		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
+		if (!storage->percpu_buf)
+			goto enomem;
 	}
 
 	storage->map = (struct bpf_cgroup_storage_map *)map;
 
 	return storage;
+
+enomem:
+	bpf_map_uncharge_memlock(map, pages);
+	kfree(storage);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+	struct bpf_cgroup_storage *storage =
+		container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+	kfree(storage->buf);
+	kfree(storage);
+}
+
+static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+	struct bpf_cgroup_storage *storage =
+		container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+	free_percpu(storage->percpu_buf);
+	kfree(storage);
 }
 
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
 {
-	u32 pages;
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_map *map;
+	u32 pages;
 
 	if (!storage)
 		return;
 
 	map = &storage->map->map;
-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
-			 sizeof(struct bpf_storage_buffer) +
-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+
+	bpf_cgroup_storage_calculate_size(map, &pages);
 	bpf_map_uncharge_memlock(map, pages);
 
-	kfree_rcu(storage->buf, rcu);
-	kfree_rcu(storage, rcu);
+	stype = cgroup_storage_type(map);
+	if (stype == BPF_CGROUP_STORAGE_SHARED)
+		call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
+	else
+		call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
 }
 
 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8c91d2b41b1e..5742df21598c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else if (IS_FD_MAP(map))
 		value_size = sizeof(u32);
@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
 	} else if (IS_FD_ARRAY(map)) {
@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else
 		value_size = map->value_size;
@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_update(map, key, value,
+						       attr->flags);
 	} else if (IS_FD_ARRAY(map)) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e90899df585d..a8cc83a970d1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_STORAGE:
+	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
 		if (func_id != BPF_FUNC_get_local_storage)
 			goto error;
 		break;
@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_get_local_storage:
-		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 			goto error;
 		break;
 	case BPF_FUNC_sk_select_reuseport:
@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
 /* look for pseudo eBPF instructions that access map FDs and
  * replace them with actual map pointers
  */
@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			}
 			env->used_maps[env->used_map_cnt++] = map;
 
-			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+			if (bpf_map_is_cgroup_storage(map) &&
 			    bpf_cgroup_storage_assign(env->prog, map)) {
-				verbose(env,
-					"only one cgroup storage is allowed\n");
+				verbose(env, "only one cgroup storage of each type is allowed\n");
 				fdput(f);
 				return -EBUSY;
 			}

From c6fdcd6e0cc4dc316e3eb261025fb0abd69540b9 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:46 +0000
Subject: [PATCH 12/64] bpf: don't allow create maps of per-cpu cgroup local
 storages

Explicitly forbid creating map of per-cpu cgroup local storages.
This behavior matches the behavior of shared cgroup storages.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/map_in_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3bfbf4464416..99d243e1ad6e 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -24,7 +24,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	 * in the verifier is not enough.
 	 */
 	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
-	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) {
+	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+	    inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
 		fdput(f);
 		return ERR_PTR(-ENOTSUPP);
 	}

From 25025e0aab2f91a93a1557b299a79814559b4dc2 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:48 +0000
Subject: [PATCH 13/64] bpf: sync include/uapi/linux/bpf.h to
 tools/include/uapi/linux/bpf.h

The sync is required due to the appearance of a new map type:
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, which implements per-cpu
cgroup local storage.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index aa5ccd2385ed..e2070d819e04 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKHASH,
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 };
 
 enum bpf_prog_type {

From e54870924f4c51a8228be9ad427274f1032e0683 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:51 +0000
Subject: [PATCH 14/64] bpftool: add support for PERCPU_CGROUP_STORAGE maps

This commit adds support for BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
map type.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index e22fbe8b975f..6003e9598973 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -72,13 +72,15 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_SOCKHASH]		= "sockhash",
 	[BPF_MAP_TYPE_CGROUP_STORAGE]	= "cgroup_storage",
 	[BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
+	[BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE]	= "percpu_cgroup_storage",
 };
 
 static bool map_is_per_cpu(__u32 type)
 {
 	return type == BPF_MAP_TYPE_PERCPU_HASH ||
 	       type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	       type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+	       type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	       type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE;
 }
 
 static bool map_is_map_of_maps(__u32 type)

From a3c6054f81e2c436d6b53b8060df996fec0cbfdb Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:53 +0000
Subject: [PATCH 15/64] selftests/bpf: add verifier per-cpu cgroup storage
 tests

This commits adds verifier tests covering per-cpu cgroup storage
functionality. There are 6 new tests, which are exactly the same
as for shared cgroup storage, but do use per-cpu cgroup storage
map.

Expected output:
  $ ./test_verifier
  #0/u add+sub+mul OK
  #0/p add+sub+mul OK
  ...
  #286/p invalid cgroup storage access 6 OK
  #287/p valid per-cpu cgroup storage access OK
  #288/p invalid per-cpu cgroup storage access 1 OK
  #289/p invalid per-cpu cgroup storage access 2 OK
  #290/p invalid per-cpu cgroup storage access 3 OK
  #291/p invalid per-cpu cgroup storage access 4 OK
  #292/p invalid per-cpu cgroup storage access 5 OK
  #293/p invalid per-cpu cgroup storage access 6 OK
  #294/p multiple registers share map_lookup_elem result OK
  ...
  #662/p mov64 src == dst OK
  #663/p mov64 src != dst OK
  Summary: 914 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 139 +++++++++++++++++++-
 1 file changed, 133 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 67c412d19c09..c7d25f23baf9 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -68,6 +68,7 @@ struct bpf_test {
 	int fixup_prog2[MAX_FIXUPS];
 	int fixup_map_in_map[MAX_FIXUPS];
 	int fixup_cgroup_storage[MAX_FIXUPS];
+	int fixup_percpu_cgroup_storage[MAX_FIXUPS];
 	const char *errstr;
 	const char *errstr_unpriv;
 	uint32_t retval;
@@ -4676,7 +4677,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
 	},
 	{
-		"invalid per-cgroup storage access 3",
+		"invalid cgroup storage access 3",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_2, 0),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
@@ -4743,6 +4744,121 @@ static struct bpf_test tests[] = {
 		.errstr = "get_local_storage() doesn't support non-zero flags",
 		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
 	},
+	{
+		"valid per-cpu cgroup storage access",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_percpu_cgroup_storage = { 1 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 1 },
+		.result = REJECT,
+		.errstr = "cannot pass map_type 1 into func bpf_get_local_storage",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 1),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "fd 1 is not pointing to valid bpf_map",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 256),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_percpu_cgroup_storage = { 1 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=64 off=256 size=4",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_cgroup_storage = { 1 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=64 off=-2 size=4",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 7),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_percpu_cgroup_storage = { 1 },
+		.result = REJECT,
+		.errstr = "get_local_storage() doesn't support non-zero flags",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid per-cpu cgroup storage access 6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_local_storage),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_percpu_cgroup_storage = { 1 },
+		.result = REJECT,
+		.errstr = "get_local_storage() doesn't support non-zero flags",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
 	{
 		"multiple registers share map_lookup_elem result",
 		.insns = {
@@ -12615,15 +12731,17 @@ static int create_map_in_map(void)
 	return outer_map_fd;
 }
 
-static int create_cgroup_storage(void)
+static int create_cgroup_storage(bool percpu)
 {
+	enum bpf_map_type type = percpu ? BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE :
+		BPF_MAP_TYPE_CGROUP_STORAGE;
 	int fd;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
-			    sizeof(struct bpf_cgroup_storage_key),
+	fd = bpf_create_map(type, sizeof(struct bpf_cgroup_storage_key),
 			    TEST_DATA_LEN, 0, 0);
 	if (fd < 0)
-		printf("Failed to create array '%s'!\n", strerror(errno));
+		printf("Failed to create cgroup storage '%s'!\n",
+		       strerror(errno));
 
 	return fd;
 }
@@ -12641,6 +12759,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_prog2 = test->fixup_prog2;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 	int *fixup_cgroup_storage = test->fixup_cgroup_storage;
+	int *fixup_percpu_cgroup_storage = test->fixup_percpu_cgroup_storage;
 
 	if (test->fill_helper)
 		test->fill_helper(test);
@@ -12710,12 +12829,20 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_cgroup_storage) {
-		map_fds[7] = create_cgroup_storage();
+		map_fds[7] = create_cgroup_storage(false);
 		do {
 			prog[*fixup_cgroup_storage].imm = map_fds[7];
 			fixup_cgroup_storage++;
 		} while (*fixup_cgroup_storage);
 	}
+
+	if (*fixup_percpu_cgroup_storage) {
+		map_fds[8] = create_cgroup_storage(true);
+		do {
+			prog[*fixup_percpu_cgroup_storage].imm = map_fds[8];
+			fixup_percpu_cgroup_storage++;
+		} while (*fixup_percpu_cgroup_storage);
+	}
 }
 
 static void do_test_single(struct bpf_test *test, bool unpriv,

From 919646d2a3a90465cf8b30a7616c63005bef7570 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:55 +0000
Subject: [PATCH 16/64] selftests/bpf: extend the storage test to test per-cpu
 cgroup storage

This test extends the cgroup storage test to use per-cpu flavor
of the cgroup storage as well.

The test initializes a per-cpu cgroup storage to some non-zero initial
value (1000), and then simple bumps a per-cpu counter each time
the shared counter is atomically incremented. Then it reads all
per-cpu areas from the userspace side, and checks that the sum
of values adds to the expected sum.

Expected output:
  $ ./test_cgroup_storage
  test_cgroup_storage:PASS

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../selftests/bpf/test_cgroup_storage.c       | 60 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index 4e196e3bfecf..f44834155f25 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -4,6 +4,7 @@
 #include <linux/filter.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/sysinfo.h>
 
 #include "bpf_rlimit.h"
 #include "cgroup_helpers.h"
@@ -15,6 +16,14 @@ char bpf_log_buf[BPF_LOG_BUF_SIZE];
 int main(int argc, char **argv)
 {
 	struct bpf_insn prog[] = {
+		BPF_LD_MAP_FD(BPF_REG_1, 0), /* percpu map fd */
+		BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_local_storage),
+		BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
+		BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+
 		BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */
 		BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
@@ -28,9 +37,18 @@ int main(int argc, char **argv)
 	};
 	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 	int error = EXIT_FAILURE;
-	int map_fd, prog_fd, cgroup_fd;
+	int map_fd, percpu_map_fd, prog_fd, cgroup_fd;
 	struct bpf_cgroup_storage_key key;
 	unsigned long long value;
+	unsigned long long *percpu_value;
+	int cpu, nproc;
+
+	nproc = get_nprocs_conf();
+	percpu_value = malloc(sizeof(*percpu_value) * nproc);
+	if (!percpu_value) {
+		printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
+		goto err;
+	}
 
 	map_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, sizeof(key),
 				sizeof(value), 0, 0);
@@ -39,7 +57,15 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	prog[0].imm = map_fd;
+	percpu_map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+				       sizeof(key), sizeof(value), 0, 0);
+	if (percpu_map_fd < 0) {
+		printf("Failed to create map: %s\n", strerror(errno));
+		goto out;
+	}
+
+	prog[0].imm = percpu_map_fd;
+	prog[7].imm = map_fd;
 	prog_fd = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
 				   prog, insns_cnt, "GPL", 0,
 				   bpf_log_buf, BPF_LOG_BUF_SIZE);
@@ -77,7 +103,15 @@ int main(int argc, char **argv)
 	}
 
 	if (bpf_map_lookup_elem(map_fd, &key, &value)) {
-		printf("Failed to lookup cgroup storage\n");
+		printf("Failed to lookup cgroup storage 0\n");
+		goto err;
+	}
+
+	for (cpu = 0; cpu < nproc; cpu++)
+		percpu_value[cpu] = 1000;
+
+	if (bpf_map_update_elem(percpu_map_fd, &key, percpu_value, 0)) {
+		printf("Failed to update the data in the cgroup storage\n");
 		goto err;
 	}
 
@@ -120,11 +154,31 @@ int main(int argc, char **argv)
 		goto err;
 	}
 
+	/* Check the final value of the counter in the percpu local storage */
+
+	for (cpu = 0; cpu < nproc; cpu++)
+		percpu_value[cpu] = 0;
+
+	if (bpf_map_lookup_elem(percpu_map_fd, &key, percpu_value)) {
+		printf("Failed to lookup the per-cpu cgroup storage\n");
+		goto err;
+	}
+
+	value = 0;
+	for (cpu = 0; cpu < nproc; cpu++)
+		value += percpu_value[cpu];
+
+	if (value != nproc * 1000 + 6) {
+		printf("Unexpected data in the per-cpu cgroup storage\n");
+		goto err;
+	}
+
 	error = 0;
 	printf("test_cgroup_storage:PASS\n");
 
 err:
 	cleanup_cgroup_environment();
+	free(percpu_value);
 
 out:
 	return error;

From 5fcbd29b372402c96b182e0e80b3d95463431490 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:45:58 +0000
Subject: [PATCH 17/64] samples/bpf: extend test_cgrp2_attach2 test to use
 per-cpu cgroup storage

This commit extends the test_cgrp2_attach2 test to cover per-cpu
cgroup storage. Bpf program will use shared and per-cpu cgroup
storages simultaneously, so a better coverage of corresponding
core code will be achieved.

Expected output:
  $ ./test_cgrp2_attach2
  Attached DROP prog. This ping in cgroup /foo should fail...
  ping: sendmsg: Operation not permitted
  Attached DROP prog. This ping in cgroup /foo/bar should fail...
  ping: sendmsg: Operation not permitted
  Attached PASS prog. This ping in cgroup /foo/bar should pass...
  Detached PASS from /foo/bar while DROP is attached to /foo.
  This ping in cgroup /foo/bar should fail...
  ping: sendmsg: Operation not permitted
  Attached PASS from /foo/bar and detached DROP from /foo.
  This ping in cgroup /foo/bar should pass...
  ### override:PASS
  ### multi:PASS

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/test_cgrp2_attach2.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 180f9d813bca..d7b68ef5ba79 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -209,7 +209,7 @@ static int map_fd = -1;
 
 static int prog_load_cnt(int verdict, int val)
 {
-	int cgroup_storage_fd;
+	int cgroup_storage_fd, percpu_cgroup_storage_fd;
 
 	if (map_fd < 0)
 		map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
@@ -225,6 +225,14 @@ static int prog_load_cnt(int verdict, int val)
 		return -1;
 	}
 
+	percpu_cgroup_storage_fd = bpf_create_map(
+		BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+		sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+	if (percpu_cgroup_storage_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		return -1;
+	}
+
 	struct bpf_insn prog[] = {
 		BPF_MOV32_IMM(BPF_REG_0, 0),
 		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
@@ -235,11 +243,20 @@ static int prog_load_cnt(int verdict, int val)
 		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 		BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
 		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
 		BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
 		BPF_MOV64_IMM(BPF_REG_2, 0),
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
 		BPF_MOV64_IMM(BPF_REG_1, val),
 		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
+
+		BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd),
+		BPF_MOV64_IMM(BPF_REG_2, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+		BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
+		BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+
 		BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
 		BPF_EXIT_INSN(),
 	};

From 371e4fcc9d96ab1c8d72d59ca4ee3537402d1584 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Sep 2018 14:46:00 +0000
Subject: [PATCH 18/64] selftests/bpf: cgroup local storage-based network
 counters

This commit adds a bpf kselftest, which demonstrates how percpu
and shared cgroup local storage can be used for efficient lookup-free
network accounting.

Cgroup local storage provides generic memory area with a very efficient
lookup free access. To avoid expensive atomic operations for each
packet, per-cpu cgroup local storage is used. Each packet is initially
charged to a per-cpu counter, and only if the counter reaches certain
value (32 in this case), the charge is moved into the global atomic
counter. This allows to amortize atomic operations, keeping reasonable
accuracy.

The test also implements a naive network traffic throttling, mostly to
demonstrate the possibility of bpf cgroup--based network bandwidth
control.

Expected output:
  ./test_netcnt
  test_netcnt:PASS

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile        |   6 +-
 tools/testing/selftests/bpf/netcnt_common.h |  24 +++
 tools/testing/selftests/bpf/netcnt_prog.c   |  71 +++++++++
 tools/testing/selftests/bpf/test_netcnt.c   | 158 ++++++++++++++++++++
 4 files changed, 257 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/netcnt_common.h
 create mode 100644 tools/testing/selftests/bpf/netcnt_prog.c
 create mode 100644 tools/testing/selftests/bpf/test_netcnt.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 059d64a0f897..f802de526f57 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,8 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
-	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names
+	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
+	test_netcnt
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -35,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-	test_skb_cgroup_id_kern.o bpf_flow.o
+	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -72,6 +73,7 @@ $(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c
 $(OUTPUT)/test_progs: trace_helpers.c
 $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
 $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
+$(OUTPUT)/test_netcnt: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h
new file mode 100644
index 000000000000..81084c1c2c23
--- /dev/null
+++ b/tools/testing/selftests/bpf/netcnt_common.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NETCNT_COMMON_H
+#define __NETCNT_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_PERCPU_PACKETS 32
+
+struct percpu_net_cnt {
+	__u64 packets;
+	__u64 bytes;
+
+	__u64 prev_ts;
+
+	__u64 prev_packets;
+	__u64 prev_bytes;
+};
+
+struct net_cnt {
+	__u64 packets;
+	__u64 bytes;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/netcnt_prog.c b/tools/testing/selftests/bpf/netcnt_prog.c
new file mode 100644
index 000000000000..1198abca1360
--- /dev/null
+++ b/tools/testing/selftests/bpf/netcnt_prog.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/version.h>
+
+#include "bpf_helpers.h"
+#include "netcnt_common.h"
+
+#define MAX_BPS	(3 * 1024 * 1024)
+
+#define REFRESH_TIME_NS	100000000
+#define NS_PER_SEC	1000000000
+
+struct bpf_map_def SEC("maps") percpu_netcnt = {
+	.type = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct percpu_net_cnt),
+};
+
+struct bpf_map_def SEC("maps") netcnt = {
+	.type = BPF_MAP_TYPE_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct net_cnt),
+};
+
+SEC("cgroup/skb")
+int bpf_nextcnt(struct __sk_buff *skb)
+{
+	struct percpu_net_cnt *percpu_cnt;
+	char fmt[] = "%d %llu %llu\n";
+	struct net_cnt *cnt;
+	__u64 ts, dt;
+	int ret;
+
+	cnt = bpf_get_local_storage(&netcnt, 0);
+	percpu_cnt = bpf_get_local_storage(&percpu_netcnt, 0);
+
+	percpu_cnt->packets++;
+	percpu_cnt->bytes += skb->len;
+
+	if (percpu_cnt->packets > MAX_PERCPU_PACKETS) {
+		__sync_fetch_and_add(&cnt->packets,
+				     percpu_cnt->packets);
+		percpu_cnt->packets = 0;
+
+		__sync_fetch_and_add(&cnt->bytes,
+				     percpu_cnt->bytes);
+		percpu_cnt->bytes = 0;
+	}
+
+	ts = bpf_ktime_get_ns();
+	dt = ts - percpu_cnt->prev_ts;
+
+	dt *= MAX_BPS;
+	dt /= NS_PER_SEC;
+
+	if (cnt->bytes + percpu_cnt->bytes - percpu_cnt->prev_bytes < dt)
+		ret = 1;
+	else
+		ret = 0;
+
+	if (dt > REFRESH_TIME_NS) {
+		percpu_cnt->prev_ts = ts;
+		percpu_cnt->prev_packets = cnt->packets;
+		percpu_cnt->prev_bytes = cnt->bytes;
+	}
+
+	return !!ret;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c
new file mode 100644
index 000000000000..7887df693399
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_netcnt.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+#include "netcnt_common.h"
+
+#define BPF_PROG "./netcnt_prog.o"
+#define TEST_CGROUP "/test-network-counters/"
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+int main(int argc, char **argv)
+{
+	struct percpu_net_cnt *percpu_netcnt;
+	struct bpf_cgroup_storage_key key;
+	int map_fd, percpu_map_fd;
+	int error = EXIT_FAILURE;
+	struct net_cnt netcnt;
+	struct bpf_object *obj;
+	int prog_fd, cgroup_fd;
+	unsigned long packets;
+	unsigned long bytes;
+	int cpu, nproc;
+	__u32 prog_cnt;
+
+	nproc = get_nprocs_conf();
+	percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc);
+	if (!percpu_netcnt) {
+		printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
+		goto err;
+	}
+
+	if (bpf_prog_load(BPF_PROG, BPF_PROG_TYPE_CGROUP_SKB,
+			  &obj, &prog_fd)) {
+		printf("Failed to load bpf program\n");
+		goto out;
+	}
+
+	if (setup_cgroup_environment()) {
+		printf("Failed to load bpf program\n");
+		goto err;
+	}
+
+	/* Create a cgroup, get fd, and join it */
+	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
+	if (!cgroup_fd) {
+		printf("Failed to create test cgroup\n");
+		goto err;
+	}
+
+	if (join_cgroup(TEST_CGROUP)) {
+		printf("Failed to join cgroup\n");
+		goto err;
+	}
+
+	/* Attach bpf program */
+	if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
+		printf("Failed to attach bpf program");
+		goto err;
+	}
+
+	assert(system("ping localhost -6 -c 10000 -f -q > /dev/null") == 0);
+
+	if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL,
+			   &prog_cnt)) {
+		printf("Failed to query attached programs");
+		goto err;
+	}
+
+	map_fd = bpf_find_map(__func__, obj, "netcnt");
+	if (map_fd < 0) {
+		printf("Failed to find bpf map with net counters");
+		goto err;
+	}
+
+	percpu_map_fd = bpf_find_map(__func__, obj, "percpu_netcnt");
+	if (percpu_map_fd < 0) {
+		printf("Failed to find bpf map with percpu net counters");
+		goto err;
+	}
+
+	if (bpf_map_get_next_key(map_fd, NULL, &key)) {
+		printf("Failed to get key in cgroup storage\n");
+		goto err;
+	}
+
+	if (bpf_map_lookup_elem(map_fd, &key, &netcnt)) {
+		printf("Failed to lookup cgroup storage\n");
+		goto err;
+	}
+
+	if (bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0])) {
+		printf("Failed to lookup percpu cgroup storage\n");
+		goto err;
+	}
+
+	/* Some packets can be still in per-cpu cache, but not more than
+	 * MAX_PERCPU_PACKETS.
+	 */
+	packets = netcnt.packets;
+	bytes = netcnt.bytes;
+	for (cpu = 0; cpu < nproc; cpu++) {
+		if (percpu_netcnt[cpu].packets > MAX_PERCPU_PACKETS) {
+			printf("Unexpected percpu value: %llu\n",
+			       percpu_netcnt[cpu].packets);
+			goto err;
+		}
+
+		packets += percpu_netcnt[cpu].packets;
+		bytes += percpu_netcnt[cpu].bytes;
+	}
+
+	/* No packets should be lost */
+	if (packets != 10000) {
+		printf("Unexpected packet count: %lu\n", packets);
+		goto err;
+	}
+
+	/* Let's check that bytes counter matches the number of packets
+	 * multiplied by the size of ipv6 ICMP packet.
+	 */
+	if (bytes != packets * 104) {
+		printf("Unexpected bytes count: %lu\n", bytes);
+		goto err;
+	}
+
+	error = 0;
+	printf("test_netcnt:PASS\n");
+
+err:
+	cleanup_cgroup_environment();
+	free(percpu_netcnt);
+
+out:
+	return error;
+}

From 28264eb227422165d23a3efeff24735fdedb7410 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 18:30:32 -0700
Subject: [PATCH 19/64] nfp: bpf: parse global BPF ABI version capability

Up until now we only had per-vNIC BPF ABI version capabilities,
which are slightly awkward to use because bulk of the resources
and configuration does not relate to any particular vNIC.  Add
a new capability for global ABI version and check the per-vNIC
version are equal to it.  Assume the ABI version 2 if no explicit
version capability is present.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/fw.h   |  1 +
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 43 +++++++++++++++++--
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  4 ++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index e4f9b7ec8528..58bad868bb6f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -52,6 +52,7 @@ enum bpf_cap_tlv_type {
 	NFP_BPF_CAP_TYPE_RANDOM		= 4,
 	NFP_BPF_CAP_TYPE_QUEUE_SELECT	= 5,
 	NFP_BPF_CAP_TYPE_ADJUST_TAIL	= 6,
+	NFP_BPF_CAP_TYPE_ABI_VERSION	= 7,
 };
 
 struct nfp_bpf_cap_tlv_func {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 970af07f4656..1f79246765d1 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -54,11 +54,14 @@ const struct rhashtable_params nfp_bpf_maps_neutral_params = {
 static bool nfp_net_ebpf_capable(struct nfp_net *nn)
 {
 #ifdef __LITTLE_ENDIAN
-	if (nn->cap & NFP_NET_CFG_CTRL_BPF &&
-	    nn_readb(nn, NFP_NET_CFG_BPF_ABI) == NFP_NET_BPF_ABI)
-		return true;
-#endif
+	struct nfp_app_bpf *bpf = nn->app->priv;
+
+	return nn->cap & NFP_NET_CFG_CTRL_BPF &&
+	       bpf->abi_version &&
+	       nn_readb(nn, NFP_NET_CFG_BPF_ABI) == bpf->abi_version;
+#else
 	return false;
+#endif
 }
 
 static int
@@ -342,6 +345,26 @@ nfp_bpf_parse_cap_adjust_tail(struct nfp_app_bpf *bpf, void __iomem *value,
 	return 0;
 }
 
+static int
+nfp_bpf_parse_cap_abi_version(struct nfp_app_bpf *bpf, void __iomem *value,
+			      u32 length)
+{
+	if (length < 4) {
+		nfp_err(bpf->app->cpp, "truncated ABI version TLV: %d\n",
+			length);
+		return -EINVAL;
+	}
+
+	bpf->abi_version = readl(value);
+	if (bpf->abi_version != 2) {
+		nfp_warn(bpf->app->cpp, "unsupported BPF ABI version: %d\n",
+			 bpf->abi_version);
+		bpf->abi_version = 0;
+	}
+
+	return 0;
+}
+
 static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 {
 	struct nfp_cpp *cpp = app->pf->cpp;
@@ -393,6 +416,11 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 							  length))
 				goto err_release_free;
 			break;
+		case NFP_BPF_CAP_TYPE_ABI_VERSION:
+			if (nfp_bpf_parse_cap_abi_version(app->priv, value,
+							  length))
+				goto err_release_free;
+			break;
 		default:
 			nfp_dbg(cpp, "unknown BPF capability: %d\n", type);
 			break;
@@ -414,6 +442,11 @@ err_release_free:
 	return -EINVAL;
 }
 
+static void nfp_bpf_init_capabilities(struct nfp_app_bpf *bpf)
+{
+	bpf->abi_version = 2; /* Original BPF ABI version */
+}
+
 static int nfp_bpf_ndo_init(struct nfp_app *app, struct net_device *netdev)
 {
 	struct nfp_app_bpf *bpf = app->priv;
@@ -447,6 +480,8 @@ static int nfp_bpf_init(struct nfp_app *app)
 	if (err)
 		goto err_free_bpf;
 
+	nfp_bpf_init_capabilities(bpf);
+
 	err = nfp_bpf_parse_capabilities(app);
 	if (err)
 		goto err_free_neutral_maps;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index dbd00982fd2b..62cdb183efdb 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -127,6 +127,8 @@ enum pkt_vec {
  *
  * @maps_neutral:	hash table of offload-neutral maps (on pointer)
  *
+ * @abi_version:	global BPF ABI version
+ *
  * @adjust_head:	adjust head capability
  * @adjust_head.flags:		extra flags for adjust head
  * @adjust_head.off_min:	minimal packet offset within buffer required
@@ -170,6 +172,8 @@ struct nfp_app_bpf {
 
 	struct rhashtable maps_neutral;
 
+	u32 abi_version;
+
 	struct nfp_bpf_cap_adjust_head {
 		u32 flags;
 		int off_min;

From 9bbdd41b8a458d8a0bf1d9096bc1e704c7b46a00 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 18:30:33 -0700
Subject: [PATCH 20/64] nfp: allow apps to request larger MTU on control vNIC

Some apps may want to have higher MTU on the control vNIC/queue.
Allow them to set the requested MTU at init time.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_app.h       |  4 ++++
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 14 ++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 4e1eb3395648..c896eb8f87a1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -40,6 +40,8 @@
 
 #include "nfp_net_repr.h"
 
+#define NFP_APP_CTRL_MTU_MAX	U32_MAX
+
 struct bpf_prog;
 struct net_device;
 struct netdev_bpf;
@@ -178,6 +180,7 @@ struct nfp_app_type {
  * @ctrl:	pointer to ctrl vNIC struct
  * @reprs:	array of pointers to representors
  * @type:	pointer to const application ops and info
+ * @ctrl_mtu:	MTU to set on the control vNIC (set in .init())
  * @priv:	app-specific priv data
  */
 struct nfp_app {
@@ -189,6 +192,7 @@ struct nfp_app {
 	struct nfp_reprs __rcu *reprs[NFP_REPR_TYPE_MAX + 1];
 
 	const struct nfp_app_type *type;
+	unsigned int ctrl_mtu;
 	void *priv;
 };
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index d05e37fcc1b2..8e8dc0db2493 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3877,10 +3877,20 @@ int nfp_net_init(struct nfp_net *nn)
 		return err;
 
 	/* Set default MTU and Freelist buffer size */
-	if (nn->max_mtu < NFP_NET_DEFAULT_MTU)
+	if (!nfp_net_is_data_vnic(nn) && nn->app->ctrl_mtu) {
+		if (nn->app->ctrl_mtu <= nn->max_mtu) {
+			nn->dp.mtu = nn->app->ctrl_mtu;
+		} else {
+			if (nn->app->ctrl_mtu != NFP_APP_CTRL_MTU_MAX)
+				nn_warn(nn, "app requested MTU above max supported %u > %u\n",
+					nn->app->ctrl_mtu, nn->max_mtu);
+			nn->dp.mtu = nn->max_mtu;
+		}
+	} else if (nn->max_mtu < NFP_NET_DEFAULT_MTU) {
 		nn->dp.mtu = nn->max_mtu;
-	else
+	} else {
 		nn->dp.mtu = NFP_NET_DEFAULT_MTU;
+	}
 	nn->dp.fl_bufsz = nfp_net_calc_fl_bufsz(&nn->dp);
 
 	if (nfp_app_ctrl_uses_data_vnics(nn->app))

From 0c9864c05f34b0b29109dc5f94707e5943fcfda3 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 18:30:34 -0700
Subject: [PATCH 21/64] nfp: bpf: allow control message sizing for map ops

In current ABI the size of the messages carrying map elements was
statically defined to at most 16 words of key and 16 words of value
(NFP word is 4 bytes).  We should not make this assumption and use
the max key and value sizes from the BPF capability instead.

To make sure old kernels don't get surprised with larger (or smaller)
messages bump the FW ABI version to 3 when key/value size is different
than 16 words.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 73 ++++++++++++++++---
 drivers/net/ethernet/netronome/nfp/bpf/fw.h   | 10 +--
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 11 ++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  7 ++
 .../net/ethernet/netronome/nfp/nfp_net_ctrl.h |  1 -
 5 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
index 2572a4b91c7c..fdcd2bc98916 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
@@ -89,15 +89,32 @@ nfp_bpf_cmsg_alloc(struct nfp_app_bpf *bpf, unsigned int size)
 	return skb;
 }
 
-static struct sk_buff *
-nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n)
+static unsigned int
+nfp_bpf_cmsg_map_req_size(struct nfp_app_bpf *bpf, unsigned int n)
 {
 	unsigned int size;
 
 	size = sizeof(struct cmsg_req_map_op);
-	size += sizeof(struct cmsg_key_value_pair) * n;
+	size += (bpf->cmsg_key_sz + bpf->cmsg_val_sz) * n;
 
-	return nfp_bpf_cmsg_alloc(bpf, size);
+	return size;
+}
+
+static struct sk_buff *
+nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n)
+{
+	return nfp_bpf_cmsg_alloc(bpf, nfp_bpf_cmsg_map_req_size(bpf, n));
+}
+
+static unsigned int
+nfp_bpf_cmsg_map_reply_size(struct nfp_app_bpf *bpf, unsigned int n)
+{
+	unsigned int size;
+
+	size = sizeof(struct cmsg_reply_map_op);
+	size += (bpf->cmsg_key_sz + bpf->cmsg_val_sz) * n;
+
+	return size;
 }
 
 static u8 nfp_bpf_cmsg_get_type(struct sk_buff *skb)
@@ -338,6 +355,34 @@ void nfp_bpf_ctrl_free_map(struct nfp_app_bpf *bpf, struct nfp_bpf_map *nfp_map)
 	dev_consume_skb_any(skb);
 }
 
+static void *
+nfp_bpf_ctrl_req_key(struct nfp_app_bpf *bpf, struct cmsg_req_map_op *req,
+		     unsigned int n)
+{
+	return &req->data[bpf->cmsg_key_sz * n + bpf->cmsg_val_sz * n];
+}
+
+static void *
+nfp_bpf_ctrl_req_val(struct nfp_app_bpf *bpf, struct cmsg_req_map_op *req,
+		     unsigned int n)
+{
+	return &req->data[bpf->cmsg_key_sz * (n + 1) + bpf->cmsg_val_sz * n];
+}
+
+static void *
+nfp_bpf_ctrl_reply_key(struct nfp_app_bpf *bpf, struct cmsg_reply_map_op *reply,
+		       unsigned int n)
+{
+	return &reply->data[bpf->cmsg_key_sz * n + bpf->cmsg_val_sz * n];
+}
+
+static void *
+nfp_bpf_ctrl_reply_val(struct nfp_app_bpf *bpf, struct cmsg_reply_map_op *reply,
+		       unsigned int n)
+{
+	return &reply->data[bpf->cmsg_key_sz * (n + 1) + bpf->cmsg_val_sz * n];
+}
+
 static int
 nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap,
 		      enum nfp_bpf_cmsg_type op,
@@ -366,12 +411,13 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap,
 
 	/* Copy inputs */
 	if (key)
-		memcpy(&req->elem[0].key, key, map->key_size);
+		memcpy(nfp_bpf_ctrl_req_key(bpf, req, 0), key, map->key_size);
 	if (value)
-		memcpy(&req->elem[0].value, value, map->value_size);
+		memcpy(nfp_bpf_ctrl_req_val(bpf, req, 0), value,
+		       map->value_size);
 
 	skb = nfp_bpf_cmsg_communicate(bpf, skb, op,
-				       sizeof(*reply) + sizeof(*reply->elem));
+				       nfp_bpf_cmsg_map_reply_size(bpf, 1));
 	if (IS_ERR(skb))
 		return PTR_ERR(skb);
 
@@ -382,9 +428,11 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap,
 
 	/* Copy outputs */
 	if (out_key)
-		memcpy(out_key, &reply->elem[0].key, map->key_size);
+		memcpy(out_key, nfp_bpf_ctrl_reply_key(bpf, reply, 0),
+		       map->key_size);
 	if (out_value)
-		memcpy(out_value, &reply->elem[0].value, map->value_size);
+		memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, 0),
+		       map->value_size);
 
 	dev_consume_skb_any(skb);
 
@@ -428,6 +476,13 @@ int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
 				     key, NULL, 0, next_key, NULL);
 }
 
+unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf)
+{
+	return max3((unsigned int)NFP_NET_DEFAULT_MTU,
+		    nfp_bpf_cmsg_map_req_size(bpf, 1),
+		    nfp_bpf_cmsg_map_reply_size(bpf, 1));
+}
+
 void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)
 {
 	struct nfp_app_bpf *bpf = app->priv;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index 58bad868bb6f..813644e90b27 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -99,6 +99,7 @@ enum nfp_bpf_cmsg_type {
 #define CMSG_TYPE_MAP_REPLY_BIT		7
 #define __CMSG_REPLY(req)		(BIT(CMSG_TYPE_MAP_REPLY_BIT) | (req))
 
+/* BPF ABIv2 fixed-length control message fields */
 #define CMSG_MAP_KEY_LW			16
 #define CMSG_MAP_VALUE_LW		16
 
@@ -148,24 +149,19 @@ struct cmsg_reply_map_free_tbl {
 	__be32 count;
 };
 
-struct cmsg_key_value_pair {
-	__be32 key[CMSG_MAP_KEY_LW];
-	__be32 value[CMSG_MAP_VALUE_LW];
-};
-
 struct cmsg_req_map_op {
 	struct cmsg_hdr hdr;
 	__be32 tid;
 	__be32 count;
 	__be32 flags;
-	struct cmsg_key_value_pair elem[0];
+	u8 data[0];
 };
 
 struct cmsg_reply_map_op {
 	struct cmsg_reply_map_simple reply_hdr;
 	__be32 count;
 	__be32 resv;
-	struct cmsg_key_value_pair elem[0];
+	u8 data[0];
 };
 
 struct cmsg_bpf_event {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 1f79246765d1..d9d37aa860e0 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -356,7 +356,7 @@ nfp_bpf_parse_cap_abi_version(struct nfp_app_bpf *bpf, void __iomem *value,
 	}
 
 	bpf->abi_version = readl(value);
-	if (bpf->abi_version != 2) {
+	if (bpf->abi_version < 2 || bpf->abi_version > 3) {
 		nfp_warn(bpf->app->cpp, "unsupported BPF ABI version: %d\n",
 			 bpf->abi_version);
 		bpf->abi_version = 0;
@@ -486,6 +486,15 @@ static int nfp_bpf_init(struct nfp_app *app)
 	if (err)
 		goto err_free_neutral_maps;
 
+	if (bpf->abi_version < 3) {
+		bpf->cmsg_key_sz = CMSG_MAP_KEY_LW * 4;
+		bpf->cmsg_val_sz = CMSG_MAP_VALUE_LW * 4;
+	} else {
+		bpf->cmsg_key_sz = bpf->maps.max_key_sz;
+		bpf->cmsg_val_sz = bpf->maps.max_val_sz;
+		app->ctrl_mtu = nfp_bpf_ctrl_cmsg_mtu(bpf);
+	}
+
 	bpf->bpf_dev = bpf_offload_dev_create();
 	err = PTR_ERR_OR_ZERO(bpf->bpf_dev);
 	if (err)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 62cdb183efdb..792ebc4081a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -121,6 +121,9 @@ enum pkt_vec {
  * @cmsg_replies:	received cmsg replies waiting to be consumed
  * @cmsg_wq:		work queue for waiting for cmsg replies
  *
+ * @cmsg_key_sz:	size of key in cmsg element array
+ * @cmsg_val_sz:	size of value in cmsg element array
+ *
  * @map_list:		list of offloaded maps
  * @maps_in_use:	number of currently offloaded maps
  * @map_elems_in_use:	number of elements allocated to offloaded maps
@@ -166,6 +169,9 @@ struct nfp_app_bpf {
 	struct sk_buff_head cmsg_replies;
 	struct wait_queue_head cmsg_wq;
 
+	unsigned int cmsg_key_sz;
+	unsigned int cmsg_val_sz;
+
 	struct list_head map_list;
 	unsigned int maps_in_use;
 	unsigned int map_elems_in_use;
@@ -496,6 +502,7 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv);
 
+unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf);
 long long int
 nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map);
 void
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
index 44d3ea75d043..4c6fb9ecb72c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -264,7 +264,6 @@
  * %NFP_NET_CFG_BPF_ADDR:	DMA address of the buffer with JITed BPF code
  */
 #define NFP_NET_CFG_BPF_ABI		0x0080
-#define   NFP_NET_BPF_ABI		2
 #define NFP_NET_CFG_BPF_CAP		0x0081
 #define   NFP_NET_BPF_CAP_RELO		(1 << 0) /* seamless reload */
 #define NFP_NET_CFG_BPF_MAX_LEN		0x0082

From f3709f69b7c5cba6323cc03c29b64293b93be817 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:29 -0700
Subject: [PATCH 22/64] bpf: Add iterator for spilled registers

Add this iterator for spilled registers, it concentrates the details of
how to get the current frame's spilled registers into a single macro
while clarifying the intention of the code which is calling the macro.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 11 +++++++++++
 kernel/bpf/verifier.c        | 16 +++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b42b60a83e19..d0e7f97e8b60 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -131,6 +131,17 @@ struct bpf_verifier_state {
 	u32 curframe;
 };
 
+#define bpf_get_spilled_reg(slot, frame)				\
+	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
+	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	 ? &frame->stack[slot].spilled_ptr : NULL)
+
+/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
+#define bpf_for_each_spilled_reg(iter, frame, reg)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
+	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8cc83a970d1..9c82d8f58085 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2252,10 +2252,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		if (state->stack[i].slot_type[0] != STACK_SPILL)
+	bpf_for_each_spilled_reg(i, state, reg) {
+		if (!reg)
 			continue;
-		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -3395,10 +3394,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		bpf_for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
-			reg = &state->stack[i].spilled_ptr;
 			if (reg->type == type && reg->id == dst_reg->id)
 				reg->range = max(reg->range, new_range);
 		}
@@ -3643,7 +3641,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 			  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *reg, *regs = state->regs;
 	u32 id = regs[regno].id;
 	int i, j;
 
@@ -3652,8 +3650,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		bpf_for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
 			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 		}

From aad2eeaf46973a0968a75640cd1f8f1c650322a0 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:30 -0700
Subject: [PATCH 23/64] bpf: Simplify ptr_min_max_vals adjustment

An upcoming commit will add another two pointer types that need very
similar behaviour, so generalise this function now.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c                       | 22 ++++++++++-----------
 tools/testing/selftests/bpf/test_verifier.c | 14 ++++++-------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c82d8f58085..abf567200574 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2669,20 +2669,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-			dst);
+	switch (ptr_reg->type) {
+	case PTR_TO_MAP_VALUE_OR_NULL:
+		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
+			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
-	}
-	if (ptr_reg->type == CONST_PTR_TO_MAP) {
-		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-			dst);
-		return -EACCES;
-	}
-	if (ptr_reg->type == PTR_TO_PACKET_END) {
-		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-			dst);
+	case CONST_PTR_TO_MAP:
+	case PTR_TO_PACKET_END:
+		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
+			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	default:
+		break;
 	}
 
 	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index c7d25f23baf9..a90be44f61e0 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3638,7 +3638,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -4896,7 +4896,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -4917,7 +4917,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -4938,7 +4938,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -7253,7 +7253,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map_in_map = { 3 },
-		.errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
+		.errstr = "R1 pointer arithmetic on map_ptr prohibited",
 		.result = REJECT,
 	},
 	{
@@ -8927,7 +8927,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
@@ -8946,7 +8946,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},

From 9d2be44a7f33d5ec4fbd3368317bcf5f404bb8f7 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:31 -0700
Subject: [PATCH 24/64] bpf: Reuse canonical string formatter for ctx errs

The array "reg_type_str" provides canonical formatting of register
types, however a couple of places would previously check whether a
register represented the context and write the name "context" directly.
An upcoming commit will add another pointer type to these statements, so
to provide more accurate error messages in the verifier, update these
error messages to use "reg_type_str" instead.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c                       |  7 +++----
 tools/testing/selftests/bpf/test_verifier.c | 10 +++++-----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index abf567200574..8b4e70eeced2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1763,8 +1763,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	if (is_ctx_reg(env, insn->dst_reg) ||
 	    is_pkt_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
-			insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ?
-			"context" : "packet");
+			insn->dst_reg, reg_type_str[insn->dst_reg]);
 		return -EACCES;
 	}
 
@@ -4871,8 +4870,8 @@ static int do_check(struct bpf_verifier_env *env)
 				return err;
 
 			if (is_ctx_reg(env, insn->dst_reg)) {
-				verbose(env, "BPF_ST stores into R%d context is not allowed\n",
-					insn->dst_reg);
+				verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
+					insn->dst_reg, reg_type_str[insn->dst_reg]);
 				return -EACCES;
 			}
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index a90be44f61e0..6e0b3f148cdb 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3276,7 +3276,7 @@ static struct bpf_test tests[] = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_ST stores into R1 context is not allowed",
+		.errstr = "BPF_ST stores into R1 inv is not allowed",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -3288,7 +3288,7 @@ static struct bpf_test tests[] = {
 				     BPF_REG_0, offsetof(struct __sk_buff, mark), 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_XADD stores into R1 context is not allowed",
+		.errstr = "BPF_XADD stores into R1 inv is not allowed",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -5266,7 +5266,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R2 leaks addr into mem",
 		.result_unpriv = REJECT,
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R1 context is not allowed",
+		.errstr = "BPF_XADD stores into R1 inv is not allowed",
 	},
 	{
 		"leak pointer into ctx 2",
@@ -5281,7 +5281,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R10 leaks addr into mem",
 		.result_unpriv = REJECT,
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R1 context is not allowed",
+		.errstr = "BPF_XADD stores into R1 inv is not allowed",
 	},
 	{
 		"leak pointer into ctx 3",
@@ -12230,7 +12230,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R2 packet",
+		.errstr = "BPF_XADD stores into R2 ctx",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
 	{

From 840b9615d6e9d134178b4dd4f3c30aa30643a379 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:32 -0700
Subject: [PATCH 25/64] bpf: Generalize ptr_or_null regs check

This check will be reused by an upcoming commit for conditional jump
checks for sockets. Refactor it a bit to simplify the later commit.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8b4e70eeced2..98b218bd46e8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -249,6 +249,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 	       type == PTR_TO_PACKET_META;
 }
 
+static bool reg_type_may_be_null(enum bpf_reg_type type)
+{
+	return type == PTR_TO_MAP_VALUE_OR_NULL;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -3599,12 +3604,10 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 	}
 }
 
-static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-			 bool is_null)
+static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+				 bool is_null)
 {
-	struct bpf_reg_state *reg = &regs[regno];
-
-	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+	if (reg_type_may_be_null(reg->type) && reg->id == id) {
 		/* Old offset (both fixed and variable parts) should
 		 * have been known-zero, because we don't allow pointer
 		 * arithmetic on pointers that might be NULL.
@@ -3617,11 +3620,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 		}
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
-		} else if (reg->map_ptr->inner_map_meta) {
-			reg->type = CONST_PTR_TO_MAP;
-			reg->map_ptr = reg->map_ptr->inner_map_meta;
-		} else {
-			reg->type = PTR_TO_MAP_VALUE;
+		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+			if (reg->map_ptr->inner_map_meta) {
+				reg->type = CONST_PTR_TO_MAP;
+				reg->map_ptr = reg->map_ptr->inner_map_meta;
+			} else {
+				reg->type = PTR_TO_MAP_VALUE;
+			}
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -3634,8 +3639,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
  * be folded together at some point.
  */
-static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
-			  bool is_null)
+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
+				  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *reg, *regs = state->regs;
@@ -3643,14 +3648,14 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 	int i, j;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_map_reg(regs, i, id, is_null);
+		mark_ptr_or_null_reg(&regs[i], id, is_null);
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
 		bpf_for_each_spilled_reg(i, state, reg) {
 			if (!reg)
 				continue;
-			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+			mark_ptr_or_null_reg(reg, id, is_null);
 		}
 	}
 }
@@ -3852,12 +3857,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		/* Mark all identical map registers in each branch as either
+	    reg_type_may_be_null(dst_reg->type)) {
+		/* Mark all identical registers in each branch as either
 		 * safe or unknown depending R == 0 or R != 0 conditional.
 		 */
-		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
-		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
+		mark_ptr_or_null_regs(this_branch, insn->dst_reg,
+				      opcode == BPF_JNE);
+		mark_ptr_or_null_regs(other_branch, insn->dst_reg,
+				      opcode == BPF_JEQ);
 	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
 					   this_branch, other_branch) &&
 		   is_pointer_value(env, insn->dst_reg)) {

From c64b7983288e636356f7f5f652de4813e1cfedac Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:33 -0700
Subject: [PATCH 26/64] bpf: Add PTR_TO_SOCKET verifier type

Teach the verifier a little bit about a new type of pointer, a
PTR_TO_SOCKET. This pointer type is accessed from BPF through the
'struct bpf_sock' structure.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h          |  34 ++++++++++
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 120 +++++++++++++++++++++++++++++++----
 net/core/filter.c            |  30 +++++----
 4 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 018299a595c8..027697b6a22f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -154,6 +154,7 @@ enum bpf_arg_type {
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
+	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
 };
 
 /* type of values returned from helper functions */
@@ -162,6 +163,7 @@ enum bpf_return_type {
 	RET_VOID,			/* function doesn't return anything */
 	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
+	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -213,6 +215,8 @@ enum bpf_reg_type {
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 	PTR_TO_FLOW_KEYS,	 /* reg points to bpf_flow_keys */
+	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
+	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -343,6 +347,11 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 					unsigned long off, unsigned long len);
+typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
+					const struct bpf_insn *src,
+					struct bpf_insn *dst,
+					struct bpf_prog *prog,
+					u32 *target_size);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
@@ -836,4 +845,29 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(CONFIG_NET)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info);
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog,
+				u32 *target_size);
+#else
+static inline bool bpf_sock_is_valid_access(int off, int size,
+					    enum bpf_access_type type,
+					    struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+					      const struct bpf_insn *si,
+					      struct bpf_insn *insn_buf,
+					      struct bpf_prog *prog,
+					      u32 *target_size)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d0e7f97e8b60..a411363098a5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -58,6 +58,8 @@ struct bpf_reg_state {
 	 * offset, so they can share range knowledge.
 	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
 	 * came from, when one is tested for != NULL.
+	 * For PTR_TO_SOCKET this is used to share which pointers retain the
+	 * same reference to the socket, to determine proper reference freeing.
 	 */
 	u32 id;
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98b218bd46e8..f86386c9affd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * (like pointer plus pointer becomes SCALAR_VALUE type)
  *
  * When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
- * types recognized by check_mem_access() function.
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
+ * four pointer types recognized by check_mem_access() function.
  *
  * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
  * and the range of [ptr, ptr + map's value_size) is accessible.
@@ -267,6 +267,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 	[PTR_TO_FLOW_KEYS]	= "flow_keys",
+	[PTR_TO_SOCKET]		= "sock",
+	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
 };
 
 static char slot_type_char[] = {
@@ -973,6 +975,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PACKET_END:
 	case PTR_TO_FLOW_KEYS:
 	case CONST_PTR_TO_MAP:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -1341,6 +1345,28 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 	return 0;
 }
 
+static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
+			     int size, enum bpf_access_type t)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	struct bpf_insn_access_aux info;
+
+	if (reg->smin_value < 0) {
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			regno);
+		return -EACCES;
+	}
+
+	if (!bpf_sock_is_valid_access(off, size, t, &info)) {
+		verbose(env, "invalid bpf_sock access off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -1459,6 +1485,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		 */
 		strict = true;
 		break;
+	case PTR_TO_SOCKET:
+		pointer_desc = "sock ";
+		break;
 	default:
 		break;
 	}
@@ -1726,6 +1755,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_flow_keys_access(env, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_SOCKET) {
+		if (t == BPF_WRITE) {
+			verbose(env, "cannot write into socket\n");
+			return -EACCES;
+		}
+		err = check_sock_access(env, regno, off, size, t);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1948,6 +1985,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_ctx_reg(env, reg, regno);
 		if (err < 0)
 			return err;
+	} else if (arg_type == ARG_PTR_TO_SOCKET) {
+		expected_type = PTR_TO_SOCKET;
+		if (type != expected_type)
+			goto err_type;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2543,6 +2584,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
+	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -2680,6 +2725,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -3627,6 +3674,8 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
+		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
+			reg->type = PTR_TO_SOCKET;
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -4402,6 +4451,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
 	case PTR_TO_FLOW_KEYS:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -4679,6 +4730,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	return 0;
 }
 
+/* Return true if it's OK to have the same insn return a different type. */
+static bool reg_type_mismatch_ok(enum bpf_reg_type type)
+{
+	switch (type) {
+	case PTR_TO_CTX:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
+		return false;
+	default:
+		return true;
+	}
+}
+
+/* If an instruction was previously used with particular pointer types, then we
+ * need to be careful to avoid cases such as the below, where it may be ok
+ * for one branch accessing the pointer, but not ok for the other branch:
+ *
+ * R1 = sock_ptr
+ * goto X;
+ * ...
+ * R1 = some_other_valid_ptr;
+ * goto X;
+ * ...
+ * R2 = *(u32 *)(R1 + 0);
+ */
+static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
+{
+	return src != prev && (!reg_type_mismatch_ok(src) ||
+			       !reg_type_mismatch_ok(prev));
+}
+
 static int do_check(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state;
@@ -4811,9 +4893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 */
 				*prev_src_type = src_reg_type;
 
-			} else if (src_reg_type != *prev_src_type &&
-				   (src_reg_type == PTR_TO_CTX ||
-				    *prev_src_type == PTR_TO_CTX)) {
+			} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
 				/* ABuser program is trying to use the same insn
 				 * dst_reg = *(u32*) (src_reg + off)
 				 * with different pointer types:
@@ -4858,9 +4938,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 			if (*prev_dst_type == NOT_INIT) {
 				*prev_dst_type = dst_reg_type;
-			} else if (dst_reg_type != *prev_dst_type &&
-				   (dst_reg_type == PTR_TO_CTX ||
-				    *prev_dst_type == PTR_TO_CTX)) {
+			} else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
 				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
@@ -5286,8 +5364,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
 	}
 }
 
-/* convert load instructions that access fields of 'struct __sk_buff'
- * into sequence of instructions that access fields of 'struct sk_buff'
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ *     struct __sk_buff    -> struct sk_buff
+ *     struct bpf_sock_ops -> struct sock
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
@@ -5316,12 +5396,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
+	if (bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		bpf_convert_ctx_access_t convert_ctx_access;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
@@ -5363,8 +5445,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			continue;
 		}
 
-		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+		switch (env->insn_aux_data[i + delta].ptr_type) {
+		case PTR_TO_CTX:
+			if (!ops->convert_ctx_access)
+				continue;
+			convert_ctx_access = ops->convert_ctx_access;
+			break;
+		case PTR_TO_SOCKET:
+			convert_ctx_access = bpf_sock_convert_ctx_access;
+			break;
+		default:
 			continue;
+		}
 
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
 		size = BPF_LDST_BYTES(insn);
@@ -5396,8 +5488,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		target_size = 0;
-		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
-					      &target_size);
+		cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+					 &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
 			verbose(env, "bpf verifier is misconfigured\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 72db8afb7cb6..b2cb186252e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5394,23 +5394,29 @@ static bool __sock_filter_check_size(int off, int size,
 	return size == size_default;
 }
 
-static bool sock_filter_is_valid_access(int off, int size,
-					enum bpf_access_type type,
-					const struct bpf_prog *prog,
-					struct bpf_insn_access_aux *info)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info)
 {
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (!__sock_filter_check_attach_type(off, type,
-					     prog->expected_attach_type))
-		return false;
 	if (!__sock_filter_check_size(off, size, info))
 		return false;
 	return true;
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	if (!bpf_sock_is_valid_access(off, size, type, info))
+		return false;
+	return __sock_filter_check_attach_type(off, type,
+					       prog->expected_attach_type);
+}
+
 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 				const struct bpf_prog *prog, int drop_verdict)
 {
@@ -6122,10 +6128,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  const struct bpf_insn *si,
-					  struct bpf_insn *insn_buf,
-					  struct bpf_prog *prog, u32 *target_size)
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
@@ -7037,7 +7043,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = {
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
-	.convert_ctx_access	= sock_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_sock_convert_ctx_access,
 };
 
 const struct bpf_prog_ops cg_sock_prog_ops = {

From 84dbf3507349696b505b6a500722538b0683e4ac Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:34 -0700
Subject: [PATCH 27/64] bpf: Macrofy stack state copy

An upcoming commit will need very similar copy/realloc boilerplate, so
refactor the existing stack copy/realloc functions into macros to
simplify it.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 104 ++++++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 45 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f86386c9affd..11e982381061 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -388,60 +388,74 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	verbose(env, "\n");
 }
 
-static int copy_stack_state(struct bpf_func_state *dst,
-			    const struct bpf_func_state *src)
-{
-	if (!src->stack)
-		return 0;
-	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
-		/* internal bug, make state invalid to reject the program */
-		memset(dst, 0, sizeof(*dst));
-		return -EFAULT;
-	}
-	memcpy(dst->stack, src->stack,
-	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
-	return 0;
+#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE)				\
+static int copy_##NAME##_state(struct bpf_func_state *dst,		\
+			       const struct bpf_func_state *src)	\
+{									\
+	if (!src->FIELD)						\
+		return 0;						\
+	if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) {			\
+		/* internal bug, make state invalid to reject the program */ \
+		memset(dst, 0, sizeof(*dst));				\
+		return -EFAULT;						\
+	}								\
+	memcpy(dst->FIELD, src->FIELD,					\
+	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\
+	return 0;							\
 }
+/* copy_stack_state() */
+COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef COPY_STATE_FN
+
+#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE)			\
+static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
+				  bool copy_old)			\
+{									\
+	u32 old_size = state->COUNT;					\
+	struct bpf_##NAME##_state *new_##FIELD;				\
+	int slot = size / SIZE;						\
+									\
+	if (size <= old_size || !size) {				\
+		if (copy_old)						\
+			return 0;					\
+		state->COUNT = slot * SIZE;				\
+		if (!size && old_size) {				\
+			kfree(state->FIELD);				\
+			state->FIELD = NULL;				\
+		}							\
+		return 0;						\
+	}								\
+	new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
+				    GFP_KERNEL);			\
+	if (!new_##FIELD)						\
+		return -ENOMEM;						\
+	if (copy_old) {							\
+		if (state->FIELD)					\
+			memcpy(new_##FIELD, state->FIELD,		\
+			       sizeof(*new_##FIELD) * (old_size / SIZE)); \
+		memset(new_##FIELD + old_size / SIZE, 0,		\
+		       sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
+	}								\
+	state->COUNT = slot * SIZE;					\
+	kfree(state->FIELD);						\
+	state->FIELD = new_##FIELD;					\
+	return 0;							\
+}
+/* realloc_stack_state() */
+REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef REALLOC_STATE_FN
 
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minimal amount of memory. check_stack_write() access from
  * the program calls into realloc_func_state() to grow the stack size.
- * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state
- * which this function copies over. It points to corresponding reg in previous
- * bpf_verifier_state which is never reallocated
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which realloc_stack_state() copies over. It points to previous
+ * bpf_verifier_state which is never reallocated.
  */
 static int realloc_func_state(struct bpf_func_state *state, int size,
 			      bool copy_old)
 {
-	u32 old_size = state->allocated_stack;
-	struct bpf_stack_state *new_stack;
-	int slot = size / BPF_REG_SIZE;
-
-	if (size <= old_size || !size) {
-		if (copy_old)
-			return 0;
-		state->allocated_stack = slot * BPF_REG_SIZE;
-		if (!size && old_size) {
-			kfree(state->stack);
-			state->stack = NULL;
-		}
-		return 0;
-	}
-	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
-				  GFP_KERNEL);
-	if (!new_stack)
-		return -ENOMEM;
-	if (copy_old) {
-		if (state->stack)
-			memcpy(new_stack, state->stack,
-			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
-		memset(new_stack + old_size / BPF_REG_SIZE, 0,
-		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
-	}
-	state->allocated_stack = slot * BPF_REG_SIZE;
-	kfree(state->stack);
-	state->stack = new_stack;
-	return 0;
+	return realloc_stack_state(state, size, copy_old);
 }
 
 static void free_func_state(struct bpf_func_state *state)

From fd978bf7fd312581a7ca454a991f0ffb34c4204b Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:35 -0700
Subject: [PATCH 28/64] bpf: Add reference tracking to verifier

Allow helper functions to acquire a reference and return it into a
register. Specific pointer types such as the PTR_TO_SOCKET will
implicitly represent such a reference. The verifier must ensure that
these references are released exactly once in each path through the
program.

To achieve this, this commit assigns an id to the pointer and tracks it
in the 'bpf_func_state', then when the function or program exits,
verifies that all of the acquired references have been freed. When the
pointer is passed to a function that frees the reference, it is removed
from the 'bpf_func_state` and all existing copies of the pointer in
registers are marked invalid.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  24 ++-
 kernel/bpf/verifier.c        | 306 ++++++++++++++++++++++++++++++++---
 2 files changed, 308 insertions(+), 22 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a411363098a5..7b6fd2ab3263 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -104,6 +104,17 @@ struct bpf_stack_state {
 	u8 slot_type[BPF_REG_SIZE];
 };
 
+struct bpf_reference_state {
+	/* Track each reference created with a unique id, even if the same
+	 * instruction creates the reference multiple times (eg, via CALL).
+	 */
+	int id;
+	/* Instruction where the allocation of this reference occurred. This
+	 * is used purely to inform the user of a reference leak.
+	 */
+	int insn_idx;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -121,7 +132,9 @@ struct bpf_func_state {
 	 */
 	u32 subprogno;
 
-	/* should be second to last. See copy_func_state() */
+	/* The following fields should be last. See copy_func_state() */
+	int acquired_refs;
+	struct bpf_reference_state *refs;
 	int allocated_stack;
 	struct bpf_stack_state *stack;
 };
@@ -217,11 +230,16 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 
-	return cur->frame[cur->curframe]->regs;
+	return cur->frame[cur->curframe];
+}
+
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return cur_func(env)->regs;
 }
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11e982381061..cd0d8bc00bd1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  *
  * After the call R0 is set to return type of the function and registers R1-R5
  * are set to NOT_INIT to indicate that they are no longer readable.
+ *
+ * The following reference types represent a potential reference to a kernel
+ * resource which, after first being allocated, must be checked and freed by
+ * the BPF program:
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
+ *
+ * When the verifier sees a helper call return a reference type, it allocates a
+ * pointer id for the reference and stores it in the current function state.
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
+ * passes through a NULL-check conditional. For the branch wherein the state is
+ * changed to CONST_IMM, the verifier releases the reference.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -189,6 +202,7 @@ struct bpf_call_arg_meta {
 	int access_size;
 	s64 msize_smax_value;
 	u64 msize_umax_value;
+	int ptr_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
-	return type == PTR_TO_MAP_VALUE_OR_NULL;
+	return type == PTR_TO_MAP_VALUE_OR_NULL ||
+	       type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool type_is_refcounted(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET;
+}
+
+static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool reg_is_refcounted(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted(reg->type);
+}
+
+static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted_or_null(reg->type);
+}
+
+static bool arg_type_is_refcounted(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_SOCKET;
+}
+
+/* Determine whether the function releases some resources allocated by another
+ * function call. The first reference type argument will be assumed to be
+ * released by release_reference().
+ */
+static bool is_release_function(enum bpf_func_id func_id)
+{
+	return false;
 }
 
 /* string representation of 'enum bpf_reg_type' */
@@ -385,6 +434,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		else
 			verbose(env, "=%s", types_buf);
 	}
+	if (state->acquired_refs && state->refs[0].id) {
+		verbose(env, " refs=%d", state->refs[0].id);
+		for (i = 1; i < state->acquired_refs; i++)
+			if (state->refs[i].id)
+				verbose(env, ",%d", state->refs[i].id);
+	}
 	verbose(env, "\n");
 }
 
@@ -403,6 +458,8 @@ static int copy_##NAME##_state(struct bpf_func_state *dst,		\
 	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\
 	return 0;							\
 }
+/* copy_reference_state() */
+COPY_STATE_FN(reference, acquired_refs, refs, 1)
 /* copy_stack_state() */
 COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef COPY_STATE_FN
@@ -441,6 +498,8 @@ static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
 	state->FIELD = new_##FIELD;					\
 	return 0;							\
 }
+/* realloc_reference_state() */
+REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
 /* realloc_stack_state() */
 REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef REALLOC_STATE_FN
@@ -452,16 +511,89 @@ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
  * which realloc_stack_state() copies over. It points to previous
  * bpf_verifier_state which is never reallocated.
  */
-static int realloc_func_state(struct bpf_func_state *state, int size,
-			      bool copy_old)
+static int realloc_func_state(struct bpf_func_state *state, int stack_size,
+			      int refs_size, bool copy_old)
 {
-	return realloc_stack_state(state, size, copy_old);
+	int err = realloc_reference_state(state, refs_size, copy_old);
+	if (err)
+		return err;
+	return realloc_stack_state(state, stack_size, copy_old);
+}
+
+/* Acquire a pointer id from the env and update the state->refs to include
+ * this new pointer reference.
+ * On success, returns a valid pointer id to associate with the register
+ * On failure, returns a negative errno.
+ */
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int new_ofs = state->acquired_refs;
+	int id, err;
+
+	err = realloc_reference_state(state, state->acquired_refs + 1, true);
+	if (err)
+		return err;
+	id = ++env->id_gen;
+	state->refs[new_ofs].id = id;
+	state->refs[new_ofs].insn_idx = insn_idx;
+
+	return id;
+}
+
+/* release function corresponding to acquire_reference_state(). Idempotent. */
+static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+{
+	int i, last_idx;
+
+	if (!ptr_id)
+		return -EFAULT;
+
+	last_idx = state->acquired_refs - 1;
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].id == ptr_id) {
+			if (last_idx && i != last_idx)
+				memcpy(&state->refs[i], &state->refs[last_idx],
+				       sizeof(*state->refs));
+			memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+			state->acquired_refs--;
+			return 0;
+		}
+	}
+	return -EFAULT;
+}
+
+/* variation on the above for cases where we expect that there must be an
+ * outstanding reference for the specified ptr_id.
+ */
+static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int err;
+
+	err = __release_reference_state(state, ptr_id);
+	if (WARN_ON_ONCE(err != 0))
+		verbose(env, "verifier internal error: can't release reference\n");
+	return err;
+}
+
+static int transfer_reference_state(struct bpf_func_state *dst,
+				    struct bpf_func_state *src)
+{
+	int err = realloc_reference_state(dst, src->acquired_refs, false);
+	if (err)
+		return err;
+	err = copy_reference_state(dst, src);
+	if (err)
+		return err;
+	return 0;
 }
 
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
 		return;
+	kfree(state->refs);
 	kfree(state->stack);
 	kfree(state);
 }
@@ -487,10 +619,14 @@ static int copy_func_state(struct bpf_func_state *dst,
 {
 	int err;
 
-	err = realloc_func_state(dst, src->allocated_stack, false);
+	err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
+				 false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
+	err = copy_reference_state(dst, src);
 	if (err)
 		return err;
-	memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
 	return copy_stack_state(dst, src);
 }
 
@@ -1015,7 +1151,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	enum bpf_reg_type type;
 
 	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
-				 true);
+				 state->acquired_refs, true);
 	if (err)
 		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -1399,7 +1535,8 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 {
 	const struct bpf_reg_state *reg = cur_regs(env) + regno;
 
-	return reg->type == PTR_TO_CTX;
+	return reg->type == PTR_TO_CTX ||
+	       reg->type == PTR_TO_SOCKET;
 }
 
 static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -2003,6 +2140,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_SOCKET;
 		if (type != expected_type)
 			goto err_type;
+		if (meta->ptr_id || !reg->id) {
+			verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
+				meta->ptr_id, reg->id);
+			return -EFAULT;
+		}
+		meta->ptr_id = reg->id;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2292,10 +2435,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
+static bool check_refcount_ok(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (arg_type_is_refcounted(fn->arg1_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg2_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg3_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg4_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg5_type))
+		count++;
+
+	/* We only support one arg being unreferenced at the moment,
+	 * which is sufficient for the helper functions we have right now.
+	 */
+	return count <= 1;
+}
+
 static int check_func_proto(const struct bpf_func_proto *fn)
 {
 	return check_raw_mode_ok(fn) &&
-	       check_arg_pair_ok(fn) ? 0 : -EINVAL;
+	       check_arg_pair_ok(fn) &&
+	       check_refcount_ok(fn) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2328,12 +2493,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		__clear_all_pkt_pointers(env, vstate->frame[i]);
 }
 
+static void release_reg_references(struct bpf_verifier_env *env,
+				   struct bpf_func_state *state, int id)
+{
+	struct bpf_reg_state *regs = state->regs, *reg;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].id == id)
+			mark_reg_unknown(env, regs, i);
+
+	bpf_for_each_spilled_reg(i, state, reg) {
+		if (!reg)
+			continue;
+		if (reg_is_refcounted(reg) && reg->id == id)
+			__mark_reg_unknown(reg);
+	}
+}
+
+/* The pointer with the specified id has released its reference to kernel
+ * resources. Identify all copies of the same pointer and clear the reference.
+ */
+static int release_reference(struct bpf_verifier_env *env,
+			     struct bpf_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	int i;
+
+	for (i = 0; i <= vstate->curframe; i++)
+		release_reg_references(env, vstate->frame[i], meta->ptr_id);
+
+	return release_reference_state(env, meta->ptr_id);
+}
+
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
-	int i, subprog, target_insn;
+	int i, err, subprog, target_insn;
 
 	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
 		verbose(env, "the call stack of %d frames is too deep\n",
@@ -2371,6 +2569,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			state->curframe + 1 /* frameno within this callchain */,
 			subprog /* subprog number within this prog */);
 
+	/* Transfer references to the callee */
+	err = transfer_reference_state(callee, caller);
+	if (err)
+		return err;
+
 	/* copy r1 - r5 args that callee can access.  The copy includes parent
 	 * pointers, which connects us up to the liveness chain
 	 */
@@ -2403,6 +2606,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
 	struct bpf_reg_state *r0;
+	int err;
 
 	callee = state->frame[state->curframe];
 	r0 = &callee->regs[BPF_REG_0];
@@ -2422,6 +2626,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	/* return to the caller whatever r0 had in the callee */
 	caller->regs[BPF_REG_0] = *r0;
 
+	/* Transfer references to the caller */
+	err = transfer_reference_state(caller, callee);
+	if (err)
+		return err;
+
 	*insn_idx = callee->callsite + 1;
 	if (env->log.level) {
 		verbose(env, "returning from callee:\n");
@@ -2478,6 +2687,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
+static int check_reference_leak(struct bpf_verifier_env *env)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int i;
+
+	for (i = 0; i < state->acquired_refs; i++) {
+		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
+			state->refs[i].id, state->refs[i].insn_idx);
+	}
+	return state->acquired_refs ? -EINVAL : 0;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2556,6 +2777,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return err;
 	}
 
+	if (func_id == BPF_FUNC_tail_call) {
+		err = check_reference_leak(env);
+		if (err) {
+			verbose(env, "tail_call would lead to reference leak\n");
+			return err;
+		}
+	} else if (is_release_function(func_id)) {
+		err = release_reference(env, &meta);
+		if (err)
+			return err;
+	}
+
 	regs = cur_regs(env);
 
 	/* check that flags argument in get_local_storage(map, flags) is 0,
@@ -2599,9 +2832,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		int id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-		regs[BPF_REG_0].id = ++env->id_gen;
+		regs[BPF_REG_0].id = id;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -3665,7 +3901,8 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 	}
 }
 
-static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
+				 struct bpf_reg_state *reg, u32 id,
 				 bool is_null)
 {
 	if (reg_type_may_be_null(reg->type) && reg->id == id) {
@@ -3691,11 +3928,13 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
 		}
-		/* We don't need id from this point onwards anymore, thus we
-		 * should better reset it, so that state pruning has chances
-		 * to take effect.
-		 */
-		reg->id = 0;
+		if (is_null || !reg_is_refcounted(reg)) {
+			/* We don't need id from this point onwards anymore,
+			 * thus we should better reset it, so that state
+			 * pruning has chances to take effect.
+			 */
+			reg->id = 0;
+		}
 	}
 }
 
@@ -3710,15 +3949,18 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 	u32 id = regs[regno].id;
 	int i, j;
 
+	if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
+		__release_reference_state(state, id);
+
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_ptr_or_null_reg(&regs[i], id, is_null);
+		mark_ptr_or_null_reg(state, &regs[i], id, is_null);
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
 		bpf_for_each_spilled_reg(i, state, reg) {
 			if (!reg)
 				continue;
-			mark_ptr_or_null_reg(reg, id, is_null);
+			mark_ptr_or_null_reg(state, reg, id, is_null);
 		}
 	}
 }
@@ -4050,6 +4292,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (err)
 		return err;
 
+	/* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
+	 * gen_ld_abs() may terminate the program at runtime, leading to
+	 * reference leak.
+	 */
+	err = check_reference_leak(env);
+	if (err) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
+		return err;
+	}
+
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4542,6 +4794,14 @@ static bool stacksafe(struct bpf_func_state *old,
 	return true;
 }
 
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+{
+	if (old->acquired_refs != cur->acquired_refs)
+		return false;
+	return !memcmp(old->refs, cur->refs,
+		       sizeof(*old->refs) * old->acquired_refs);
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -4587,6 +4847,9 @@ static bool func_states_equal(struct bpf_func_state *old,
 
 	if (!stacksafe(old, cur, idmap))
 		goto out_free;
+
+	if (!refsafe(old, cur))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -4868,6 +5131,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		regs = cur_regs(env);
 		env->insn_aux_data[insn_idx].seen = true;
+
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -5032,6 +5296,10 @@ static int do_check(struct bpf_verifier_env *env)
 					continue;
 				}
 
+				err = check_reference_leak(env);
+				if (err)
+					return err;
+
 				/* eBPF calling convetion is such that R0 is used
 				 * to return the value from eBPF program.
 				 * Make sure that it's readable at this time

From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:36 -0700
Subject: [PATCH 29/64] bpf: Add helper to retrieve socket in BPF

This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
socket listening on this host, and returns a socket pointer which the
BPF program can then access to determine, for instance, whether to
forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
socket, so when a BPF program makes use of this function, it must
subsequently pass the returned pointer into the newly added sk_release()
to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h                  |  93 ++++++++++++-
 kernel/bpf/verifier.c                     |   8 +-
 net/core/filter.c                         | 151 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  93 ++++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
 5 files changed, 354 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2070d819e04..f9187b41dff6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2144,6 +2144,77 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2229,7 +2300,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2399,6 +2473,23 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd0d8bc00bd1..73c81bef6ae8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
+ * bpf_sk_release(). When a reference type passes into the release function,
+ * the verifier also releases the reference. If any unchecked or unreleased
+ * reference remains at the end of the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index b2cb186252e4..591c698bc517 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,13 +58,17 @@
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
 #include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
+#include <net/net_namespace.h>
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
@@ -4813,6 +4817,141 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+		       struct sk_buff *skb, u8 family, u8 proto)
+{
+	int dif = skb->dev->ifindex;
+	bool refcounted = false;
+	struct sock *sk = NULL;
+
+	if (family == AF_INET) {
+		__be32 src4 = tuple->ipv4.saddr;
+		__be32 dst4 = tuple->ipv4.daddr;
+		int sdif = inet_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+					   src4, tuple->ipv4.sport,
+					   dst4, tuple->ipv4.dport,
+					   dif, sdif, &refcounted);
+		else
+			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+					       dst4, tuple->ipv4.dport,
+					       dif, sdif, &udp_table, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+		int sdif = inet6_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+					    src6, tuple->ipv6.sport,
+					    dst6, tuple->ipv6.dport,
+					    dif, sdif, &refcounted);
+		else
+			sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport,
+					       dst6, tuple->ipv6.dport,
+					       dif, sdif, &udp_table, skb);
+#endif
+	}
+
+	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+		sk = NULL;
+	}
+	return sk;
+}
+
+/* bpf_sk_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ * Returns the socket as an 'unsigned long' to simplify the casting in the
+ * callers to satisfy BPF_CALL declarations.
+ */
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+	      u8 proto, u64 netns_id, u64 flags)
+{
+	struct net *caller_net;
+	struct sock *sk = NULL;
+	u8 family = AF_UNSPEC;
+	struct net *net;
+
+	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
+	if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+		goto out;
+
+	if (skb->dev)
+		caller_net = dev_net(skb->dev);
+	else
+		caller_net = sock_net(skb->sk);
+	if (netns_id) {
+		net = get_net_ns_by_id(caller_net, netns_id);
+		if (unlikely(!net))
+			goto out;
+		sk = sk_lookup(net, tuple, skb, family, proto);
+		put_net(net);
+	} else {
+		net = caller_net;
+		sk = sk_lookup(net, tuple, skb, family, proto);
+	}
+
+	if (sk)
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+	.func		= bpf_sk_lookup_tcp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+	.func		= bpf_sk_lookup_udp,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_release, struct sock *, sk)
+{
+	if (!sock_flag(sk, SOCK_RCU_FREE))
+		sock_gen_put(sk);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -5019,6 +5158,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5119,6 +5264,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_hash_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e2070d819e04..f9187b41dff6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2144,6 +2144,77 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2229,7 +2300,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2399,6 +2473,23 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e4be7730222d..1d407b3494f9 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
 	(void *) BPF_FUNC_skb_cgroup_id;
 static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
 	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_tcp;
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_udp;
+static int (*bpf_sk_release)(struct bpf_sock *sk) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions

From 0c586079f852187d19fea60c9a4981ad29e22ba8 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:37 -0700
Subject: [PATCH 30/64] selftests/bpf: Generalize dummy program types

Don't hardcode the dummy program types to SOCKET_FILTER type, as this
prevents testing bpf_tail_call in conjunction with other program types.
Instead, use the program type specified in the test case.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 31 +++++++++++----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 6e0b3f148cdb..163fd1c0062c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -12652,18 +12652,18 @@ static int create_map(uint32_t type, uint32_t size_key,
 	return fd;
 }
 
-static int create_prog_dummy1(void)
+static int create_prog_dummy1(enum bpf_map_type prog_type)
 {
 	struct bpf_insn prog[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 42),
 		BPF_EXIT_INSN(),
 	};
 
-	return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+	return bpf_load_program(prog_type, prog,
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_dummy2(int mfd, int idx)
+static int create_prog_dummy2(enum bpf_map_type prog_type, int mfd, int idx)
 {
 	struct bpf_insn prog[] = {
 		BPF_MOV64_IMM(BPF_REG_3, idx),
@@ -12674,11 +12674,12 @@ static int create_prog_dummy2(int mfd, int idx)
 		BPF_EXIT_INSN(),
 	};
 
-	return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+	return bpf_load_program(prog_type, prog,
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(uint32_t max_elem, int p1key)
+static int create_prog_array(enum bpf_map_type prog_type, uint32_t max_elem,
+			     int p1key)
 {
 	int p2key = 1;
 	int mfd, p1fd, p2fd;
@@ -12690,8 +12691,8 @@ static int create_prog_array(uint32_t max_elem, int p1key)
 		return -1;
 	}
 
-	p1fd = create_prog_dummy1();
-	p2fd = create_prog_dummy2(mfd, p2key);
+	p1fd = create_prog_dummy1(prog_type);
+	p2fd = create_prog_dummy2(prog_type, mfd, p2key);
 	if (p1fd < 0 || p2fd < 0)
 		goto out;
 	if (bpf_map_update_elem(mfd, &p1key, &p1fd, BPF_ANY) < 0)
@@ -12748,8 +12749,8 @@ static int create_cgroup_storage(bool percpu)
 
 static char bpf_vlog[UINT_MAX >> 8];
 
-static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
-			  int *map_fds)
+static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
+			  struct bpf_insn *prog, int *map_fds)
 {
 	int *fixup_map1 = test->fixup_map1;
 	int *fixup_map2 = test->fixup_map2;
@@ -12805,7 +12806,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_prog1) {
-		map_fds[4] = create_prog_array(4, 0);
+		map_fds[4] = create_prog_array(prog_type, 4, 0);
 		do {
 			prog[*fixup_prog1].imm = map_fds[4];
 			fixup_prog1++;
@@ -12813,7 +12814,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_prog2) {
-		map_fds[5] = create_prog_array(8, 7);
+		map_fds[5] = create_prog_array(prog_type, 8, 7);
 		do {
 			prog[*fixup_prog2].imm = map_fds[5];
 			fixup_prog2++;
@@ -12859,11 +12860,13 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	for (i = 0; i < MAX_NR_MAPS; i++)
 		map_fds[i] = -1;
 
-	do_test_fixup(test, prog, map_fds);
+	if (!prog_type)
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	do_test_fixup(test, prog_type, prog, map_fds);
 	prog_len = probe_filter_length(prog);
 
-	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-				     prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
+	fd_prog = bpf_verify_program(prog_type, prog, prog_len,
+				     test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
 				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1);
 
 	expected_ret = unpriv && test->result_unpriv != UNDEF ?

From b584ab8840fdc17f0d75ac62510396b8f657ed8f Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:38 -0700
Subject: [PATCH 31/64] selftests/bpf: Add tests for reference tracking

reference tracking: leak potential reference
reference tracking: leak potential reference on stack
reference tracking: leak potential reference on stack 2
reference tracking: zero potential reference
reference tracking: copy and zero potential references
reference tracking: release reference without check
reference tracking: release reference
reference tracking: release reference twice
reference tracking: release reference twice inside branch
reference tracking: alloc, check, free in one subbranch
reference tracking: alloc, check, free in both subbranches
reference tracking in call: free reference in subprog
reference tracking in call: free reference in subprog and outside
reference tracking in call: alloc & leak reference in subprog
reference tracking in call: alloc in subprog, release outside
reference tracking in call: sk_ptr leak into caller stack
reference tracking in call: sk_ptr spill into caller stack
reference tracking: allow LD_ABS
reference tracking: forbid LD_ABS while holding reference
reference tracking: allow LD_IND
reference tracking: forbid LD_IND while holding reference
reference tracking: check reference or tail call
reference tracking: release reference then tail call
reference tracking: leak possible reference over tail call
reference tracking: leak checked reference over tail call
reference tracking: mangle and release sock_or_null
reference tracking: mangle and release sock
reference tracking: access member
reference tracking: write to member
reference tracking: invalid 64-bit access of member
reference tracking: access after release
reference tracking: direct access for lookup
unpriv: spill/fill of different pointers stx - ctx and sock
unpriv: spill/fill of different pointers stx - leak sock
unpriv: spill/fill of different pointers stx - sock and ctx (read)
unpriv: spill/fill of different pointers stx - sock and ctx (write)

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 759 ++++++++++++++++++++
 1 file changed, 759 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 163fd1c0062c..bc9cd8537467 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2017 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -178,6 +179,24 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
 	self->retval = (uint32_t)res;
 }
 
+/* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
+#define BPF_SK_LOOKUP							\
+	/* struct bpf_sock_tuple tuple = {} */				\
+	BPF_MOV64_IMM(BPF_REG_2, 0),					\
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),			\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),		\
+	/* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */	\
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),				\
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),				\
+	BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)),	\
+	BPF_MOV64_IMM(BPF_REG_4, 0),					\
+	BPF_MOV64_IMM(BPF_REG_5, 0),					\
+	BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp)
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -2707,6 +2726,137 @@ static struct bpf_test tests[] = {
 		.errstr = "same insn cannot be used with different pointers",
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"unpriv: spill/fill of different pointers stx - ctx and sock",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			/* struct bpf_sock *sock = bpf_sock_lookup(...); */
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			/* u64 foo; */
+			/* void *target = &foo; */
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			/* if (skb == NULL) *target = sock; */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+			/* else *target = skb; */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+			/* struct __sk_buff *skb = *target; */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+			/* skb->mark = 42; */
+			BPF_MOV64_IMM(BPF_REG_3, 42),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct __sk_buff, mark)),
+			/* if (sk) bpf_sk_release(sk) */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+				BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "type=ctx expected=sock",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"unpriv: spill/fill of different pointers stx - leak sock",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			/* struct bpf_sock *sock = bpf_sock_lookup(...); */
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			/* u64 foo; */
+			/* void *target = &foo; */
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			/* if (skb == NULL) *target = sock; */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+			/* else *target = skb; */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+			/* struct __sk_buff *skb = *target; */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+			/* skb->mark = 42; */
+			BPF_MOV64_IMM(BPF_REG_3, 42),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		//.errstr = "same insn cannot be used with different pointers",
+		.errstr = "Unreleased reference",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"unpriv: spill/fill of different pointers stx - sock and ctx (read)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			/* struct bpf_sock *sock = bpf_sock_lookup(...); */
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			/* u64 foo; */
+			/* void *target = &foo; */
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			/* if (skb) *target = skb */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+			/* else *target = sock */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+			/* struct bpf_sock *sk = *target; */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+			/* if (sk) u32 foo = sk->mark; bpf_sk_release(sk); */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2),
+				BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+					    offsetof(struct bpf_sock, mark)),
+				BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "same insn cannot be used with different pointers",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"unpriv: spill/fill of different pointers stx - sock and ctx (write)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			/* struct bpf_sock *sock = bpf_sock_lookup(...); */
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			/* u64 foo; */
+			/* void *target = &foo; */
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			/* if (skb) *target = skb */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+			/* else *target = sock */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+				BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+			/* struct bpf_sock *sk = *target; */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+			/* if (sk) sk->mark = 42; bpf_sk_release(sk); */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+				BPF_MOV64_IMM(BPF_REG_3, 42),
+				BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+					    offsetof(struct bpf_sock, mark)),
+				BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		//.errstr = "same insn cannot be used with different pointers",
+		.errstr = "cannot write into socket",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 	{
 		"unpriv: spill/fill of different pointers ldx",
 		.insns = {
@@ -12557,6 +12707,214 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
 	},
+	{
+		"reference tracking: leak potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: zero potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: copy and zero potential references",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference without check",
+		.insns = {
+			BPF_SK_LOOKUP,
+			/* reference in r0 may be NULL */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=sock_or_null expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference twice",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference twice inside branch",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* goto end */
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in one subbranch",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */
+			/* Leak reference in R0 */
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in both subbranches",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), /* mark == 0? */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: free reference in subprog",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 	{
 		"pass modified ctx pointer to helper, 1",
 		.insns = {
@@ -12627,6 +12985,407 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
 	},
+	{
+		"reference tracking in call: free reference in subprog and outside",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc & leak reference in subprog",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_4),
+			BPF_SK_LOOKUP,
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc in subprog, release outside",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(), /* return sk */
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = POINTER_VALUE,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: sk_ptr leak into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: sk_ptr spill into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			/* now the sk_ptr is verified, free the reference */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_4, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: allow LD_ABS",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: forbid LD_ABS while holding reference",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "BPF_LD_[ABS|IND] cannot be mixed with socket references",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: allow LD_IND",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_IMM(BPF_REG_7, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"reference tracking: forbid LD_IND while holding reference",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_7, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_4),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "BPF_LD_[ABS|IND] cannot be mixed with socket references",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: check reference or tail call",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			/* if (sk) bpf_sk_release() */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 7),
+			/* bpf_tail_call() */
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 17 },
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference then tail call",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+			BPF_SK_LOOKUP,
+			/* if (sk) bpf_sk_release() */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			/* bpf_tail_call() */
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 18 },
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: leak possible reference over tail call",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+			/* Look up socket and store in REG_6 */
+			BPF_SK_LOOKUP,
+			/* bpf_tail_call() */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			/* if (sk) bpf_sk_release() */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 16 },
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "tail_call would lead to reference leak",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak checked reference over tail call",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+			/* Look up socket and store in REG_6 */
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			/* if (!sk) goto end */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			/* bpf_tail_call() */
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 17 },
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "tail_call would lead to reference leak",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: mangle and release sock_or_null",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "R1 pointer arithmetic on sock_or_null prohibited",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: mangle and release sock",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "R1 pointer arithmetic on sock prohibited",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: access member",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: write to member",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_LD_IMM64(BPF_REG_2, 42),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_2,
+				    offsetof(struct bpf_sock, mark)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_LD_IMM64(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "cannot write into socket",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: invalid 64-bit access of member",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "invalid bpf_sock access off=0 size=8",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: access after release",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "!read_ok",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: direct access for lookup",
+		.insns = {
+			/* Check that the packet is at least 64B long */
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 9),
+			/* sk = sk_lookup_tcp(ctx, skb->data, ...) */
+			BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)

From 29cd77f41620dca22bdee092217c16b49ece8915 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:39 -0700
Subject: [PATCH 32/64] libbpf: Support loading individual progs

Allow the individual program load to be invoked. This will help with
testing, where a single ELF may contain several sections, some of which
denote subprograms that are expected to fail verification, along with
some which are expected to pass verification. By allowing programs to be
iterated and individually loaded, each program can be independently
checked against its expected verification result.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 4 ++--
 tools/lib/bpf/libbpf.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 425d5ca45c97..9e68fd9fcfca 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -228,7 +228,7 @@ struct bpf_object {
 };
 #define obj_elf_valid(o)	((o)->efile.elf)
 
-static void bpf_program__unload(struct bpf_program *prog)
+void bpf_program__unload(struct bpf_program *prog)
 {
 	int i;
 
@@ -1375,7 +1375,7 @@ out:
 	return ret;
 }
 
-static int
+int
 bpf_program__load(struct bpf_program *prog,
 		  char *license, u32 kern_version)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 511c1294dcbf..2ed24d3f80b3 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -128,10 +128,13 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex);
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
+int bpf_program__load(struct bpf_program *prog, char *license,
+		      u32 kern_version);
 int bpf_program__fd(struct bpf_program *prog);
 int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 			      int instance);
 int bpf_program__pin(struct bpf_program *prog, const char *path);
+void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_insn;
 

From de375f4e91e3074a09ad337ac4b32a2f21c24d96 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:40 -0700
Subject: [PATCH 33/64] selftests/bpf: Add C tests for reference tracking

Add some tests that demonstrate and test the balanced lookup/free
nature of socket lookup. Section names that start with "fail" represent
programs that are expected to fail verification; all others should
succeed.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 tools/testing/selftests/bpf/test_progs.c      |  38 ++++
 .../selftests/bpf/test_sk_lookup_kern.c       | 180 ++++++++++++++++++
 3 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f802de526f57..1381ab81099c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -36,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o
+	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 63a671803ed6..e8becca9c521 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1698,6 +1698,43 @@ static void test_task_fd_query_tp(void)
 				   "sys_enter_read");
 }
 
+static void test_reference_tracking()
+{
+	const char *file = "./test_sk_lookup_kern.o";
+	struct bpf_object *obj;
+	struct bpf_program *prog;
+	__u32 duration;
+	int err = 0;
+
+	obj = bpf_object__open(file);
+	if (IS_ERR(obj)) {
+		error_cnt++;
+		return;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *title;
+
+		/* Ignore .text sections */
+		title = bpf_program__title(prog, false);
+		if (strstr(title, ".text") != NULL)
+			continue;
+
+		bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
+
+		/* Expect verifier failure if test name has 'fail' */
+		if (strstr(title, "fail") != NULL) {
+			libbpf_set_print(NULL, NULL, NULL);
+			err = !bpf_program__load(prog, "GPL", 0);
+			libbpf_set_print(printf, printf, NULL);
+		} else {
+			err = bpf_program__load(prog, "GPL", 0);
+		}
+		CHECK(err, title, "\n");
+	}
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1719,6 +1756,7 @@ int main(void)
 	test_get_stack_raw_tp();
 	test_task_fd_query_rawtp();
 	test_task_fd_query_tp();
+	test_reference_tracking();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
new file mode 100644
index 000000000000..b745bdc08c2b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
+					void *data_end, __u16 eth_proto,
+					bool *ipv4)
+{
+	struct bpf_sock_tuple *result;
+	__u8 proto = 0;
+	__u64 ihl_len;
+
+	if (eth_proto == bpf_htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+		if (iph + 1 > data_end)
+			return NULL;
+		ihl_len = iph->ihl * 4;
+		proto = iph->protocol;
+		*ipv4 = true;
+		result = (struct bpf_sock_tuple *)&iph->saddr;
+	} else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+		if (ip6h + 1 > data_end)
+			return NULL;
+		ihl_len = sizeof(*ip6h);
+		proto = ip6h->nexthdr;
+		*ipv4 = true;
+		result = (struct bpf_sock_tuple *)&ip6h->saddr;
+	}
+
+	if (data + nh_off + ihl_len > data_end || proto != IPPROTO_TCP)
+		return NULL;
+
+	return result;
+}
+
+SEC("sk_lookup_success")
+int bpf_sk_lookup_test0(struct __sk_buff *skb)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = (struct ethhdr *)(data);
+	struct bpf_sock_tuple *tuple;
+	struct bpf_sock *sk;
+	size_t tuple_len;
+	bool ipv4;
+
+	if (eth + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
+	if (!tuple || tuple + sizeof *tuple > data_end)
+		return TC_ACT_SHOT;
+
+	tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+	sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, 0, 0);
+	if (sk)
+		bpf_sk_release(sk);
+	return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
+}
+
+SEC("sk_lookup_success_simple")
+int bpf_sk_lookup_test1(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	if (sk)
+		bpf_sk_release(sk);
+	return 0;
+}
+
+SEC("fail_use_after_free")
+int bpf_sk_lookup_uaf(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+	__u32 family = 0;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	if (sk) {
+		bpf_sk_release(sk);
+		family = sk->family;
+	}
+	return family;
+}
+
+SEC("fail_modify_sk_pointer")
+int bpf_sk_lookup_modptr(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+	__u32 family;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	if (sk) {
+		sk += 1;
+		bpf_sk_release(sk);
+	}
+	return 0;
+}
+
+SEC("fail_modify_sk_or_null_pointer")
+int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+	__u32 family;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	sk += 1;
+	if (sk)
+		bpf_sk_release(sk);
+	return 0;
+}
+
+SEC("fail_no_release")
+int bpf_sk_lookup_test2(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	return 0;
+}
+
+SEC("fail_release_twice")
+int bpf_sk_lookup_test3(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk);
+	bpf_sk_release(sk);
+	return 0;
+}
+
+SEC("fail_release_unchecked")
+int bpf_sk_lookup_test4(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk);
+	return 0;
+}
+
+void lookup_no_release(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+}
+
+SEC("fail_no_release_subcall")
+int bpf_sk_lookup_test5(struct __sk_buff *skb)
+{
+	lookup_no_release(skb);
+	return 0;
+}

From a610b665ec9ec9ba076e6dd8703750999c81eae4 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 13:35:41 -0700
Subject: [PATCH 34/64] Documentation: Describe bpf reference tracking

Document the new pointer types in the verifier and how the pointer ID
tracking works to ensure that references which are taken are later
released.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/filter.txt | 64 +++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index e6b4ebb2b243..4443ce958862 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1125,6 +1125,14 @@ pointer type.  The types of pointers describe their base, as follows:
     PTR_TO_STACK        Frame pointer.
     PTR_TO_PACKET       skb->data.
     PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
+    PTR_TO_SOCKET       Pointer to struct bpf_sock_ops, implicitly refcounted.
+    PTR_TO_SOCKET_OR_NULL
+                        Either a pointer to a socket, or NULL; socket lookup
+                        returns this type, which becomes a PTR_TO_SOCKET when
+                        checked != NULL. PTR_TO_SOCKET is reference-counted,
+                        so programs must release the reference through the
+                        socket release function before the end of the program.
+                        Arithmetic on these pointers is forbidden.
 However, a pointer may be offset from this base (as a result of pointer
 arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
 offset'.  The former is used when an exactly-known value (e.g. an immediate
@@ -1171,6 +1179,13 @@ over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting
 pointer will have a variable offset known to be 4n+2 for some n, so adding the 2
 bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through
 that pointer are safe.
+The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common
+to all copies of the pointer returned from a socket lookup. This has similar
+behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but
+it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly
+represents a reference to the corresponding 'struct sock'. To ensure that the
+reference is not leaked, it is imperative to NULL-check the reference and in
+the non-NULL case, and pass the valid reference to the socket release function.
 
 Direct packet access
 --------------------
@@ -1444,6 +1459,55 @@ Error:
   8: (7a) *(u64 *)(r0 +0) = 1
   R0 invalid mem access 'imm'
 
+Program that performs a socket lookup then sets the pointer to NULL without
+checking it:
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_MOV64_IMM(BPF_REG_0, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (b7) r0 = 0
+  9: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
+Program that performs a socket lookup but does not NULL-check the returned
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
 Testing
 -------
 

From d71019b54bff4b8f52903e354bf3b8cb2b4dfa75 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Tue, 2 Oct 2018 22:32:26 -0700
Subject: [PATCH 35/64] net: core: Fix build with CONFIG_IPV6=m

Stephen Rothwell reports the following link failure with IPv6 as module:

  x86_64-linux-gnu-ld: net/core/filter.o: in function `sk_lookup':
  (.text+0x19219): undefined reference to `__udp6_lib_lookup'

Fix the build by only enabling the IPv6 socket lookup if IPv6 support is
compiled into the kernel.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 591c698bc517..30c6b2d3ef16 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4838,7 +4838,7 @@ struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
 			sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
 					       dst4, tuple->ipv4.dport,
 					       dif, sdif, &udp_table, skb);
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_REACHABLE(CONFIG_IPV6)
 	} else {
 		struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
 		struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

From 434fe9d4b4bfa8becb0959ef32b9b9fa628ef6fe Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:38 -0700
Subject: [PATCH 36/64] libbpf: Move __dump_nlmsg_t from API to implementation

This typedef is used only by implementation in netlink.c. Nothing uses
it in public API. Move it to netlink.c.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.h  | 3 ---
 tools/lib/bpf/netlink.c | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 2ed24d3f80b3..8388be525388 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -304,11 +304,8 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
 			       void **buf, size_t *buf_len,
 			       bpf_perf_event_print_t fn, void *priv);
 
-struct nlmsghdr;
 struct nlattr;
 typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
-typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
-			      void *cookie);
 int bpf_netlink_open(unsigned int *nl_pid);
 int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
 		void *cookie);
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index fde1d7bf8199..da46d9358d9d 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -18,6 +18,9 @@
 #define SOL_NETLINK 270
 #endif
 
+typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
+			      void *cookie);
+
 int bpf_netlink_open(__u32 *nl_pid)
 {
 	struct sockaddr_nl sa;

From aae57780107d92de2463e605cb054656ebd233d1 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:39 -0700
Subject: [PATCH 37/64] libbpf: Consistent prefixes for interfaces in libbpf.h.

libbpf is used more and more outside kernel tree. That means the library
should follow good practices in library design and implementation to
play well with third party code that uses it.

One of such practices is to have a common prefix (or a few) for every
interface, function or data structure, library provides. I helps to
avoid name conflicts with other libraries and keeps API consistent.

Inconsistent names in libbpf already cause problems in real life. E.g.
an application can't use both libbpf and libnl due to conflicting
symbols.

Having common prefix will help to fix current and avoid future problems.

libbpf already uses the following prefixes for its interfaces:
* bpf_ for bpf system call wrappers, program/map/elf-object
  abstractions and a few other things;
* btf_ for BTF related API;
* libbpf_ for everything else.

The patch adds libbpf_ prefix to functions and typedef in libbpf.h that
use none of mentioned above prefixes and doesn't fit well into the first
two categories.

Since affected part of API is used in bpftool, the patch applies
corresponding change to bpftool as well. Having it in a separate patch
will cause a state of tree where bpftool is broken what may not be a
good idea.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/net.c | 31 +++++++++++++++----------------
 tools/lib/bpf/libbpf.h  | 20 ++++++++++----------
 tools/lib/bpf/netlink.c | 37 ++++++++++++++++++++-----------------
 3 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index ed205ee57655..ef83e8a08490 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -127,14 +127,14 @@ static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
 	tcinfo.array_len = 0;
 
 	tcinfo.is_qdisc = false;
-	ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
-			   &tcinfo);
+	ret = libbpf_nl_get_class(sock, nl_pid, dev->ifindex,
+				  dump_class_qdisc_nlmsg, &tcinfo);
 	if (ret)
 		goto out;
 
 	tcinfo.is_qdisc = true;
-	ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg,
-			   &tcinfo);
+	ret = libbpf_nl_get_qdisc(sock, nl_pid, dev->ifindex,
+				  dump_class_qdisc_nlmsg, &tcinfo);
 	if (ret)
 		goto out;
 
@@ -142,10 +142,9 @@ static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
 	filter_info.ifindex = dev->ifindex;
 	for (i = 0; i < tcinfo.used_len; i++) {
 		filter_info.kind = tcinfo.handle_array[i].kind;
-		ret = nl_get_filter(sock, nl_pid, dev->ifindex,
-				    tcinfo.handle_array[i].handle,
-				    dump_filter_nlmsg,
-				    &filter_info);
+		ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex,
+					   tcinfo.handle_array[i].handle,
+					   dump_filter_nlmsg, &filter_info);
 		if (ret)
 			goto out;
 	}
@@ -153,22 +152,22 @@ static int show_dev_tc_bpf(int sock, unsigned int nl_pid,
 	/* root, ingress and egress handle */
 	handle = TC_H_ROOT;
 	filter_info.kind = "root";
-	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
-			    dump_filter_nlmsg, &filter_info);
+	ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+				   dump_filter_nlmsg, &filter_info);
 	if (ret)
 		goto out;
 
 	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
 	filter_info.kind = "clsact/ingress";
-	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
-			    dump_filter_nlmsg, &filter_info);
+	ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+				   dump_filter_nlmsg, &filter_info);
 	if (ret)
 		goto out;
 
 	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
 	filter_info.kind = "clsact/egress";
-	ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle,
-			    dump_filter_nlmsg, &filter_info);
+	ret = libbpf_nl_get_filter(sock, nl_pid, dev->ifindex, handle,
+				   dump_filter_nlmsg, &filter_info);
 	if (ret)
 		goto out;
 
@@ -196,7 +195,7 @@ static int do_show(int argc, char **argv)
 		usage();
 	}
 
-	sock = bpf_netlink_open(&nl_pid);
+	sock = libbpf_netlink_open(&nl_pid);
 	if (sock < 0) {
 		fprintf(stderr, "failed to open netlink sock\n");
 		return -1;
@@ -211,7 +210,7 @@ static int do_show(int argc, char **argv)
 		jsonw_start_array(json_wtr);
 	NET_START_OBJECT;
 	NET_START_ARRAY("xdp", "%s:\n");
-	ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
+	ret = libbpf_nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
 	NET_END_ARRAY("\n");
 
 	if (!ret) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 8388be525388..710ff5724980 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -305,14 +305,14 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
 			       bpf_perf_event_print_t fn, void *priv);
 
 struct nlattr;
-typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
-int bpf_netlink_open(unsigned int *nl_pid);
-int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
-		void *cookie);
-int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
-		 dump_nlmsg_t dump_class_nlmsg, void *cookie);
-int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
-		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
-int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
-		  dump_nlmsg_t dump_filter_nlmsg, void *cookie);
+typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
+int libbpf_netlink_open(unsigned int *nl_pid);
+int libbpf_nl_get_link(int sock, unsigned int nl_pid,
+		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie);
+int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie);
+int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
+int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+			 libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie);
 #endif
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index da46d9358d9d..506bdfdbcab0 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -18,10 +18,10 @@
 #define SOL_NETLINK 270
 #endif
 
-typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
+typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t,
 			      void *cookie);
 
-int bpf_netlink_open(__u32 *nl_pid)
+int libbpf_netlink_open(__u32 *nl_pid)
 {
 	struct sockaddr_nl sa;
 	socklen_t addrlen;
@@ -65,7 +65,7 @@ cleanup:
 }
 
 static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
-			    __dump_nlmsg_t _fn, dump_nlmsg_t fn,
+			    __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn,
 			    void *cookie)
 {
 	bool multipart = true;
@@ -133,7 +133,7 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 	} req;
 	__u32 nl_pid;
 
-	sock = bpf_netlink_open(&nl_pid);
+	sock = libbpf_netlink_open(&nl_pid);
 	if (sock < 0)
 		return sock;
 
@@ -181,8 +181,8 @@ cleanup:
 	return ret;
 }
 
-static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
-			     void *cookie)
+static int __dump_link_nlmsg(struct nlmsghdr *nlh,
+			     libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
 {
 	struct nlattr *tb[IFLA_MAX + 1], *attr;
 	struct ifinfomsg *ifi = NLMSG_DATA(nlh);
@@ -196,8 +196,8 @@ static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
 	return dump_link_nlmsg(cookie, ifi, tb);
 }
 
-int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
-		void *cookie)
+int libbpf_nl_get_link(int sock, unsigned int nl_pid,
+		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
 {
 	struct {
 		struct nlmsghdr nlh;
@@ -219,7 +219,8 @@ int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
 }
 
 static int __dump_class_nlmsg(struct nlmsghdr *nlh,
-			      dump_nlmsg_t dump_class_nlmsg, void *cookie)
+			      libbpf_dump_nlmsg_t dump_class_nlmsg,
+			      void *cookie)
 {
 	struct nlattr *tb[TCA_MAX + 1], *attr;
 	struct tcmsg *t = NLMSG_DATA(nlh);
@@ -233,8 +234,8 @@ static int __dump_class_nlmsg(struct nlmsghdr *nlh,
 	return dump_class_nlmsg(cookie, t, tb);
 }
 
-int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
-		 dump_nlmsg_t dump_class_nlmsg, void *cookie)
+int libbpf_nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_class_nlmsg, void *cookie)
 {
 	struct {
 		struct nlmsghdr nlh;
@@ -257,7 +258,8 @@ int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
 }
 
 static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
-			      dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+			      libbpf_dump_nlmsg_t dump_qdisc_nlmsg,
+			      void *cookie)
 {
 	struct nlattr *tb[TCA_MAX + 1], *attr;
 	struct tcmsg *t = NLMSG_DATA(nlh);
@@ -271,8 +273,8 @@ static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
 	return dump_qdisc_nlmsg(cookie, t, tb);
 }
 
-int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
-		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+			libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
 {
 	struct {
 		struct nlmsghdr nlh;
@@ -295,7 +297,8 @@ int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
 }
 
 static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
-			       dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+			       libbpf_dump_nlmsg_t dump_filter_nlmsg,
+			       void *cookie)
 {
 	struct nlattr *tb[TCA_MAX + 1], *attr;
 	struct tcmsg *t = NLMSG_DATA(nlh);
@@ -309,8 +312,8 @@ static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
 	return dump_filter_nlmsg(cookie, t, tb);
 }
 
-int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
-		  dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+			 libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie)
 {
 	struct {
 		struct nlmsghdr nlh;

From f04bc8a436e1b32f842a631ff889954bdf56b720 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:40 -0700
Subject: [PATCH 38/64] libbpf: Consistent prefixes for interfaces in nlattr.h.

libbpf is used more and more outside kernel tree. That means the library
should follow good practices in library design and implementation to
play well with third party code that uses it.

One of such practices is to have a common prefix (or a few) for every
interface, function or data structure, library provides. I helps to
avoid name conflicts with other libraries and keeps API consistent.

Inconsistent names in libbpf already cause problems in real life. E.g.
an application can't use both libbpf and libnl due to conflicting
symbols.

Having common prefix will help to fix current and avoid future problems.

libbpf already uses the following prefixes for its interfaces:
* bpf_ for bpf system call wrappers, program/map/elf-object
  abstractions and a few other things;
* btf_ for BTF related API;
* libbpf_ for everything else.

The patch adds libbpf_ prefix to interfaces in nlattr.h that use none of
mentioned above prefixes and doesn't fit well into the first two
categories.

Since affected part of API is used in bpftool, the patch applies
corresponding change to bpftool as well. Having it in a separate patch
will cause a state of tree where bpftool is broken what may not be a
good idea.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/net.c            | 10 +++--
 tools/bpf/bpftool/netlink_dumper.c | 32 ++++++++-------
 tools/lib/bpf/netlink.c            | 10 ++---
 tools/lib/bpf/nlattr.c             | 64 ++++++++++++++++--------------
 tools/lib/bpf/nlattr.h             | 59 +++++++++++++--------------
 5 files changed, 94 insertions(+), 81 deletions(-)

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index ef83e8a08490..d441bb7035ca 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -69,7 +69,9 @@ static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 	snprintf(netinfo->devices[netinfo->used_len].devname,
 		 sizeof(netinfo->devices[netinfo->used_len].devname),
 		 "%s",
-		 tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : "");
+		 tb[IFLA_IFNAME]
+			 ? libbpf_nla_getattr_str(tb[IFLA_IFNAME])
+			 : "");
 	netinfo->used_len++;
 
 	return do_xdp_dump(ifinfo, tb);
@@ -83,7 +85,7 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 	if (tcinfo->is_qdisc) {
 		/* skip clsact qdisc */
 		if (tb[TCA_KIND] &&
-		    strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0)
+		    strcmp(libbpf_nla_data(tb[TCA_KIND]), "clsact") == 0)
 			return 0;
 		if (info->tcm_handle == 0)
 			return 0;
@@ -101,7 +103,9 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 	snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
 		 sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
 		 "%s",
-		 tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
+		 tb[TCA_KIND]
+			 ? libbpf_nla_getattr_str(tb[TCA_KIND])
+			 : "unknown");
 	tcinfo->used_len++;
 
 	return 0;
diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c
index 6f5e9cc6836c..4e9f4531269f 100644
--- a/tools/bpf/bpftool/netlink_dumper.c
+++ b/tools/bpf/bpftool/netlink_dumper.c
@@ -21,7 +21,7 @@ static void xdp_dump_prog_id(struct nlattr **tb, int attr,
 	if (new_json_object)
 		NET_START_OBJECT
 	NET_DUMP_STR("mode", " %s", mode);
-	NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr]))
+	NET_DUMP_UINT("id", " id %u", libbpf_nla_getattr_u32(tb[attr]))
 	if (new_json_object)
 		NET_END_OBJECT
 }
@@ -32,13 +32,13 @@ static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
 	struct nlattr *tb[IFLA_XDP_MAX + 1];
 	unsigned char mode;
 
-	if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
+	if (libbpf_nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
 		return -1;
 
 	if (!tb[IFLA_XDP_ATTACHED])
 		return 0;
 
-	mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
+	mode = libbpf_nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
 	if (mode == XDP_ATTACHED_NONE)
 		return 0;
 
@@ -75,14 +75,14 @@ int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
 		return 0;
 
 	return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index,
-			       nla_getattr_str(tb[IFLA_IFNAME]));
+			       libbpf_nla_getattr_str(tb[IFLA_IFNAME]));
 }
 
 static int do_bpf_dump_one_act(struct nlattr *attr)
 {
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
 
-	if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
+	if (libbpf_nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	if (!tb[TCA_ACT_BPF_PARMS])
@@ -91,10 +91,10 @@ static int do_bpf_dump_one_act(struct nlattr *attr)
 	NET_START_OBJECT_NESTED2;
 	if (tb[TCA_ACT_BPF_NAME])
 		NET_DUMP_STR("name", "%s",
-			     nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
+			     libbpf_nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
 	if (tb[TCA_ACT_BPF_ID])
 		NET_DUMP_UINT("id", " id %u",
-			      nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
+			      libbpf_nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
 	NET_END_OBJECT_NESTED;
 	return 0;
 }
@@ -106,10 +106,11 @@ static int do_dump_one_act(struct nlattr *attr)
 	if (!attr)
 		return 0;
 
-	if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
+	if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
-	if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
+	if (tb[TCA_ACT_KIND] &&
+	    strcmp(libbpf_nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
 		return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]);
 
 	return 0;
@@ -120,7 +121,7 @@ static int do_bpf_act_dump(struct nlattr *attr)
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	int act, ret;
 
-	if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
+	if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	NET_START_ARRAY("act", " %s [");
@@ -139,13 +140,15 @@ static int do_bpf_filter_dump(struct nlattr *attr)
 	struct nlattr *tb[TCA_BPF_MAX + 1];
 	int ret;
 
-	if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
+	if (libbpf_nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	if (tb[TCA_BPF_NAME])
-		NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME]));
+		NET_DUMP_STR("name", " %s",
+			     libbpf_nla_getattr_str(tb[TCA_BPF_NAME]));
 	if (tb[TCA_BPF_ID])
-		NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID]));
+		NET_DUMP_UINT("id", " id %u",
+			      libbpf_nla_getattr_u32(tb[TCA_BPF_ID]));
 	if (tb[TCA_BPF_ACT]) {
 		ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
 		if (ret)
@@ -160,7 +163,8 @@ int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind,
 {
 	int ret = 0;
 
-	if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
+	if (tb[TCA_OPTIONS] &&
+	    strcmp(libbpf_nla_data(tb[TCA_KIND]), "bpf") == 0) {
 		NET_START_OBJECT;
 		if (devname[0] != '\0')
 			NET_DUMP_STR("devname", "%s", devname);
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 506bdfdbcab0..2d2edbbd8ae8 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -103,7 +103,7 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 				if (!err->error)
 					continue;
 				ret = err->error;
-				nla_dump_errormsg(nh);
+				libbpf_nla_dump_errormsg(nh);
 				goto done;
 			case NLMSG_DONE:
 				return 0;
@@ -190,7 +190,7 @@ static int __dump_link_nlmsg(struct nlmsghdr *nlh,
 
 	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
 	attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi)));
-	if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
+	if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	return dump_link_nlmsg(cookie, ifi, tb);
@@ -228,7 +228,7 @@ static int __dump_class_nlmsg(struct nlmsghdr *nlh,
 
 	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
 	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
-	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+	if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	return dump_class_nlmsg(cookie, t, tb);
@@ -267,7 +267,7 @@ static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
 
 	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
 	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
-	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+	if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	return dump_qdisc_nlmsg(cookie, t, tb);
@@ -306,7 +306,7 @@ static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
 
 	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
 	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
-	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+	if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
 		return -LIBBPF_ERRNO__NLPARSE;
 
 	return dump_filter_nlmsg(cookie, t, tb);
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index 49f514119bdb..e52257a7367a 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -17,13 +17,13 @@
 #include <string.h>
 #include <stdio.h>
 
-static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
-	[NLA_U8]	= sizeof(uint8_t),
-	[NLA_U16]	= sizeof(uint16_t),
-	[NLA_U32]	= sizeof(uint32_t),
-	[NLA_U64]	= sizeof(uint64_t),
-	[NLA_STRING]	= 1,
-	[NLA_FLAG]	= 0,
+static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = {
+	[LIBBPF_NLA_U8]		= sizeof(uint8_t),
+	[LIBBPF_NLA_U16]	= sizeof(uint16_t),
+	[LIBBPF_NLA_U32]	= sizeof(uint32_t),
+	[LIBBPF_NLA_U64]	= sizeof(uint64_t),
+	[LIBBPF_NLA_STRING]	= 1,
+	[LIBBPF_NLA_FLAG]	= 0,
 };
 
 static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
@@ -47,9 +47,9 @@ static int nla_type(const struct nlattr *nla)
 }
 
 static int validate_nla(struct nlattr *nla, int maxtype,
-			struct nla_policy *policy)
+			struct libbpf_nla_policy *policy)
 {
-	struct nla_policy *pt;
+	struct libbpf_nla_policy *pt;
 	unsigned int minlen = 0;
 	int type = nla_type(nla);
 
@@ -58,23 +58,24 @@ static int validate_nla(struct nlattr *nla, int maxtype,
 
 	pt = &policy[type];
 
-	if (pt->type > NLA_TYPE_MAX)
+	if (pt->type > LIBBPF_NLA_TYPE_MAX)
 		return 0;
 
 	if (pt->minlen)
 		minlen = pt->minlen;
-	else if (pt->type != NLA_UNSPEC)
+	else if (pt->type != LIBBPF_NLA_UNSPEC)
 		minlen = nla_attr_minlen[pt->type];
 
-	if (nla_len(nla) < minlen)
+	if (libbpf_nla_len(nla) < minlen)
 		return -1;
 
-	if (pt->maxlen && nla_len(nla) > pt->maxlen)
+	if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen)
 		return -1;
 
-	if (pt->type == NLA_STRING) {
-		char *data = nla_data(nla);
-		if (data[nla_len(nla) - 1] != '\0')
+	if (pt->type == LIBBPF_NLA_STRING) {
+		char *data = libbpf_nla_data(nla);
+
+		if (data[libbpf_nla_len(nla) - 1] != '\0')
 			return -1;
 	}
 
@@ -104,15 +105,15 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
  * @see nla_validate
  * @return 0 on success or a negative error code.
  */
-int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
-	      struct nla_policy *policy)
+int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head,
+		     int len, struct libbpf_nla_policy *policy)
 {
 	struct nlattr *nla;
 	int rem, err;
 
 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 
-	nla_for_each_attr(nla, head, len, rem) {
+	libbpf_nla_for_each_attr(nla, head, len, rem) {
 		int type = nla_type(nla);
 
 		if (type > maxtype)
@@ -144,23 +145,25 @@ errout:
  * @arg policy          Attribute validation policy.
  *
  * Feeds the stream of attributes nested into the specified attribute
- * to nla_parse().
+ * to libbpf_nla_parse().
  *
- * @see nla_parse
+ * @see libbpf_nla_parse
  * @return 0 on success or a negative error code.
  */
-int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
-		     struct nla_policy *policy)
+int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype,
+			    struct nlattr *nla,
+			    struct libbpf_nla_policy *policy)
 {
-	return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy);
+	return libbpf_nla_parse(tb, maxtype, libbpf_nla_data(nla),
+				libbpf_nla_len(nla), policy);
 }
 
 /* dump netlink extended ack error message */
-int nla_dump_errormsg(struct nlmsghdr *nlh)
+int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh)
 {
-	struct nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = {
-		[NLMSGERR_ATTR_MSG]	= { .type = NLA_STRING },
-		[NLMSGERR_ATTR_OFFS]	= { .type = NLA_U32 },
+	struct libbpf_nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = {
+		[NLMSGERR_ATTR_MSG]	= { .type = LIBBPF_NLA_STRING },
+		[NLMSGERR_ATTR_OFFS]	= { .type = LIBBPF_NLA_U32 },
 	};
 	struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr;
 	struct nlmsgerr *err;
@@ -181,14 +184,15 @@ int nla_dump_errormsg(struct nlmsghdr *nlh)
 	attr = (struct nlattr *) ((void *) err + hlen);
 	alen = nlh->nlmsg_len - hlen;
 
-	if (nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) {
+	if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen,
+			     extack_policy) != 0) {
 		fprintf(stderr,
 			"Failed to parse extended error attributes\n");
 		return 0;
 	}
 
 	if (tb[NLMSGERR_ATTR_MSG])
-		errmsg = (char *) nla_data(tb[NLMSGERR_ATTR_MSG]);
+		errmsg = (char *) libbpf_nla_data(tb[NLMSGERR_ATTR_MSG]);
 
 	fprintf(stderr, "Kernel error message: %s\n", errmsg);
 
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index a6e2396bce7c..755a3312c87f 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -23,19 +23,19 @@
  * Standard attribute types to specify validation policy
  */
 enum {
-	NLA_UNSPEC,	/**< Unspecified type, binary data chunk */
-	NLA_U8,		/**< 8 bit integer */
-	NLA_U16,	/**< 16 bit integer */
-	NLA_U32,	/**< 32 bit integer */
-	NLA_U64,	/**< 64 bit integer */
-	NLA_STRING,	/**< NUL terminated character string */
-	NLA_FLAG,	/**< Flag */
-	NLA_MSECS,	/**< Micro seconds (64bit) */
-	NLA_NESTED,	/**< Nested attributes */
-	__NLA_TYPE_MAX,
+	LIBBPF_NLA_UNSPEC,	/**< Unspecified type, binary data chunk */
+	LIBBPF_NLA_U8,		/**< 8 bit integer */
+	LIBBPF_NLA_U16,		/**< 16 bit integer */
+	LIBBPF_NLA_U32,		/**< 32 bit integer */
+	LIBBPF_NLA_U64,		/**< 64 bit integer */
+	LIBBPF_NLA_STRING,	/**< NUL terminated character string */
+	LIBBPF_NLA_FLAG,	/**< Flag */
+	LIBBPF_NLA_MSECS,	/**< Micro seconds (64bit) */
+	LIBBPF_NLA_NESTED,	/**< Nested attributes */
+	__LIBBPF_NLA_TYPE_MAX,
 };
 
-#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)
+#define LIBBPF_NLA_TYPE_MAX (__LIBBPF_NLA_TYPE_MAX - 1)
 
 /**
  * @ingroup attr
@@ -43,8 +43,8 @@ enum {
  *
  * See section @core_doc{core_attr_parse,Attribute Parsing} for more details.
  */
-struct nla_policy {
-	/** Type of attribute or NLA_UNSPEC */
+struct libbpf_nla_policy {
+	/** Type of attribute or LIBBPF_NLA_UNSPEC */
 	uint16_t	type;
 
 	/** Minimal length of payload required */
@@ -62,49 +62,50 @@ struct nla_policy {
  * @arg len	length of attribute stream
  * @arg rem	initialized to len, holds bytes currently remaining in stream
  */
-#define nla_for_each_attr(pos, head, len, rem) \
+#define libbpf_nla_for_each_attr(pos, head, len, rem) \
 	for (pos = head, rem = len; \
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
 /**
- * nla_data - head of payload
+ * libbpf_nla_data - head of payload
  * @nla: netlink attribute
  */
-static inline void *nla_data(const struct nlattr *nla)
+static inline void *libbpf_nla_data(const struct nlattr *nla)
 {
 	return (char *) nla + NLA_HDRLEN;
 }
 
-static inline uint8_t nla_getattr_u8(const struct nlattr *nla)
+static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla)
 {
-	return *(uint8_t *)nla_data(nla);
+	return *(uint8_t *)libbpf_nla_data(nla);
 }
 
-static inline uint32_t nla_getattr_u32(const struct nlattr *nla)
+static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla)
 {
-	return *(uint32_t *)nla_data(nla);
+	return *(uint32_t *)libbpf_nla_data(nla);
 }
 
-static inline const char *nla_getattr_str(const struct nlattr *nla)
+static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla)
 {
-	return (const char *)nla_data(nla);
+	return (const char *)libbpf_nla_data(nla);
 }
 
 /**
- * nla_len - length of payload
+ * libbpf_nla_len - length of payload
  * @nla: netlink attribute
  */
-static inline int nla_len(const struct nlattr *nla)
+static inline int libbpf_nla_len(const struct nlattr *nla)
 {
 	return nla->nla_len - NLA_HDRLEN;
 }
 
-int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
-	      struct nla_policy *policy);
-int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
-		     struct nla_policy *policy);
+int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head,
+		     int len, struct libbpf_nla_policy *policy);
+int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype,
+			    struct nlattr *nla,
+			    struct libbpf_nla_policy *policy);
 
-int nla_dump_errormsg(struct nlmsghdr *nlh);
+int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh);
 
 #endif /* __NLATTR_H */

From 24d6a8084259e22acf9a80d4de3606a031f42f6d Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:41 -0700
Subject: [PATCH 39/64] libbpf: Consistent prefixes for interfaces in
 str_error.h.

libbpf is used more and more outside kernel tree. That means the library
should follow good practices in library design and implementation to
play well with third party code that uses it.

One of such practices is to have a common prefix (or a few) for every
interface, function or data structure, library provides. I helps to
avoid name conflicts with other libraries and keeps API consistent.

Inconsistent names in libbpf already cause problems in real life. E.g.
an application can't use both libbpf and libnl due to conflicting
symbols.

Having common prefix will help to fix current and avoid future problems.

libbpf already uses the following prefixes for its interfaces:
* bpf_ for bpf system call wrappers, program/map/elf-object
  abstractions and a few other things;
* btf_ for BTF related API;
* libbpf_ for everything else.

The patch renames function in str_error.h to have libbpf_ prefix since it
misses one and doesn't fit well into the first two categories.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c    | 20 +++++++++++---------
 tools/lib/bpf/str_error.c |  2 +-
 tools/lib/bpf/str_error.h |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9e68fd9fcfca..02888d36b805 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -470,7 +470,8 @@ static int bpf_object__elf_init(struct bpf_object *obj)
 		obj->efile.fd = open(obj->path, O_RDONLY);
 		if (obj->efile.fd < 0) {
 			char errmsg[STRERR_BUFSIZE];
-			char *cp = str_error(errno, errmsg, sizeof(errmsg));
+			char *cp = libbpf_strerror_r(errno, errmsg,
+						     sizeof(errmsg));
 
 			pr_warning("failed to open %s: %s\n", obj->path, cp);
 			return -errno;
@@ -811,7 +812,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 						      data->d_size, name, idx);
 			if (err) {
 				char errmsg[STRERR_BUFSIZE];
-				char *cp = str_error(-err, errmsg, sizeof(errmsg));
+				char *cp = libbpf_strerror_r(-err, errmsg,
+							     sizeof(errmsg));
 
 				pr_warning("failed to alloc program %s (%s): %s",
 					   name, obj->path, cp);
@@ -1140,7 +1142,7 @@ bpf_object__create_maps(struct bpf_object *obj)
 
 		*pfd = bpf_create_map_xattr(&create_attr);
 		if (*pfd < 0 && create_attr.btf_key_type_id) {
-			cp = str_error(errno, errmsg, sizeof(errmsg));
+			cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 			pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
 				   map->name, cp, errno);
 			create_attr.btf_fd = 0;
@@ -1155,7 +1157,7 @@ bpf_object__create_maps(struct bpf_object *obj)
 			size_t j;
 
 			err = *pfd;
-			cp = str_error(errno, errmsg, sizeof(errmsg));
+			cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 			pr_warning("failed to create map (name: '%s'): %s\n",
 				   map->name, cp);
 			for (j = 0; j < i; j++)
@@ -1339,7 +1341,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	}
 
 	ret = -LIBBPF_ERRNO__LOAD;
-	cp = str_error(errno, errmsg, sizeof(errmsg));
+	cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 	pr_warning("load bpf program failed: %s\n", cp);
 
 	if (log_buf && log_buf[0] != '\0') {
@@ -1655,7 +1657,7 @@ static int check_path(const char *path)
 
 	dir = dirname(dname);
 	if (statfs(dir, &st_fs)) {
-		cp = str_error(errno, errmsg, sizeof(errmsg));
+		cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 		pr_warning("failed to statfs %s: %s\n", dir, cp);
 		err = -errno;
 	}
@@ -1691,7 +1693,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 	}
 
 	if (bpf_obj_pin(prog->instances.fds[instance], path)) {
-		cp = str_error(errno, errmsg, sizeof(errmsg));
+		cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 		pr_warning("failed to pin program: %s\n", cp);
 		return -errno;
 	}
@@ -1709,7 +1711,7 @@ static int make_dir(const char *path)
 		err = -errno;
 
 	if (err) {
-		cp = str_error(-err, errmsg, sizeof(errmsg));
+		cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg));
 		pr_warning("failed to mkdir %s: %s\n", path, cp);
 	}
 	return err;
@@ -1771,7 +1773,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path)
 	}
 
 	if (bpf_obj_pin(map->fd, path)) {
-		cp = str_error(errno, errmsg, sizeof(errmsg));
+		cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg));
 		pr_warning("failed to pin map: %s\n", cp);
 		return -errno;
 	}
diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c
index b8798114a357..3d211b642cb5 100644
--- a/tools/lib/bpf/str_error.c
+++ b/tools/lib/bpf/str_error.c
@@ -9,7 +9,7 @@
  * libc, while checking strerror_r() return to avoid having to check this in
  * all places calling it.
  */
-char *str_error(int err, char *dst, int len)
+char *libbpf_strerror_r(int err, char *dst, int len)
 {
 	int ret = strerror_r(err, dst, len);
 	if (ret)
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
index 355b1db571d1..998eff7d6710 100644
--- a/tools/lib/bpf/str_error.h
+++ b/tools/lib/bpf/str_error.h
@@ -2,5 +2,5 @@
 #ifndef BPF_STR_ERROR
 #define BPF_STR_ERROR
 
-char *str_error(int err, char *dst, int len);
+char *libbpf_strerror_r(int err, char *dst, int len);
 #endif // BPF_STR_ERROR

From eff8190880c005e754d7d1bd315fc53a3ae9f876 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:42 -0700
Subject: [PATCH 40/64] libbpf: Make include guards consistent

Rename include guards to have consistent names "__LIBBPF_<header_name>".

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/bpf.h       | 6 +++---
 tools/lib/bpf/btf.h       | 6 +++---
 tools/lib/bpf/libbpf.h    | 6 +++---
 tools/lib/bpf/nlattr.h    | 6 +++---
 tools/lib/bpf/str_error.h | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6f38164b2618..4c78f61b7c71 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -20,8 +20,8 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not,  see <http://www.gnu.org/licenses>
  */
-#ifndef __BPF_BPF_H
-#define __BPF_BPF_H
+#ifndef __LIBBPF_BPF_H
+#define __LIBBPF_BPF_H
 
 #include <linux/bpf.h>
 #include <stdbool.h>
@@ -111,4 +111,4 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
 		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
 		      __u64 *probe_addr);
-#endif
+#endif /* __LIBBPF_BPF_H */
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 4897e0724d4e..d5d20682eeb6 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: LGPL-2.1 */
 /* Copyright (c) 2018 Facebook */
 
-#ifndef __BPF_BTF_H
-#define __BPF_BTF_H
+#ifndef __LIBBPF_BTF_H
+#define __LIBBPF_BTF_H
 
 #include <linux/types.h>
 
@@ -23,4 +23,4 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id);
 int btf__fd(const struct btf *btf);
 const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 
-#endif
+#endif /* __LIBBPF_BTF_H */
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 710ff5724980..28f83dd6022b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -20,8 +20,8 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this program; if not,  see <http://www.gnu.org/licenses>
  */
-#ifndef __BPF_LIBBPF_H
-#define __BPF_LIBBPF_H
+#ifndef __LIBBPF_LIBBPF_H
+#define __LIBBPF_LIBBPF_H
 
 #include <stdio.h>
 #include <stdint.h>
@@ -315,4 +315,4 @@ int libbpf_nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
 			libbpf_dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
 int libbpf_nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
 			 libbpf_dump_nlmsg_t dump_filter_nlmsg, void *cookie);
-#endif
+#endif /* __LIBBPF_LIBBPF_H */
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 755a3312c87f..7198584a3040 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -11,8 +11,8 @@
  * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch>
  */
 
-#ifndef __NLATTR_H
-#define __NLATTR_H
+#ifndef __LIBBPF_NLATTR_H
+#define __LIBBPF_NLATTR_H
 
 #include <stdint.h>
 #include <linux/netlink.h>
@@ -108,4 +108,4 @@ int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype,
 
 int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh);
 
-#endif /* __NLATTR_H */
+#endif /* __LIBBPF_NLATTR_H */
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
index 998eff7d6710..b9157f5eebde 100644
--- a/tools/lib/bpf/str_error.h
+++ b/tools/lib/bpf/str_error.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: LGPL-2.1
-#ifndef BPF_STR_ERROR
-#define BPF_STR_ERROR
+#ifndef __LIBBPF_STR_ERROR_H
+#define __LIBBPF_STR_ERROR_H
 
 char *libbpf_strerror_r(int err, char *dst, int len);
-#endif // BPF_STR_ERROR
+#endif /* __LIBBPF_STR_ERROR_H */

From e5b0863c2064f2d40de9de4862317f9db4ccffff Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 3 Oct 2018 15:26:43 -0700
Subject: [PATCH 41/64] libbpf: Use __u32 instead of u32 in bpf_program__load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make bpf_program__load consistent with other interfaces: use __u32
instead of u32. That in turn fixes build of samples:

In file included from ./samples/bpf/trace_output_user.c:21:0:
./tools/lib/bpf/libbpf.h:132:9: error: unknown type name ‘u32’
         u32 kern_version);
         ^

Fixes: commit 29cd77f41620d ("libbpf: Support loading individual progs")
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 2 +-
 tools/lib/bpf/libbpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 02888d36b805..85de1ebd4cb0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1379,7 +1379,7 @@ out:
 
 int
 bpf_program__load(struct bpf_program *prog,
-		  char *license, u32 kern_version)
+		  char *license, __u32 kern_version)
 {
 	int err = 0, fd, i;
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 28f83dd6022b..fbfc2aec0f0d 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -129,7 +129,7 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex);
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
 int bpf_program__load(struct bpf_program *prog, char *license,
-		      u32 kern_version);
+		      __u32 kern_version);
 int bpf_program__fd(struct bpf_program *prog);
 int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 			      int instance);

From 20cdeb5408da5149c39d577c91ad6f33c73d1b05 Mon Sep 17 00:00:00 2001
From: Bo YU <tsu.yubo@gmail.com>
Date: Wed, 3 Oct 2018 10:02:50 -0400
Subject: [PATCH 42/64] bpf, tracex3_user: erase "ARRAY_SIZE" redefined

There is a warning when compiling bpf sample programs in sample/bpf:

  make -C /home/foo/bpf/samples/bpf/../../tools/lib/bpf/ RM='rm -rf' LDFLAGS= srctree=/home/foo/bpf/samples/bpf/../../ O=
    HOSTCC  /home/foo/bpf/samples/bpf/tracex3_user.o
  /home/foo/bpf/samples/bpf/tracex3_user.c:20:0: warning: "ARRAY_SIZE" redefined
   #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))

  In file included from /home/foo/bpf/samples/bpf/tracex3_user.c:18:0:
  ./tools/testing/selftests/bpf/bpf_util.h:48:0: note: this is the location of the previous definition
   # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

Signed-off-by: Bo YU <tsu.yubo@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/tracex3_user.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index 6c6b10f4c3ee..56466d010139 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -17,8 +17,6 @@
 #include "bpf_load.h"
 #include "bpf_util.h"
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
-
 #define SLOTS 100
 
 static void clear_stats(int fd)

From 7ccc4f1887518a392752ad44c014fd6e1f526da7 Mon Sep 17 00:00:00 2001
From: Konrad Djimeli <kdjimeli@igalia.com>
Date: Thu, 4 Oct 2018 18:01:32 +0100
Subject: [PATCH 43/64] bpf: typo fix in Documentation/networking/af_xdp.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a simple typo: Completetion -> Completion

Signed-off-by: Konrad Djimeli <kdjimeli@igalia.com>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index ff929cfab4f4..4ae4f9d8f8fe 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -159,8 +159,8 @@ log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
 and 3000 refers to the same chunk.
 
 
-UMEM Completetion Ring
-~~~~~~~~~~~~~~~~~~~~~~
+UMEM Completion Ring
+~~~~~~~~~~~~~~~~~~~~
 
 The Completion Ring is used transfer ownership of UMEM frames from
 kernel-space to user-space. Just like the Fill ring, UMEM indicies are

From 661b8d1b0e3a745e25f05adef2ebd00d830eeea7 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 1 Oct 2018 14:51:33 +0200
Subject: [PATCH 44/64] net: add umem reference in netdev{_rx}_queue

These references to the umem will be used to store information
on what kind of AF_XDP umem that is bound to a queue id, if any.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1cbbf77a685f..8318f79586c2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -609,6 +609,9 @@ struct netdev_queue {
 
 	/* Subordinate device that the queue has been assigned to */
 	struct net_device	*sb_dev;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xdp_umem         *umem;
+#endif
 /*
  * write-mostly part
  */
@@ -738,6 +741,9 @@ struct netdev_rx_queue {
 	struct kobject			kobj;
 	struct net_device		*dev;
 	struct xdp_rxq_info		xdp_rxq;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xdp_umem                 *umem;
+#endif
 } ____cacheline_aligned_in_smp;
 
 /*

From c9b47cc1fabca533fb8ebaeb417778036c1ce27d Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 1 Oct 2018 14:51:34 +0200
Subject: [PATCH 45/64] xsk: fix bug when trying to use both copy and zero-copy
 on one queue id

Previously, the xsk code did not record which umem was bound to a
specific queue id. This was not required if all drivers were zero-copy
enabled as this had to be recorded in the driver anyway. So if a user
tried to bind two umems to the same queue, the driver would say
no. But if copy-mode was first enabled and then zero-copy mode (or the
reverse order), we mistakenly enabled both of them on the same umem
leading to buggy behavior. The main culprit for this is that we did
not store the association of umem to queue id in the copy case and
only relied on the driver reporting this. As this relation was not
stored in the driver for copy mode (it does not rely on the AF_XDP
NDOs), this obviously could not work.

This patch fixes the problem by always recording the umem to queue id
relationship in the netdev_queue and netdev_rx_queue structs. This way
we always know what kind of umem has been bound to a queue id and can
act appropriately at bind time.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 92 ++++++++++++++++++++++++++++++++--------------
 net/xdp/xdp_umem.h |  2 +-
 net/xdp/xsk.c      |  7 ----
 3 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 555427b3e0fe..4d6c6652f5d1 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -42,27 +42,47 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
 	}
 }
 
-int xdp_umem_query(struct net_device *dev, u16 queue_id)
+/* The umem is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
+				u16 queue_id)
 {
-	struct netdev_bpf bpf;
+	if (queue_id < dev->real_num_rx_queues)
+		dev->_rx[queue_id].umem = umem;
+	if (queue_id < dev->real_num_tx_queues)
+		dev->_tx[queue_id].umem = umem;
+}
 
-	ASSERT_RTNL();
+static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+					      u16 queue_id)
+{
+	if (queue_id < dev->real_num_rx_queues)
+		return dev->_rx[queue_id].umem;
+	if (queue_id < dev->real_num_tx_queues)
+		return dev->_tx[queue_id].umem;
 
-	memset(&bpf, 0, sizeof(bpf));
-	bpf.command = XDP_QUERY_XSK_UMEM;
-	bpf.xsk.queue_id = queue_id;
+	return NULL;
+}
 
-	if (!dev->netdev_ops->ndo_bpf)
-		return 0;
-	return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
+static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
+{
+	/* Zero out the entry independent on how many queues are configured
+	 * at this point in time, as it might be used in the future.
+	 */
+	if (queue_id < dev->num_rx_queues)
+		dev->_rx[queue_id].umem = NULL;
+	if (queue_id < dev->num_tx_queues)
+		dev->_tx[queue_id].umem = NULL;
 }
 
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
-			u32 queue_id, u16 flags)
+			u16 queue_id, u16 flags)
 {
 	bool force_zc, force_copy;
 	struct netdev_bpf bpf;
-	int err;
+	int err = 0;
 
 	force_zc = flags & XDP_ZEROCOPY;
 	force_copy = flags & XDP_COPY;
@@ -70,17 +90,23 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	if (force_zc && force_copy)
 		return -EINVAL;
 
-	if (force_copy)
-		return 0;
-
-	if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
-		return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */
-
 	rtnl_lock();
-	err = xdp_umem_query(dev, queue_id);
-	if (err) {
-		err = err < 0 ? -EOPNOTSUPP : -EBUSY;
-		goto err_rtnl_unlock;
+	if (xdp_get_umem_from_qid(dev, queue_id)) {
+		err = -EBUSY;
+		goto out_rtnl_unlock;
+	}
+
+	xdp_reg_umem_at_qid(dev, umem, queue_id);
+	umem->dev = dev;
+	umem->queue_id = queue_id;
+	if (force_copy)
+		/* For copy-mode, we are done. */
+		goto out_rtnl_unlock;
+
+	if (!dev->netdev_ops->ndo_bpf ||
+	    !dev->netdev_ops->ndo_xsk_async_xmit) {
+		err = -EOPNOTSUPP;
+		goto err_unreg_umem;
 	}
 
 	bpf.command = XDP_SETUP_XSK_UMEM;
@@ -89,18 +115,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 
 	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
 	if (err)
-		goto err_rtnl_unlock;
+		goto err_unreg_umem;
 	rtnl_unlock();
 
 	dev_hold(dev);
-	umem->dev = dev;
-	umem->queue_id = queue_id;
 	umem->zc = true;
 	return 0;
 
-err_rtnl_unlock:
+err_unreg_umem:
+	xdp_clear_umem_at_qid(dev, queue_id);
+	if (!force_zc)
+		err = 0; /* fallback to copy mode */
+out_rtnl_unlock:
 	rtnl_unlock();
-	return force_zc ? err : 0; /* fail or fallback */
+	return err;
 }
 
 static void xdp_umem_clear_dev(struct xdp_umem *umem)
@@ -108,7 +136,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
 	struct netdev_bpf bpf;
 	int err;
 
-	if (umem->dev) {
+	if (umem->zc) {
 		bpf.command = XDP_SETUP_XSK_UMEM;
 		bpf.xsk.umem = NULL;
 		bpf.xsk.queue_id = umem->queue_id;
@@ -119,9 +147,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
 
 		if (err)
 			WARN(1, "failed to disable umem!\n");
+	}
 
+	if (umem->dev) {
+		rtnl_lock();
+		xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
+		rtnl_unlock();
+	}
+
+	if (umem->zc) {
 		dev_put(umem->dev);
-		umem->dev = NULL;
+		umem->zc = false;
 	}
 }
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index c8be1ad3eb88..27603227601b 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -9,7 +9,7 @@
 #include <net/xdp_sock.h>
 
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
-			u32 queue_id, u16 flags);
+			u16 queue_id, u16 flags);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 5a432dfee4ee..caeddad15b7c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -419,13 +419,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	}
 
 	qid = sxdp->sxdp_queue_id;
-
-	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
-	    (xs->tx && qid >= dev->real_num_tx_queues)) {
-		err = -EINVAL;
-		goto out_unlock;
-	}
-
 	flags = sxdp->sxdp_flags;
 
 	if (flags & XDP_SHARED_UMEM) {

From b8c8a2e2e3758be93959587e6b19aadbb0ce7275 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 14:51:35 +0200
Subject: [PATCH 46/64] ethtool: rename local variable max -> curr

ethtool_set_channels() validates the config against driver's max
settings. It retrieves the current config and stores it in a
variable called max. This was okay when only max settings were
accessed but we will soon want to access current settings as
well, so calling the entire structure max makes the code less
readable.

While at it drop unnecessary parenthesis.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/ethtool.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 96afc55aa61e..9a648fb09594 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1655,7 +1655,7 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
 static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
-	struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
+	struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
 	u32 max_rx_in_use = 0;
 
 	if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
@@ -1664,13 +1664,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	if (copy_from_user(&channels, useraddr, sizeof(channels)))
 		return -EFAULT;
 
-	dev->ethtool_ops->get_channels(dev, &max);
+	dev->ethtool_ops->get_channels(dev, &curr);
 
 	/* ensure new counts are within the maximums */
-	if ((channels.rx_count > max.max_rx) ||
-	    (channels.tx_count > max.max_tx) ||
-	    (channels.combined_count > max.max_combined) ||
-	    (channels.other_count > max.max_other))
+	if (channels.rx_count > curr.max_rx ||
+	    channels.tx_count > curr.max_tx ||
+	    channels.combined_count > curr.max_combined ||
+	    channels.other_count > curr.max_other)
 		return -EINVAL;
 
 	/* ensure the new Rx count fits within the configured Rx flow

From 1661d346628115c364e2b7d5b15a64ca3bd0dbd4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 1 Oct 2018 14:51:36 +0200
Subject: [PATCH 47/64] ethtool: don't allow disabling queues with umem
 installed

We already check the RSS indirection table does not use queues which
would be disabled by channel reconfiguration. Make sure user does not
try to disable queues which have a UMEM and zero-copy AF_XDP socket
installed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  7 +++++++
 net/core/ethtool.c     | 11 +++++++++++
 net/xdp/xdp_umem.c     |  4 ++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 70a115bea4f4..13acb9803a6d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -86,6 +86,7 @@ struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
 struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 					  struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -183,6 +184,12 @@ static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
 {
 }
 
+static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+						     u16 queue_id)
+{
+	return NULL;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9a648fb09594..5a788adeba0b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -27,6 +27,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/sched/signal.h>
 #include <linux/net.h>
+#include <net/xdp_sock.h>
 
 /*
  * Some useful ethtool_ops methods that're device independent.
@@ -1656,7 +1657,9 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 						   void __user *useraddr)
 {
 	struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
+	u16 from_channel, to_channel;
 	u32 max_rx_in_use = 0;
+	unsigned int i;
 
 	if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
 		return -EOPNOTSUPP;
@@ -1680,6 +1683,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	    (channels.combined_count + channels.rx_count) <= max_rx_in_use)
 	    return -EINVAL;
 
+	/* Disabling channels, query zero-copy AF_XDP sockets */
+	from_channel = channels.combined_count +
+		min(channels.rx_count, channels.tx_count);
+	to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
+	for (i = from_channel; i < to_channel; i++)
+		if (xdp_get_umem_from_qid(dev, i))
+			return -EINVAL;
+
 	return dev->ethtool_ops->set_channels(dev, &channels);
 }
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 4d6c6652f5d1..773326f682b1 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -55,8 +55,8 @@ static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
 		dev->_tx[queue_id].umem = umem;
 }
 
-static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
-					      u16 queue_id)
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+				       u16 queue_id)
 {
 	if (queue_id < dev->real_num_rx_queues)
 		return dev->_rx[queue_id].umem;

From a41b4f3c58dddcb39b7072a97ac8cb9036ce53f6 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 1 Oct 2018 14:51:37 +0200
Subject: [PATCH 48/64] xsk: simplify xdp_clear_umem_at_qid implementation

As we now do not allow ethtool to deactivate the queue id we are
running an AF_XDP socket on, we can simplify the implementation of
xdp_clear_umem_at_qid().

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 773326f682b1..c6007c58231c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -68,12 +68,9 @@ struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 
 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
 {
-	/* Zero out the entry independent on how many queues are configured
-	 * at this point in time, as it might be used in the future.
-	 */
-	if (queue_id < dev->num_rx_queues)
+	if (queue_id < dev->real_num_rx_queues)
 		dev->_rx[queue_id].umem = NULL;
-	if (queue_id < dev->num_tx_queues)
+	if (queue_id < dev->real_num_tx_queues)
 		dev->_tx[queue_id].umem = NULL;
 }
 

From 541d7fdd7694560404c502f64298a90ffe017e6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Fri, 5 Oct 2018 13:25:15 +0200
Subject: [PATCH 49/64] xsk: proper AF_XDP socket teardown ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AF_XDP socket struct can exist in three different, implicit
states: setup, bound and released. Setup is prior the socket has been
bound to a device. Bound is when the socket is active for receive and
send. Released is when the process/userspace side of the socket is
released, but the sock object is still lingering, e.g. when there is a
reference to the socket in an XSKMAP after process termination.

The Rx fast-path code uses the "dev" member of struct xdp_sock to
check whether a socket is bound or relased, and the Tx code uses the
struct xdp_umem "xsk_list" member in conjunction with "dev" to
determine the state of a socket.

However, the transition from bound to released did not tear the socket
down in correct order.

On the Rx side "dev" was cleared after synchronize_net() making the
synchronization useless. On the Tx side, the internal queues were
destroyed prior removing them from the "xsk_list".

This commit corrects the cleanup order, and by doing so
xdp_del_sk_umem() can be simplified and one synchronize_net() can be
removed.

Fixes: 965a99098443 ("xsk: add support for bind for Rx")
Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions")
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 11 +++--------
 net/xdp/xsk.c      | 13 ++++++++-----
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index c6007c58231c..a264cf2accd0 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -32,14 +32,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
 {
 	unsigned long flags;
 
-	if (xs->dev) {
-		spin_lock_irqsave(&umem->xsk_list_lock, flags);
-		list_del_rcu(&xs->list);
-		spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
-
-		if (umem->zc)
-			synchronize_net();
-	}
+	spin_lock_irqsave(&umem->xsk_list_lock, flags);
+	list_del_rcu(&xs->list);
+	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
 }
 
 /* The umem is stored both in the _rx struct and the _tx struct as we do
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index caeddad15b7c..0577cd49aa72 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -355,12 +355,18 @@ static int xsk_release(struct socket *sock)
 	local_bh_enable();
 
 	if (xs->dev) {
+		struct net_device *dev = xs->dev;
+
 		/* Wait for driver to stop using the xdp socket. */
-		synchronize_net();
-		dev_put(xs->dev);
+		xdp_del_sk_umem(xs->umem, xs);
 		xs->dev = NULL;
+		synchronize_net();
+		dev_put(dev);
 	}
 
+	xskq_destroy(xs->rx);
+	xskq_destroy(xs->tx);
+
 	sock_orphan(sk);
 	sock->sk = NULL;
 
@@ -714,9 +720,6 @@ static void xsk_destruct(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		return;
 
-	xskq_destroy(xs->rx);
-	xskq_destroy(xs->tx);
-	xdp_del_sk_umem(xs->umem, xs);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);

From 1bc38b8ff6cc54b1cd925525c0a9ffa7eef4d23b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 5 Oct 2018 16:40:00 -0700
Subject: [PATCH 50/64] libbpf: relicense libbpf as LGPL-2.1 OR BSD-2-Clause
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

libbpf is maturing as a library and gaining features that no other bpf libraries support
(BPF Type Format, bpf to bpf calls, etc)
Many Apache2 licensed projects (like bcc, bpftrace, gobpf, cilium, etc)
would like to use libbpf, but cannot do this yet, since Apache Foundation explicitly
states that LGPL is incompatible with Apache2.
Hence let's relicense libbpf as dual license LGPL-2.1 or BSD-2-Clause,
since BSD-2 is compatible with Apache2.
Dual LGPL or Apache2 is invalid combination.
Fix license mistake in Makefile as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David Beckett <david.beckett@netronome.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Joe Stringer <joe@ovn.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/Makefile       |  2 +-
 tools/lib/bpf/bpf.c          |  2 +-
 tools/lib/bpf/bpf.h          |  2 +-
 tools/lib/bpf/btf.c          |  2 +-
 tools/lib/bpf/btf.h          |  2 +-
 tools/lib/bpf/libbpf.c       | 15 +--------------
 tools/lib/bpf/libbpf.h       | 15 +--------------
 tools/lib/bpf/libbpf_errno.c | 15 +--------------
 tools/lib/bpf/netlink.c      |  2 +-
 tools/lib/bpf/nlattr.c       |  7 +------
 tools/lib/bpf/nlattr.h       |  7 +------
 tools/lib/bpf/str_error.c    |  2 +-
 tools/lib/bpf/str_error.h    |  2 +-
 13 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index d49902e818b5..6ad27257fd67 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 # Most of this file is copied from tools/lib/traceevent/Makefile
 
 BPF_VERSION = 0
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 3878a26a2071..d70a255cb05e 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 
 /*
  * common eBPF ELF operations.
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 4c78f61b7c71..87520a87a75f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: LGPL-2.1 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 
 /*
  * common eBPF ELF operations.
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index cf94b0770522..449591aa9900 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 /* Copyright (c) 2018 Facebook */
 
 #include <stdlib.h>
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index d5d20682eeb6..6db5462bb2ef 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: LGPL-2.1 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 /* Copyright (c) 2018 Facebook */
 
 #ifndef __LIBBPF_BTF_H
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 85de1ebd4cb0..ceb918c14d80 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 
 /*
  * Common eBPF ELF object loading operations.
@@ -7,19 +7,6 @@
  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
  * Copyright (C) 2015 Huawei Inc.
  * Copyright (C) 2017 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License (not later!)
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not,  see <http://www.gnu.org/licenses>
  */
 
 #define _GNU_SOURCE
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index fbfc2aec0f0d..8af8d3663991 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: LGPL-2.1 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 
 /*
  * Common eBPF ELF object loading operations.
@@ -6,19 +6,6 @@
  * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
  * Copyright (C) 2015 Huawei Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License (not later!)
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not,  see <http://www.gnu.org/licenses>
  */
 #ifndef __LIBBPF_LIBBPF_H
 #define __LIBBPF_LIBBPF_H
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index 2464ade3b326..d83b17f8435c 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -1,23 +1,10 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 
 /*
  * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
  * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
  * Copyright (C) 2015 Huawei Inc.
  * Copyright (C) 2017 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License (not later!)
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not,  see <http://www.gnu.org/licenses>
  */
 
 #include <stdio.h>
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index 2d2edbbd8ae8..0ce67aea8f3b 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 /* Copyright (c) 2018 Facebook */
 
 #include <stdlib.h>
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index e52257a7367a..1e69c0c8d413 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -1,13 +1,8 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 
 /*
  * NETLINK      Netlink attributes
  *
- *	This library is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU Lesser General Public
- *	License as published by the Free Software Foundation version 2.1
- *	of the License.
- *
  * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch>
  */
 
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 7198584a3040..6cc3ac91690f 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -1,13 +1,8 @@
-/* SPDX-License-Identifier: LGPL-2.1 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 
 /*
  * NETLINK      Netlink attributes
  *
- *	This library is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU Lesser General Public
- *	License as published by the Free Software Foundation version 2.1
- *	of the License.
- *
  * Copyright (c) 2003-2013 Thomas Graf <tgraf@suug.ch>
  */
 
diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c
index 3d211b642cb5..00e48ac5b806 100644
--- a/tools/lib/bpf/str_error.c
+++ b/tools/lib/bpf/str_error.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 #undef _GNU_SOURCE
 #include <string.h>
 #include <stdio.h>
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
index b9157f5eebde..a139334d57b6 100644
--- a/tools/lib/bpf/str_error.h
+++ b/tools/lib/bpf/str_error.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 #ifndef __LIBBPF_STR_ERROR_H
 #define __LIBBPF_STR_ERROR_H
 

From 31ce8c4a1ad26852c91de900ebf6888a4d53fc1a Mon Sep 17 00:00:00 2001
From: Arthur Fabre <arthur@arthurfabre.com>
Date: Sun, 7 Oct 2018 09:45:19 +0100
Subject: [PATCH 51/64] bpf, doc: Document Jump X addressing mode

bpf_asm and the other classic BPF tools support jump conditions
comparing register A to register X, in addition to comparing
register A with constant K.

Only the latter was documented in filter.txt, add two new addressing
modes that describe the former.

Signed-off-by: Arthur Fabre <arthur@arthurfabre.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/filter.txt | 30 +++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 4443ce958862..2196b824e96c 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -203,11 +203,11 @@ opcodes as defined in linux/filter.h stand for:
 
   Instruction      Addressing mode      Description
 
-  ld               1, 2, 3, 4, 10       Load word into A
+  ld               1, 2, 3, 4, 12       Load word into A
   ldi              4                    Load word into A
   ldh              1, 2                 Load half-word into A
   ldb              1, 2                 Load byte into A
-  ldx              3, 4, 5, 10          Load word into X
+  ldx              3, 4, 5, 12          Load word into X
   ldxi             4                    Load word into X
   ldxb             5                    Load byte into X
 
@@ -216,14 +216,14 @@ opcodes as defined in linux/filter.h stand for:
 
   jmp              6                    Jump to label
   ja               6                    Jump to label
-  jeq              7, 8                 Jump on A == k
-  jneq             8                    Jump on A != k
-  jne              8                    Jump on A != k
-  jlt              8                    Jump on A <  k
-  jle              8                    Jump on A <= k
-  jgt              7, 8                 Jump on A >  k
-  jge              7, 8                 Jump on A >= k
-  jset             7, 8                 Jump on A &  k
+  jeq              7, 8, 9, 10          Jump on A == <x>
+  jneq             9, 10                Jump on A != <x>
+  jne              9, 10                Jump on A != <x>
+  jlt              9, 10                Jump on A <  <x>
+  jle              9, 10                Jump on A <= <x>
+  jgt              7, 8, 9, 10          Jump on A >  <x>
+  jge              7, 8, 9, 10          Jump on A >= <x>
+  jset             7, 8, 9, 10          Jump on A &  <x>
 
   add              0, 4                 A + <x>
   sub              0, 4                 A - <x>
@@ -240,7 +240,7 @@ opcodes as defined in linux/filter.h stand for:
   tax                                   Copy A into X
   txa                                   Copy X into A
 
-  ret              4, 9                 Return
+  ret              4, 11                Return
 
 The next table shows addressing formats from the 2nd column:
 
@@ -254,9 +254,11 @@ The next table shows addressing formats from the 2nd column:
    5               4*([k]&0xf)          Lower nibble * 4 at byte offset k in the packet
    6               L                    Jump label L
    7               #k,Lt,Lf             Jump to Lt if true, otherwise jump to Lf
-   8               #k,Lt                Jump to Lt if predicate is true
-   9               a/%a                 Accumulator A
-  10               extension            BPF extension
+   8               x/%x,Lt,Lf           Jump to Lt if true, otherwise jump to Lf
+   9               #k,Lt                Jump to Lt if predicate is true
+  10               x/%x,Lt              Jump to Lt if predicate is true
+  11               a/%a                 Accumulator A
+  12               extension            BPF extension
 
 The Linux kernel also has a couple of BPF extensions that are used along
 with the class of load instructions by "overloading" the k argument with

From c941ce9c282cc606e6517356fcc186a9da2b4ab9 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:47 +0100
Subject: [PATCH 52/64] bpf: add verifier callback to get stack usage info for
 offloaded progs

In preparation for BPF-to-BPF calls in offloaded programs, add a new
function attribute to the struct bpf_prog_offload_ops so that drivers
supporting eBPF offload can hook at the end of program verification, and
potentially extract information collected by the verifier.

Implement a minimal callback (returning 0) in the drivers providing the
structs, namely netdevsim and nfp.

This will be useful in the nfp driver, in later commits, to extract the
number of subprograms as well as the stack depth for those subprograms.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/bpf/verifier.c  |  8 +++++++-
 drivers/net/netdevsim/bpf.c                    |  8 +++++++-
 include/linux/bpf.h                            |  1 +
 include/linux/bpf_verifier.h                   |  1 +
 kernel/bpf/offload.c                           | 18 ++++++++++++++++++
 kernel/bpf/verifier.c                          |  3 +++
 6 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index a6e9248669e1..e470489021e3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -640,6 +640,12 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
+static int nfp_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = {
-	.insn_hook = nfp_verify_insn,
+	.insn_hook	= nfp_verify_insn,
+	.finalize	= nfp_bpf_finalize,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 81444208b216..cb3518474f0e 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -86,8 +86,14 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn)
 	return 0;
 }
 
+static int nsim_bpf_finalize(struct bpf_verifier_env *env)
+{
+	return 0;
+}
+
 static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = {
-	.insn_hook = nsim_bpf_verify_insn,
+	.insn_hook	= nsim_bpf_verify_insn,
+	.finalize	= nsim_bpf_finalize,
 };
 
 static bool nsim_xdp_offload_active(struct netdevsim *ns)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 027697b6a22f..9b558713447f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -263,6 +263,7 @@ struct bpf_verifier_ops {
 struct bpf_prog_offload_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
 			 int insn_idx, int prev_insn_idx);
+	int (*finalize)(struct bpf_verifier_env *env);
 };
 
 struct bpf_prog_offload {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7b6fd2ab3263..9e8056ec20fa 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -245,5 +245,6 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
 int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 				 int insn_idx, int prev_insn_idx);
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
 
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 177a52436394..8e93c47f0779 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 	return ret;
 }
 
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_offload *offload;
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	offload = env->prog->aux->offload;
+	if (offload) {
+		if (offload->dev_ops->finalize)
+			ret = offload->dev_ops->finalize(env);
+		else
+			ret = 0;
+	}
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_prog_offload *offload = prog->aux->offload;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73c81bef6ae8..a0454cb299ba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6309,6 +6309,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		env->cur_state = NULL;
 	}
 
+	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
+		ret = bpf_prog_offload_finalize(env);
+
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
 	free_states(env);

From 1a7e62e6329c210ff67c5706fbe91187f2452baf Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:48 +0100
Subject: [PATCH 53/64] nfp: bpf: rename nfp_prog->stack_depth as
 nfp_prog->stack_frame_depth

In preparation for support for BPF to BPF calls in offloaded programs,
rename the "stack_depth" field of the struct nfp_prog as
"stack_frame_depth". This is to make it clear that the field refers to
the maximum size of the current stack frame (as opposed to the maximum
size of the whole stack memory).

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c     | 10 +++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.h    |  4 ++--
 drivers/net/ethernet/netronome/nfp/bpf/offload.c |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index eff57f7d056a..98a94ca36bfa 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1137,7 +1137,7 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
 	     bool clr_gpr, lmem_step step)
 {
-	s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
+	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
 	bool first = true, last;
 	bool needs_inc = false;
 	swreg stack_off_reg;
@@ -1695,7 +1695,7 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	s64 lm_off;
 
 	/* We only have to reload LM0 if the key is not at start of stack */
-	lm_off = nfp_prog->stack_depth;
+	lm_off = nfp_prog->stack_frame_depth;
 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
 	load_lm_ptr = meta->arg2.var_off || lm_off;
 
@@ -1808,10 +1808,10 @@ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		swreg stack_depth_reg;
 
 		stack_depth_reg = ur_load_imm_any(nfp_prog,
-						  nfp_prog->stack_depth,
+						  nfp_prog->stack_frame_depth,
 						  stack_imm(nfp_prog));
-		emit_alu(nfp_prog, reg_both(dst),
-			 stack_reg(nfp_prog), ALU_OP_ADD, stack_depth_reg);
+		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
+			 ALU_OP_ADD, stack_depth_reg);
 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
 	} else {
 		wrp_reg_mov(nfp_prog, dst, src);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 792ebc4081a3..7050535383b8 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -436,7 +436,7 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
  * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
  * @n_translated: number of successfully translated instructions (for errors)
  * @error: error code if something went wrong
- * @stack_depth: max stack depth from the verifier
+ * @stack_frame_depth: max stack depth for current frame
  * @adjust_head_location: if program has single adjust head call - the insn no.
  * @map_records_cnt: the number of map pointers recorded for this prog
  * @map_records: the map record pointers from bpf->maps_neutral
@@ -460,7 +460,7 @@ struct nfp_prog {
 	unsigned int n_translated;
 	int error;
 
-	unsigned int stack_depth;
+	unsigned int stack_frame_depth;
 	unsigned int adjust_head_location;
 
 	unsigned int map_records_cnt;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 1ccd6371a15b..c9519bb00f8a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -260,7 +260,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 			prog->aux->stack_depth, stack_size);
 		return -EOPNOTSUPP;
 	}
-	nfp_prog->stack_depth = round_up(prog->aux->stack_depth, 4);
+	nfp_prog->stack_frame_depth = round_up(prog->aux->stack_depth, 4);
 
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);

From c5da54d93eb43461a5b79e1fdad8409abad83a77 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:49 +0100
Subject: [PATCH 54/64] nfp: bpf: copy eBPF subprograms information from kernel
 verifier

In order to support BPF-to-BPF calls in offloaded programs, the nfp
driver must collect information about the distinct subprograms: namely,
the number of subprograms composing the complete program and the stack
depth of those subprograms. The latter in particular is non-trivial to
collect, so we copy those elements from the kernel verifier via the
newly added post-verification hook. The struct nfp_prog is extended to
store this information. Stack depths are stored in an array of dedicated
structs.

Subprogram start indexes are not collected. Instead, meta instructions
associated to the start of a subprogram will be marked with a flag in a
later patch.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 12 ++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  |  2 ++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 15 +++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 7050535383b8..7f6e850e42da 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -423,6 +423,14 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
 	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
 }
 
+/**
+ * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
+ * @stack_depth:	maximum stack depth used by this sub-program
+ */
+struct nfp_bpf_subprog_info {
+	u16 stack_depth;
+};
+
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
@@ -439,7 +447,9 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
  * @stack_frame_depth: max stack depth for current frame
  * @adjust_head_location: if program has single adjust head call - the insn no.
  * @map_records_cnt: the number of map pointers recorded for this prog
+ * @subprog_cnt: number of sub-programs, including main function
  * @map_records: the map record pointers from bpf->maps_neutral
+ * @subprog: pointer to an array of objects holding info about sub-programs
  * @insns: list of BPF instruction wrappers (struct nfp_insn_meta)
  */
 struct nfp_prog {
@@ -464,7 +474,9 @@ struct nfp_prog {
 	unsigned int adjust_head_location;
 
 	unsigned int map_records_cnt;
+	unsigned int subprog_cnt;
 	struct nfp_bpf_neutral_map **map_records;
+	struct nfp_bpf_subprog_info *subprog;
 
 	struct list_head insns;
 };
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index c9519bb00f8a..b683b03efd22 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -208,6 +208,8 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta, *tmp;
 
+	kfree(nfp_prog->subprog);
+
 	list_for_each_entry_safe(meta, tmp, &nfp_prog->insns, l) {
 		list_del(&meta->l);
 		kfree(meta);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e470489021e3..9ef74bc1ec1d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -642,6 +642,21 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
+	struct bpf_subprog_info *info;
+	struct nfp_prog *nfp_prog;
+	int i;
+
+	nfp_prog = env->prog->aux->offload->dev_priv;
+	nfp_prog->subprog_cnt = env->subprog_cnt;
+	nfp_prog->subprog = kcalloc(nfp_prog->subprog_cnt,
+				    sizeof(nfp_prog->subprog[0]), GFP_KERNEL);
+	if (!nfp_prog->subprog)
+		return -ENOMEM;
+
+	info = env->subprog_info;
+	for (i = 0; i < nfp_prog->subprog_cnt; i++)
+		nfp_prog->subprog[i].stack_depth = info[i].stack_depth;
+
 	return 0;
 }
 

From bcfdfb7c962d84ef07aa2b400ef1c9d70e1120e5 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:50 +0100
Subject: [PATCH 55/64] nfp: bpf: ignore helper-related checks for BPF calls in
 nfp verifier

The checks related to eBPF helper calls are performed each time the nfp
driver meets a BPF_JUMP | BPF_CALL instruction. However, these checks
are not relevant for BPF-to-BPF call (same instruction code, different
value in source register), so just skip the checks for such calls.

While at it, rename the function that runs those checks to make it clear
they apply to _helper_ calls only.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 8 ++++++++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 9 +++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 7f6e850e42da..853a5346378c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -423,6 +423,14 @@ static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
 	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
 }
 
+static inline bool is_mbpf_helper_call(const struct nfp_insn_meta *meta)
+{
+	struct bpf_insn insn = meta->insn;
+
+	return insn.code == (BPF_JMP | BPF_CALL) &&
+		insn.src_reg != BPF_PSEUDO_CALL;
+}
+
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 9ef74bc1ec1d..c642c2c07d96 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -155,8 +155,9 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env,
 }
 
 static int
-nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env,
-		   struct nfp_insn_meta *meta)
+nfp_bpf_check_helper_call(struct nfp_prog *nfp_prog,
+			  struct bpf_verifier_env *env,
+			  struct nfp_insn_meta *meta)
 {
 	const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1;
 	const struct bpf_reg_state *reg2 = cur_regs(env) + BPF_REG_2;
@@ -620,8 +621,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 		return -EINVAL;
 	}
 
-	if (meta->insn.code == (BPF_JMP | BPF_CALL))
-		return nfp_bpf_check_call(nfp_prog, env, meta);
+	if (is_mbpf_helper_call(meta))
+		return nfp_bpf_check_helper_call(nfp_prog, env, meta);
 	if (meta->insn.code == (BPF_JMP | BPF_EXIT))
 		return nfp_bpf_check_exit(nfp_prog, env);
 

From e3b49dc69b320ba93059509ff2b31bde9242a7fa Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:51 +0100
Subject: [PATCH 56/64] nfp: bpf: account for BPF-to-BPF calls when preparing
 nfp JIT

Similarly to "exit" or "helper call" instructions, BPF-to-BPF calls will
require additional processing before translation starts, in order to
record and mark jump destinations.

We also mark the instructions where each subprogram begins. This will be
used in a following commit to determine where to add prologues for
subprograms.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 35 +++++++++++++------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  3 +-
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 98a94ca36bfa..ccb80a5ac828 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -4018,20 +4018,35 @@ void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt)
 
 	/* Another pass to record jump information. */
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		struct nfp_insn_meta *dst_meta;
 		u64 code = meta->insn.code;
+		unsigned int dst_idx;
+		bool pseudo_call;
 
-		if (BPF_CLASS(code) == BPF_JMP && BPF_OP(code) != BPF_EXIT &&
-		    BPF_OP(code) != BPF_CALL) {
-			struct nfp_insn_meta *dst_meta;
-			unsigned short dst_indx;
+		if (BPF_CLASS(code) != BPF_JMP)
+			continue;
+		if (BPF_OP(code) == BPF_EXIT)
+			continue;
+		if (is_mbpf_helper_call(meta))
+			continue;
 
-			dst_indx = meta->n + 1 + meta->insn.off;
-			dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_indx,
-						     cnt);
+		/* If opcode is BPF_CALL at this point, this can only be a
+		 * BPF-to-BPF call (a.k.a pseudo call).
+		 */
+		pseudo_call = BPF_OP(code) == BPF_CALL;
 
-			meta->jmp_dst = dst_meta;
-			dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
-		}
+		if (pseudo_call)
+			dst_idx = meta->n + 1 + meta->insn.imm;
+		else
+			dst_idx = meta->n + 1 + meta->insn.off;
+
+		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx, cnt);
+
+		if (pseudo_call)
+			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
+
+		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
+		meta->jmp_dst = dst_meta;
 	}
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 853a5346378c..20a98ce4b345 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -262,7 +262,8 @@ struct nfp_bpf_reg_state {
 	bool var_off;
 };
 
-#define FLAG_INSN_IS_JUMP_DST	BIT(0)
+#define FLAG_INSN_IS_JUMP_DST			BIT(0)
+#define FLAG_INSN_IS_SUBPROG_START		BIT(1)
 
 /**
  * struct nfp_insn_meta - BPF instruction wrapper

From 389f263b60c9a3168d47eeb44b6a07bd2b8c66e2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:52 +0100
Subject: [PATCH 57/64] nfp: bpf: add main logics for BPF-to-BPF calls support
 in nfp driver

This is the main patch for the logics of BPF-to-BPF calls in the nfp
driver.

The functions called on BPF_JUMP | BPF_CALL and BPF_JUMP | BPF_EXIT were
used to call helpers and exit from the program, respectively; make them
usable for calling into, or returning from, a BPF subprogram as well.

For all calls, push the return address as well as the callee-saved
registers (R6 to R9) to the stack, and pop them upon returning from the
calls. In order to limit the overhead in terms of instruction number,
this is done through dedicated subroutines. Jumping to the callee
actually consists in jumping to the subroutine, that "returns" to the
callee: this will require some fixup for passing the address in a later
patch. Similarly, returning consists in jumping to the subroutine, which
pops registers and then return directly to the caller (but no fixup is
needed here).

Return to the caller is performed with the RTN instruction newly added
to the JIT.

For the few steps where we need to know what subprogram an instruction
belongs to, the struct nfp_insn_meta is extended with a new subprog_idx
field.

Note that checks on the available stack size, to take into account the
additional requirements associated to BPF-to-BPF calls (storing R6-R9
and return addresses), are added in a later patch.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 235 +++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  20 ++
 .../net/ethernet/netronome/nfp/bpf/offload.c  |   1 -
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  34 ++-
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |   9 +
 5 files changed, 295 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index ccb80a5ac828..2d2c9148bd44 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -266,6 +266,38 @@ emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
 }
 
+static void
+__emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
+	      u8 defer, bool dst_lmextn, bool src_lmextn)
+{
+	u64 insn;
+
+	insn = OP_BR_ALU_BASE |
+		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
+		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
+		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
+		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
+		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
+		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
+
+	nfp_prog_push(nfp_prog, insn);
+}
+
+static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
+{
+	struct nfp_insn_ur_regs reg;
+	int err;
+
+	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
+	if (err) {
+		nfp_prog->error = err;
+		return;
+	}
+
+	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
+		      reg.src_lmextn);
+}
+
 static void
 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
 	     enum immed_width width, bool invert,
@@ -3081,7 +3113,73 @@ static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
 }
 
-static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int
+bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	u32 ret_tgt, stack_depth;
+	swreg tmp_reg;
+
+	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
+	/* Space for saving the return address is accounted for by the callee,
+	 * so stack_depth can be zero for the main function.
+	 */
+	if (stack_depth) {
+		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
+					  stack_imm(nfp_prog));
+		emit_alu(nfp_prog, stack_reg(nfp_prog),
+			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
+		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
+			    NFP_CSR_ACT_LM_ADDR0);
+	}
+
+	/* The following steps are performed:
+	 *     1. Put the start offset of the callee into imm_b(). This will
+	 *        require a fixup step, as we do not necessarily know this
+	 *        address yet.
+	 *     2. Put the return address from the callee to the caller into
+	 *        register ret_reg().
+	 *     3. (After defer slots are consumed) Jump to the subroutine that
+	 *        pushes the registers to the stack.
+	 * The subroutine acts as a trampoline, and returns to the address in
+	 * imm_b(), i.e. jumps to the callee.
+	 *
+	 * Using ret_reg() to pass the return address to the callee is set here
+	 * as a convention. The callee can then push this address onto its
+	 * stack frame in its prologue. The advantages of passing the return
+	 * address through ret_reg(), instead of pushing it to the stack right
+	 * here, are the following:
+	 * - It looks cleaner.
+	 * - If the called function is called multiple time, we get a lower
+	 *   program size.
+	 * - We save two no-op instructions that should be added just before
+	 *   the emit_br() when stack depth is not null otherwise.
+	 * - If we ever find a register to hold the return address during whole
+	 *   execution of the callee, we will not have to push the return
+	 *   address to the stack for leaf functions.
+	 */
+	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+		     RELO_BR_GO_CALL_PUSH_REGS);
+	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
+
+	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
+		return -EINVAL;
+
+	if (stack_depth) {
+		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
+					  stack_imm(nfp_prog));
+		emit_alu(nfp_prog, stack_reg(nfp_prog),
+			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
+		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
+			    NFP_CSR_ACT_LM_ADDR0);
+		wrp_nops(nfp_prog, 3);
+	}
+
+	return 0;
+}
+
+static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	switch (meta->insn.imm) {
 	case BPF_FUNC_xdp_adjust_head:
@@ -3102,6 +3200,19 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	}
 }
 
+static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	if (is_mbpf_pseudo_call(meta))
+		return bpf_to_bpf_call(nfp_prog, meta);
+	else
+		return helper_call(nfp_prog, meta);
+}
+
+static bool nfp_is_main_function(struct nfp_insn_meta *meta)
+{
+	return meta->subprog_idx == 0;
+}
+
 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
@@ -3109,6 +3220,30 @@ static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int
+nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* Pop R6~R9 to the stack via related subroutine.
+	 * Pop return address for BPF-to-BPF call from the stack and load it
+	 * into ret_reg() before we jump. This means that the subroutine does
+	 * not come back here, we make it jump back to the subprogram caller
+	 * directly!
+	 */
+	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+		     RELO_BR_GO_CALL_POP_REGS);
+	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+
+	return 0;
+}
+
+static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	if (nfp_is_main_function(meta))
+		return goto_out(nfp_prog, meta);
+	else
+		return nfp_subprog_epilogue(nfp_prog, meta);
+}
+
 static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
@@ -3197,7 +3332,7 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
 	[BPF_JMP | BPF_CALL] =		call,
-	[BPF_JMP | BPF_EXIT] =		goto_out,
+	[BPF_JMP | BPF_EXIT] =		jmp_exit,
 };
 
 /* --- Assembler logic --- */
@@ -3258,6 +3393,27 @@ static void nfp_intro(struct nfp_prog *nfp_prog)
 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
 }
 
+static void
+nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* Save return address into the stack. */
+	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
+}
+
+static void
+nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
+
+	nfp_prog->stack_frame_depth = round_up(depth, 4);
+	nfp_subprog_prologue(nfp_prog, meta);
+}
+
+bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
+{
+	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
+}
+
 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
 {
 	/* TC direct-action mode:
@@ -3348,6 +3504,56 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
+{
+	u8 reg;
+
+	/* Subroutine: Save all callee saved registers (R6 ~ R9).
+	 * imm_b() holds the return address.
+	 */
+	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
+	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
+		u8 adj = (reg - BPF_REG_0) * 2;
+		u8 idx = (reg - BPF_REG_6) * 2;
+
+		/* The first slot in the stack frame is used to push the return
+		 * address in bpf_to_bpf_call(), start just after.
+		 */
+		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
+
+		if (reg == BPF_REG_8)
+			/* Prepare to jump back, last 3 insns use defer slots */
+			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
+
+		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
+	}
+}
+
+static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
+{
+	u8 reg;
+
+	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
+	 * ret_reg() holds the return address.
+	 */
+	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
+	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
+		u8 adj = (reg - BPF_REG_0) * 2;
+		u8 idx = (reg - BPF_REG_6) * 2;
+
+		/* The first slot in the stack frame holds the return address,
+		 * start popping just after that.
+		 */
+		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
+
+		if (reg == BPF_REG_8)
+			/* Prepare to jump back, last 3 insns use defer slots */
+			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
+
+		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
+	}
+}
+
 static void nfp_outro(struct nfp_prog *nfp_prog)
 {
 	switch (nfp_prog->type) {
@@ -3360,13 +3566,23 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 	default:
 		WARN_ON(1);
 	}
+
+	if (nfp_prog->subprog_cnt == 1)
+		return;
+
+	nfp_push_callee_registers(nfp_prog);
+	nfp_pop_callee_registers(nfp_prog);
 }
 
 static int nfp_translate(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta;
+	unsigned int depth;
 	int err;
 
+	depth = nfp_prog->subprog[0].stack_depth;
+	nfp_prog->stack_frame_depth = round_up(depth, 4);
+
 	nfp_intro(nfp_prog);
 	if (nfp_prog->error)
 		return nfp_prog->error;
@@ -3376,6 +3592,12 @@ static int nfp_translate(struct nfp_prog *nfp_prog)
 
 		meta->off = nfp_prog_current_offset(nfp_prog);
 
+		if (nfp_is_subprog_start(meta)) {
+			nfp_start_subprog(nfp_prog, meta);
+			if (nfp_prog->error)
+				return nfp_prog->error;
+		}
+
 		if (meta->skip) {
 			nfp_prog->n_translated++;
 			continue;
@@ -4069,6 +4291,7 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 	for (i = 0; i < nfp_prog->prog_len; i++) {
 		enum nfp_relo_type special;
 		u32 val;
+		u16 off;
 
 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
 		switch (special) {
@@ -4085,6 +4308,14 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 			br_set_offset(&prog[i],
 				      nfp_prog->tgt_abort + bv->start_off);
 			break;
+		case RELO_BR_GO_CALL_PUSH_REGS:
+			off = nfp_prog->tgt_call_push_regs + bv->start_off;
+			br_set_offset(&prog[i], off);
+			break;
+		case RELO_BR_GO_CALL_POP_REGS:
+			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
+			br_set_offset(&prog[i], off);
+			break;
 		case RELO_BR_NEXT_PKT:
 			br_set_offset(&prog[i], bv->tgt_done);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 20a98ce4b345..d9695bc316dd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -61,6 +61,8 @@ enum nfp_relo_type {
 	/* internal jumps to parts of the outro */
 	RELO_BR_GO_OUT,
 	RELO_BR_GO_ABORT,
+	RELO_BR_GO_CALL_PUSH_REGS,
+	RELO_BR_GO_CALL_POP_REGS,
 	/* external jumps to fixed addresses */
 	RELO_BR_NEXT_PKT,
 	RELO_BR_HELPER,
@@ -104,6 +106,7 @@ enum pkt_vec {
 #define imma_a(np)	reg_a(STATIC_REG_IMMA)
 #define imma_b(np)	reg_b(STATIC_REG_IMMA)
 #define imm_both(np)	reg_both(STATIC_REG_IMM)
+#define ret_reg(np)	imm_a(np)
 
 #define NFP_BPF_ABI_FLAGS	reg_imm(0)
 #define   NFP_BPF_ABI_FLAG_MARK	1
@@ -290,6 +293,7 @@ struct nfp_bpf_reg_state {
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
+ * @subprog_idx: index of subprogram to which the instruction belongs
  * @skip: skip this instruction (optimized out)
  * @double_cb: callback for second part of the instruction
  * @l: link on nfp_prog->insns list
@@ -336,6 +340,7 @@ struct nfp_insn_meta {
 	unsigned int off;
 	unsigned short n;
 	unsigned short flags;
+	unsigned short subprog_idx;
 	bool skip;
 	instr_cb_t double_cb;
 
@@ -432,6 +437,16 @@ static inline bool is_mbpf_helper_call(const struct nfp_insn_meta *meta)
 		insn.src_reg != BPF_PSEUDO_CALL;
 }
 
+static inline bool is_mbpf_pseudo_call(const struct nfp_insn_meta *meta)
+{
+	struct bpf_insn insn = meta->insn;
+
+	return insn.code == (BPF_JMP | BPF_CALL) &&
+		insn.src_reg == BPF_PSEUDO_CALL;
+}
+
+#define STACK_FRAME_ALIGN 64
+
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
@@ -451,6 +466,8 @@ struct nfp_bpf_subprog_info {
  * @last_bpf_off: address of the last instruction translated from BPF
  * @tgt_out: jump target for normal exit
  * @tgt_abort: jump target for abort (e.g. access outside of packet buffer)
+ * @tgt_call_push_regs: jump target for subroutine for saving R6~R9 to stack
+ * @tgt_call_pop_regs: jump target for subroutine used for restoring R6~R9
  * @n_translated: number of successfully translated instructions (for errors)
  * @error: error code if something went wrong
  * @stack_frame_depth: max stack depth for current frame
@@ -475,6 +492,8 @@ struct nfp_prog {
 	unsigned int last_bpf_off;
 	unsigned int tgt_out;
 	unsigned int tgt_abort;
+	unsigned int tgt_call_push_regs;
+	unsigned int tgt_call_pop_regs;
 
 	unsigned int n_translated;
 	int error;
@@ -502,6 +521,7 @@ struct nfp_bpf_vnic {
 	unsigned int tgt_done;
 };
 
+bool nfp_is_subprog_start(struct nfp_insn_meta *meta);
 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt);
 int nfp_bpf_jit(struct nfp_prog *prog);
 bool nfp_bpf_supported_opcode(u8 code);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index b683b03efd22..2ebd13b29c97 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -262,7 +262,6 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 			prog->aux->stack_depth, stack_size);
 		return -EOPNOTSUPP;
 	}
-	nfp_prog->stack_frame_depth = round_up(prog->aux->stack_depth, 4);
 
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index c642c2c07d96..cc1b2c601f4e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -641,6 +641,27 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
+static int
+nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *meta;
+	int index = 0;
+
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		if (nfp_is_subprog_start(meta))
+			index++;
+		meta->subprog_idx = index;
+	}
+
+	if (index + 1 != nfp_prog->subprog_cnt) {
+		pr_vlog(env, "BUG: number of processed BPF functions is not consistent (processed %d, expected %d)\n",
+			index + 1, nfp_prog->subprog_cnt);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *info;
@@ -654,10 +675,21 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	if (!nfp_prog->subprog)
 		return -ENOMEM;
 
+	nfp_assign_subprog_idx(env, nfp_prog);
+
 	info = env->subprog_info;
-	for (i = 0; i < nfp_prog->subprog_cnt; i++)
+	for (i = 0; i < nfp_prog->subprog_cnt; i++) {
 		nfp_prog->subprog[i].stack_depth = info[i].stack_depth;
 
+		if (i == 0)
+			continue;
+
+		/* Account for size of return address. */
+		nfp_prog->subprog[i].stack_depth += REG_WIDTH;
+		/* Account for size of saved registers. */
+		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index fad0e62a910c..5b257c603e91 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -82,6 +82,15 @@
 #define OP_BR_BIT_ADDR_LO	OP_BR_ADDR_LO
 #define OP_BR_BIT_ADDR_HI	OP_BR_ADDR_HI
 
+#define OP_BR_ALU_BASE		0x0e800000000ULL
+#define OP_BR_ALU_BASE_MASK	0x0ff80000000ULL
+#define OP_BR_ALU_A_SRC		0x000000003ffULL
+#define OP_BR_ALU_B_SRC		0x000000ffc00ULL
+#define OP_BR_ALU_DEFBR		0x00000300000ULL
+#define OP_BR_ALU_IMM_HI	0x0007fc00000ULL
+#define OP_BR_ALU_SRC_LMEXTN	0x40000000000ULL
+#define OP_BR_ALU_DST_LMEXTN	0x80000000000ULL
+
 static inline bool nfp_is_br(u64 insn)
 {
 	return (insn & OP_BR_BASE_MASK) == OP_BR_BASE ||

From fb1981654129e9c71af4c0f2782c5ffde8cbf37f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:53 +0100
Subject: [PATCH 58/64] nfp: bpf: account for additional stack usage when
 checking stack limit

Offloaded programs using BPF-to-BPF calls use the stack to store the
return address when calling into a subprogram. Callees also need some
space to save eBPF registers R6 to R9. And contrarily to kernel
verifier, we align stack frames on 64 bytes (and not 32). Account for
all this when checking the stack size limit before JIT-ing the program.
This means we have to recompute maximum stack usage for the program, we
cannot get the value from the kernel.

In addition to adapting the checks on stack usage, move them to the
finalize() callback, now that we have it and because such checks are
part of the verification step rather than translation.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../net/ethernet/netronome/nfp/bpf/offload.c  |  8 ---
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 68 +++++++++++++++++++
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 2ebd13b29c97..49c7bead8113 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -252,17 +252,9 @@ err_free:
 static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-	unsigned int stack_size;
 	unsigned int max_instr;
 	int err;
 
-	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
-	if (prog->aux->stack_depth > stack_size) {
-		nn_info(nn, "stack too large: program %dB > FW stack %dB\n",
-			prog->aux->stack_depth, stack_size);
-		return -EOPNOTSUPP;
-	}
-
 	max_instr = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
 	nfp_prog->__prog_alloc_len = max_instr * sizeof(u64);
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index cc1b2c601f4e..81a463726d55 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -34,10 +34,12 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/kernel.h>
+#include <linux/netdevice.h>
 #include <linux/pkt_cls.h>
 
 #include "../nfp_app.h"
 #include "../nfp_main.h"
+#include "../nfp_net.h"
 #include "fw.h"
 #include "main.h"
 
@@ -662,10 +664,67 @@ nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
 	return 0;
 }
 
+static unsigned int
+nfp_bpf_get_stack_usage(struct nfp_prog *nfp_prog, unsigned int cnt)
+{
+	struct nfp_insn_meta *meta = nfp_prog_first_meta(nfp_prog);
+	unsigned int max_depth = 0, depth = 0, frame = 0;
+	struct nfp_insn_meta *ret_insn[MAX_CALL_FRAMES];
+	unsigned short frame_depths[MAX_CALL_FRAMES];
+	unsigned short ret_prog[MAX_CALL_FRAMES];
+	unsigned short idx = meta->subprog_idx;
+
+	/* Inspired from check_max_stack_depth() from kernel verifier.
+	 * Starting from main subprogram, walk all instructions and recursively
+	 * walk all callees that given subprogram can call. Since recursion is
+	 * prevented by the kernel verifier, this algorithm only needs a local
+	 * stack of MAX_CALL_FRAMES to remember callsites.
+	 */
+process_subprog:
+	frame_depths[frame] = nfp_prog->subprog[idx].stack_depth;
+	frame_depths[frame] = round_up(frame_depths[frame], STACK_FRAME_ALIGN);
+	depth += frame_depths[frame];
+	max_depth = max(max_depth, depth);
+
+continue_subprog:
+	for (; meta != nfp_prog_last_meta(nfp_prog) && meta->subprog_idx == idx;
+	     meta = nfp_meta_next(meta)) {
+		if (!is_mbpf_pseudo_call(meta))
+			continue;
+
+		/* We found a call to a subprogram. Remember instruction to
+		 * return to and subprog id.
+		 */
+		ret_insn[frame] = nfp_meta_next(meta);
+		ret_prog[frame] = idx;
+
+		/* Find the callee and start processing it. */
+		meta = nfp_bpf_goto_meta(nfp_prog, meta,
+					 meta->n + 1 + meta->insn.imm, cnt);
+		idx = meta->subprog_idx;
+		frame++;
+		goto process_subprog;
+	}
+	/* End of for() loop means the last instruction of the subprog was
+	 * reached. If we popped all stack frames, return; otherwise, go on
+	 * processing remaining instructions from the caller.
+	 */
+	if (frame == 0)
+		return max_depth;
+
+	depth -= frame_depths[frame];
+	frame--;
+	meta = ret_insn[frame];
+	idx = ret_prog[frame];
+	goto continue_subprog;
+}
+
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
+	unsigned int stack_size, stack_needed;
 	struct bpf_subprog_info *info;
 	struct nfp_prog *nfp_prog;
+	struct nfp_net *nn;
 	int i;
 
 	nfp_prog = env->prog->aux->offload->dev_priv;
@@ -690,6 +749,15 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
 	}
 
+	nn = netdev_priv(env->prog->aux->offload->netdev);
+	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	stack_needed = nfp_bpf_get_stack_usage(nfp_prog, env->prog->len);
+	if (stack_needed > stack_size) {
+		pr_vlog(env, "stack too large: program %dB > FW stack %dB\n",
+			stack_needed, stack_size);
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 

From bdf4c66faf5fa6fd5ffb0b59c39c7629103d6479 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:54 +0100
Subject: [PATCH 59/64] nfp: bpf: update fixup function for BPF-to-BPF calls
 support

Relocation for targets of BPF-to-BPF calls are required at the end of
translation. Update the nfp_fixup_branches() function in that regard.

When checking that the last instruction of each bloc is a branch, we
must account for the length of the instructions required to pop the
return address from the stack.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 25 ++++++++++++++++---
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  2 ++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 2d2c9148bd44..e8b03d8f54f7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3116,7 +3116,7 @@ static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	u32 ret_tgt, stack_depth;
+	u32 ret_tgt, stack_depth, offset_br;
 	swreg tmp_reg;
 
 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
@@ -3160,6 +3160,7 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
 		     RELO_BR_GO_CALL_PUSH_REGS);
+	offset_br = nfp_prog_current_offset(nfp_prog);
 	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
@@ -3176,6 +3177,9 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		wrp_nops(nfp_prog, 3);
 	}
 
+	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
+	meta->num_insns_after_br -= offset_br;
+
 	return 0;
 }
 
@@ -3344,21 +3348,36 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
 			continue;
-		if (meta->insn.code == (BPF_JMP | BPF_CALL))
-			continue;
 		if (BPF_CLASS(meta->insn.code) != BPF_JMP)
 			continue;
+		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
+		    !nfp_is_main_function(meta))
+			continue;
+		if (is_mbpf_helper_call(meta))
+			continue;
 
 		if (list_is_last(&meta->l, &nfp_prog->insns))
 			br_idx = nfp_prog->last_bpf_off;
 		else
 			br_idx = list_next_entry(meta, l)->off - 1;
 
+		/* For BPF-to-BPF function call, a stack adjustment sequence is
+		 * generated after the return instruction. Therefore, we must
+		 * withdraw the length of this sequence to have br_idx pointing
+		 * to where the "branch" NFP instruction is expected to be.
+		 */
+		if (is_mbpf_pseudo_call(meta))
+			br_idx -= meta->num_insns_after_br;
+
 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
 			return -ELOOP;
 		}
+
+		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
+			continue;
+
 		/* Leave special branches for later */
 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
 		    RELO_BR_REL)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index d9695bc316dd..1cef5136c198 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -283,6 +283,7 @@ struct nfp_bpf_reg_state {
  * @xadd_maybe_16bit: 16bit immediate is possible
  * @jmp_dst: destination info for jump instructions
  * @jump_neg_op: jump instruction has inverted immediate, use ADD instead of SUB
+ * @num_insns_after_br: number of insns following a branch jump, used for fixup
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
@@ -319,6 +320,7 @@ struct nfp_insn_meta {
 		struct {
 			struct nfp_insn_meta *jmp_dst;
 			bool jump_neg_op;
+			u32 num_insns_after_br; /* only for BPF-to-BPF calls */
 		};
 		/* function calls */
 		struct {

From 2178f3f0dc200557312e783aa683b87794084ae2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:55 +0100
Subject: [PATCH 60/64] nfp: bpf: fix return address from register-saving
 subroutine to callee

On performing a BPF-to-BPF call, we first jump to a subroutine that
pushes callee-saved registers (R6~R9) to the stack, and from there we
goes to the start of the callee next. In order to do so, the caller must
pass to the subroutine the address of the NFP instruction to jump to at
the end of that subroutine. This cannot be reliably implemented when
translated the caller, as we do not always know the start offset of the
callee yet.

This patch implement the required fixup step for passing the start
offset in the callee via the register used by the subroutine to hold its
return address.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 28 +++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index e8b03d8f54f7..74423d3e714d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3340,10 +3340,25 @@ static const instr_cb_t instr_cb[256] = {
 };
 
 /* --- Assembler logic --- */
+static int
+nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
+{
+	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
+		pr_err("BUG: failed to fix up callee register saving\n");
+		return -EINVAL;
+	}
+
+	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
+
+	return 0;
+}
+
 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta, *jmp_dst;
 	u32 idx, br_idx;
+	int err;
 
 	list_for_each_entry(meta, &nfp_prog->insns, l) {
 		if (meta->skip)
@@ -3380,7 +3395,7 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 
 		/* Leave special branches for later */
 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
-		    RELO_BR_REL)
+		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
 			continue;
 
 		if (!meta->jmp_dst) {
@@ -3395,6 +3410,17 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
+		if (is_mbpf_pseudo_call(meta)) {
+			err = nfp_fixup_immed_relo(nfp_prog, meta,
+						   jmp_dst, br_idx);
+			if (err)
+				return err;
+		}
+
+		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
+		    RELO_BR_REL)
+			continue;
+
 		for (idx = meta->off; idx <= br_idx; idx++) {
 			if (!nfp_is_br(nfp_prog->prog[idx]))
 				continue;

From 445496231445aad46866a858a384b428cd073977 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:56 +0100
Subject: [PATCH 61/64] nfp: bpf: optimise save/restore for R6~R9 based on
 register usage

When pre-processing the instructions, it is trivial to detect what
subprograms are using R6, R7, R8 or R9 as destination registers. If a
subprogram uses none of those, then we do not need to jump to the
subroutines dedicated to saving and restoring callee-saved registers in
its prologue and epilogue.

This patch introduces detection of callee-saved registers in subprograms
and prevents the JIT from adding calls to those subroutines whenever we
can: we save some instructions in the translated program, and some time
at runtime on BPF-to-BPF calls and returns.

If no subprogram needs to save those registers, we can avoid appending
the subroutines at the end of the program.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 85 ++++++++++++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  2 +
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 14 ++-
 3 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 74423d3e714d..b393f9dea584 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3132,7 +3132,9 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 			    NFP_CSR_ACT_LM_ADDR0);
 	}
 
-	/* The following steps are performed:
+	/* Two cases for jumping to the callee:
+	 *
+	 * - If callee uses and needs to save R6~R9 then:
 	 *     1. Put the start offset of the callee into imm_b(). This will
 	 *        require a fixup step, as we do not necessarily know this
 	 *        address yet.
@@ -3140,8 +3142,12 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *        register ret_reg().
 	 *     3. (After defer slots are consumed) Jump to the subroutine that
 	 *        pushes the registers to the stack.
-	 * The subroutine acts as a trampoline, and returns to the address in
-	 * imm_b(), i.e. jumps to the callee.
+	 *   The subroutine acts as a trampoline, and returns to the address in
+	 *   imm_b(), i.e. jumps to the callee.
+	 *
+	 * - If callee does not need to save R6~R9 then just load return
+	 *   address to the caller in ret_reg(), and jump to the callee
+	 *   directly.
 	 *
 	 * Using ret_reg() to pass the return address to the callee is set here
 	 * as a convention. The callee can then push this address onto its
@@ -3157,11 +3163,21 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *   execution of the callee, we will not have to push the return
 	 *   address to the stack for leaf functions.
 	 */
-	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
-		     RELO_BR_GO_CALL_PUSH_REGS);
-	offset_br = nfp_prog_current_offset(nfp_prog);
-	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	if (!meta->jmp_dst) {
+		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
+		return -ELOOP;
+	}
+	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+			     RELO_BR_GO_CALL_PUSH_REGS);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	} else {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
+		emit_br(nfp_prog, BR_UNC, meta->n + 1 + meta->insn.imm, 1);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+	}
 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
@@ -3227,15 +3243,24 @@ static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	/* Pop R6~R9 to the stack via related subroutine.
-	 * Pop return address for BPF-to-BPF call from the stack and load it
-	 * into ret_reg() before we jump. This means that the subroutine does
-	 * not come back here, we make it jump back to the subprogram caller
-	 * directly!
-	 */
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
-		     RELO_BR_GO_CALL_POP_REGS);
-	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
+		/* Pop R6~R9 to the stack via related subroutine.
+		 * We loaded the return address to the caller into ret_reg().
+		 * This means that the subroutine does not come back here, we
+		 * make it jump back to the subprogram caller directly!
+		 */
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+			     RELO_BR_GO_CALL_POP_REGS);
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	} else {
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+		/* Jump back to caller if no callee-saved registers were used
+		 * by the subprogram.
+		 */
+		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
+	}
 
 	return 0;
 }
@@ -3410,7 +3435,8 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
-		if (is_mbpf_pseudo_call(meta)) {
+		if (is_mbpf_pseudo_call(meta) &&
+		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
 			err = nfp_fixup_immed_relo(nfp_prog, meta,
 						   jmp_dst, br_idx);
 			if (err)
@@ -3549,6 +3575,17 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
+{
+	unsigned int idx;
+
+	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
+		if (nfp_prog->subprog[idx].needs_reg_push)
+			return true;
+
+	return false;
+}
+
 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
 {
 	u8 reg;
@@ -3612,7 +3649,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 		WARN_ON(1);
 	}
 
-	if (nfp_prog->subprog_cnt == 1)
+	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
 		return;
 
 	nfp_push_callee_registers(nfp_prog);
@@ -4354,10 +4391,20 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 				      nfp_prog->tgt_abort + bv->start_off);
 			break;
 		case RELO_BR_GO_CALL_PUSH_REGS:
+			if (!nfp_prog->tgt_call_push_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
 		case RELO_BR_GO_CALL_POP_REGS:
+			if (!nfp_prog->tgt_call_pop_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 1cef5136c198..44b787a0bd4b 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -452,9 +452,11 @@ static inline bool is_mbpf_pseudo_call(const struct nfp_insn_meta *meta)
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
+ * @needs_reg_push:	whether sub-program uses callee-saved registers
  */
 struct nfp_bpf_subprog_info {
 	u16 stack_depth;
+	u8 needs_reg_push : 1;
 };
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 81a463726d55..f31721bd1fac 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -644,7 +644,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 }
 
 static int
-nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+nfp_assign_subprog_idx_and_regs(struct bpf_verifier_env *env,
+				struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta;
 	int index = 0;
@@ -653,6 +654,10 @@ nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
 		if (nfp_is_subprog_start(meta))
 			index++;
 		meta->subprog_idx = index;
+
+		if (meta->insn.dst_reg >= BPF_REG_6 &&
+		    meta->insn.dst_reg <= BPF_REG_9)
+			nfp_prog->subprog[index].needs_reg_push = 1;
 	}
 
 	if (index + 1 != nfp_prog->subprog_cnt) {
@@ -734,7 +739,7 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	if (!nfp_prog->subprog)
 		return -ENOMEM;
 
-	nfp_assign_subprog_idx(env, nfp_prog);
+	nfp_assign_subprog_idx_and_regs(env, nfp_prog);
 
 	info = env->subprog_info;
 	for (i = 0; i < nfp_prog->subprog_cnt; i++) {
@@ -745,8 +750,9 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 
 		/* Account for size of return address. */
 		nfp_prog->subprog[i].stack_depth += REG_WIDTH;
-		/* Account for size of saved registers. */
-		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+		/* Account for size of saved registers, if necessary. */
+		if (nfp_prog->subprog[i].needs_reg_push)
+			nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
 	}
 
 	nn = netdev_priv(env->prog->aux->offload->netdev);

From 7ff0ccde43664e3de9fe60edc19466f16cda6b7a Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:57 +0100
Subject: [PATCH 62/64] nfp: bpf: support pointers to other stack frames for
 BPF-to-BPF calls

Mark instructions that use pointers to areas in the stack outside of the
current stack frame, and process them accordingly in mem_op_stack().
This way, we also support BPF-to-BPF calls where the caller passes a
pointer to data in its own stack frame to the callee (typically, when
the caller passes an address to one of its local variables located in
the stack, as an argument).

Thanks to Jakub and Jiong for figuring out how to deal with this case,
I just had to turn their email discussion into this patch.

Suggested-by: Jiong Wang <jiong.wang@netronome.com>
Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 3 ++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 1 +
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index b393f9dea584..6ed1b5207ecd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1178,7 +1178,8 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	bool lm3 = true;
 	int ret;
 
-	if (meta->ptr_not_const) {
+	if (meta->ptr_not_const ||
+	    meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
 		/* Use of the last encountered ptr_off is OK, they all have
 		 * the same alignment.  Depend on low bits of value being
 		 * discarded when written to LMaddr register.
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 44b787a0bd4b..25e10cfa2678 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -267,6 +267,7 @@ struct nfp_bpf_reg_state {
 
 #define FLAG_INSN_IS_JUMP_DST			BIT(0)
 #define FLAG_INSN_IS_SUBPROG_START		BIT(1)
+#define FLAG_INSN_PTR_CALLER_STACK_FRAME	BIT(2)
 
 /**
  * struct nfp_insn_meta - BPF instruction wrapper
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index f31721bd1fac..cddb70786a58 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -336,6 +336,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 {
 	s32 old_off, new_off;
 
+	if (reg->frameno != env->cur_state->curframe)
+		meta->flags |= FLAG_INSN_PTR_CALLER_STACK_FRAME;
+
 	if (!tnum_is_const(reg->var_off)) {
 		pr_vlog(env, "variable ptr stack access\n");
 		return -EINVAL;

From e4052d06a5195b29271a7af262711d69f9ecfd04 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Sun, 7 Oct 2018 12:56:58 +0100
Subject: [PATCH 63/64] bpf: allow offload of programs with BPF-to-BPF function
 calls

Now that there is at least one driver supporting BPF-to-BPF function
calls, lift the restriction, in the verifier, on hardware offload of
eBPF programs containing such calls. But prevent jit_subprogs(), still
in the verifier, from being run for offloaded programs.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a0454cb299ba..73cc136915fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1009,10 +1009,6 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			verbose(env, "function calls to other bpf functions are allowed for root only\n");
 			return -EPERM;
 		}
-		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-			verbose(env, "function calls in offloaded programs are not supported yet\n");
-			return -EINVAL;
-		}
 		ret = add_subprog(env, i + insn[i].imm + 1);
 		if (ret < 0)
 			return ret;
@@ -5968,10 +5964,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	int i, depth;
 #endif
-	int err;
+	int err = 0;
 
-	err = 0;
-	if (env->prog->jit_requested) {
+	if (env->prog->jit_requested &&
+	    !bpf_prog_is_dev_bound(env->prog->aux)) {
 		err = jit_subprogs(env);
 		if (err == 0)
 			return 0;

From df3f94a0bbeb6cb6a02eb16b8e76f16b33cb2f8f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Oct 2018 11:30:01 -0700
Subject: [PATCH 64/64] bpf: fix building without CONFIG_INET

The newly added TCP and UDP handling fails to link when CONFIG_INET
is disabled:

net/core/filter.o: In function `sk_lookup':
filter.c:(.text+0x7ff8): undefined reference to `tcp_hashinfo'
filter.c:(.text+0x7ffc): undefined reference to `tcp_hashinfo'
filter.c:(.text+0x8020): undefined reference to `__inet_lookup_established'
filter.c:(.text+0x8058): undefined reference to `__inet_lookup_listener'
filter.c:(.text+0x8068): undefined reference to `udp_table'
filter.c:(.text+0x8070): undefined reference to `udp_table'
filter.c:(.text+0x808c): undefined reference to `__udp4_lib_lookup'
net/core/filter.o: In function `bpf_sk_release':
filter.c:(.text+0x82e8): undefined reference to `sock_gen_put'

Wrap the related sections of code in #ifdefs for the config option.

Furthermore, sk_lookup() should always have been marked 'static', this
also avoids a warning about a missing prototype when building with
'make W=1'.

Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Joe Stringer <joe@wand.net.nz>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 30c6b2d3ef16..4bbc6567fcb8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4817,8 +4817,9 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
-struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
-		       struct sk_buff *skb, u8 family, u8 proto)
+#ifdef CONFIG_INET
+static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+			      struct sk_buff *skb, u8 family, u8 proto)
 {
 	int dif = skb->dev->ifindex;
 	bool refcounted = false;
@@ -4951,6 +4952,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_SOCKET,
 };
+#endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
 {
@@ -5158,12 +5160,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
+#ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
 		return &bpf_sk_lookup_tcp_proto;
 	case BPF_FUNC_sk_lookup_udp:
 		return &bpf_sk_lookup_udp_proto;
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5264,12 +5268,14 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_hash_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+#ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
 		return &bpf_sk_lookup_tcp_proto;
 	case BPF_FUNC_sk_lookup_udp:
 		return &bpf_sk_lookup_udp_proto;
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}