From cf83b2d2e2b64920bd6999b199dfa271d7e94cf8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Oct 2020 23:10:54 -0700 Subject: [PATCH 01/66] bpf: Permit cond_resched for some iterators Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator") tries to fix rcu stalls warning which is caused by bpf task_file iterator when running "bpftool prog". rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 ... task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The fix is to limit the number of bpf program runs to be one million. This fixed the program in most cases. But we also found under heavy load, which can increase the wallclock time for bpf_seq_read(), the warning may still be possible. For example, calling bpf_delay() in the "while" loop of bpf_seq_read(), which will introduce artificial delay, the warning will show up in my qemu run. static unsigned q; volatile unsigned *p = &q; volatile unsigned long long ll; static void bpf_delay(void) { int i, j; for (i = 0; i < 10000; i++) for (j = 0; j < 10000; j++) ll += *p; } There are two ways to fix this issue. One is to reduce the above one million threshold to say 100,000 and hopefully rcu warning will not show up any more. Another is to introduce a target feature which enables bpf_seq_read() calling cond_resched(). This patch took second approach as the first approach may cause more -EAGAIN failures for read() syscalls. Note that not all bpf_iter targets can permit cond_resched() in bpf_seq_read() as some, e.g., netlink seq iterator, rcu read lock critical section spans through seq_ops->next() -> seq_ops->show() -> seq_ops->next(). For the kernel code with the above hack, "bpftool p" roughly takes 38 seconds to finish on my VM with 184 bpf program runs. Using the following command, I am able to collect the number of context switches: perf stat -e context-switches -- ./bpftool p >& log Without this patch, 69 context-switches With this patch, 75 context-switches This patch added additional 6 context switches, roughly every 6 seconds to reschedule, to avoid lengthy no-rescheduling which may cause the above RCU warnings. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com --- include/linux/bpf.h | 5 +++++ kernel/bpf/bpf_iter.c | 14 ++++++++++++++ kernel/bpf/task_iter.c | 2 ++ 3 files changed, 21 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b16bf48aab6..2fffd30e13ac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1294,6 +1294,10 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +enum bpf_iter_feature { + BPF_ITER_RESCHED = BIT(0), +}; + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; @@ -1302,6 +1306,7 @@ struct bpf_iter_reg { bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; + u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8f10e30ea0b0..5454161407f1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,15 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static bool bpf_iter_support_resched(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 @@ -83,6 +92,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; + bool can_resched; void *p; mutex_lock(&seq->lock); @@ -135,6 +145,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, goto done; } + can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; @@ -180,6 +191,9 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } break; } + + if (can_resched) + cond_resched(); } stop: offs = seq->count; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5b6af30bfbcd..1fdb2fc196cd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -337,6 +337,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), @@ -354,6 +355,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), From c50eb518e262fa06bd334e6eec172eaf5d7a5bd9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:24 -0700 Subject: [PATCH 02/66] bpf: Use separate lockdep class for each hashtab If a hashtab is accessed in both NMI and non-NMI contexts, it may cause deadlock in bucket->lock. LOCKDEP NMI warning highlighted this issue: ./test_progs -t stacktrace [ 74.828970] [ 74.828971] ================================ [ 74.828972] WARNING: inconsistent lock state [ 74.828973] 5.9.0-rc8+ #275 Not tainted [ 74.828974] -------------------------------- [ 74.828975] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 74.828976] taskset/1174 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 74.828977] ffffc90000ee96b0 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x271/0x5a0 [ 74.828981] {INITIAL USE} state was registered at: [ 74.828982] lock_acquire+0x137/0x510 [ 74.828983] _raw_spin_lock_irqsave+0x43/0x90 [ 74.828984] htab_map_update_elem+0x271/0x5a0 [ 74.828984] 0xffffffffa0040b34 [ 74.828985] trace_call_bpf+0x159/0x310 [ 74.828986] perf_trace_run_bpf_submit+0x5f/0xd0 [ 74.828987] perf_trace_urandom_read+0x1be/0x220 [ 74.828988] urandom_read_nowarn.isra.0+0x26f/0x380 [ 74.828989] vfs_read+0xf8/0x280 [ 74.828989] ksys_read+0xc9/0x160 [ 74.828990] do_syscall_64+0x33/0x40 [ 74.828991] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 74.828992] irq event stamp: 1766 [ 74.828993] hardirqs last enabled at (1765): [] asm_exc_page_fault+0x1e/0x30 [ 74.828994] hardirqs last disabled at (1766): [] irqentry_enter+0x37/0x60 [ 74.828995] softirqs last enabled at (856): [] fpu__clear+0xac/0x120 [ 74.828996] softirqs last disabled at (854): [] fpu__clear+0x20/0x120 [ 74.828997] [ 74.828998] other info that might help us debug this: [ 74.828999] Possible unsafe locking scenario: [ 74.828999] [ 74.829000] CPU0 [ 74.829001] ---- [ 74.829001] lock(&htab->buckets[i].raw_lock); [ 74.829003] [ 74.829004] lock(&htab->buckets[i].raw_lock); [ 74.829006] [ 74.829006] *** DEADLOCK *** [ 74.829007] [ 74.829008] 1 lock held by taskset/1174: [ 74.829008] #0: ffff8883ec3fd020 (&cpuctx_lock){-...}-{2:2}, at: perf_event_task_tick+0x101/0x650 [ 74.829012] [ 74.829013] stack backtrace: [ 74.829014] CPU: 0 PID: 1174 Comm: taskset Not tainted 5.9.0-rc8+ #275 [ 74.829015] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 74.829016] Call Trace: [ 74.829016] [ 74.829017] dump_stack+0x9a/0xd0 [ 74.829018] lock_acquire+0x461/0x510 [ 74.829019] ? lock_release+0x6b0/0x6b0 [ 74.829020] ? stack_map_get_build_id_offset+0x45e/0x800 [ 74.829021] ? htab_map_update_elem+0x271/0x5a0 [ 74.829022] ? rcu_read_lock_held_common+0x1a/0x50 [ 74.829022] ? rcu_read_lock_held+0x5f/0xb0 [ 74.829023] _raw_spin_lock_irqsave+0x43/0x90 [ 74.829024] ? htab_map_update_elem+0x271/0x5a0 [ 74.829025] htab_map_update_elem+0x271/0x5a0 [ 74.829026] bpf_prog_1fd9e30e1438d3c5_oncpu+0x9c/0xe88 [ 74.829027] bpf_overflow_handler+0x127/0x320 [ 74.829028] ? perf_event_text_poke_output+0x4d0/0x4d0 [ 74.829029] ? sched_clock_cpu+0x18/0x130 [ 74.829030] __perf_event_overflow+0xae/0x190 [ 74.829030] handle_pmi_common+0x34c/0x470 [ 74.829031] ? intel_pmu_save_and_restart+0x90/0x90 [ 74.829032] ? lock_acquire+0x3f8/0x510 [ 74.829033] ? lock_release+0x6b0/0x6b0 [ 74.829034] intel_pmu_handle_irq+0x11e/0x240 [ 74.829034] perf_event_nmi_handler+0x40/0x60 [ 74.829035] nmi_handle+0x110/0x360 [ 74.829036] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829037] default_do_nmi+0x6b/0x170 [ 74.829038] exc_nmi+0x106/0x130 [ 74.829038] end_repeat_nmi+0x16/0x55 [ 74.829039] RIP: 0010:__intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829042] Code: 2f 1f 03 48 8d bb b8 0c 00 00 e8 29 09 41 00 48 ... [ 74.829043] RSP: 0000:ffff8880a604fc90 EFLAGS: 00000002 [ 74.829044] RAX: 000000070000000f RBX: ffff8883ec2195a0 RCX: 000000000000038f [ 74.829045] RDX: 0000000000000007 RSI: ffffffff82e72c20 RDI: ffff8883ec21a258 [ 74.829046] RBP: 000000070000000f R08: ffffffff8101b013 R09: fffffbfff0a7982d [ 74.829047] R10: ffffffff853cc167 R11: fffffbfff0a7982c R12: 0000000000000000 [ 74.829049] R13: ffff8883ec3f0af0 R14: ffff8883ec3fd120 R15: ffff8883e9c92098 [ 74.829049] ? intel_pmu_lbr_enable_all+0x43/0x240 [ 74.829050] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829051] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829052] [ 74.829053] perf_event_task_tick+0x48d/0x650 [ 74.829054] scheduler_tick+0x129/0x210 [ 74.829054] update_process_times+0x37/0x70 [ 74.829055] tick_sched_handle.isra.0+0x35/0x90 [ 74.829056] tick_sched_timer+0x8f/0xb0 [ 74.829057] __hrtimer_run_queues+0x364/0x7d0 [ 74.829058] ? tick_sched_do_timer+0xa0/0xa0 [ 74.829058] ? enqueue_hrtimer+0x1e0/0x1e0 [ 74.829059] ? recalibrate_cpu_khz+0x10/0x10 [ 74.829060] ? ktime_get_update_offsets_now+0x1a3/0x360 [ 74.829061] hrtimer_interrupt+0x1bb/0x360 [ 74.829062] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 74.829063] __sysvec_apic_timer_interrupt+0xed/0x3d0 [ 74.829064] sysvec_apic_timer_interrupt+0x3f/0xd0 [ 74.829064] ? asm_sysvec_apic_timer_interrupt+0xa/0x20 [ 74.829065] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 74.829066] RIP: 0033:0x7fba18d579b4 [ 74.829068] Code: 74 54 44 0f b6 4a 04 41 83 e1 0f 41 80 f9 ... [ 74.829069] RSP: 002b:00007ffc9ba69570 EFLAGS: 00000206 [ 74.829071] RAX: 00007fba192084c0 RBX: 00007fba18c24d28 RCX: 00000000000007a4 [ 74.829072] RDX: 00007fba18c30488 RSI: 0000000000000000 RDI: 000000000000037b [ 74.829073] RBP: 00007fba18ca5760 R08: 00007fba18c248fc R09: 00007fba18c94c30 [ 74.829074] R10: 000000000000002f R11: 0000000000073c30 R12: 00007ffc9ba695e0 [ 74.829075] R13: 00000000000003f3 R14: 00007fba18c21ac8 R15: 00000000000058d6 However, such warning should not apply across multiple hashtabs. The system will not deadlock if one hashtab is used in NMI, while another hashtab is used in non-NMI. Use separate lockdep class for each hashtab, so that we don't get this false alert. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-2-songliubraving@fb.com --- kernel/bpf/hashtab.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..278da031c91a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -99,6 +99,7 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; + struct lock_class_key lockdep_key; }; /* each htab element is struct htab_elem + key + value */ @@ -136,12 +137,18 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; + lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) + if (htab_use_raw_lock(htab)) { raw_spin_lock_init(&htab->buckets[i].raw_lock); - else + lockdep_set_class(&htab->buckets[i].raw_lock, + &htab->lockdep_key); + } else { spin_lock_init(&htab->buckets[i].lock); + lockdep_set_class(&htab->buckets[i].lock, + &htab->lockdep_key); + } } } @@ -1312,6 +1319,7 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } From 20b6cc34ea74b6a84599c1f8a70f3315b56a1883 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:25 -0700 Subject: [PATCH 03/66] bpf: Avoid hashtab deadlock with map_locked If a hashtab is accessed in both non-NMI and NMI context, the system may deadlock on bucket->lock. Fix this issue with percpu counter map_locked. map_locked rejects concurrent access to the same bucket from the same CPU. To reduce memory overhead, map_locked is not added per bucket. Instead, 8 percpu counters are added to each hashtab. buckets are assigned to these counters based on the lower bits of its hash. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-3-songliubraving@fb.com --- kernel/bpf/hashtab.c | 114 +++++++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 32 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 278da031c91a..da59ba978d17 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -86,6 +86,9 @@ struct bucket { }; }; +#define HASHTAB_MAP_LOCK_COUNT 8 +#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) + struct bpf_htab { struct bpf_map map; struct bucket *buckets; @@ -100,6 +103,7 @@ struct bpf_htab { u32 elem_size; /* size of each element in bytes */ u32 hashrnd; struct lock_class_key lockdep_key; + int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -152,26 +156,41 @@ static void htab_init_buckets(struct bpf_htab *htab) } } -static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b) +static inline int htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b, u32 hash, + unsigned long *pflags) { unsigned long flags; + hash = hash & HASHTAB_MAP_LOCK_MASK; + + migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); + return -EBUSY; + } + if (htab_use_raw_lock(htab)) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); - return flags; + *pflags = flags; + + return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, + struct bucket *b, u32 hash, unsigned long flags) { + hash = hash & HASHTAB_MAP_LOCK_MASK; if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -429,8 +448,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; + int err, i; u64 cost; - int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -487,6 +506,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_charge; + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { + htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), + sizeof(int), GFP_USER); + if (!htab->map_locked[i]) + goto free_map_locked; + } + if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -497,7 +523,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (prealloc) { err = prealloc_init(htab); if (err) - goto free_buckets; + goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so @@ -513,7 +539,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); -free_buckets: +free_map_locked: + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); free_charge: bpf_map_charge_finish(&htab->map.memory); @@ -694,12 +722,15 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; + int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + if (ret) + return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -707,7 +738,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } @@ -979,7 +1010,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1020,7 +1053,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1058,7 +1091,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1077,7 +1112,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1112,7 +1147,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1135,7 +1172,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1175,7 +1212,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1197,7 +1236,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1225,7 +1264,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1235,17 +1274,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); - ret = 0; + } else { + ret = -ENOENT; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1257,7 +1299,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1267,16 +1309,18 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - ret = 0; - } + else + ret = -ENOENT; - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1302,6 +1346,7 @@ static void delete_all_elements(struct bpf_htab *htab) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1320,6 +1365,8 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); lockdep_unregister_key(&htab->lockdep_key); + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); kfree(htab); } @@ -1423,8 +1470,11 @@ again_nocopy: b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ - if (locked) - flags = htab_lock_bucket(htab, b); + if (locked) { + ret = htab_lock_bucket(htab, b, batch, &flags); + if (ret) + goto next_batch; + } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1441,7 +1491,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1452,7 +1502,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1505,7 +1555,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { From 8aaeed81fcb917b5cf4976932c5baefa1471128b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Nov 2020 03:41:00 -0800 Subject: [PATCH 04/66] bpf: Fix error path in htab_map_alloc() syzbot was able to trigger a use-after-free in htab_map_alloc() [1] htab_map_alloc() lacks a call to lockdep_unregister_key() in its error path. lockdep_register_key() and lockdep_unregister_key() can not fail, it seems better to use them right after htab allocation and before htab freeing, avoiding more goto/labels in htab_map_alloc() [1] BUG: KASAN: use-after-free in lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 Read of size 8 at addr ffff88805fa67ad8 by task syz-executor.3/2356 CPU: 1 PID: 2356 Comm: syz-executor.3 Not tainted 5.9.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 htab_init_buckets kernel/bpf/hashtab.c:144 [inline] htab_map_alloc+0x6c5/0x14a0 kernel/bpf/hashtab.c:521 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: 0d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f0eafee1c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000001a00 RCX: 000000000045deb9 RDX: 0000000000000040 RSI: 0000000020000040 RDI: 405a020000000000 RBP: 000000000118bf60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007ffd3cf9eabf R14: 00007f0eafee29c0 R15: 000000000118bf2c Allocated by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xc2/0xd0 mm/kasan/common.c:461 kmalloc include/linux/slab.h:554 [inline] kzalloc include/linux/slab.h:666 [inline] htab_map_alloc+0xdf/0x14a0 kernel/bpf/hashtab.c:454 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0x102/0x140 mm/kasan/common.c:422 slab_free_hook mm/slub.c:1544 [inline] slab_free_freelist_hook+0x5d/0x150 mm/slub.c:1577 slab_free mm/slub.c:3142 [inline] kfree+0xdb/0x360 mm/slub.c:4124 htab_map_alloc+0x3f9/0x14a0 kernel/bpf/hashtab.c:549 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88805fa67800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 728 bytes inside of 1024-byte region [ffff88805fa67800, ffff88805fa67c00) The buggy address belongs to the page: page:000000003c5582c4 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fa60 head:000000003c5582c4 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head) raw: 00fff00000010200 ffffea0000bc1200 0000000200000002 ffff888010041140 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88805fa67980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88805fa67b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: c50eb518e262 ("bpf: Use separate lockdep class for each hashtab") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201102114100.3103180-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da59ba978d17..23f73d4649c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -141,7 +141,6 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; - lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); if (htab_use_raw_lock(htab)) { @@ -455,6 +454,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); + lockdep_register_key(&htab->lockdep_key); + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -546,6 +547,7 @@ free_map_locked: free_charge: bpf_map_charge_finish(&htab->map.memory); free_htab: + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); return ERR_PTR(err); } @@ -1364,9 +1366,9 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); - lockdep_unregister_key(&htab->lockdep_key); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } From aaf376bddf68d0afe5f4b5f25fc555da358e2287 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Nov 2020 13:34:48 -0800 Subject: [PATCH 05/66] selftests/bpf: Move test_tcppbf_user into test_progs Recently a bug was missed due to the fact that test_tcpbpf_user is not a part of test_progs. In order to prevent similar issues in the future move the test functionality into test_progs. By doing this we can make certain that it is a part of standard testing and will not be overlooked. As a part of moving the functionality into test_progs it is necessary to integrate with the test_progs framework and to drop any redundant code. This patch: 1. Cleans up the include headers 2. Dropped a duplicate definition of bpf_find_map 3. Switched over to using test_progs specific cgroup functions 4. Renamed main to test_tcpbpf_user 5. Dropped return value in favor of CHECK_FAIL to check for errors The general idea is that I wanted to keep the changes as small as possible while moving the file into the test_progs framework. The follow-on patches are meant to clean up the remaining issues such as the use of CHECK_FAIL. Signed-off-by: Alexander Duyck Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160443928881.1086697.17661359319919165370.stgit@localhost.localdomain --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 3 +- .../tcpbpf_user.c} | 42 ++++--------------- 3 files changed, 10 insertions(+), 36 deletions(-) rename tools/testing/selftests/bpf/{test_tcpbpf_user.c => prog_tests/tcpbpf_user.c} (81%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3ab1200e172f..395ae040ce1f 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -8,7 +8,6 @@ FEATURE-DUMP.libbpf fixdep test_dev_cgroup /test_progs* -test_tcpbpf_user test_verifier_log feature test_sock diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 542768f5195b..50e5b18fc455 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -32,7 +32,7 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_verifier_log test_dev_cgroup test_tcpbpf_user \ + test_verifier_log test_dev_cgroup \ test_sock test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sysctl \ @@ -163,7 +163,6 @@ $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c $(OUTPUT)/test_socket_cookie: cgroup_helpers.c $(OUTPUT)/test_sockmap: cgroup_helpers.c -$(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c $(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c similarity index 81% rename from tools/testing/selftests/bpf/test_tcpbpf_user.c rename to tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index 74a9e49988b6..caa8d3adec8a 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -1,21 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "bpf_rlimit.h" -#include "bpf_util.h" -#include "cgroup_helpers.h" +#include #include "test_tcpbpf.h" +#define CG_NAME "/tcpbpf-user-test" + /* 3 comes from one listening socket + both ends of the connection */ #define EXPECTED_CLOSE_EVENTS 3 @@ -76,25 +66,11 @@ int verify_sockopt_result(int sock_map_fd) return ret; } -static int bpf_find_map(const char *test, struct bpf_object *obj, - const char *name) -{ - struct bpf_map *map; - - map = bpf_object__find_map_by_name(obj, name); - if (!map) { - printf("%s:FAIL:map '%s' not found\n", test, name); - return -1; - } - return bpf_map__fd(map); -} - -int main(int argc, char **argv) +void test_tcpbpf_user(void) { const char *file = "test_tcpbpf_kern.o"; int prog_fd, map_fd, sock_map_fd; struct tcpbpf_globals g = {0}; - const char *cg_path = "/foo"; int error = EXIT_FAILURE; struct bpf_object *obj; int cg_fd = -1; @@ -102,7 +78,7 @@ int main(int argc, char **argv) __u32 key = 0; int rv; - cg_fd = cgroup_setup_and_join(cg_path); + cg_fd = test__join_cgroup(CG_NAME); if (cg_fd < 0) goto err; @@ -155,11 +131,11 @@ retry_lookup: goto err; } - printf("PASSED!\n"); error = 0; err: bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); - close(cg_fd); - cleanup_cgroup_environment(); - return error; + if (cg_fd != -1) + close(cg_fd); + + CHECK_FAIL(error); } From 247f0ec361b7e0c5c67db8222873182fb8be5146 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Nov 2020 13:34:56 -0800 Subject: [PATCH 06/66] selftests/bpf: Drop python client/server in favor of threads Drop the tcp_client/server.py files in favor of using a client and server thread within the test case. Specifically we spawn a new thread to play the role of the server, and the main testing thread plays the role of client. Add logic to the end of the run_test function to guarantee that the sockets are closed when we begin verifying results. Doing this we are able to reduce overhead since we don't have two python workers possibly floating around. In addition we don't have to worry about synchronization issues and as such the retry loop waiting for the threads to close the sockets can be dropped as we will have already closed the sockets in the local executable and synchronized the server thread. Signed-off-by: Alexander Duyck Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160443929638.1086697.2430242340980315521.stgit@localhost.localdomain --- .../selftests/bpf/prog_tests/tcpbpf_user.c | 95 +++++++++++++++---- tools/testing/selftests/bpf/tcp_client.py | 50 ---------- tools/testing/selftests/bpf/tcp_server.py | 80 ---------------- 3 files changed, 78 insertions(+), 147 deletions(-) delete mode 100755 tools/testing/selftests/bpf/tcp_client.py delete mode 100755 tools/testing/selftests/bpf/tcp_server.py diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index caa8d3adec8a..616269abdb41 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include "test_tcpbpf.h" +#define LO_ADDR6 "::1" #define CG_NAME "/tcpbpf-user-test" -/* 3 comes from one listening socket + both ends of the connection */ -#define EXPECTED_CLOSE_EVENTS 3 +static __u32 duration; #define EXPECT_EQ(expected, actual, fmt) \ do { \ @@ -42,7 +43,9 @@ int verify_result(const struct tcpbpf_globals *result) EXPECT_EQ(0x80, result->bad_cb_test_rv, PRIu32); EXPECT_EQ(0, result->good_cb_test_rv, PRIu32); EXPECT_EQ(1, result->num_listen, PRIu32); - EXPECT_EQ(EXPECTED_CLOSE_EVENTS, result->num_close_events, PRIu32); + + /* 3 comes from one listening socket + both ends of the connection */ + EXPECT_EQ(3, result->num_close_events, PRIu32); return ret; } @@ -66,6 +69,75 @@ int verify_sockopt_result(int sock_map_fd) return ret; } +static int run_test(void) +{ + int listen_fd = -1, cli_fd = -1, accept_fd = -1; + char buf[1000]; + int err = -1; + int i, rv; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); + if (CHECK(listen_fd == -1, "start_server", "listen_fd:%d errno:%d\n", + listen_fd, errno)) + goto done; + + cli_fd = connect_to_fd(listen_fd, 0); + if (CHECK(cli_fd == -1, "connect_to_fd(listen_fd)", + "cli_fd:%d errno:%d\n", cli_fd, errno)) + goto done; + + accept_fd = accept(listen_fd, NULL, NULL); + if (CHECK(accept_fd == -1, "accept(listen_fd)", + "accept_fd:%d errno:%d\n", accept_fd, errno)) + goto done; + + /* Send 1000B of '+'s from cli_fd -> accept_fd */ + for (i = 0; i < 1000; i++) + buf[i] = '+'; + + rv = send(cli_fd, buf, 1000, 0); + if (CHECK(rv != 1000, "send(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + rv = recv(accept_fd, buf, 1000, 0); + if (CHECK(rv != 1000, "recv(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + /* Send 500B of '.'s from accept_fd ->cli_fd */ + for (i = 0; i < 500; i++) + buf[i] = '.'; + + rv = send(accept_fd, buf, 500, 0); + if (CHECK(rv != 500, "send(accept_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + rv = recv(cli_fd, buf, 500, 0); + if (CHECK(rv != 500, "recv(cli_fd)", "rv:%d errno:%d\n", rv, errno)) + goto done; + + /* + * shutdown accept first to guarantee correct ordering for + * bytes_received and bytes_acked when we go to verify the results. + */ + shutdown(accept_fd, SHUT_WR); + err = recv(cli_fd, buf, 1, 0); + if (CHECK(err, "recv(cli_fd) for fin", "err:%d errno:%d\n", err, errno)) + goto done; + + shutdown(cli_fd, SHUT_WR); + err = recv(accept_fd, buf, 1, 0); + CHECK(err, "recv(accept_fd) for fin", "err:%d errno:%d\n", err, errno); +done: + if (accept_fd != -1) + close(accept_fd); + if (cli_fd != -1) + close(cli_fd); + if (listen_fd != -1) + close(listen_fd); + + return err; +} + void test_tcpbpf_user(void) { const char *file = "test_tcpbpf_kern.o"; @@ -74,7 +146,6 @@ void test_tcpbpf_user(void) int error = EXIT_FAILURE; struct bpf_object *obj; int cg_fd = -1; - int retry = 10; __u32 key = 0; int rv; @@ -94,11 +165,6 @@ void test_tcpbpf_user(void) goto err; } - if (system("./tcp_server.py")) { - printf("FAILED: TCP server\n"); - goto err; - } - map_fd = bpf_find_map(__func__, obj, "global_map"); if (map_fd < 0) goto err; @@ -107,20 +173,15 @@ void test_tcpbpf_user(void) if (sock_map_fd < 0) goto err; -retry_lookup: + if (run_test()) + goto err; + rv = bpf_map_lookup_elem(map_fd, &key, &g); if (rv != 0) { printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); goto err; } - if (g.num_close_events != EXPECTED_CLOSE_EVENTS && retry--) { - printf("Unexpected number of close events (%d), retrying!\n", - g.num_close_events); - usleep(100); - goto retry_lookup; - } - if (verify_result(&g)) { printf("FAILED: Wrong stats\n"); goto err; diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py deleted file mode 100755 index bfff82be3fc1..000000000000 --- a/tools/testing/selftests/bpf/tcp_client.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# -# SPDX-License-Identifier: GPL-2.0 -# - -import sys, os, os.path, getopt -import socket, time -import subprocess -import select - -def read(sock, n): - buf = b'' - while len(buf) < n: - rem = n - len(buf) - try: s = sock.recv(rem) - except (socket.error) as e: return b'' - buf += s - return buf - -def send(sock, s): - total = len(s) - count = 0 - while count < total: - try: n = sock.send(s) - except (socket.error) as e: n = 0 - if n == 0: - return count; - count += n - return count - - -serverPort = int(sys.argv[1]) - -# create active socket -sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) -try: - sock.connect(('::1', serverPort)) -except socket.error as e: - sys.exit(1) - -buf = b'' -n = 0 -while n < 1000: - buf += b'+' - n += 1 - -sock.settimeout(1); -n = send(sock, buf) -n = read(sock, 500) -sys.exit(0) diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py deleted file mode 100755 index 42ab8882f00f..000000000000 --- a/tools/testing/selftests/bpf/tcp_server.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -# -# SPDX-License-Identifier: GPL-2.0 -# - -import sys, os, os.path, getopt -import socket, time -import subprocess -import select - -def read(sock, n): - buf = b'' - while len(buf) < n: - rem = n - len(buf) - try: s = sock.recv(rem) - except (socket.error) as e: return b'' - buf += s - return buf - -def send(sock, s): - total = len(s) - count = 0 - while count < total: - try: n = sock.send(s) - except (socket.error) as e: n = 0 - if n == 0: - return count; - count += n - return count - - -SERVER_PORT = 12877 -MAX_PORTS = 2 - -serverPort = SERVER_PORT -serverSocket = None - -# create passive socket -serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) - -try: serverSocket.bind(('::1', 0)) -except socket.error as msg: - print('bind fails: ' + str(msg)) - -sn = serverSocket.getsockname() -serverPort = sn[1] - -cmdStr = ("./tcp_client.py %d &") % (serverPort) -os.system(cmdStr) - -buf = b'' -n = 0 -while n < 500: - buf += b'.' - n += 1 - -serverSocket.listen(MAX_PORTS) -readList = [serverSocket] - -while True: - readyRead, readyWrite, inError = \ - select.select(readList, [], [], 2) - - if len(readyRead) > 0: - waitCount = 0 - for sock in readyRead: - if sock == serverSocket: - (clientSocket, address) = serverSocket.accept() - address = str(address[0]) - readList.append(clientSocket) - else: - sock.settimeout(1); - s = read(sock, 1000) - n = send(sock, buf) - sock.close() - serverSocket.close() - sys.exit(0) - else: - print('Select timeout!') - sys.exit(1) From d3813ea14b696053c076123239093822b527f0f7 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Nov 2020 13:35:04 -0800 Subject: [PATCH 07/66] selftests/bpf: Replace EXPECT_EQ with ASSERT_EQ and refactor verify_results There is already logic in test_progs.h for asserting that a value is expected to be another value. So instead of reinventing it we should just make use of ASSERT_EQ in tcpbpf_user.c. This will allow for better debugging and integrates much more closely with the test_progs framework. In addition we can refactor the code a bit to merge together the two verify functions and tie them together into a single function. Doing this helps to clean the code up a bit and makes it more readable as all the verification is now done in one function. Lastly we can relocate the verification to the end of the run_test since it is logically part of the test itself. With this we can drop the need for a return value from run_test since verification becomes the last step of the call and then immediately following is the tear down of the test setup. Signed-off-by: Alexander Duyck Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/160443930408.1086697.16101205859962113000.stgit@localhost.localdomain --- .../selftests/bpf/prog_tests/tcpbpf_user.c | 111 +++++++----------- 1 file changed, 41 insertions(+), 70 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index 616269abdb41..22c359871af6 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include #include @@ -10,66 +9,56 @@ static __u32 duration; -#define EXPECT_EQ(expected, actual, fmt) \ - do { \ - if ((expected) != (actual)) { \ - printf(" Value of: " #actual "\n" \ - " Actual: %" fmt "\n" \ - " Expected: %" fmt "\n", \ - (actual), (expected)); \ - ret--; \ - } \ - } while (0) - -int verify_result(const struct tcpbpf_globals *result) +static void verify_result(int map_fd, int sock_map_fd) { - __u32 expected_events; - int ret = 0; + __u32 expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | + (1 << BPF_SOCK_OPS_RWND_INIT) | + (1 << BPF_SOCK_OPS_TCP_CONNECT_CB) | + (1 << BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) | + (1 << BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) | + (1 << BPF_SOCK_OPS_NEEDS_ECN) | + (1 << BPF_SOCK_OPS_STATE_CB) | + (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); + struct tcpbpf_globals result; + __u32 key = 0; + int res, rv; - expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | - (1 << BPF_SOCK_OPS_RWND_INIT) | - (1 << BPF_SOCK_OPS_TCP_CONNECT_CB) | - (1 << BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) | - (1 << BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) | - (1 << BPF_SOCK_OPS_NEEDS_ECN) | - (1 << BPF_SOCK_OPS_STATE_CB) | - (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); + rv = bpf_map_lookup_elem(map_fd, &key, &result); + if (CHECK(rv, "bpf_map_lookup_elem(map_fd)", "err:%d errno:%d", + rv, errno)) + return; - EXPECT_EQ(expected_events, result->event_map, "#" PRIx32); - EXPECT_EQ(501ULL, result->bytes_received, "llu"); - EXPECT_EQ(1002ULL, result->bytes_acked, "llu"); - EXPECT_EQ(1, result->data_segs_in, PRIu32); - EXPECT_EQ(1, result->data_segs_out, PRIu32); - EXPECT_EQ(0x80, result->bad_cb_test_rv, PRIu32); - EXPECT_EQ(0, result->good_cb_test_rv, PRIu32); - EXPECT_EQ(1, result->num_listen, PRIu32); + /* check global map */ + CHECK(expected_events != result.event_map, "event_map", + "unexpected event_map: actual 0x%08x != expected 0x%08x\n", + result.event_map, expected_events); + + ASSERT_EQ(result.bytes_received, 501, "bytes_received"); + ASSERT_EQ(result.bytes_acked, 1002, "bytes_acked"); + ASSERT_EQ(result.data_segs_in, 1, "data_segs_in"); + ASSERT_EQ(result.data_segs_out, 1, "data_segs_out"); + ASSERT_EQ(result.bad_cb_test_rv, 0x80, "bad_cb_test_rv"); + ASSERT_EQ(result.good_cb_test_rv, 0, "good_cb_test_rv"); + ASSERT_EQ(result.num_listen, 1, "num_listen"); /* 3 comes from one listening socket + both ends of the connection */ - EXPECT_EQ(3, result->num_close_events, PRIu32); - - return ret; -} - -int verify_sockopt_result(int sock_map_fd) -{ - __u32 key = 0; - int ret = 0; - int res; - int rv; + ASSERT_EQ(result.num_close_events, 3, "num_close_events"); /* check setsockopt for SAVE_SYN */ rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - EXPECT_EQ(0, rv, "d"); - EXPECT_EQ(0, res, "d"); - key = 1; + CHECK(rv, "bpf_map_lookup_elem(sock_map_fd)", "err:%d errno:%d", + rv, errno); + ASSERT_EQ(res, 0, "bpf_setsockopt(TCP_SAVE_SYN)"); + /* check getsockopt for SAVED_SYN */ + key = 1; rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - EXPECT_EQ(0, rv, "d"); - EXPECT_EQ(1, res, "d"); - return ret; + CHECK(rv, "bpf_map_lookup_elem(sock_map_fd)", "err:%d errno:%d", + rv, errno); + ASSERT_EQ(res, 1, "bpf_getsockopt(TCP_SAVED_SYN)"); } -static int run_test(void) +static void run_test(int map_fd, int sock_map_fd) { int listen_fd = -1, cli_fd = -1, accept_fd = -1; char buf[1000]; @@ -135,18 +124,17 @@ done: if (listen_fd != -1) close(listen_fd); - return err; + if (!err) + verify_result(map_fd, sock_map_fd); } void test_tcpbpf_user(void) { const char *file = "test_tcpbpf_kern.o"; int prog_fd, map_fd, sock_map_fd; - struct tcpbpf_globals g = {0}; int error = EXIT_FAILURE; struct bpf_object *obj; int cg_fd = -1; - __u32 key = 0; int rv; cg_fd = test__join_cgroup(CG_NAME); @@ -173,24 +161,7 @@ void test_tcpbpf_user(void) if (sock_map_fd < 0) goto err; - if (run_test()) - goto err; - - rv = bpf_map_lookup_elem(map_fd, &key, &g); - if (rv != 0) { - printf("FAILED: bpf_map_lookup_elem returns %d\n", rv); - goto err; - } - - if (verify_result(&g)) { - printf("FAILED: Wrong stats\n"); - goto err; - } - - if (verify_sockopt_result(sock_map_fd)) { - printf("FAILED: Wrong sockopt stats\n"); - goto err; - } + run_test(map_fd, sock_map_fd); error = 0; err: From 0a099d1429c709020277d24a460d11ff8356a080 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Nov 2020 13:35:11 -0800 Subject: [PATCH 08/66] selftests/bpf: Migrate tcpbpf_user.c to use BPF skeleton Update tcpbpf_user.c to make use of the BPF skeleton. Doing this we can simplify test_tcpbpf_user and reduce the overhead involved in setting up the test. In addition we can clean up the remaining bits such as the one remaining CHECK_FAIL at the end of test_tcpbpf_user so that the function only makes use of CHECK as needed. Signed-off-by: Alexander Duyck Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160443931155.1086697.17869006617113525162.stgit@localhost.localdomain --- .../selftests/bpf/prog_tests/tcpbpf_user.c | 41 +++++++------------ 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index 22c359871af6..bef81648797a 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -3,6 +3,7 @@ #include #include "test_tcpbpf.h" +#include "test_tcpbpf_kern.skel.h" #define LO_ADDR6 "::1" #define CG_NAME "/tcpbpf-user-test" @@ -130,44 +131,30 @@ done: void test_tcpbpf_user(void) { - const char *file = "test_tcpbpf_kern.o"; - int prog_fd, map_fd, sock_map_fd; - int error = EXIT_FAILURE; - struct bpf_object *obj; + struct test_tcpbpf_kern *skel; + int map_fd, sock_map_fd; int cg_fd = -1; - int rv; + + skel = test_tcpbpf_kern__open_and_load(); + if (CHECK(!skel, "open and load skel", "failed")) + return; cg_fd = test__join_cgroup(CG_NAME); - if (cg_fd < 0) + if (CHECK(cg_fd < 0, "test__join_cgroup(" CG_NAME ")", + "cg_fd:%d errno:%d", cg_fd, errno)) goto err; - if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) { - printf("FAILED: load_bpf_file failed for: %s\n", file); - goto err; - } + map_fd = bpf_map__fd(skel->maps.global_map); + sock_map_fd = bpf_map__fd(skel->maps.sockopt_results); - rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0); - if (rv) { - printf("FAILED: bpf_prog_attach: %d (%s)\n", - error, strerror(errno)); - goto err; - } - - map_fd = bpf_find_map(__func__, obj, "global_map"); - if (map_fd < 0) - goto err; - - sock_map_fd = bpf_find_map(__func__, obj, "sockopt_results"); - if (sock_map_fd < 0) + skel->links.bpf_testcb = bpf_program__attach_cgroup(skel->progs.bpf_testcb, cg_fd); + if (!ASSERT_OK_PTR(skel->links.bpf_testcb, "attach_cgroup(bpf_testcb)")) goto err; run_test(map_fd, sock_map_fd); - error = 0; err: - bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); if (cg_fd != -1) close(cg_fd); - - CHECK_FAIL(error); + test_tcpbpf_kern__destroy(skel); } From 21b5177e997c98643eaabd4b917f2e287395af86 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 3 Nov 2020 13:35:19 -0800 Subject: [PATCH 09/66] selftest/bpf: Use global variables instead of maps for test_tcpbpf_kern Use global variables instead of global_map and sockopt_results_map to track test data. Doing this greatly simplifies the code as there is not need to take the extra steps of updating the maps or looking up elements. Signed-off-by: Alexander Duyck Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/160443931900.1086697.6588858453575682351.stgit@localhost.localdomain --- .../selftests/bpf/prog_tests/tcpbpf_user.c | 51 ++++------- .../selftests/bpf/progs/test_tcpbpf_kern.c | 86 +++---------------- tools/testing/selftests/bpf/test_tcpbpf.h | 2 + 3 files changed, 31 insertions(+), 108 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c index bef81648797a..ab5281475f44 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c +++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c @@ -10,7 +10,7 @@ static __u32 duration; -static void verify_result(int map_fd, int sock_map_fd) +static void verify_result(struct tcpbpf_globals *result) { __u32 expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) | (1 << BPF_SOCK_OPS_RWND_INIT) | @@ -20,46 +20,31 @@ static void verify_result(int map_fd, int sock_map_fd) (1 << BPF_SOCK_OPS_NEEDS_ECN) | (1 << BPF_SOCK_OPS_STATE_CB) | (1 << BPF_SOCK_OPS_TCP_LISTEN_CB)); - struct tcpbpf_globals result; - __u32 key = 0; - int res, rv; - - rv = bpf_map_lookup_elem(map_fd, &key, &result); - if (CHECK(rv, "bpf_map_lookup_elem(map_fd)", "err:%d errno:%d", - rv, errno)) - return; /* check global map */ - CHECK(expected_events != result.event_map, "event_map", + CHECK(expected_events != result->event_map, "event_map", "unexpected event_map: actual 0x%08x != expected 0x%08x\n", - result.event_map, expected_events); + result->event_map, expected_events); - ASSERT_EQ(result.bytes_received, 501, "bytes_received"); - ASSERT_EQ(result.bytes_acked, 1002, "bytes_acked"); - ASSERT_EQ(result.data_segs_in, 1, "data_segs_in"); - ASSERT_EQ(result.data_segs_out, 1, "data_segs_out"); - ASSERT_EQ(result.bad_cb_test_rv, 0x80, "bad_cb_test_rv"); - ASSERT_EQ(result.good_cb_test_rv, 0, "good_cb_test_rv"); - ASSERT_EQ(result.num_listen, 1, "num_listen"); + ASSERT_EQ(result->bytes_received, 501, "bytes_received"); + ASSERT_EQ(result->bytes_acked, 1002, "bytes_acked"); + ASSERT_EQ(result->data_segs_in, 1, "data_segs_in"); + ASSERT_EQ(result->data_segs_out, 1, "data_segs_out"); + ASSERT_EQ(result->bad_cb_test_rv, 0x80, "bad_cb_test_rv"); + ASSERT_EQ(result->good_cb_test_rv, 0, "good_cb_test_rv"); + ASSERT_EQ(result->num_listen, 1, "num_listen"); /* 3 comes from one listening socket + both ends of the connection */ - ASSERT_EQ(result.num_close_events, 3, "num_close_events"); + ASSERT_EQ(result->num_close_events, 3, "num_close_events"); /* check setsockopt for SAVE_SYN */ - rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - CHECK(rv, "bpf_map_lookup_elem(sock_map_fd)", "err:%d errno:%d", - rv, errno); - ASSERT_EQ(res, 0, "bpf_setsockopt(TCP_SAVE_SYN)"); + ASSERT_EQ(result->tcp_save_syn, 0, "tcp_save_syn"); /* check getsockopt for SAVED_SYN */ - key = 1; - rv = bpf_map_lookup_elem(sock_map_fd, &key, &res); - CHECK(rv, "bpf_map_lookup_elem(sock_map_fd)", "err:%d errno:%d", - rv, errno); - ASSERT_EQ(res, 1, "bpf_getsockopt(TCP_SAVED_SYN)"); + ASSERT_EQ(result->tcp_saved_syn, 1, "tcp_saved_syn"); } -static void run_test(int map_fd, int sock_map_fd) +static void run_test(struct tcpbpf_globals *result) { int listen_fd = -1, cli_fd = -1, accept_fd = -1; char buf[1000]; @@ -126,13 +111,12 @@ done: close(listen_fd); if (!err) - verify_result(map_fd, sock_map_fd); + verify_result(result); } void test_tcpbpf_user(void) { struct test_tcpbpf_kern *skel; - int map_fd, sock_map_fd; int cg_fd = -1; skel = test_tcpbpf_kern__open_and_load(); @@ -144,14 +128,11 @@ void test_tcpbpf_user(void) "cg_fd:%d errno:%d", cg_fd, errno)) goto err; - map_fd = bpf_map__fd(skel->maps.global_map); - sock_map_fd = bpf_map__fd(skel->maps.sockopt_results); - skel->links.bpf_testcb = bpf_program__attach_cgroup(skel->progs.bpf_testcb, cg_fd); if (!ASSERT_OK_PTR(skel->links.bpf_testcb, "attach_cgroup(bpf_testcb)")) goto err; - run_test(map_fd, sock_map_fd); + run_test(&skel->bss->global); err: if (cg_fd != -1) diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c index 3e6912e4df3d..e85e49deba70 100644 --- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c @@ -14,40 +14,7 @@ #include #include "test_tcpbpf.h" -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 4); - __type(key, __u32); - __type(value, struct tcpbpf_globals); -} global_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 2); - __type(key, __u32); - __type(value, int); -} sockopt_results SEC(".maps"); - -static inline void update_event_map(int event) -{ - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (gp == NULL) { - struct tcpbpf_globals g = {0}; - - g.event_map |= (1 << event); - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } else { - g = *gp; - g.event_map |= (1 << event); - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } -} - +struct tcpbpf_globals global = {}; int _version SEC("version") = 1; SEC("sockops") @@ -105,29 +72,15 @@ int bpf_testcb(struct bpf_sock_ops *skops) op = (int) skops->op; - update_event_map(op); + global.event_map |= (1 << op); switch (op) { case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Test failure to set largest cb flag (assumes not defined) */ - bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); + global.bad_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, 0x80); /* Set callback */ - good_call_rv = bpf_sock_ops_cb_flags_set(skops, + global.good_cb_test_rv = bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG); - /* Update results */ - { - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (!gp) - break; - g = *gp; - g.bad_cb_test_rv = bad_call_rv; - g.good_cb_test_rv = good_call_rv; - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); - } break; case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: skops->sk_txhash = 0x12345f; @@ -143,10 +96,8 @@ int bpf_testcb(struct bpf_sock_ops *skops) thdr = (struct tcphdr *)(header + offset); v = thdr->syn; - __u32 key = 1; - bpf_map_update_elem(&sockopt_results, &key, &v, - BPF_ANY); + global.tcp_saved_syn = v; } } break; @@ -156,25 +107,16 @@ int bpf_testcb(struct bpf_sock_ops *skops) break; case BPF_SOCK_OPS_STATE_CB: if (skops->args[1] == BPF_TCP_CLOSE) { - __u32 key = 0; - struct tcpbpf_globals g, *gp; - - gp = bpf_map_lookup_elem(&global_map, &key); - if (!gp) - break; - g = *gp; if (skops->args[0] == BPF_TCP_LISTEN) { - g.num_listen++; + global.num_listen++; } else { - g.total_retrans = skops->total_retrans; - g.data_segs_in = skops->data_segs_in; - g.data_segs_out = skops->data_segs_out; - g.bytes_received = skops->bytes_received; - g.bytes_acked = skops->bytes_acked; + global.total_retrans = skops->total_retrans; + global.data_segs_in = skops->data_segs_in; + global.data_segs_out = skops->data_segs_out; + global.bytes_received = skops->bytes_received; + global.bytes_acked = skops->bytes_acked; } - g.num_close_events++; - bpf_map_update_elem(&global_map, &key, &g, - BPF_ANY); + global.num_close_events++; } break; case BPF_SOCK_OPS_TCP_LISTEN_CB: @@ -182,9 +124,7 @@ int bpf_testcb(struct bpf_sock_ops *skops) v = bpf_setsockopt(skops, IPPROTO_TCP, TCP_SAVE_SYN, &save_syn, sizeof(save_syn)); /* Update global map w/ result of setsock opt */ - __u32 key = 0; - - bpf_map_update_elem(&sockopt_results, &key, &v, BPF_ANY); + global.tcp_save_syn = v; break; default: rv = -1; diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h index 6220b95cbd02..0ed33521cbbb 100644 --- a/tools/testing/selftests/bpf/test_tcpbpf.h +++ b/tools/testing/selftests/bpf/test_tcpbpf.h @@ -14,5 +14,7 @@ struct tcpbpf_globals { __u64 bytes_acked; __u32 num_listen; __u32 num_close_events; + __u32 tcp_save_syn; + __u32 tcp_saved_syn; }; #endif From c81ed6d81e0560713ceb94917ff1981848d0614e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:51 -0800 Subject: [PATCH 10/66] libbpf: Factor out common operations in BTF writing APIs Factor out commiting of appended type data. Also extract fetching the very last type in the BTF (to append members to). These two operations are common across many APIs and will be easier to refactor with split BTF, if they are extracted into a single place. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-2-andrii@kernel.org --- tools/lib/bpf/btf.c | 123 ++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 80 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 231b07203e3d..89fecfe5cb2b 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1560,6 +1560,20 @@ static void btf_type_inc_vlen(struct btf_type *t) t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); } +static int btf_commit_type(struct btf *btf, int data_sz) +{ + int err; + + err = btf_add_type_idx_entry(btf, btf->hdr->type_len); + if (err) + return err; + + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types++; + return btf->nr_types; +} + /* * Append new BTF_KIND_INT type with: * - *name* - non-empty, non-NULL type name; @@ -1572,7 +1586,7 @@ static void btf_type_inc_vlen(struct btf_type *t) int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding) { struct btf_type *t; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -1606,14 +1620,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding /* set INT info, we don't allow setting legacy bit offset/size */ *(__u32 *)(t + 1) = (encoding << 24) | (byte_sz * 8); - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* it's completely legal to append BTF types with type IDs pointing forward to @@ -1631,7 +1638,7 @@ static int validate_type_id(int id) static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref_type_id) { struct btf_type *t; - int sz, name_off = 0, err; + int sz, name_off = 0; if (validate_type_id(ref_type_id)) return -EINVAL; @@ -1654,14 +1661,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref t->info = btf_type_info(kind, 0, 0); t->type = ref_type_id; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1689,7 +1689,7 @@ int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 n { struct btf_type *t; struct btf_array *a; - int sz, err; + int sz; if (validate_type_id(index_type_id) || validate_type_id(elem_type_id)) return -EINVAL; @@ -1711,21 +1711,14 @@ int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 n a->index_type = index_type_id; a->nelems = nr_elems; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* generic STRUCT/UNION append function */ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 bytes_sz) { struct btf_type *t; - int sz, err, name_off = 0; + int sz, name_off = 0; if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1748,14 +1741,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 t->info = btf_type_info(kind, 0, 0); t->size = bytes_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1793,6 +1779,11 @@ int btf__add_union(struct btf *btf, const char *name, __u32 byte_sz) return btf_add_composite(btf, BTF_KIND_UNION, name, byte_sz); } +static struct btf_type *btf_last_type(struct btf *btf) +{ + return btf_type_by_id(btf, btf__get_nr_types(btf)); +} + /* * Append new field for the current STRUCT/UNION type with: * - *name* - name of the field, can be NULL or empty for anonymous field; @@ -1814,7 +1805,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, /* last type should be union/struct */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_composite(t)) return -EINVAL; @@ -1849,7 +1840,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, m->offset = bit_offset | (bit_size << 24); /* btf_add_type_mem can invalidate t pointer */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); /* update parent type's vlen and kflag */ t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); @@ -1874,7 +1865,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) { struct btf_type *t; - int sz, err, name_off = 0; + int sz, name_off = 0; /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8) @@ -1899,14 +1890,7 @@ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) t->info = btf_type_info(BTF_KIND_ENUM, 0, 0); t->size = byte_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -1926,7 +1910,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) /* last type should be BTF_KIND_ENUM */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_enum(t)) return -EINVAL; @@ -1953,7 +1937,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) v->val = value; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; @@ -2093,7 +2077,7 @@ int btf__add_func(struct btf *btf, const char *name, int btf__add_func_proto(struct btf *btf, int ret_type_id) { struct btf_type *t; - int sz, err; + int sz; if (validate_type_id(ret_type_id)) return -EINVAL; @@ -2113,14 +2097,7 @@ int btf__add_func_proto(struct btf *btf, int ret_type_id) t->info = btf_type_info(BTF_KIND_FUNC_PROTO, 0, 0); t->type = ret_type_id; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2143,7 +2120,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) /* last type should be BTF_KIND_FUNC_PROTO */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_func_proto(t)) return -EINVAL; @@ -2166,7 +2143,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) p->type = type_id; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; @@ -2188,7 +2165,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) { struct btf_type *t; struct btf_var *v; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -2219,14 +2196,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) v = btf_var(t); v->linkage = linkage; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2244,7 +2214,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) { struct btf_type *t; - int sz, err, name_off; + int sz, name_off; /* non-empty name */ if (!name || !name[0]) @@ -2267,14 +2237,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) t->info = btf_type_info(BTF_KIND_DATASEC, 0, 0); t->size = byte_sz; - err = btf_add_type_idx_entry(btf, btf->hdr->type_len); - if (err) - return err; - - btf->hdr->type_len += sz; - btf->hdr->str_off += sz; - btf->nr_types++; - return btf->nr_types; + return btf_commit_type(btf, sz); } /* @@ -2296,7 +2259,7 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __ /* last type should be BTF_KIND_DATASEC */ if (btf->nr_types == 0) return -EINVAL; - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); if (!btf_is_datasec(t)) return -EINVAL; @@ -2317,7 +2280,7 @@ int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __ v->size = byte_sz; /* update parent type's vlen */ - t = btf_type_by_id(btf, btf->nr_types); + t = btf_last_type(btf); btf_type_inc_vlen(t); btf->hdr->type_len += sz; From d9448f94962bd28554df7d9a342d37c7f13d6232 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:52 -0800 Subject: [PATCH 11/66] selftest/bpf: Relax btf_dedup test checks Remove the requirement of a strictly exact string section contents. This used to be true when string deduplication was done through sorting, but with string dedup done through hash table, it's no longer true. So relax test harness to relax strings checks and, consequently, type checks, which now don't have to have exactly the same string offsets. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105043402.2530976-3-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/btf.c | 40 ++++++++++++-------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 93162484c2ca..8ae97e2a4b9d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -6652,7 +6652,7 @@ static void do_test_dedup(unsigned int test_num) const void *test_btf_data, *expect_btf_data; const char *ret_test_next_str, *ret_expect_next_str; const char *test_strs, *expect_strs; - const char *test_str_cur, *test_str_end; + const char *test_str_cur; const char *expect_str_cur, *expect_str_end; unsigned int raw_btf_size; void *raw_btf; @@ -6719,12 +6719,18 @@ static void do_test_dedup(unsigned int test_num) goto done; } - test_str_cur = test_strs; - test_str_end = test_strs + test_hdr->str_len; expect_str_cur = expect_strs; expect_str_end = expect_strs + expect_hdr->str_len; - while (test_str_cur < test_str_end && expect_str_cur < expect_str_end) { + while (expect_str_cur < expect_str_end) { size_t test_len, expect_len; + int off; + + off = btf__find_str(test_btf, expect_str_cur); + if (CHECK(off < 0, "exp str '%s' not found: %d\n", expect_str_cur, off)) { + err = -1; + goto done; + } + test_str_cur = btf__str_by_offset(test_btf, off); test_len = strlen(test_str_cur); expect_len = strlen(expect_str_cur); @@ -6741,15 +6747,8 @@ static void do_test_dedup(unsigned int test_num) err = -1; goto done; } - test_str_cur += test_len + 1; expect_str_cur += expect_len + 1; } - if (CHECK(test_str_cur != test_str_end, - "test_str_cur:%p != test_str_end:%p", - test_str_cur, test_str_end)) { - err = -1; - goto done; - } test_nr_types = btf__get_nr_types(test_btf); expect_nr_types = btf__get_nr_types(expect_btf); @@ -6775,10 +6774,21 @@ static void do_test_dedup(unsigned int test_num) err = -1; goto done; } - if (CHECK(memcmp((void *)test_type, - (void *)expect_type, - test_size), - "type #%d: contents differ", i)) { + if (CHECK(btf_kind(test_type) != btf_kind(expect_type), + "type %d kind: exp %d != got %u\n", + i, btf_kind(expect_type), btf_kind(test_type))) { + err = -1; + goto done; + } + if (CHECK(test_type->info != expect_type->info, + "type %d info: exp %d != got %u\n", + i, expect_type->info, test_type->info)) { + err = -1; + goto done; + } + if (CHECK(test_type->size != expect_type->size, + "type %d size/type: exp %d != got %u\n", + i, expect_type->size, test_type->size)) { err = -1; goto done; } From 88a82c2a9ab5b2ce533c3de3f4517853c2f67f53 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:53 -0800 Subject: [PATCH 12/66] libbpf: Unify and speed up BTF string deduplication Revamp BTF dedup's string deduplication to match the approach of writable BTF string management. This allows to transfer deduplicated strings index back to BTF object after deduplication without expensive extra memory copying and hash map re-construction. It also simplifies the code and speeds it up, because hashmap-based string deduplication is faster than sort + unique approach. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-4-andrii@kernel.org --- tools/lib/bpf/btf.c | 263 +++++++++++++++++--------------------------- 1 file changed, 98 insertions(+), 165 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 89fecfe5cb2b..fa1b147c63c6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -90,6 +90,14 @@ struct btf { struct hashmap *strs_hash; /* whether strings are already deduplicated */ bool strs_deduped; + /* extra indirection layer to make strings hashmap work with stable + * string offsets and ability to transparently choose between + * btf->strs_data or btf_dedup->strs_data as a source of strings. + * This is used for BTF strings dedup to transfer deduplicated strings + * data back to struct btf without re-building strings index. + */ + void **strs_data_ptr; + /* BTF object FD, if loaded into kernel */ int fd; @@ -1363,17 +1371,19 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, static size_t strs_hash_fn(const void *key, void *ctx) { - struct btf *btf = ctx; - const char *str = btf->strs_data + (long)key; + const struct btf *btf = ctx; + const char *strs = *btf->strs_data_ptr; + const char *str = strs + (long)key; return str_hash(str); } static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx) { - struct btf *btf = ctx; - const char *str1 = btf->strs_data + (long)key1; - const char *str2 = btf->strs_data + (long)key2; + const struct btf *btf = ctx; + const char *strs = *btf->strs_data_ptr; + const char *str1 = strs + (long)key1; + const char *str2 = strs + (long)key2; return strcmp(str1, str2) == 0; } @@ -1418,6 +1428,9 @@ static int btf_ensure_modifiable(struct btf *btf) memcpy(types, btf->types_data, btf->hdr->type_len); memcpy(strs, btf->strs_data, btf->hdr->str_len); + /* make hashmap below use btf->strs_data as a source of strings */ + btf->strs_data_ptr = &btf->strs_data; + /* build lookup index for all strings */ hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf); if (IS_ERR(hash)) { @@ -2824,19 +2837,11 @@ struct btf_dedup { size_t hypot_cap; /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; -}; - -struct btf_str_ptr { - const char *str; - __u32 new_off; - bool used; -}; - -struct btf_str_ptrs { - struct btf_str_ptr *ptrs; - const char *data; - __u32 cnt; - __u32 cap; + /* temporary strings deduplication state */ + void *strs_data; + size_t strs_cap; + size_t strs_len; + struct hashmap* strs_hash; }; static long hash_combine(long h, long value) @@ -3063,64 +3068,41 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) return 0; } -static int str_sort_by_content(const void *a1, const void *a2) +static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { - const struct btf_str_ptr *p1 = a1; - const struct btf_str_ptr *p2 = a2; - - return strcmp(p1->str, p2->str); -} - -static int str_sort_by_offset(const void *a1, const void *a2) -{ - const struct btf_str_ptr *p1 = a1; - const struct btf_str_ptr *p2 = a2; - - if (p1->str != p2->str) - return p1->str < p2->str ? -1 : 1; - return 0; -} - -static int btf_dedup_str_ptr_cmp(const void *str_ptr, const void *pelem) -{ - const struct btf_str_ptr *p = pelem; - - if (str_ptr != p->str) - return (const char *)str_ptr < p->str ? -1 : 1; - return 0; -} - -static int btf_str_mark_as_used(__u32 *str_off_ptr, void *ctx) -{ - struct btf_str_ptrs *strs; - struct btf_str_ptr *s; + struct btf_dedup *d = ctx; + long old_off, new_off, len; + const char *s; + void *p; + int err; if (*str_off_ptr == 0) return 0; - strs = ctx; - s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, - sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); - if (!s) - return -EINVAL; - s->used = true; - return 0; -} + s = btf__str_by_offset(d->btf, *str_off_ptr); + len = strlen(s) + 1; -static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx) -{ - struct btf_str_ptrs *strs; - struct btf_str_ptr *s; + new_off = d->strs_len; + p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); + if (!p) + return -ENOMEM; - if (*str_off_ptr == 0) - return 0; + memcpy(p, s, len); - strs = ctx; - s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, - sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); - if (!s) - return -EINVAL; - *str_off_ptr = s->new_off; + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, + HASHMAP_ADD, (const void **)&old_off, NULL); + if (err == -EEXIST) { + *str_off_ptr = old_off; + } else if (err) { + return err; + } else { + *str_off_ptr = new_off; + d->strs_len += len; + } return 0; } @@ -3137,118 +3119,69 @@ static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx) */ static int btf_dedup_strings(struct btf_dedup *d) { - char *start = d->btf->strs_data; - char *end = start + d->btf->hdr->str_len; - char *p = start, *tmp_strs = NULL; - struct btf_str_ptrs strs = { - .cnt = 0, - .cap = 0, - .ptrs = NULL, - .data = start, - }; - int i, j, err = 0, grp_idx; - bool grp_used; + char *s; + int err; if (d->btf->strs_deduped) return 0; - /* build index of all strings */ - while (p < end) { - if (strs.cnt + 1 > strs.cap) { - struct btf_str_ptr *new_ptrs; + s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); + if (!s) + return -ENOMEM; + /* initial empty string */ + s[0] = 0; + d->strs_len = 1; - strs.cap += max(strs.cnt / 2, 16U); - new_ptrs = libbpf_reallocarray(strs.ptrs, strs.cap, sizeof(strs.ptrs[0])); - if (!new_ptrs) { - err = -ENOMEM; - goto done; - } - strs.ptrs = new_ptrs; - } - - strs.ptrs[strs.cnt].str = p; - strs.ptrs[strs.cnt].used = false; - - p += strlen(p) + 1; - strs.cnt++; - } - - /* temporary storage for deduplicated strings */ - tmp_strs = malloc(d->btf->hdr->str_len); - if (!tmp_strs) { - err = -ENOMEM; - goto done; - } - - /* mark all used strings */ - strs.ptrs[0].used = true; - err = btf_for_each_str_off(d, btf_str_mark_as_used, &strs); - if (err) - goto done; - - /* sort strings by context, so that we can identify duplicates */ - qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_content); - - /* - * iterate groups of equal strings and if any instance in a group was - * referenced, emit single instance and remember new offset + /* temporarily switch to use btf_dedup's strs_data for strings for hash + * functions; later we'll just transfer hashmap to struct btf as is, + * along the strs_data */ - p = tmp_strs; - grp_idx = 0; - grp_used = strs.ptrs[0].used; - /* iterate past end to avoid code duplication after loop */ - for (i = 1; i <= strs.cnt; i++) { - /* - * when i == strs.cnt, we want to skip string comparison and go - * straight to handling last group of strings (otherwise we'd - * need to handle last group after the loop w/ duplicated code) - */ - if (i < strs.cnt && - !strcmp(strs.ptrs[i].str, strs.ptrs[grp_idx].str)) { - grp_used = grp_used || strs.ptrs[i].used; - continue; - } + d->btf->strs_data_ptr = &d->strs_data; - /* - * this check would have been required after the loop to handle - * last group of strings, but due to <= condition in a loop - * we avoid that duplication - */ - if (grp_used) { - int new_off = p - tmp_strs; - __u32 len = strlen(strs.ptrs[grp_idx].str); - - memmove(p, strs.ptrs[grp_idx].str, len + 1); - for (j = grp_idx; j < i; j++) - strs.ptrs[j].new_off = new_off; - p += len + 1; - } - - if (i < strs.cnt) { - grp_idx = i; - grp_used = strs.ptrs[i].used; - } + d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf); + if (IS_ERR(d->strs_hash)) { + err = PTR_ERR(d->strs_hash); + d->strs_hash = NULL; + goto err_out; } - /* replace original strings with deduped ones */ - d->btf->hdr->str_len = p - tmp_strs; - memmove(start, tmp_strs, d->btf->hdr->str_len); - end = start + d->btf->hdr->str_len; - - /* restore original order for further binary search lookups */ - qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_offset); + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups + */ + err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, + HASHMAP_ADD, NULL, NULL); + if (err) + goto err_out; /* remap string offsets */ - err = btf_for_each_str_off(d, btf_str_remap_offset, &strs); + err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); if (err) - goto done; + goto err_out; - d->btf->hdr->str_len = end - start; + /* replace BTF string data and hash with deduped ones */ + free(d->btf->strs_data); + hashmap__free(d->btf->strs_hash); + d->btf->strs_data = d->strs_data; + d->btf->strs_data_cap = d->strs_cap; + d->btf->hdr->str_len = d->strs_len; + d->btf->strs_hash = d->strs_hash; + /* now point strs_data_ptr back to btf->strs_data */ + d->btf->strs_data_ptr = &d->btf->strs_data; + + d->strs_data = d->strs_hash = NULL; + d->strs_len = d->strs_cap = 0; d->btf->strs_deduped = true; + return 0; + +err_out: + free(d->strs_data); + hashmap__free(d->strs_hash); + d->strs_data = d->strs_hash = NULL; + d->strs_len = d->strs_cap = 0; + + /* restore strings pointer for existing d->btf->strs_hash back */ + d->btf->strs_data_ptr = &d->strs_data; -done: - free(tmp_strs); - free(strs.ptrs); return err; } From ba451366bf44498f22dd16c31a792083bd6f2ae1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:54 -0800 Subject: [PATCH 13/66] libbpf: Implement basic split BTF support Support split BTF operation, in which one BTF (base BTF) provides basic set of types and strings, while another one (split BTF) builds on top of base's types and strings and adds its own new types and strings. From API standpoint, the fact that the split BTF is built on top of the base BTF is transparent. Type numeration is transparent. If the base BTF had last type ID #N, then all types in the split BTF start at type ID N+1. Any type in split BTF can reference base BTF types, but not vice versa. Programmatically construction of a split BTF on top of a base BTF is supported: one can create an empty split BTF with btf__new_empty_split() and pass base BTF as an input, or pass raw binary data to btf__new_split(), or use btf__parse_xxx_split() variants to get initial set of split types/strings from the ELF file with .BTF section. String offsets are similarly transparent and are a logical continuation of base BTF's strings. When building BTF programmatically and adding a new string (explicitly with btf__add_str() or implicitly through appending new types/members), string-to-be-added would first be looked up from the base BTF's string section and re-used if it's there. If not, it will be looked up and/or added to the split BTF string section. Similarly to type IDs, types in split BTF can refer to strings from base BTF absolutely transparently (but not vice versa, of course, because base BTF doesn't "know" about existence of split BTF). Internal type index is slightly adjusted to be zero-indexed, ignoring a fake [0] VOID type. This allows to handle split/base BTF type lookups transparently by using btf->start_id type ID offset, which is always 1 for base/non-split BTF and equals btf__get_nr_types(base_btf) + 1 for the split BTF. BTF deduplication is not yet supported for split BTF and support for it will be added in separate patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-5-andrii@kernel.org --- tools/lib/bpf/btf.c | 197 ++++++++++++++++++++++++++++++--------- tools/lib/bpf/btf.h | 8 ++ tools/lib/bpf/libbpf.map | 9 ++ 3 files changed, 169 insertions(+), 45 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index fa1b147c63c6..0258cf108c0a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -78,10 +78,32 @@ struct btf { void *types_data; size_t types_data_cap; /* used size stored in hdr->type_len */ - /* type ID to `struct btf_type *` lookup index */ + /* type ID to `struct btf_type *` lookup index + * type_offs[0] corresponds to the first non-VOID type: + * - for base BTF it's type [1]; + * - for split BTF it's the first non-base BTF type. + */ __u32 *type_offs; size_t type_offs_cap; + /* number of types in this BTF instance: + * - doesn't include special [0] void type; + * - for split BTF counts number of types added on top of base BTF. + */ __u32 nr_types; + /* if not NULL, points to the base BTF on top of which the current + * split BTF is based + */ + struct btf *base_btf; + /* BTF type ID of the first type in this BTF instance: + * - for base BTF it's equal to 1; + * - for split BTF it's equal to biggest type ID of base BTF plus 1. + */ + int start_id; + /* logical string offset of this BTF instance: + * - for base BTF it's equal to 0; + * - for split BTF it's equal to total size of base BTF's string section size. + */ + int start_str_off; void *strs_data; size_t strs_data_cap; /* used size stored in hdr->str_len */ @@ -176,7 +198,7 @@ static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) __u32 *p; p = btf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), - btf->nr_types + 1, BTF_MAX_NR_TYPES, 1); + btf->nr_types, BTF_MAX_NR_TYPES, 1); if (!p) return -ENOMEM; @@ -252,12 +274,16 @@ static int btf_parse_str_sec(struct btf *btf) const char *start = btf->strs_data; const char *end = start + btf->hdr->str_len; - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || - start[0] || end[-1]) { + if (btf->base_btf && hdr->str_len == 0) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || end[-1]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { pr_debug("Invalid BTF string section\n"); return -EINVAL; } - return 0; } @@ -372,19 +398,9 @@ static int btf_parse_type_sec(struct btf *btf) struct btf_header *hdr = btf->hdr; void *next_type = btf->types_data; void *end_type = next_type + hdr->type_len; - int err, i = 0, type_size; - - /* VOID (type_id == 0) is specially handled by btf__get_type_by_id(), - * so ensure we can never properly use its offset from index by - * setting it to a large value - */ - err = btf_add_type_idx_entry(btf, UINT_MAX); - if (err) - return err; + int err, type_size; while (next_type + sizeof(struct btf_type) <= end_type) { - i++; - if (btf->swapped_endian) btf_bswap_type_base(next_type); @@ -392,7 +408,7 @@ static int btf_parse_type_sec(struct btf *btf) if (type_size < 0) return type_size; if (next_type + type_size > end_type) { - pr_warn("BTF type [%d] is malformed\n", i); + pr_warn("BTF type [%d] is malformed\n", btf->start_id + btf->nr_types); return -EINVAL; } @@ -417,7 +433,7 @@ static int btf_parse_type_sec(struct btf *btf) __u32 btf__get_nr_types(const struct btf *btf) { - return btf->nr_types; + return btf->start_id + btf->nr_types - 1; } /* internal helper returning non-const pointer to a type */ @@ -425,13 +441,14 @@ static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; - - return btf->types_data + btf->type_offs[type_id]; + if (type_id < btf->start_id) + return btf_type_by_id(btf->base_btf, type_id); + return btf->types_data + btf->type_offs[type_id - btf->start_id]; } const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) { - if (type_id > btf->nr_types) + if (type_id >= btf->start_id + btf->nr_types) return NULL; return btf_type_by_id((struct btf *)btf, type_id); } @@ -440,9 +457,13 @@ static int determine_ptr_size(const struct btf *btf) { const struct btf_type *t; const char *name; - int i; + int i, n; - for (i = 1; i <= btf->nr_types; i++) { + if (btf->base_btf && btf->base_btf->ptr_sz > 0) + return btf->base_btf->ptr_sz; + + n = btf__get_nr_types(btf); + for (i = 1; i <= n; i++) { t = btf__type_by_id(btf, i); if (!btf_is_int(t)) continue; @@ -725,7 +746,7 @@ void btf__free(struct btf *btf) free(btf); } -struct btf *btf__new_empty(void) +static struct btf *btf_new_empty(struct btf *base_btf) { struct btf *btf; @@ -733,12 +754,21 @@ struct btf *btf__new_empty(void) if (!btf) return ERR_PTR(-ENOMEM); + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; btf->fd = -1; btf->ptr_sz = sizeof(void *); btf->swapped_endian = false; + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__get_nr_types(base_btf) + 1; + btf->start_str_off = base_btf->hdr->str_len; + } + /* +1 for empty string at offset 0 */ - btf->raw_size = sizeof(struct btf_header) + 1; + btf->raw_size = sizeof(struct btf_header) + (base_btf ? 0 : 1); btf->raw_data = calloc(1, btf->raw_size); if (!btf->raw_data) { free(btf); @@ -752,12 +782,22 @@ struct btf *btf__new_empty(void) btf->types_data = btf->raw_data + btf->hdr->hdr_len; btf->strs_data = btf->raw_data + btf->hdr->hdr_len; - btf->hdr->str_len = 1; /* empty string at offset 0 */ + btf->hdr->str_len = base_btf ? 0 : 1; /* empty string at offset 0 */ return btf; } -struct btf *btf__new(const void *data, __u32 size) +struct btf *btf__new_empty(void) +{ + return btf_new_empty(NULL); +} + +struct btf *btf__new_empty_split(struct btf *base_btf) +{ + return btf_new_empty(base_btf); +} + +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) { struct btf *btf; int err; @@ -766,6 +806,16 @@ struct btf *btf__new(const void *data, __u32 size) if (!btf) return ERR_PTR(-ENOMEM); + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__get_nr_types(base_btf) + 1; + btf->start_str_off = base_btf->hdr->str_len; + } + btf->raw_data = malloc(size); if (!btf->raw_data) { err = -ENOMEM; @@ -798,7 +848,13 @@ done: return btf; } -struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +struct btf *btf__new(const void *data, __u32 size) +{ + return btf_new(data, size, NULL); +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) { Elf_Data *btf_data = NULL, *btf_ext_data = NULL; int err = 0, fd = -1, idx = 0; @@ -876,7 +932,7 @@ struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) err = -ENOENT; goto done; } - btf = btf__new(btf_data->d_buf, btf_data->d_size); + btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); if (IS_ERR(btf)) goto done; @@ -921,7 +977,17 @@ done: return btf; } -struct btf *btf__parse_raw(const char *path) +struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +{ + return btf_parse_elf(path, NULL, btf_ext); +} + +struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf) +{ + return btf_parse_elf(path, base_btf, NULL); +} + +static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) { struct btf *btf = NULL; void *data = NULL; @@ -975,7 +1041,7 @@ struct btf *btf__parse_raw(const char *path) } /* finally parse BTF data */ - btf = btf__new(data, sz); + btf = btf_new(data, sz, base_btf); err_out: free(data); @@ -984,18 +1050,38 @@ err_out: return err ? ERR_PTR(err) : btf; } -struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +struct btf *btf__parse_raw(const char *path) +{ + return btf_parse_raw(path, NULL); +} + +struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) +{ + return btf_parse_raw(path, base_btf); +} + +static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { struct btf *btf; if (btf_ext) *btf_ext = NULL; - btf = btf__parse_raw(path); + btf = btf_parse_raw(path, base_btf); if (!IS_ERR(btf) || PTR_ERR(btf) != -EPROTO) return btf; - return btf__parse_elf(path, btf_ext); + return btf_parse_elf(path, base_btf, btf_ext); +} + +struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +{ + return btf_parse(path, NULL, btf_ext); +} + +struct btf *btf__parse_split(const char *path, struct btf *base_btf) +{ + return btf_parse(path, base_btf, NULL); } static int compare_vsi_off(const void *_a, const void *_b) @@ -1179,8 +1265,8 @@ static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endi memcpy(p, btf->types_data, hdr->type_len); if (swap_endian) { - for (i = 1; i <= btf->nr_types; i++) { - t = p + btf->type_offs[i]; + for (i = 0; i < btf->nr_types; i++) { + t = p + btf->type_offs[i]; /* btf_bswap_type_rest() relies on native t->info, so * we swap base type info after we swapped all the * additional information @@ -1223,8 +1309,10 @@ const void *btf__get_raw_data(const struct btf *btf_ro, __u32 *size) const char *btf__str_by_offset(const struct btf *btf, __u32 offset) { - if (offset < btf->hdr->str_len) - return btf->strs_data + offset; + if (offset < btf->start_str_off) + return btf__str_by_offset(btf->base_btf, offset); + else if (offset - btf->start_str_off < btf->hdr->str_len) + return btf->strs_data + (offset - btf->start_str_off); else return NULL; } @@ -1461,7 +1549,10 @@ static int btf_ensure_modifiable(struct btf *btf) /* if BTF was created from scratch, all strings are guaranteed to be * unique and deduplicated */ - btf->strs_deduped = btf->hdr->str_len <= 1; + if (btf->hdr->str_len == 0) + btf->strs_deduped = true; + if (!btf->base_btf && btf->hdr->str_len == 1) + btf->strs_deduped = true; /* invalidate raw_data representation */ btf_invalidate_raw_data(btf); @@ -1493,6 +1584,14 @@ int btf__find_str(struct btf *btf, const char *s) long old_off, new_off, len; void *p; + if (btf->base_btf) { + int ret; + + ret = btf__find_str(btf->base_btf, s); + if (ret != -ENOENT) + return ret; + } + /* BTF needs to be in a modifiable state to build string lookup index */ if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1507,7 +1606,7 @@ int btf__find_str(struct btf *btf, const char *s) memcpy(p, s, len); if (hashmap__find(btf->strs_hash, (void *)new_off, (void **)&old_off)) - return old_off; + return btf->start_str_off + old_off; return -ENOENT; } @@ -1523,6 +1622,14 @@ int btf__add_str(struct btf *btf, const char *s) void *p; int err; + if (btf->base_btf) { + int ret; + + ret = btf__find_str(btf->base_btf, s); + if (ret != -ENOENT) + return ret; + } + if (btf_ensure_modifiable(btf)) return -ENOMEM; @@ -1549,12 +1656,12 @@ int btf__add_str(struct btf *btf, const char *s) err = hashmap__insert(btf->strs_hash, (void *)new_off, (void *)new_off, HASHMAP_ADD, (const void **)&old_off, NULL); if (err == -EEXIST) - return old_off; /* duplicated string, return existing offset */ + return btf->start_str_off + old_off; /* duplicated string, return existing offset */ if (err) return err; btf->hdr->str_len += len; /* new unique string, adjust data length */ - return new_off; + return btf->start_str_off + new_off; } static void *btf_add_type_mem(struct btf *btf, size_t add_sz) @@ -1584,7 +1691,7 @@ static int btf_commit_type(struct btf *btf, int data_sz) btf->hdr->type_len += data_sz; btf->hdr->str_off += data_sz; btf->nr_types++; - return btf->nr_types; + return btf->start_id + btf->nr_types - 1; } /* @@ -4167,14 +4274,14 @@ static int btf_dedup_compact_types(struct btf_dedup *d) memmove(p, btf__type_by_id(d->btf, i), len); d->hypot_map[i] = next_type_id; - d->btf->type_offs[next_type_id] = p - d->btf->types_data; + d->btf->type_offs[next_type_id - 1] = p - d->btf->types_data; p += len; next_type_id++; } /* shrink struct btf's internal types index and update btf_header */ d->btf->nr_types = next_type_id - 1; - d->btf->type_offs_cap = d->btf->nr_types + 1; + d->btf->type_offs_cap = d->btf->nr_types; d->btf->hdr->type_len = p - d->btf->types_data; new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, sizeof(*new_offs)); diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 57247240a20a..1093f6fe6800 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -31,11 +31,19 @@ enum btf_endianness { }; LIBBPF_API void btf__free(struct btf *btf); + LIBBPF_API struct btf *btf__new(const void *data, __u32 size); +LIBBPF_API struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf); LIBBPF_API struct btf *btf__new_empty(void); +LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); + LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf); LIBBPF_API struct btf *btf__parse_raw(const char *path); +LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); + LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 4ebfadf45b47..29ff4807b909 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -337,3 +337,12 @@ LIBBPF_0.2.0 { perf_buffer__consume_buffer; xsk_socket__create_shared; } LIBBPF_0.1.0; + +LIBBPF_0.3.0 { + global: + btf__parse_elf_split; + btf__parse_raw_split; + btf__parse_split; + btf__new_empty_split; + btf__new_split; +} LIBBPF_0.2.0; From 197389da2fbfbc3cefb229268c32d858d9575c96 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:55 -0800 Subject: [PATCH 14/66] selftests/bpf: Add split BTF basic test Add selftest validating ability to programmatically generate and then dump split BTF. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-6-andrii@kernel.org --- .../selftests/bpf/prog_tests/btf_split.c | 99 +++++++++++++++++++ tools/testing/selftests/bpf/test_progs.h | 11 +++ 2 files changed, 110 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_split.c diff --git a/tools/testing/selftests/bpf/prog_tests/btf_split.c b/tools/testing/selftests/bpf/prog_tests/btf_split.c new file mode 100644 index 000000000000..ca7c2a91610a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_split.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include + +static char *dump_buf; +static size_t dump_buf_sz; +static FILE *dump_buf_file; + +static void btf_dump_printf(void *ctx, const char *fmt, va_list args) +{ + vfprintf(ctx, fmt, args); +} + +void test_btf_split() { + struct btf_dump_opts opts; + struct btf_dump *d = NULL; + const struct btf_type *t; + struct btf *btf1, *btf2; + int str_off, i, err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 1); /* [2] ptr to int */ + + btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + /* pointer size should be "inherited" from main BTF */ + ASSERT_EQ(btf__pointer_size(btf2), 8, "inherit_ptr_sz"); + + str_off = btf__find_str(btf2, "int"); + ASSERT_NEQ(str_off, -ENOENT, "str_int_missing"); + + t = btf__type_by_id(btf2, 1); + if (!ASSERT_OK_PTR(t, "int_type")) + goto cleanup; + ASSERT_EQ(btf_is_int(t), true, "int_kind"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); + + btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ + btf__add_field(btf2, "f1", 3, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 1, 32, 0); /* int f2; */ + btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ + /* } */ + + t = btf__type_by_id(btf1, 4); + ASSERT_NULL(t, "split_type_in_main"); + + t = btf__type_by_id(btf2, 4); + if (!ASSERT_OK_PTR(t, "split_struct_type")) + goto cleanup; + ASSERT_EQ(btf_is_struct(t), true, "split_struct_kind"); + ASSERT_EQ(btf_vlen(t), 3, "split_struct_vlen"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "s2", "split_struct_name"); + + /* BTF-to-C dump of split BTF */ + dump_buf_file = open_memstream(&dump_buf, &dump_buf_sz); + if (!ASSERT_OK_PTR(dump_buf_file, "dump_memstream")) + return; + opts.ctx = dump_buf_file; + d = btf_dump__new(btf2, NULL, &opts, btf_dump_printf); + if (!ASSERT_OK_PTR(d, "btf_dump__new")) + goto cleanup; + for (i = 1; i <= btf__get_nr_types(btf2); i++) { + err = btf_dump__dump_type(d, i); + ASSERT_OK(err, "dump_type_ok"); + } + fflush(dump_buf_file); + dump_buf[dump_buf_sz] = 0; /* some libc implementations don't do this */ + ASSERT_STREQ(dump_buf, +"struct s1 {\n" +" int f1;\n" +"};\n" +"\n" +"struct s2 {\n" +" struct s1 f1;\n" +" int f2;\n" +" int *f3;\n" +"};\n\n", "c_dump"); + +cleanup: + if (dump_buf_file) + fclose(dump_buf_file); + free(dump_buf); + btf_dump__free(d); + btf__free(btf1); + btf__free(btf2); +} diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 238f5f61189e..d6b14853f3bc 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -141,6 +141,17 @@ extern int test__join_cgroup(const char *path); ___ok; \ }) +#define ASSERT_NEQ(actual, expected, name) ({ \ + static int duration = 0; \ + typeof(actual) ___act = (actual); \ + typeof(expected) ___exp = (expected); \ + bool ___ok = ___act != ___exp; \ + CHECK(!___ok, (name), \ + "unexpected %s: actual %lld == expected %lld\n", \ + (name), (long long)(___act), (long long)(___exp)); \ + ___ok; \ +}) + #define ASSERT_STREQ(actual, expected, name) ({ \ static int duration = 0; \ const char *___act = actual; \ From 1306c980cf892bc17e7296d3e9ab8e9082f893a1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:56 -0800 Subject: [PATCH 15/66] selftests/bpf: Add checking of raw type dump in BTF writer APIs selftests Add re-usable btf_helpers.{c,h} to provide BTF-related testing routines. Start with adding a raw BTF dumping helpers. Raw BTF dump is the most succinct and at the same time a very human-friendly way to validate exact contents of BTF types. Cross-validate raw BTF dump and writable BTF in a single selftest. Raw type dump checks also serve as a good self-documentation. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-7-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/btf_helpers.c | 200 ++++++++++++++++++ tools/testing/selftests/bpf/btf_helpers.h | 12 ++ .../selftests/bpf/prog_tests/btf_write.c | 43 ++++ 4 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/btf_helpers.c create mode 100644 tools/testing/selftests/bpf/btf_helpers.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 50e5b18fc455..c1708ffa6b1c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -386,7 +386,7 @@ TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ network_helpers.c testing_helpers.c \ - flow_dissector_load.h + btf_helpers.c flow_dissector_load.h TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c new file mode 100644 index 000000000000..abc3f6c04cfc --- /dev/null +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include + +static const char * const btf_kind_str_mapping[] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", +}; + +static const char *btf_kind_str(__u16 kind) +{ + if (kind > BTF_KIND_DATASEC) + return "UNKNOWN"; + return btf_kind_str_mapping[kind]; +} + +static const char *btf_int_enc_str(__u8 encoding) +{ + switch (encoding) { + case 0: + return "(none)"; + case BTF_INT_SIGNED: + return "SIGNED"; + case BTF_INT_CHAR: + return "CHAR"; + case BTF_INT_BOOL: + return "BOOL"; + default: + return "UNKN"; + } +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: + return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: + return "global-alloc"; + default: + return "(unknown)"; + } +} + +static const char *btf_func_linkage_str(const struct btf_type *t) +{ + switch (btf_vlen(t)) { + case BTF_FUNC_STATIC: + return "static"; + case BTF_FUNC_GLOBAL: + return "global"; + case BTF_FUNC_EXTERN: + return "extern"; + default: + return "(unknown)"; + } +} + +static const char *btf_str(const struct btf *btf, __u32 off) +{ + if (!off) + return "(anon)"; + return btf__str_by_offset(btf, off) ?: "(invalid)"; +} + +int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id) +{ + const struct btf_type *t; + int kind, i; + __u32 vlen; + + t = btf__type_by_id(btf, id); + if (!t) + return -EINVAL; + + vlen = btf_vlen(t); + kind = btf_kind(t); + + fprintf(out, "[%u] %s '%s'", id, btf_kind_str(kind), btf_str(btf, t->name_off)); + + switch (kind) { + case BTF_KIND_INT: + fprintf(out, " size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, btf_int_offset(t), btf_int_bits(t), + btf_int_enc_str(btf_int_encoding(t))); + break; + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + fprintf(out, " type_id=%u", t->type); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *arr = btf_array(t); + + fprintf(out, " type_id=%u index_type_id=%u nr_elems=%u", + arr->type, arr->index_type, arr->nelems); + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, m++) { + __u32 bit_off, bit_sz; + + bit_off = btf_member_bit_offset(t, i); + bit_sz = btf_member_bitfield_size(t, i); + fprintf(out, "\n\t'%s' type_id=%u bits_offset=%u", + btf_str(btf, m->name_off), m->type, bit_off); + if (bit_sz) + fprintf(out, " bitfield_size=%u", bit_sz); + } + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *v = btf_enum(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, v++) { + fprintf(out, "\n\t'%s' val=%u", + btf_str(btf, v->name_off), v->val); + } + break; + } + case BTF_KIND_FWD: + fprintf(out, " fwd_kind=%s", btf_kflag(t) ? "union" : "struct"); + break; + case BTF_KIND_FUNC: + fprintf(out, " type_id=%u linkage=%s", t->type, btf_func_linkage_str(t)); + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + fprintf(out, " ret_type_id=%u vlen=%u", t->type, vlen); + for (i = 0; i < vlen; i++, p++) { + fprintf(out, "\n\t'%s' type_id=%u", + btf_str(btf, p->name_off), p->type); + } + break; + } + case BTF_KIND_VAR: + fprintf(out, " type_id=%u, linkage=%s", + t->type, btf_var_linkage_str(btf_var(t)->linkage)); + break; + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + fprintf(out, " size=%u vlen=%u", t->size, vlen); + for (i = 0; i < vlen; i++, v++) { + fprintf(out, "\n\ttype_id=%u offset=%u size=%u", + v->type, v->offset, v->size); + } + break; + } + default: + break; + } + + return 0; +} + +/* Print raw BTF type dump into a local buffer and return string pointer back. + * Buffer *will* be overwritten by subsequent btf_type_raw_dump() calls + */ +const char *btf_type_raw_dump(const struct btf *btf, int type_id) +{ + static char buf[16 * 1024]; + FILE *buf_file; + + buf_file = fmemopen(buf, sizeof(buf) - 1, "w"); + if (!buf_file) { + fprintf(stderr, "Failed to open memstream: %d\n", errno); + return NULL; + } + + fprintf_btf_type_raw(buf_file, btf, type_id); + fflush(buf_file); + fclose(buf_file); + + return buf; +} diff --git a/tools/testing/selftests/bpf/btf_helpers.h b/tools/testing/selftests/bpf/btf_helpers.h new file mode 100644 index 000000000000..2c9ce1b61dc9 --- /dev/null +++ b/tools/testing/selftests/bpf/btf_helpers.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2020 Facebook */ +#ifndef __BTF_HELPERS_H +#define __BTF_HELPERS_H + +#include +#include + +int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id); +const char *btf_type_raw_dump(const struct btf *btf, int type_id); + +#endif diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c index 314e1e7c36df..f36da15b134f 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_write.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include #include +#include "btf_helpers.h" static int duration = 0; @@ -39,6 +40,8 @@ void test_btf_write() { ASSERT_EQ(t->size, 4, "int_sz"); ASSERT_EQ(btf_int_encoding(t), BTF_INT_SIGNED, "int_enc"); ASSERT_EQ(btf_int_bits(t), 32, "int_bits"); + ASSERT_STREQ(btf_type_raw_dump(btf, 1), + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "raw_dump"); /* invalid int size */ id = btf__add_int(btf, "bad sz int", 7, 0); @@ -59,24 +62,32 @@ void test_btf_write() { t = btf__type_by_id(btf, 2); ASSERT_EQ(btf_kind(t), BTF_KIND_PTR, "ptr_kind"); ASSERT_EQ(t->type, 1, "ptr_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 2), + "[2] PTR '(anon)' type_id=1", "raw_dump"); id = btf__add_const(btf, 5); /* points forward to restrict */ ASSERT_EQ(id, 3, "const_id"); t = btf__type_by_id(btf, 3); ASSERT_EQ(btf_kind(t), BTF_KIND_CONST, "const_kind"); ASSERT_EQ(t->type, 5, "const_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 3), + "[3] CONST '(anon)' type_id=5", "raw_dump"); id = btf__add_volatile(btf, 3); ASSERT_EQ(id, 4, "volatile_id"); t = btf__type_by_id(btf, 4); ASSERT_EQ(btf_kind(t), BTF_KIND_VOLATILE, "volatile_kind"); ASSERT_EQ(t->type, 3, "volatile_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 4), + "[4] VOLATILE '(anon)' type_id=3", "raw_dump"); id = btf__add_restrict(btf, 4); ASSERT_EQ(id, 5, "restrict_id"); t = btf__type_by_id(btf, 5); ASSERT_EQ(btf_kind(t), BTF_KIND_RESTRICT, "restrict_kind"); ASSERT_EQ(t->type, 4, "restrict_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 5), + "[5] RESTRICT '(anon)' type_id=4", "raw_dump"); /* ARRAY */ id = btf__add_array(btf, 1, 2, 10); /* int *[10] */ @@ -86,6 +97,8 @@ void test_btf_write() { ASSERT_EQ(btf_array(t)->index_type, 1, "array_index_type"); ASSERT_EQ(btf_array(t)->type, 2, "array_elem_type"); ASSERT_EQ(btf_array(t)->nelems, 10, "array_nelems"); + ASSERT_STREQ(btf_type_raw_dump(btf, 6), + "[6] ARRAY '(anon)' type_id=2 index_type_id=1 nr_elems=10", "raw_dump"); /* STRUCT */ err = btf__add_field(btf, "field", 1, 0, 0); @@ -113,6 +126,10 @@ void test_btf_write() { ASSERT_EQ(m->type, 1, "f2_type"); ASSERT_EQ(btf_member_bit_offset(t, 1), 32, "f2_bit_off"); ASSERT_EQ(btf_member_bitfield_size(t, 1), 16, "f2_bit_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 7), + "[7] STRUCT 's1' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32 bitfield_size=16", "raw_dump"); /* UNION */ id = btf__add_union(btf, "u1", 8); @@ -136,6 +153,9 @@ void test_btf_write() { ASSERT_EQ(m->type, 1, "f1_type"); ASSERT_EQ(btf_member_bit_offset(t, 0), 0, "f1_bit_off"); ASSERT_EQ(btf_member_bitfield_size(t, 0), 16, "f1_bit_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 8), + "[8] UNION 'u1' size=8 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0 bitfield_size=16", "raw_dump"); /* ENUM */ id = btf__add_enum(btf, "e1", 4); @@ -156,6 +176,10 @@ void test_btf_write() { v = btf_enum(t) + 1; ASSERT_STREQ(btf__str_by_offset(btf, v->name_off), "v2", "v2_name"); ASSERT_EQ(v->val, 2, "v2_val"); + ASSERT_STREQ(btf_type_raw_dump(btf, 9), + "[9] ENUM 'e1' size=4 vlen=2\n" + "\t'v1' val=1\n" + "\t'v2' val=2", "raw_dump"); /* FWDs */ id = btf__add_fwd(btf, "struct_fwd", BTF_FWD_STRUCT); @@ -164,6 +188,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "struct_fwd", "fwd_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind"); ASSERT_EQ(btf_kflag(t), 0, "fwd_kflag"); + ASSERT_STREQ(btf_type_raw_dump(btf, 10), + "[10] FWD 'struct_fwd' fwd_kind=struct", "raw_dump"); id = btf__add_fwd(btf, "union_fwd", BTF_FWD_UNION); ASSERT_EQ(id, 11, "union_fwd_id"); @@ -171,6 +197,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "union_fwd", "fwd_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind"); ASSERT_EQ(btf_kflag(t), 1, "fwd_kflag"); + ASSERT_STREQ(btf_type_raw_dump(btf, 11), + "[11] FWD 'union_fwd' fwd_kind=union", "raw_dump"); id = btf__add_fwd(btf, "enum_fwd", BTF_FWD_ENUM); ASSERT_EQ(id, 12, "enum_fwd_id"); @@ -179,6 +207,8 @@ void test_btf_write() { ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM, "enum_fwd_kind"); ASSERT_EQ(btf_vlen(t), 0, "enum_fwd_kind"); ASSERT_EQ(t->size, 4, "enum_fwd_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 12), + "[12] ENUM 'enum_fwd' size=4 vlen=0", "raw_dump"); /* TYPEDEF */ id = btf__add_typedef(btf, "typedef1", 1); @@ -187,6 +217,8 @@ void test_btf_write() { ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "typedef1", "typedef_name"); ASSERT_EQ(btf_kind(t), BTF_KIND_TYPEDEF, "typedef_kind"); ASSERT_EQ(t->type, 1, "typedef_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 13), + "[13] TYPEDEF 'typedef1' type_id=1", "raw_dump"); /* FUNC & FUNC_PROTO */ id = btf__add_func(btf, "func1", BTF_FUNC_GLOBAL, 15); @@ -196,6 +228,8 @@ void test_btf_write() { ASSERT_EQ(t->type, 15, "func_type"); ASSERT_EQ(btf_kind(t), BTF_KIND_FUNC, "func_kind"); ASSERT_EQ(btf_vlen(t), BTF_FUNC_GLOBAL, "func_vlen"); + ASSERT_STREQ(btf_type_raw_dump(btf, 14), + "[14] FUNC 'func1' type_id=15 linkage=global", "raw_dump"); id = btf__add_func_proto(btf, 1); ASSERT_EQ(id, 15, "func_proto_id"); @@ -214,6 +248,10 @@ void test_btf_write() { p = btf_params(t) + 1; ASSERT_STREQ(btf__str_by_offset(btf, p->name_off), "p2", "p2_name"); ASSERT_EQ(p->type, 2, "p2_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 15), + "[15] FUNC_PROTO '(anon)' ret_type_id=1 vlen=2\n" + "\t'p1' type_id=1\n" + "\t'p2' type_id=2", "raw_dump"); /* VAR */ id = btf__add_var(btf, "var1", BTF_VAR_GLOBAL_ALLOCATED, 1); @@ -223,6 +261,8 @@ void test_btf_write() { ASSERT_EQ(btf_kind(t), BTF_KIND_VAR, "var_kind"); ASSERT_EQ(t->type, 1, "var_type"); ASSERT_EQ(btf_var(t)->linkage, BTF_VAR_GLOBAL_ALLOCATED, "var_type"); + ASSERT_STREQ(btf_type_raw_dump(btf, 16), + "[16] VAR 'var1' type_id=1, linkage=global-alloc", "raw_dump"); /* DATASECT */ id = btf__add_datasec(btf, "datasec1", 12); @@ -239,6 +279,9 @@ void test_btf_write() { ASSERT_EQ(vi->type, 1, "v1_type"); ASSERT_EQ(vi->offset, 4, "v1_off"); ASSERT_EQ(vi->size, 8, "v1_sz"); + ASSERT_STREQ(btf_type_raw_dump(btf, 17), + "[17] DATASEC 'datasec1' size=12 vlen=1\n" + "\ttype_id=1 offset=4 size=8", "raw_dump"); btf__free(btf); } From d8123624506cd62730c9cd9c7672c698e462703d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:57 -0800 Subject: [PATCH 16/66] libbpf: Fix BTF data layout checks and allow empty BTF Make data section layout checks stricter, disallowing overlap of types and strings data. Additionally, allow BTFs with no type data. There is nothing inherently wrong with having BTF with no types (put potentially with some strings). This could be a situation with kernel module BTFs, if module doesn't introduce any new type information. Also fix invalid offset alignment check for btf->hdr->type_off. Fixes: 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105043402.2530976-8-andrii@kernel.org --- tools/lib/bpf/btf.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 0258cf108c0a..20bb88e71f07 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -245,22 +245,18 @@ static int btf_parse_hdr(struct btf *btf) return -EINVAL; } - if (meta_left < hdr->type_off) { - pr_debug("Invalid BTF type section offset:%u\n", hdr->type_off); + if (meta_left < hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size:%u\n", btf->raw_size); return -EINVAL; } - if (meta_left < hdr->str_off) { - pr_debug("Invalid BTF string section offset:%u\n", hdr->str_off); + if (hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); return -EINVAL; } - if (hdr->type_off >= hdr->str_off) { - pr_debug("BTF type section offset >= string section offset. No type?\n"); - return -EINVAL; - } - - if (hdr->type_off & 0x02) { + if (hdr->type_off % 4) { pr_debug("BTF type section is not aligned to 4 bytes\n"); return -EINVAL; } From f86524efcf9e3f3a7cf75ebcd82cf8f58ec716cc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:58 -0800 Subject: [PATCH 17/66] libbpf: Support BTF dedup of split BTFs Add support for deduplication split BTFs. When deduplicating split BTF, base BTF is considered to be immutable and can't be modified or adjusted. 99% of BTF deduplication logic is left intact (module some type numbering adjustments). There are only two differences. First, each type in base BTF gets hashed (expect VAR and DATASEC, of course, those are always considered to be self-canonical instances) and added into a table of canonical table candidates. Hashing is a shallow, fast operation, so mostly eliminates the overhead of having entire base BTF to be a part of BTF dedup. Second difference is very critical and subtle. While deduplicating split BTF types, it is possible to discover that one of immutable base BTF BTF_KIND_FWD types can and should be resolved to a full STRUCT/UNION type from the split BTF part. This is, obviously, can't happen because we can't modify the base BTF types anymore. So because of that, any type in split BTF that directly or indirectly references that newly-to-be-resolved FWD type can't be considered to be equivalent to the corresponding canonical types in base BTF, because that would result in a loss of type resolution information. So in such case, split BTF types will be deduplicated separately and will cause some duplication of type information, which is unavoidable. With those two changes, the rest of the algorithm manages to deduplicate split BTF correctly, pointing all the duplicates to their canonical counter-parts in base BTF, but also is deduplicating whatever unique types are present in split BTF on their own. Also, theoretically, split BTF after deduplication could end up with either empty type section or empty string section. This is handled by libbpf correctly in one of previous patches in the series. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-9-andrii@kernel.org --- tools/lib/bpf/btf.c | 221 +++++++++++++++++++++++++++++++++----------- 1 file changed, 168 insertions(+), 53 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 20bb88e71f07..8d04d9becb67 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2718,6 +2718,7 @@ struct btf_dedup; static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, const struct btf_dedup_opts *opts); static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_prep(struct btf_dedup *d); static int btf_dedup_strings(struct btf_dedup *d); static int btf_dedup_prim_types(struct btf_dedup *d); static int btf_dedup_struct_types(struct btf_dedup *d); @@ -2876,6 +2877,11 @@ int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, if (btf_ensure_modifiable(btf)) return -ENOMEM; + err = btf_dedup_prep(d); + if (err) { + pr_debug("btf_dedup_prep failed:%d\n", err); + goto done; + } err = btf_dedup_strings(d); if (err < 0) { pr_debug("btf_dedup_strings failed:%d\n", err); @@ -2938,6 +2944,13 @@ struct btf_dedup { __u32 *hypot_list; size_t hypot_cnt; size_t hypot_cap; + /* Whether hypothetical mapping, if successful, would need to adjust + * already canonicalized types (due to a new forward declaration to + * concrete type resolution). In such case, during split BTF dedup + * candidate type would still be considered as different, because base + * BTF is considered to be immutable. + */ + bool hypot_adjust_canon; /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; /* temporary strings deduplication state */ @@ -2985,6 +2998,7 @@ static void btf_dedup_clear_hypot_map(struct btf_dedup *d) for (i = 0; i < d->hypot_cnt; i++) d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; d->hypot_cnt = 0; + d->hypot_adjust_canon = false; } static void btf_dedup_free(struct btf_dedup *d) @@ -3024,7 +3038,7 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, { struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; - int i, err = 0; + int i, err = 0, type_cnt; if (!d) return ERR_PTR(-ENOMEM); @@ -3044,14 +3058,15 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, goto done; } - d->map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + type_cnt = btf__get_nr_types(btf) + 1; + d->map = malloc(sizeof(__u32) * type_cnt); if (!d->map) { err = -ENOMEM; goto done; } /* special BTF "void" type is made canonical immediately */ d->map[0] = 0; - for (i = 1; i <= btf->nr_types; i++) { + for (i = 1; i < type_cnt; i++) { struct btf_type *t = btf_type_by_id(d->btf, i); /* VAR and DATASEC are never deduped and are self-canonical */ @@ -3061,12 +3076,12 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, d->map[i] = BTF_UNPROCESSED_ID; } - d->hypot_map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + d->hypot_map = malloc(sizeof(__u32) * type_cnt); if (!d->hypot_map) { err = -ENOMEM; goto done; } - for (i = 0; i <= btf->nr_types; i++) + for (i = 0; i < type_cnt; i++) d->hypot_map[i] = BTF_UNPROCESSED_ID; done: @@ -3090,8 +3105,8 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) int i, j, r, rec_size; struct btf_type *t; - for (i = 1; i <= d->btf->nr_types; i++) { - t = btf_type_by_id(d->btf, i); + for (i = 0; i < d->btf->nr_types; i++) { + t = btf_type_by_id(d->btf, d->btf->start_id + i); r = fn(&t->name_off, ctx); if (r) return r; @@ -3174,15 +3189,27 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { struct btf_dedup *d = ctx; + __u32 str_off = *str_off_ptr; long old_off, new_off, len; const char *s; void *p; int err; - if (*str_off_ptr == 0) + /* don't touch empty string or string in main BTF */ + if (str_off == 0 || str_off < d->btf->start_str_off) return 0; - s = btf__str_by_offset(d->btf, *str_off_ptr); + s = btf__str_by_offset(d->btf, str_off); + if (d->btf->base_btf) { + err = btf__find_str(d->btf->base_btf, s); + if (err >= 0) { + *str_off_ptr = err; + return 0; + } + if (err != -ENOENT) + return err; + } + len = strlen(s) + 1; new_off = d->strs_len; @@ -3199,11 +3226,11 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, HASHMAP_ADD, (const void **)&old_off, NULL); if (err == -EEXIST) { - *str_off_ptr = old_off; + *str_off_ptr = d->btf->start_str_off + old_off; } else if (err) { return err; } else { - *str_off_ptr = new_off; + *str_off_ptr = d->btf->start_str_off + new_off; d->strs_len += len; } return 0; @@ -3228,13 +3255,6 @@ static int btf_dedup_strings(struct btf_dedup *d) if (d->btf->strs_deduped) return 0; - s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); - if (!s) - return -ENOMEM; - /* initial empty string */ - s[0] = 0; - d->strs_len = 1; - /* temporarily switch to use btf_dedup's strs_data for strings for hash * functions; later we'll just transfer hashmap to struct btf as is, * along the strs_data @@ -3248,13 +3268,22 @@ static int btf_dedup_strings(struct btf_dedup *d) goto err_out; } - /* insert empty string; we won't be looking it up during strings - * dedup, but it's good to have it for generic BTF string lookups - */ - err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, - HASHMAP_ADD, NULL, NULL); - if (err) - goto err_out; + if (!d->btf->base_btf) { + s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); + if (!s) + return -ENOMEM; + /* initial empty string */ + s[0] = 0; + d->strs_len = 1; + + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups + */ + err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, + HASHMAP_ADD, NULL, NULL); + if (err) + goto err_out; + } /* remap string offsets */ err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); @@ -3549,6 +3578,66 @@ static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) return true; } +/* Prepare split BTF for deduplication by calculating hashes of base BTF's + * types and initializing the rest of the state (canonical type mapping) for + * the fixed base BTF part. + */ +static int btf_dedup_prep(struct btf_dedup *d) +{ + struct btf_type *t; + int type_id; + long h; + + if (!d->btf->base_btf) + return 0; + + for (type_id = 1; type_id < d->btf->start_id; type_id++) { + t = btf_type_by_id(d->btf, type_id); + + /* all base BTF types are self-canonical by definition */ + d->map[type_id] = type_id; + + switch (btf_kind(t)) { + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + /* VAR and DATASEC are never hash/deduplicated */ + continue; + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_FWD: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + h = btf_hash_common(t); + break; + case BTF_KIND_INT: + h = btf_hash_int(t); + break; + case BTF_KIND_ENUM: + h = btf_hash_enum(t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + h = btf_hash_struct(t); + break; + case BTF_KIND_ARRAY: + h = btf_hash_array(t); + break; + case BTF_KIND_FUNC_PROTO: + h = btf_hash_fnproto(t); + break; + default: + pr_debug("unknown kind %d for type [%d]\n", btf_kind(t), type_id); + return -EINVAL; + } + if (btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + } + + return 0; +} + /* * Deduplicate primitive types, that can't reference other types, by calculating * their type signature hash and comparing them with any possible canonical @@ -3642,8 +3731,8 @@ static int btf_dedup_prim_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_prim_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, d->btf->start_id + i); if (err) return err; } @@ -3833,6 +3922,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, } else { real_kind = cand_kind; fwd_kind = btf_fwd_kind(canon_type); + /* we'd need to resolve base FWD to STRUCT/UNION */ + if (fwd_kind == real_kind && canon_id < d->btf->start_id) + d->hypot_adjust_canon = true; } return fwd_kind == real_kind; } @@ -3870,8 +3962,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return 0; cand_arr = btf_array(cand_type); canon_arr = btf_array(canon_type); - eq = btf_dedup_is_equiv(d, - cand_arr->index_type, canon_arr->index_type); + eq = btf_dedup_is_equiv(d, cand_arr->index_type, canon_arr->index_type); if (eq <= 0) return eq; return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); @@ -3954,16 +4045,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, */ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) { - __u32 cand_type_id, targ_type_id; + __u32 canon_type_id, targ_type_id; __u16 t_kind, c_kind; __u32 t_id, c_id; int i; for (i = 0; i < d->hypot_cnt; i++) { - cand_type_id = d->hypot_list[i]; - targ_type_id = d->hypot_map[cand_type_id]; + canon_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[canon_type_id]; t_id = resolve_type_id(d, targ_type_id); - c_id = resolve_type_id(d, cand_type_id); + c_id = resolve_type_id(d, canon_type_id); t_kind = btf_kind(btf__type_by_id(d->btf, t_id)); c_kind = btf_kind(btf__type_by_id(d->btf, c_id)); /* @@ -3978,9 +4069,26 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d) * stability is not a requirement for STRUCT/UNION equivalence * checks, though. */ + + /* if it's the split BTF case, we still need to point base FWD + * to STRUCT/UNION in a split BTF, because FWDs from split BTF + * will be resolved against base FWD. If we don't point base + * canonical FWD to the resolved STRUCT/UNION, then all the + * FWDs in split BTF won't be correctly resolved to a proper + * STRUCT/UNION. + */ if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) d->map[c_id] = t_id; - else if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + + /* if graph equivalence determined that we'd need to adjust + * base canonical types, then we need to only point base FWDs + * to STRUCTs/UNIONs and do no more modifications. For all + * other purposes the type graphs were not equivalent. + */ + if (d->hypot_adjust_canon) + continue; + + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) d->map[t_id] = c_id; if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && @@ -4064,8 +4172,10 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) return eq; if (!eq) continue; - new_id = cand_id; btf_dedup_merge_hypot_map(d); + if (d->hypot_adjust_canon) /* not really equivalent */ + continue; + new_id = cand_id; break; } @@ -4080,8 +4190,8 @@ static int btf_dedup_struct_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_struct_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, d->btf->start_id + i); if (err) return err; } @@ -4224,8 +4334,8 @@ static int btf_dedup_ref_types(struct btf_dedup *d) { int i, err; - for (i = 1; i <= d->btf->nr_types; i++) { - err = btf_dedup_ref_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, d->btf->start_id + i); if (err < 0) return err; } @@ -4249,39 +4359,44 @@ static int btf_dedup_ref_types(struct btf_dedup *d) static int btf_dedup_compact_types(struct btf_dedup *d) { __u32 *new_offs; - __u32 next_type_id = 1; + __u32 next_type_id = d->btf->start_id; + const struct btf_type *t; void *p; - int i, len; + int i, id, len; /* we are going to reuse hypot_map to store compaction remapping */ d->hypot_map[0] = 0; - for (i = 1; i <= d->btf->nr_types; i++) - d->hypot_map[i] = BTF_UNPROCESSED_ID; + /* base BTF types are not renumbered */ + for (id = 1; id < d->btf->start_id; id++) + d->hypot_map[id] = id; + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) + d->hypot_map[id] = BTF_UNPROCESSED_ID; p = d->btf->types_data; - for (i = 1; i <= d->btf->nr_types; i++) { - if (d->map[i] != i) + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) { + if (d->map[id] != id) continue; - len = btf_type_size(btf__type_by_id(d->btf, i)); + t = btf__type_by_id(d->btf, id); + len = btf_type_size(t); if (len < 0) return len; - memmove(p, btf__type_by_id(d->btf, i), len); - d->hypot_map[i] = next_type_id; - d->btf->type_offs[next_type_id - 1] = p - d->btf->types_data; + memmove(p, t, len); + d->hypot_map[id] = next_type_id; + d->btf->type_offs[next_type_id - d->btf->start_id] = p - d->btf->types_data; p += len; next_type_id++; } /* shrink struct btf's internal types index and update btf_header */ - d->btf->nr_types = next_type_id - 1; + d->btf->nr_types = next_type_id - d->btf->start_id; d->btf->type_offs_cap = d->btf->nr_types; d->btf->hdr->type_len = p - d->btf->types_data; new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, sizeof(*new_offs)); - if (!new_offs) + if (d->btf->type_offs_cap && !new_offs) return -ENOMEM; d->btf->type_offs = new_offs; d->btf->hdr->str_off = d->btf->hdr->type_len; @@ -4413,8 +4528,8 @@ static int btf_dedup_remap_types(struct btf_dedup *d) { int i, r; - for (i = 1; i <= d->btf->nr_types; i++) { - r = btf_dedup_remap_type(d, i); + for (i = 0; i < d->btf->nr_types; i++) { + r = btf_dedup_remap_type(d, d->btf->start_id + i); if (r < 0) return r; } From 6b6e6b1d09aa20c351a1fce0ea6402da436624a4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:33:59 -0800 Subject: [PATCH 18/66] libbpf: Accomodate DWARF/compiler bug with duplicated identical arrays In some cases compiler seems to generate distinct DWARF types for identical arrays within the same CU. That seems like a bug, but it's already out there and breaks type graph equivalence checks, so accommodate it anyway by checking for identical arrays, regardless of their type ID. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-10-andrii@kernel.org --- tools/lib/bpf/btf.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8d04d9becb67..2d0d064c6d31 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3785,6 +3785,19 @@ static inline __u16 btf_fwd_kind(struct btf_type *t) return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT; } +/* Check if given two types are identical ARRAY definitions */ +static int btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + struct btf_type *t1, *t2; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + if (!btf_is_array(t1) || !btf_is_array(t2)) + return 0; + + return btf_equal_array(t1, t2); +} + /* * Check equivalence of BTF type graph formed by candidate struct/union (we'll * call it "candidate graph" in this description for brevity) to a type graph @@ -3895,8 +3908,18 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_id = resolve_fwd_id(d, canon_id); hypot_type_id = d->hypot_map[canon_id]; - if (hypot_type_id <= BTF_MAX_NR_TYPES) - return hypot_type_id == cand_id; + if (hypot_type_id <= BTF_MAX_NR_TYPES) { + /* In some cases compiler will generate different DWARF types + * for *identical* array type definitions and use them for + * different fields within the *same* struct. This breaks type + * equivalence check, which makes an assumption that candidate + * types sub-graph has a consistent and deduped-by-compiler + * types within a single CU. So work around that by explicitly + * allowing identical array types here. + */ + return hypot_type_id == cand_id || + btf_dedup_identical_arrays(d, hypot_type_id, cand_id); + } if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) return -ENOMEM; From 232338fa2fb47726ab7c459419115a6ab6bfb3e3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:34:00 -0800 Subject: [PATCH 19/66] selftests/bpf: Add split BTF dedup selftests Add selftests validating BTF deduplication for split BTF case. Add a helper macro that allows to validate entire BTF with raw BTF dump, not just type-by-type. This saves tons of code and complexity. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-11-andrii@kernel.org --- tools/testing/selftests/bpf/btf_helpers.c | 59 ++++ tools/testing/selftests/bpf/btf_helpers.h | 7 + .../bpf/prog_tests/btf_dedup_split.c | 325 ++++++++++++++++++ 3 files changed, 391 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c diff --git a/tools/testing/selftests/bpf/btf_helpers.c b/tools/testing/selftests/bpf/btf_helpers.c index abc3f6c04cfc..48f90490f922 100644 --- a/tools/testing/selftests/bpf/btf_helpers.c +++ b/tools/testing/selftests/bpf/btf_helpers.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include "test_progs.h" static const char * const btf_kind_str_mapping[] = { [BTF_KIND_UNKN] = "UNKNOWN", @@ -198,3 +200,60 @@ const char *btf_type_raw_dump(const struct btf *btf, int type_id) return buf; } + +int btf_validate_raw(struct btf *btf, int nr_types, const char *exp_types[]) +{ + int i; + bool ok = true; + + ASSERT_EQ(btf__get_nr_types(btf), nr_types, "btf_nr_types"); + + for (i = 1; i <= nr_types; i++) { + if (!ASSERT_STREQ(btf_type_raw_dump(btf, i), exp_types[i - 1], "raw_dump")) + ok = false; + } + + return ok; +} + +static void btf_dump_printf(void *ctx, const char *fmt, va_list args) +{ + vfprintf(ctx, fmt, args); +} + +/* Print BTF-to-C dump into a local buffer and return string pointer back. + * Buffer *will* be overwritten by subsequent btf_type_raw_dump() calls + */ +const char *btf_type_c_dump(const struct btf *btf) +{ + static char buf[16 * 1024]; + FILE *buf_file; + struct btf_dump *d = NULL; + struct btf_dump_opts opts = {}; + int err, i; + + buf_file = fmemopen(buf, sizeof(buf) - 1, "w"); + if (!buf_file) { + fprintf(stderr, "Failed to open memstream: %d\n", errno); + return NULL; + } + + opts.ctx = buf_file; + d = btf_dump__new(btf, NULL, &opts, btf_dump_printf); + if (libbpf_get_error(d)) { + fprintf(stderr, "Failed to create btf_dump instance: %ld\n", libbpf_get_error(d)); + return NULL; + } + + for (i = 1; i <= btf__get_nr_types(btf); i++) { + err = btf_dump__dump_type(d, i); + if (err) { + fprintf(stderr, "Failed to dump type [%d]: %d\n", i, err); + return NULL; + } + } + + fflush(buf_file); + fclose(buf_file); + return buf; +} diff --git a/tools/testing/selftests/bpf/btf_helpers.h b/tools/testing/selftests/bpf/btf_helpers.h index 2c9ce1b61dc9..295c0137d9bd 100644 --- a/tools/testing/selftests/bpf/btf_helpers.h +++ b/tools/testing/selftests/bpf/btf_helpers.h @@ -8,5 +8,12 @@ int fprintf_btf_type_raw(FILE *out, const struct btf *btf, __u32 id); const char *btf_type_raw_dump(const struct btf *btf, int type_id); +int btf_validate_raw(struct btf *btf, int nr_types, const char *exp_types[]); +#define VALIDATE_RAW_BTF(btf, raw_types...) \ + btf_validate_raw(btf, \ + sizeof((const char *[]){raw_types})/sizeof(void *),\ + (const char *[]){raw_types}) + +const char *btf_type_c_dump(const struct btf *btf); #endif diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c new file mode 100644 index 000000000000..64554fd33547 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include "btf_helpers.h" + +static void test_split_simple() { + const struct btf_type *t; + struct btf *btf1, *btf2; + int str_off, err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 1); /* [2] ptr to int */ + btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + + ASSERT_STREQ(btf_type_c_dump(btf1), "\ +struct s1 {\n\ + int f1;\n\ +};\n\n", "c_dump"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + /* pointer size should be "inherited" from main BTF */ + ASSERT_EQ(btf__pointer_size(btf2), 8, "inherit_ptr_sz"); + + str_off = btf__find_str(btf2, "int"); + ASSERT_NEQ(str_off, -ENOENT, "str_int_missing"); + + t = btf__type_by_id(btf2, 1); + if (!ASSERT_OK_PTR(t, "int_type")) + goto cleanup; + ASSERT_EQ(btf_is_int(t), true, "int_kind"); + ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); + + btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ + btf__add_field(btf2, "f1", 6, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 5, 32, 0); /* int f2; */ + btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ + /* } */ + + /* duplicated int */ + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [5] int */ + + /* duplicated struct s1 */ + btf__add_struct(btf2, "s1", 4); /* [6] struct s1 { */ + btf__add_field(btf2, "f1", 5, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=5 bits_offset=32\n" + "\t'f3' type_id=2 bits_offset=64", + "[5] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=5 bits_offset=0"); + + ASSERT_STREQ(btf_type_c_dump(btf2), "\ +struct s1 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s1___2 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s2 {\n\ + struct s1___2 f1;\n\ + int f2;\n\ + int *f3;\n\ +};\n\n", "c_dump"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=3 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32\n" + "\t'f3' type_id=2 bits_offset=64"); + + ASSERT_STREQ(btf_type_c_dump(btf2), "\ +struct s1 {\n\ + int f1;\n\ +};\n\ +\n\ +struct s2 {\n\ + struct s1 f1;\n\ + int f2;\n\ + int *f3;\n\ +};\n\n", "c_dump"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +static void test_split_fwd_resolve() { + struct btf *btf1, *btf2; + int err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 4); /* [2] ptr to struct s1 */ + btf__add_ptr(btf1, 5); /* [3] ptr to struct s2 */ + btf__add_struct(btf1, "s1", 16); /* [4] struct s1 { */ + btf__add_field(btf1, "f1", 2, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf1, "f2", 3, 64, 0); /* struct s2 *f2; */ + /* } */ + btf__add_struct(btf1, "s2", 4); /* [5] struct s2 { */ + btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ + btf__add_ptr(btf2, 10); /* [7] ptr to struct s1 */ + btf__add_fwd(btf2, "s2", BTF_FWD_STRUCT); /* [8] fwd for struct s2 */ + btf__add_ptr(btf2, 8); /* [9] ptr to fwd struct s2 */ + btf__add_struct(btf2, "s1", 16); /* [10] struct s1 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0", + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] PTR '(anon)' type_id=10", + "[8] FWD 's2' fwd_kind=struct", + "[9] PTR '(anon)' type_id=8", + "[10] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=4", + "[3] PTR '(anon)' type_id=5", + "[4] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=3 bits_offset=64", + "[5] STRUCT 's2' size=4 vlen=1\n" + "\t'f1' type_id=1 bits_offset=0"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +static void test_split_struct_duped() { + struct btf *btf1, *btf2; + int err; + + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf__set_pointer_size(btf1, 8); /* enforce 64-bit arch */ + + btf__add_int(btf1, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf1, 5); /* [2] ptr to struct s1 */ + btf__add_fwd(btf1, "s2", BTF_FWD_STRUCT); /* [3] fwd for struct s2 */ + btf__add_ptr(btf1, 3); /* [4] ptr to fwd struct s2 */ + btf__add_struct(btf1, "s1", 16); /* [5] struct s1 { */ + btf__add_field(btf1, "f1", 2, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf1, "f2", 4, 64, 0); /* struct s2 *f2; */ + /* } */ + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64"); + + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ + btf__add_ptr(btf2, 10); /* [7] ptr to struct s1 */ + btf__add_fwd(btf2, "s2", BTF_FWD_STRUCT); /* [8] fwd for struct s2 */ + btf__add_ptr(btf2, 11); /* [9] ptr to struct s2 */ + btf__add_struct(btf2, "s1", 16); /* [10] struct s1 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + /* } */ + btf__add_struct(btf2, "s2", 40); /* [11] struct s2 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 *f1; */ + btf__add_field(btf2, "f2", 9, 64, 0); /* struct s2 *f2; */ + btf__add_field(btf2, "f3", 6, 128, 0); /* int f3; */ + btf__add_field(btf2, "f4", 10, 192, 0); /* struct s1 f4; */ + /* } */ + btf__add_ptr(btf2, 8); /* [12] ptr to fwd struct s2 */ + btf__add_struct(btf2, "s3", 8); /* [13] struct s3 { */ + btf__add_field(btf2, "f1", 12, 0, 0); /* struct s2 *f1; (fwd) */ + /* } */ + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64", + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] PTR '(anon)' type_id=10", + "[8] FWD 's2' fwd_kind=struct", + "[9] PTR '(anon)' type_id=11", + "[10] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64", + "[11] STRUCT 's2' size=40 vlen=4\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=9 bits_offset=64\n" + "\t'f3' type_id=6 bits_offset=128\n" + "\t'f4' type_id=10 bits_offset=192", + "[12] PTR '(anon)' type_id=8", + "[13] STRUCT 's3' size=8 vlen=1\n" + "\t'f1' type_id=12 bits_offset=0"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=5", + "[3] FWD 's2' fwd_kind=struct", + "[4] PTR '(anon)' type_id=3", + "[5] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=2 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=64", + "[6] PTR '(anon)' type_id=8", + "[7] PTR '(anon)' type_id=9", + "[8] STRUCT 's1' size=16 vlen=2\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=7 bits_offset=64", + "[9] STRUCT 's2' size=40 vlen=4\n" + "\t'f1' type_id=6 bits_offset=0\n" + "\t'f2' type_id=7 bits_offset=64\n" + "\t'f3' type_id=1 bits_offset=128\n" + "\t'f4' type_id=8 bits_offset=192", + "[10] STRUCT 's3' size=8 vlen=1\n" + "\t'f1' type_id=7 bits_offset=0"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + +void test_btf_dedup_split() +{ + if (test__start_subtest("split_simple")) + test_split_simple(); + if (test__start_subtest("split_struct_duped")) + test_split_struct_duped(); + if (test__start_subtest("split_fwd_resolve")) + test_split_fwd_resolve(); +} From 75fa1777694c245c1e59ac774cb1d58a15ecefeb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 4 Nov 2020 20:34:01 -0800 Subject: [PATCH 20/66] tools/bpftool: Add bpftool support for split BTF Add ability to work with split BTF by providing extra -B flag, which allows to specify the path to the base BTF file. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105043402.2530976-12-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 9 ++++++--- tools/bpf/bpftool/main.c | 15 ++++++++++++++- tools/bpf/bpftool/main.h | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 8ab142ff5eac..c96b56e8e3a4 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -358,8 +358,12 @@ static int dump_btf_raw(const struct btf *btf, } } else { int cnt = btf__get_nr_types(btf); + int start_id = 1; - for (i = 1; i <= cnt; i++) { + if (base_btf) + start_id = btf__get_nr_types(base_btf) + 1; + + for (i = start_id; i <= cnt; i++) { t = btf__type_by_id(btf, i); dump_btf_type(btf, i, t); } @@ -438,7 +442,6 @@ static int do_dump(int argc, char **argv) return -1; } src = GET_ARG(); - if (is_prefix(src, "map")) { struct bpf_map_info info = {}; __u32 len = sizeof(info); @@ -499,7 +502,7 @@ static int do_dump(int argc, char **argv) } NEXT_ARG(); } else if (is_prefix(src, "file")) { - btf = btf__parse(*argv, NULL); + btf = btf__parse_split(*argv, base_btf); if (IS_ERR(btf)) { err = -PTR_ERR(btf); btf = NULL; diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 682daaa49e6a..b86f450e6fce 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -11,6 +11,7 @@ #include #include +#include #include "main.h" @@ -28,6 +29,7 @@ bool show_pinned; bool block_mount; bool verifier_logs; bool relaxed_maps; +struct btf *base_btf; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; struct pinned_obj_table link_table; @@ -391,6 +393,7 @@ int main(int argc, char **argv) { "mapcompat", no_argument, NULL, 'm' }, { "nomount", no_argument, NULL, 'n' }, { "debug", no_argument, NULL, 'd' }, + { "base-btf", required_argument, NULL, 'B' }, { 0 } }; int opt, ret; @@ -407,7 +410,7 @@ int main(int argc, char **argv) hash_init(link_table.table); opterr = 0; - while ((opt = getopt_long(argc, argv, "Vhpjfmnd", + while ((opt = getopt_long(argc, argv, "VhpjfmndB:", options, NULL)) >= 0) { switch (opt) { case 'V': @@ -441,6 +444,15 @@ int main(int argc, char **argv) libbpf_set_print(print_all_levels); verifier_logs = true; break; + case 'B': + base_btf = btf__parse(optarg, NULL); + if (libbpf_get_error(base_btf)) { + p_err("failed to parse base BTF at '%s': %ld\n", + optarg, libbpf_get_error(base_btf)); + base_btf = NULL; + return -1; + } + break; default: p_err("unrecognized option '%s'", argv[optind - 1]); if (json_output) @@ -465,6 +477,7 @@ int main(int argc, char **argv) delete_pinned_obj_table(&map_table); delete_pinned_obj_table(&link_table); } + btf__free(base_btf); return ret; } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index c46e52137b87..76e91641262b 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -90,6 +90,7 @@ extern bool show_pids; extern bool block_mount; extern bool verifier_logs; extern bool relaxed_maps; +extern struct btf *base_btf; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; extern struct pinned_obj_table link_table; From c6bde958a62b8ca5ee8d2c1fe429aec4ad54efad Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Thu, 29 Oct 2020 21:14:42 +0100 Subject: [PATCH 21/66] bpf: Lift hashtab key_size limit Currently key_size of hashtab is limited to MAX_BPF_STACK. As the key of hashtab can also be a value from a per cpu map it can be larger than MAX_BPF_STACK. The use-case for this patch originates to implement allow/disallow lists for files and file paths. The maximum length of file paths is defined by PATH_MAX with 4096 chars including nul. This limit exceeds MAX_BPF_STACK. Changelog: v5: - Fix cast overflow v4: - Utilize BPF skeleton in tests - Rebase v3: - Rebase v2: - Add a test for bpf side Signed-off-by: Florian Lehner Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201029201442.596690-1-dev@der-flo.net --- kernel/bpf/hashtab.c | 16 +++---- .../selftests/bpf/prog_tests/hash_large_key.c | 43 ++++++++++++++++++ .../selftests/bpf/progs/test_hash_large_key.c | 44 +++++++++++++++++++ tools/testing/selftests/bpf/test_maps.c | 3 +- 4 files changed, 94 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/hash_large_key.c create mode 100644 tools/testing/selftests/bpf/progs/test_hash_large_key.c diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 23f73d4649c9..7bf18d92af41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -415,17 +415,11 @@ static int htab_map_alloc_check(union bpf_attr *attr) attr->value_size == 0) return -EINVAL; - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return -E2BIG; - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's + if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - + sizeof(struct htab_elem)) + /* if key_size + value_size is bigger, the user space won't be + * able to access the elements via bpf syscall. This check + * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; diff --git a/tools/testing/selftests/bpf/prog_tests/hash_large_key.c b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c new file mode 100644 index 000000000000..34684c0fc76d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_hash_large_key.skel.h" + +void test_hash_large_key(void) +{ + int err, value = 21, duration = 0, hash_map_fd; + struct test_hash_large_key *skel; + + struct bigelement { + int a; + char b[4096]; + long long c; + } key; + bzero(&key, sizeof(key)); + + skel = test_hash_large_key__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + hash_map_fd = bpf_map__fd(skel->maps.hash_map); + if (CHECK(hash_map_fd < 0, "bpf_map__fd", "failed\n")) + goto cleanup; + + err = test_hash_large_key__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + err = bpf_map_update_elem(hash_map_fd, &key, &value, BPF_ANY); + if (CHECK(err, "bpf_map_update_elem", "errno=%d\n", errno)) + goto cleanup; + + key.c = 1; + err = bpf_map_lookup_elem(hash_map_fd, &key, &value); + if (CHECK(err, "bpf_map_lookup_elem", "errno=%d\n", errno)) + goto cleanup; + + CHECK_FAIL(value != 42); + +cleanup: + test_hash_large_key__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_hash_large_key.c b/tools/testing/selftests/bpf/progs/test_hash_large_key.c new file mode 100644 index 000000000000..473a22794a62 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_hash_large_key.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, struct bigelement); + __type(value, __u32); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct bigelement); +} key_map SEC(".maps"); + +struct bigelement { + int a; + char b[4096]; + long long c; +}; + +SEC("raw_tracepoint/sys_enter") +int bpf_hash_large_key_test(void *ctx) +{ + int zero = 0, err = 1, value = 42; + struct bigelement *key; + + key = bpf_map_lookup_elem(&key_map, &zero); + if (!key) + return 0; + + key->c = 1; + if (bpf_map_update_elem(&hash_map, key, &value, BPF_ANY)) + return 0; + + return 0; +} + diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 0d92ebcb335d..0ad3e6305ff0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1223,9 +1223,10 @@ out_map_in_map: static void test_map_large(void) { + struct bigkey { int a; - char b[116]; + char b[4096]; long long c; } key; int fd, i, value; From 9e7a4d9831e836eb03dedab89902277ee94eb7a6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:39 +0000 Subject: [PATCH 22/66] bpf: Allow LSM programs to use bpf spin locks Usage of spin locks was not allowed for tracing programs due to insufficient preemption checks. The verifier does not currently prevent LSM programs from using spin locks, but the helpers are not exposed via bpf_lsm_func_proto. Based on the discussion in [1], non-sleepable LSM programs should be able to use bpf_spin_{lock, unlock}. Sleepable LSM programs can be preempted which means that allowng spin locks will need more work (disabling preemption and the verifier ensuring that no sleepable helpers are called when a spin lock is held). [1]: https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsingh@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-2-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 4 ++++ kernel/bpf/verifier.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..cd8a617f2109 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -59,6 +59,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..f863aa84d0a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9719,11 +9719,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog_type) || - prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; + if (map_value_has_spin_lock(map)) { + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (prog->aux->sleepable) { + verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && From 4cf1bc1f10452065a29d576fc5693fc4fab5b919 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:40 +0000 Subject: [PATCH 23/66] bpf: Implement task local storage Similar to bpf_local_storage for sockets and inodes add local storage for task_struct. The life-cycle of storage is managed with the life-cycle of the task_struct. i.e. the storage is destroyed along with the owning task with a callback to the bpf_task_storage_free from the task_free LSM hook. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. The userspace map operations can be done by using a pid fd as a key passed to the lookup, update and delete operations. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 23 +++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 39 ++++ kernel/bpf/Makefile | 1 + kernel/bpf/bpf_lsm.c | 4 + kernel/bpf/bpf_task_storage.c | 315 +++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 ++ security/bpf/hooks.c | 2 + tools/include/uapi/linux/bpf.h | 39 ++++ 10 files changed, 436 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_task_storage.c diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aaacb6aafc87..73226181b744 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -7,6 +7,7 @@ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H +#include #include #include @@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + if (unlikely(!task->security)) + return NULL; + + return task->security + bpf_lsm_blob_sizes.lbs_task; +} + extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + return NULL; +} + static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2e6f568377f1..99f7fd657d87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..f0b93ced5a7f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o +obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index cd8a617f2109..e92c51bebb47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c new file mode 100644 index 000000000000..39a45fba4fb0 --- /dev/null +++ b/kernel/bpf/bpf_task_storage.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(task_cache); + +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) +{ + struct task_struct *task = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data * +task_storage_lookup(struct task_struct *task, struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *task_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + + task_storage = rcu_dereference(bsb->storage); + if (!task_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); +} + +void bpf_task_storage_free(struct task_struct *task) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_task(task); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Neither the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_task_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_task_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = task_storage_lookup(task, map, true); + put_pid(pid); + return sdata ? sdata->data : NULL; +out: + put_pid(pid); + return ERR_PTR(err); +} + +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, map_flags); + + err = PTR_ERR_OR_ZERO(sdata); +out: + put_pid(pid); + return err; +} + +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = task_storage_lookup(task, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + err = task_storage_delete(task, map); +out: + put_pid(pid); + return err; +} + +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the task_storage_ptr is not + * NULL as task_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!task_storage_ptr(task)) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + return task_storage_delete(task, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); + return &smap->map; +} + +static void task_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int task_storage_map_btf_id; +const struct bpf_map_ops task_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = task_storage_map_alloc, + .map_free = task_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_pid_task_storage_lookup_elem, + .map_update_elem = bpf_pid_task_storage_update_elem, + .map_delete_elem = bpf_pid_task_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &task_storage_map_btf_id, + .map_owner_storage_ptr = task_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) + +const struct bpf_func_proto bpf_task_storage_get_proto = { + .func = bpf_task_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_task_storage_delete_proto = { + .func = bpf_task_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8f50c9c19f1b..f3fe9f53f93c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f863aa84d0a2..00960f6a83ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4469,6 +4469,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_inode_storage_delete) goto error; break; + case BPF_MAP_TYPE_TASK_STORAGE: + if (func_id != BPF_FUNC_task_storage_get && + func_id != BPF_FUNC_task_storage_delete) + goto error; + break; default: break; } @@ -4547,6 +4552,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; + case BPF_FUNC_task_storage_get: + case BPF_FUNC_task_storage_delete: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + goto error; + break; default: break; } diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 788667d582ae..e5971fa74fd7 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { #include #undef LSM_HOOK LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), + LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; static int __init bpf_lsm_init(void) @@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void) struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), + .lbs_task = sizeof(struct bpf_storage_blob), }; DEFINE_LSM(bpf) = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 8885274d225985807deeb95de74642073c3b97bb Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:41 +0000 Subject: [PATCH 24/66] libbpf: Add support for task local storage Updates the bpf_probe_map_type API to also support BPF_MAP_TYPE_TASK_STORAGE similar to other local storage maps. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-4-kpsingh@chromium.org --- tools/lib/bpf/libbpf_probes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 5482a9b7ae2d..ecaae2927ab8 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -230,6 +230,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) break; case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: btf_key_type_id = 1; btf_value_type_id = 3; value_size = 8; From 864ab0616dccf14e51618a24acc7cf23ddd2b98a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:42 +0000 Subject: [PATCH 25/66] bpftool: Add support for task local storage Updates the binary to handle the BPF_MAP_TYPE_TASK_STORAGE as "task_storage" for printing and parsing. Also updates the documentation and bash completion Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-5-kpsingh@chromium.org --- tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++- tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/map.c | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index dade10cdf295..3d52256ba75f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -50,7 +50,8 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** + | **task_storage** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 3f1da30c4da6..fdffbc64c65c 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -705,7 +705,7 @@ _bpftool() hash_of_maps devmap devmap_hash sockmap cpumap \ xskmap sockhash cgroup_storage reuseport_sockarray \ percpu_cgroup_storage queue stack sk_storage \ - struct_ops inode_storage' -- \ + struct_ops inode_storage task_storage' -- \ "$cur" ) ) return 0 ;; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index a7efbd84fbcc..b400364ee054 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -51,6 +51,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", [BPF_MAP_TYPE_RINGBUF] = "ringbuf", [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); @@ -1464,7 +1465,8 @@ static int do_help(int argc, char **argv) " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" - " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage }\n" + " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" + " task_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); From 3ca1032ab7ab010eccb107aa515598788f7d93bb Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:43 +0000 Subject: [PATCH 26/66] bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID The currently available bpf_get_current_task returns an unsigned integer which can be used along with BPF_CORE_READ to read data from the task_struct but still cannot be used as an input argument to a helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct. In order to implement this helper a new return type, RET_PTR_TO_BTF_ID, is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not require checking the nullness of returned pointer. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/verifier.c | 7 +++++-- kernel/trace/bpf_trace.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 5 files changed, 40 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fffd30e13ac..73d5381a5d5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 00960f6a83ec..10da26e55130 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5186,11 +5186,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? + PTR_TO_BTF_ID : + PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %d of func %s#%d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..e4515b0f62a8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1022,6 +1022,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_get_current_btf_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1265,6 +1279,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From f0e5ba0bc481df77cf0afac2b33e420b33eeb463 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:44 +0000 Subject: [PATCH 27/66] bpf: Fix tests for local_storage The {inode,sk}_storage_result checking if the correct value was retrieved was being clobbered unconditionally by the return value of the bpf_{inode,sk}_storage_delete call. Also, consistently use the newly added BPF_LOCAL_STORAGE_GET_F_CREATE flag. Fixes: cd324d7abb3d ("bpf: Add selftests for local_storage") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201106103747.2780972-7-kpsingh@chromium.org --- .../selftests/bpf/progs/local_storage.c | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 0758ba229ae0..09529e33be98 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -58,20 +58,22 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct dummy_storage *storage; + int err; if (pid != monitored_pid) return 0; storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (storage->value == DUMMY_STORAGE_VALUE) + if (storage->value != DUMMY_STORAGE_VALUE) inode_storage_result = -1; - inode_storage_result = - bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); + err = bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); + if (!err) + inode_storage_result = err; return 0; } @@ -82,19 +84,23 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, { __u32 pid = bpf_get_current_pid_tgid() >> 32; struct dummy_storage *storage; + int err; if (pid != monitored_pid) return 0; storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; - if (storage->value == DUMMY_STORAGE_VALUE) + if (storage->value != DUMMY_STORAGE_VALUE) sk_storage_result = -1; - sk_storage_result = bpf_sk_storage_delete(&sk_storage_map, sock->sk); + err = bpf_sk_storage_delete(&sk_storage_map, sock->sk); + if (!err) + sk_storage_result = err; + return 0; } @@ -109,7 +115,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, return 0; storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0, - BPF_SK_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; @@ -131,7 +137,7 @@ int BPF_PROG(file_open, struct file *file) return 0; storage = bpf_inode_storage_get(&inode_storage_map, file->f_inode, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); + BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; From a367efa71b3f5a53281ca9772f8bf43166dfdf5f Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:45 +0000 Subject: [PATCH 28/66] bpf: Update selftests for local_storage to use vmlinux.h With the fixing of BTF pruning of embedded types being fixed, the test can be simplified to use vmlinux.h Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201106103747.2780972-8-kpsingh@chromium.org --- .../selftests/bpf/progs/local_storage.c | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 09529e33be98..ef3822bc7542 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -4,9 +4,8 @@ * Copyright 2020 Google LLC. */ +#include "vmlinux.h" #include -#include -#include #include #include @@ -36,23 +35,6 @@ struct { __type(value, struct dummy_storage); } sk_storage_map SEC(".maps"); -/* TODO Use vmlinux.h once BTF pruning for embedded types is fixed. - */ -struct sock {} __attribute__((preserve_access_index)); -struct sockaddr {} __attribute__((preserve_access_index)); -struct socket { - struct sock *sk; -} __attribute__((preserve_access_index)); - -struct inode {} __attribute__((preserve_access_index)); -struct dentry { - struct inode *d_inode; -} __attribute__((preserve_access_index)); -struct file { - struct inode *f_inode; -} __attribute__((preserve_access_index)); - - SEC("lsm/inode_unlink") int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { From 9cde3beeadb311d4b435a7d28d5ab72bcc5de65d Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:46 +0000 Subject: [PATCH 29/66] bpf: Add tests for task_local_storage The test exercises the syscall based map operations by creating a pidfd for the current process. For verifying kernel / LSM functionality, the test implements a simple MAC policy which denies an executable from unlinking itself. The LSM program bprm_committed_creds sets a task_local_storage with a pointer to the inode. This is then used to detect if the task is trying to unlink itself in the inode_unlink LSM hook. The test copies /bin/rm to /tmp and executes it in a child thread with the intention of deleting itself. A successful test should prevent the the running executable from deleting itself. The bpf programs are also updated to call bpf_spin_{lock, unlock} to trigger the verfier checks for spin locks. The temporary file is cleaned up later in the test. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-9-kpsingh@chromium.org --- .../bpf/prog_tests/test_local_storage.c | 185 ++++++++++++++++-- .../selftests/bpf/progs/local_storage.c | 61 +++++- 2 files changed, 226 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 91cd6f357246..4e7f6a4965f2 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -4,30 +4,161 @@ * Copyright (C) 2020 Google LLC. */ +#include +#include #include #include #include "local_storage.skel.h" #include "network_helpers.h" -int create_and_unlink_file(void) +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) { - char fname[PATH_MAX] = "/tmp/fileXXXXXX"; - int fd; + return syscall(__NR_pidfd_open, pid, flags); +} - fd = mkstemp(fname); - if (fd < 0) - return fd; +static inline ssize_t copy_file_range(int fd_in, loff_t *off_in, int fd_out, + loff_t *off_out, size_t len, + unsigned int flags) +{ + return syscall(__NR_copy_file_range, fd_in, off_in, fd_out, off_out, + len, flags); +} - close(fd); - unlink(fname); - return 0; +static unsigned int duration; + +#define TEST_STORAGE_VALUE 0xbeefdead + +struct storage { + void *inode; + unsigned int value; + /* Lock ensures that spin locked versions of local stoage operations + * also work, most operations in this tests are still single threaded + */ + struct bpf_spin_lock lock; +}; + +/* Copies an rm binary to a temp file. dest is a mkstemp template */ +static int copy_rm(char *dest) +{ + int fd_in, fd_out = -1, ret = 0; + struct stat stat; + + fd_in = open("/bin/rm", O_RDONLY); + if (fd_in < 0) + return -errno; + + fd_out = mkstemp(dest); + if (fd_out < 0) { + ret = -errno; + goto out; + } + + ret = fstat(fd_in, &stat); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = copy_file_range(fd_in, NULL, fd_out, NULL, stat.st_size, 0); + if (ret == -1) { + ret = -errno; + goto out; + } + + /* Set executable permission on the copied file */ + ret = chmod(dest, 0100); + if (ret == -1) + ret = -errno; + +out: + close(fd_in); + close(fd_out); + return ret; +} + +/* Fork and exec the provided rm binary and return the exit code of the + * forked process and its pid. + */ +static int run_self_unlink(int *monitored_pid, const char *rm_path) +{ + int child_pid, child_status, ret; + int null_fd; + + child_pid = fork(); + if (child_pid == 0) { + null_fd = open("/dev/null", O_WRONLY); + dup2(null_fd, STDOUT_FILENO); + dup2(null_fd, STDERR_FILENO); + close(null_fd); + + *monitored_pid = getpid(); + /* Use the copied /usr/bin/rm to delete itself + * /tmp/copy_of_rm /tmp/copy_of_rm. + */ + ret = execlp(rm_path, rm_path, rm_path, NULL); + if (ret) + exit(errno); + } else if (child_pid > 0) { + waitpid(child_pid, &child_status, 0); + return WEXITSTATUS(child_status); + } + + return -EINVAL; +} + +static bool check_syscall_operations(int map_fd, int obj_fd) +{ + struct storage val = { .value = TEST_STORAGE_VALUE, .lock = { 0 } }, + lookup_val = { .value = 0, .lock = { 0 } }; + int err; + + /* Looking up an existing element should fail initially */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "err:%d errno:%d\n", err, errno)) + return false; + + /* Create a new element */ + err = bpf_map_update_elem(map_fd, &obj_fd, &val, + BPF_NOEXIST | BPF_F_LOCK); + if (CHECK(err < 0, "bpf_map_update_elem", "err:%d errno:%d\n", err, + errno)) + return false; + + /* Lookup the newly created element */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(err < 0, "bpf_map_lookup_elem", "err:%d errno:%d", err, + errno)) + return false; + + /* Check the value of the newly created element */ + if (CHECK(lookup_val.value != val.value, "bpf_map_lookup_elem", + "value got = %x errno:%d", lookup_val.value, val.value)) + return false; + + err = bpf_map_delete_elem(map_fd, &obj_fd); + if (CHECK(err, "bpf_map_delete_elem()", "err:%d errno:%d\n", err, + errno)) + return false; + + /* The lookup should fail, now that the element has been deleted */ + err = bpf_map_lookup_elem_flags(map_fd, &obj_fd, &lookup_val, + BPF_F_LOCK); + if (CHECK(!err || errno != ENOENT, "bpf_map_lookup_elem", + "err:%d errno:%d\n", err, errno)) + return false; + + return true; } void test_test_local_storage(void) { + char tmp_exec_path[PATH_MAX] = "/tmp/copy_of_rmXXXXXX"; + int err, serv_sk = -1, task_fd = -1; struct local_storage *skel = NULL; - int err, duration = 0, serv_sk = -1; skel = local_storage__open_and_load(); if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) @@ -37,11 +168,36 @@ void test_test_local_storage(void) if (CHECK(err, "attach", "lsm attach failed: %d\n", err)) goto close_prog; + task_fd = sys_pidfd_open(getpid(), 0); + if (CHECK(task_fd < 0, "pidfd_open", + "failed to get pidfd err:%d, errno:%d", task_fd, errno)) + goto close_prog; + + if (!check_syscall_operations(bpf_map__fd(skel->maps.task_storage_map), + task_fd)) + goto close_prog; + + err = copy_rm(tmp_exec_path); + if (CHECK(err < 0, "copy_rm", "err %d errno %d\n", err, errno)) + goto close_prog; + + /* Sets skel->bss->monitored_pid to the pid of the forked child + * forks a child process that executes tmp_exec_path and tries to + * unlink its executable. This operation should be denied by the loaded + * LSM program. + */ + err = run_self_unlink(&skel->bss->monitored_pid, tmp_exec_path); + if (CHECK(err != EPERM, "run_self_unlink", "err %d want EPERM\n", err)) + goto close_prog_unlink; + + /* Set the process being monitored to be the current process */ skel->bss->monitored_pid = getpid(); - err = create_and_unlink_file(); - if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno)) - goto close_prog; + /* Remove the temporary created executable */ + err = unlink(tmp_exec_path); + if (CHECK(err != 0, "unlink", "unable to unlink %s: %d", tmp_exec_path, + errno)) + goto close_prog_unlink; CHECK(skel->data->inode_storage_result != 0, "inode_storage_result", "inode_local_storage not set\n"); @@ -55,6 +211,9 @@ void test_test_local_storage(void) close(serv_sk); +close_prog_unlink: + unlink(tmp_exec_path); close_prog: + close(task_fd); local_storage__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index ef3822bc7542..3e3de130f28f 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -17,41 +17,64 @@ int monitored_pid = 0; int inode_storage_result = -1; int sk_storage_result = -1; -struct dummy_storage { +struct local_storage { + struct inode *exec_inode; __u32 value; + struct bpf_spin_lock lock; }; struct { __uint(type, BPF_MAP_TYPE_INODE_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); - __type(value, struct dummy_storage); + __type(value, struct local_storage); } inode_storage_map SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_SK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE); __type(key, int); - __type(value, struct dummy_storage); + __type(value, struct local_storage); } sk_storage_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct local_storage); +} task_storage_map SEC(".maps"); + SEC("lsm/inode_unlink") int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; + bool is_self_unlink; int err; if (pid != monitored_pid) return 0; + storage = bpf_task_storage_get(&task_storage_map, + bpf_get_current_task_btf(), 0, 0); + if (storage) { + /* Don't let an executable delete itself */ + bpf_spin_lock(&storage->lock); + is_self_unlink = storage->exec_inode == victim->d_inode; + bpf_spin_unlock(&storage->lock); + if (is_self_unlink) + return -EPERM; + } + storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!storage) return 0; + bpf_spin_lock(&storage->lock); if (storage->value != DUMMY_STORAGE_VALUE) inode_storage_result = -1; + bpf_spin_unlock(&storage->lock); err = bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); if (!err) @@ -65,7 +88,7 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, int addrlen) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; int err; if (pid != monitored_pid) @@ -76,8 +99,10 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, if (!storage) return 0; + bpf_spin_lock(&storage->lock); if (storage->value != DUMMY_STORAGE_VALUE) sk_storage_result = -1; + bpf_spin_unlock(&storage->lock); err = bpf_sk_storage_delete(&sk_storage_map, sock->sk); if (!err) @@ -91,7 +116,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int protocol, int kern) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; if (pid != monitored_pid) return 0; @@ -101,7 +126,9 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, if (!storage) return 0; + bpf_spin_lock(&storage->lock); storage->value = DUMMY_STORAGE_VALUE; + bpf_spin_unlock(&storage->lock); return 0; } @@ -110,7 +137,7 @@ SEC("lsm/file_open") int BPF_PROG(file_open, struct file *file) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct dummy_storage *storage; + struct local_storage *storage; if (pid != monitored_pid) return 0; @@ -123,6 +150,26 @@ int BPF_PROG(file_open, struct file *file) if (!storage) return 0; + bpf_spin_lock(&storage->lock); storage->value = DUMMY_STORAGE_VALUE; + bpf_spin_unlock(&storage->lock); return 0; } + +/* This uses the local storage to remember the inode of the binary that a + * process was originally executing. + */ +SEC("lsm/bprm_committed_creds") +void BPF_PROG(exec, struct linux_binprm *bprm) +{ + struct local_storage *storage; + + storage = bpf_task_storage_get(&task_storage_map, + bpf_get_current_task_btf(), 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (storage) { + bpf_spin_lock(&storage->lock); + storage->exec_inode = bprm->file->f_inode; + bpf_spin_unlock(&storage->lock); + } +} From 4170bc6baa5446e1d85e0b7647ea54ba72aa85c4 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:47 +0000 Subject: [PATCH 30/66] bpf: Exercise syscall operations for inode and sk storage Use the check_syscall_operations added for task_local_storage to exercise syscall operations for other local storage maps: * Check the absence of an element for the given fd. * Create a new element, retrieve and compare its value. * Delete the element and check again for absence. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-10-kpsingh@chromium.org --- .../bpf/prog_tests/test_local_storage.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 4e7f6a4965f2..5fda45982be0 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -157,7 +157,7 @@ static bool check_syscall_operations(int map_fd, int obj_fd) void test_test_local_storage(void) { char tmp_exec_path[PATH_MAX] = "/tmp/copy_of_rmXXXXXX"; - int err, serv_sk = -1, task_fd = -1; + int err, serv_sk = -1, task_fd = -1, rm_fd = -1; struct local_storage *skel = NULL; skel = local_storage__open_and_load(); @@ -181,6 +181,15 @@ void test_test_local_storage(void) if (CHECK(err < 0, "copy_rm", "err %d errno %d\n", err, errno)) goto close_prog; + rm_fd = open(tmp_exec_path, O_RDONLY); + if (CHECK(rm_fd < 0, "open", "failed to open %s err:%d, errno:%d", + tmp_exec_path, rm_fd, errno)) + goto close_prog; + + if (!check_syscall_operations(bpf_map__fd(skel->maps.inode_storage_map), + rm_fd)) + goto close_prog; + /* Sets skel->bss->monitored_pid to the pid of the forked child * forks a child process that executes tmp_exec_path and tries to * unlink its executable. This operation should be denied by the loaded @@ -209,11 +218,15 @@ void test_test_local_storage(void) CHECK(skel->data->sk_storage_result != 0, "sk_storage_result", "sk_local_storage not set\n"); - close(serv_sk); + if (!check_syscall_operations(bpf_map__fd(skel->maps.sk_storage_map), + serv_sk)) + goto close_prog; close_prog_unlink: unlink(tmp_exec_path); close_prog: + close(serv_sk); + close(rm_fd); close(task_fd); local_storage__destroy(skel); } From f055f355faf1991ef4e6b3c3517f8f2fc247805e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 6 Nov 2020 12:33:46 -0800 Subject: [PATCH 31/66] selftests/bpf: Fix selftest build with old libc pidfd_open was added in 2019. Some versions of libc library don't define it. Define it manually if it's not available. Reported-by: Sergei Iudin Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_local_storage.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index 5fda45982be0..fcca7ba1f368 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -12,6 +12,10 @@ #include "local_storage.skel.h" #include "network_helpers.h" +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif + static inline int sys_pidfd_open(pid_t pid, unsigned int flags) { return syscall(__NR_pidfd_open, pid, flags); From a10b4f9610431cf0d136530c12e48cba7c6391a4 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 6 Nov 2020 04:13:54 -0500 Subject: [PATCH 32/66] samples/bpf: Remove duplicate include in hbm The 'bpf/bpf.h' include in 'samples/bpf/hbm.c' is duplicated. Signed-off-by: Menglong Dong Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1604654034-52821-1-git-send-email-dong.menglong@zte.com.cn --- samples/bpf/hbm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index ff4c533dfac2..400e741a56eb 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -51,7 +51,6 @@ #include "cgroup_helpers.h" #include "hbm.h" #include "bpf_util.h" -#include #include bool outFlag = true; From 666475ccbf1dc99c1e61e47975d5fbf86d6236aa Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 16:10:50 +0800 Subject: [PATCH 33/66] bpf, btf: Remove the duplicate btf_ids.h include Remove duplicate btf_ids.h header which is included twice. Signed-off-by: Wang Qing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1604736650-11197-1-git-send-email-wangqing@vivo.com --- kernel/bpf/btf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed7d02e8bc93..6324de8c59f7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,7 +22,6 @@ #include #include #include -#include #include /* BTF (BPF Type Format) is the meta data format which describes From f52b8fd332573106e60958617a3d2e30611ce1fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 6 Nov 2020 14:54:02 -0800 Subject: [PATCH 34/66] bpf: selftest: Use static globals in tcp_hdr_options and btf_skc_cls_ingress Some globals in the tcp_hdr_options test and btf_skc_cls_ingress test are not using static scope. This patch fixes it. Targeting bpf-next branch as an improvement since it currently does not break the build. Fixes: ad2f8eb0095e ("bpf: selftests: Tcp header options") Fixes: 9a856cae2217 ("bpf: selftest: Add test_btf_skc_cls_ingress") Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201106225402.4135741-1-kafai@fb.com --- .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c | 2 +- .../selftests/bpf/prog_tests/tcp_hdr_options.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c index 86ccf37e26b3..762f6a9da8b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c @@ -17,7 +17,7 @@ #include "test_btf_skc_cls_ingress.skel.h" static struct test_btf_skc_cls_ingress *skel; -struct sockaddr_in6 srv_sa6; +static struct sockaddr_in6 srv_sa6; static __u32 duration; #define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress" diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index c85174cdcb77..08d19cafd5e8 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -18,12 +18,12 @@ #define LO_ADDR6 "::1" #define CG_NAME "/tcpbpf-hdr-opt-test" -struct bpf_test_option exp_passive_estab_in; -struct bpf_test_option exp_active_estab_in; -struct bpf_test_option exp_passive_fin_in; -struct bpf_test_option exp_active_fin_in; -struct hdr_stg exp_passive_hdr_stg; -struct hdr_stg exp_active_hdr_stg = { .active = true, }; +static struct bpf_test_option exp_passive_estab_in; +static struct bpf_test_option exp_active_estab_in; +static struct bpf_test_option exp_passive_fin_in; +static struct bpf_test_option exp_active_fin_in; +static struct hdr_stg exp_passive_hdr_stg; +static struct hdr_stg exp_active_hdr_stg = { .active = true, }; static struct test_misc_tcp_hdr_options *misc_skel; static struct test_tcp_hdr_options *skel; From 951bb64621b8139c0cd99dcadc13e6510c08aa73 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:28 -0800 Subject: [PATCH 35/66] bpf: Add in-kernel split BTF support Adjust in-kernel BTF implementation to support a split BTF mode of operation. Changes are mostly mirroring libbpf split BTF changes, with the exception of start_id being 0 for in-kernel implementation due to simpler read-only mode. Otherwise, for split BTF logic, most of the logic of jumping to base BTF, where necessary, is encapsulated in few helper functions. Type numbering and string offset in a split BTF are logically continuing where base BTF ends, so most of the high-level logic is kept without changes. Type verification and size resolution is only doing an added resolution of new split BTF types and relies on already cached size and type resolution results in the base BTF. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-2-andrii@kernel.org --- kernel/bpf/btf.c | 171 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 119 insertions(+), 52 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6324de8c59f7..727c1c27053f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -203,12 +203,17 @@ struct btf { const char *strings; void *nohdr_data; struct btf_header hdr; - u32 nr_types; + u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; + + /* split BTF support */ + struct btf *base_btf; + u32 start_id; /* first type ID in this BTF (0 for base BTF) */ + u32 start_str_off; /* first string offset (0 for base BTF) */ }; enum verifier_phase { @@ -449,14 +454,27 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +static u32 btf_nr_types_total(const struct btf *btf) +{ + u32 total = 0; + + while (btf) { + total += btf->nr_types; + btf = btf->base_btf; + } + + return total; +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; - u32 i; + u32 i, total; - for (i = 1; i <= btf->nr_types; i++) { - t = btf->types[i]; + total = btf_nr_types_total(btf); + for (i = 1; i < total; i++) { + t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; @@ -599,8 +617,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return BTF_STR_OFFSET_VALID(offset) && - offset < btf->hdr.str_len; + if (!BTF_STR_OFFSET_VALID(offset)) + return false; + + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first, bool dot_ok) @@ -614,10 +638,22 @@ static bool __btf_name_char_ok(char c, bool first, bool dot_ok) return true; } +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +{ + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ - const char *src = &btf->strings[offset]; + const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true, dot_ok)) @@ -650,27 +686,28 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { + const char *name; + if (!offset) return "(anon)"; - else if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - else - return "(invalid-name-offset)"; + + name = btf_str_by_offset(btf, offset); + return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - - return NULL; + return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { - if (type_id > btf->nr_types) - return NULL; + while (type_id < btf->start_id) + btf = btf->base_btf; + type_id -= btf->start_id; + if (type_id >= btf->nr_types) + return NULL; return btf->types[type_id]; } @@ -1390,17 +1427,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; - /* < 2 because +1 for btf_void which is always in btf->types[0]. - * btf_void is not accounted in btf->nr_types because btf_void - * does not come from the BTF file. - */ - if (btf->types_size - btf->nr_types < 2) { + if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_TYPE) { + if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } @@ -1414,18 +1447,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) if (!new_types) return -ENOMEM; - if (btf->nr_types == 0) - new_types[0] = &btf_void; - else + if (btf->nr_types == 0) { + if (!btf->base_btf) { + /* lazily init VOID type */ + new_types[0] = &btf_void; + btf->nr_types++; + } + } else { memcpy(new_types, btf->types, - sizeof(*btf->types) * (btf->nr_types + 1)); + sizeof(*btf->types) * btf->nr_types); + } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } - btf->types[++(btf->nr_types)] = t; + btf->types[btf->nr_types++] = t; return 0; } @@ -1498,18 +1536,17 @@ static int env_resolve_init(struct btf_verifier_env *env) u32 *resolved_ids = NULL; u8 *visit_states = NULL; - /* +1 for btf_void */ - resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; - resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; - visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; @@ -1561,21 +1598,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { - return env->visit_states[type_id] == RESOLVED; + /* base BTF types should be resolved by now */ + if (type_id < env->btf->start_id) + return true; + + return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { + const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; - if (env->visit_states[type_id] != NOT_VISITED) + if (type_id < btf->start_id + || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; - env->visit_states[type_id] = VISITED; + env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; @@ -1605,6 +1648,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; + type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; @@ -1709,14 +1753,30 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } +static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_ids[type_id - btf->start_id]; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { - *type_id = btf->resolved_ids[*type_id]; + *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } +static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_sizes[type_id - btf->start_id]; +} + const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { @@ -1731,7 +1791,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { @@ -1739,14 +1799,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size_type_id = btf->resolved_ids[size_type_id]; + size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else @@ -3798,7 +3858,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; - env->log_type_id = 1; + env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; @@ -3825,8 +3885,8 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) - return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + return !btf_resolved_type_id(btf, type_id) && + !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { @@ -3846,7 +3906,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == - btf->resolved_sizes[type_id]); + btf_resolved_type_size(btf, type_id)); } return false; @@ -3888,7 +3948,8 @@ static int btf_resolve(struct btf_verifier_env *env, static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; - u32 type_id; + const struct btf_type *t; + u32 type_id, i; int err; err = env_resolve_init(env); @@ -3896,8 +3957,9 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; env->phase++; - for (type_id = 1; type_id <= btf->nr_types; type_id++) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { + type_id = btf->start_id + i; + t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && @@ -3934,7 +3996,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->type_len) { + if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } @@ -3961,13 +4023,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || - start[0] || end[-1]) { + btf->strings = start; + + if (btf->base_btf && !hdr->str_len) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } - - btf->strings = start; return 0; } @@ -4908,7 +4975,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) { - *bad_type = btf->types[0]; + *bad_type = btf_type_by_id(btf, 0); return -EINVAL; } if (btf_type_is_ptr(t)) From 5329722057d41aebc31e391907a501feaa42f7d9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:29 -0800 Subject: [PATCH 36/66] bpf: Assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO Allocate ID for vmlinux BTF. This makes it visible when iterating over all BTF objects in the system. To allow distinguishing vmlinux BTF (and later kernel module BTF) from user-provided BTFs, expose extra kernel_btf flag, as well as BTF name ("vmlinux" for vmlinux BTF, will equal to module's name for module BTF). We might want to later allow specifying BTF name for user-provided BTFs as well, if that makes sense. But currently this is reserved only for in-kernel BTFs. Having in-kernel BTFs exposed IDs will allow to extend BPF APIs that require in-kernel BTF type with ability to specify BTF types from kernel modules, not just vmlinux BTF. This will be implemented in a follow up patch set for fentry/fexit/fmod_ret/lsm/etc. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-3-andrii@kernel.org --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/btf.c | 43 +++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 727c1c27053f..856585db7aa7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -214,6 +214,8 @@ struct btf { struct btf *base_btf; u32 start_id; /* first type ID in this BTF (0 for base BTF) */ u32 start_str_off; /* first string offset (0 for base BTF) */ + char name[MODULE_NAME_LEN]; + bool kernel_btf; }; enum verifier_phase { @@ -4429,6 +4431,8 @@ struct btf *btf_parse_vmlinux(void) btf->data = __start_BTF; btf->data_size = __stop_BTF - __start_BTF; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "vmlinux"); err = btf_parse_hdr(env); if (err) @@ -4454,8 +4458,13 @@ struct btf *btf_parse_vmlinux(void) bpf_struct_ops_init(btf, log); - btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); + + err = btf_alloc_id(btf); + if (err) + goto errout; + + btf_verifier_env_free(env); return btf; errout: @@ -5553,7 +5562,9 @@ int btf_get_info_by_fd(const struct btf *btf, struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; - u32 uinfo_len; + char __user *uname; + u32 uinfo_len, uname_len, name_len; + int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; @@ -5570,11 +5581,37 @@ int btf_get_info_by_fd(const struct btf *btf, return -EFAULT; info.btf_size = btf->data_size; + info.kernel_btf = btf->kernel_btf; + + uname = u64_to_user_ptr(info.name); + uname_len = info.name_len; + if (!uname ^ !uname_len) + return -EINVAL; + + name_len = strlen(btf->name); + info.name_len = name_len; + + if (uname) { + if (uname_len >= name_len + 1) { + if (copy_to_user(uname, btf->name, name_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(uname, btf->name, uname_len - 1)) + return -EFAULT; + if (put_user(zero, uname + uname_len - 1)) + return -EFAULT; + /* let user-space know about too short buffer */ + ret = -ENOSPC; + } + } + if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; - return 0; + return ret; } int btf_get_fd_by_id(u32 id) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { From 5f9ae91f7c0dbbc4195e2a6c8eedcaeb5b9e4cbb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:30 -0800 Subject: [PATCH 37/66] kbuild: Build kernel module BTFs if BTF is enabled and pahole supports it Detect if pahole supports split BTF generation, and generate BTF for each selected kernel module, if it does. This is exposed to Makefiles and C code as CONFIG_DEBUG_INFO_BTF_MODULES flag. Kernel module BTF has to be re-generated if either vmlinux's BTF changes or module's .ko changes. To achieve that, I needed a helper similar to if_changed, but that would allow to filter out vmlinux from the list of updated dependencies for .ko building. I've put it next to the only place that uses and needs it, but it might be a better idea to just add it along the other if_changed variants into scripts/Kbuild.include. Each kernel module's BTF deduplication is pretty fast, as it does only incremental BTF deduplication on top of already deduplicated vmlinux BTF. To show the added build time, I've first ran make only just built kernel (to establish the baseline) and then forced only BTF re-generation, without regenerating .ko files. The build was performed with -j60 parallelization on 56-core machine. The final time also includes bzImage building, so it's not a pure BTF overhead. $ time make -j60 ... make -j60 27.65s user 10.96s system 782% cpu 4.933 total $ touch ~/linux-build/default/vmlinux && time make -j60 ... make -j60 123.69s user 27.85s system 1566% cpu 9.675 total So 4.6 seconds real time, with noticeable part spent in compressed vmlinux and bzImage building. To show size savings, I've built my kernel configuration with about 700 kernel modules with full BTF per each kernel module (without deduplicating against vmlinux) and with split BTF against deduplicated vmlinux (approach in this patch). Below are top 10 modules with biggest BTF sizes. And total size of BTF data across all kernel modules. It shows that split BTF "compresses" 115MB down to 5MB total. And the biggest kernel modules get a downsize from 500-570KB down to 200-300KB. FULL BTF ======== $ for f in $(find . -name '*.ko'); do size -A -d $f | grep BTF | awk '{print $2}'; done | awk '{ s += $1 } END { print s }' 115710691 $ for f in $(find . -name '*.ko'); do printf "%s %d\n" $f $(size -A -d $f | grep BTF | awk '{print $2}'); done | sort -nr -k2 | head -n10 ./drivers/gpu/drm/i915/i915.ko 570570 ./drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko 520240 ./drivers/gpu/drm/radeon/radeon.ko 503849 ./drivers/infiniband/hw/mlx5/mlx5_ib.ko 491777 ./fs/xfs/xfs.ko 411544 ./drivers/net/ethernet/intel/i40e/i40e.ko 403904 ./drivers/net/ethernet/broadcom/bnx2x/bnx2x.ko 398754 ./drivers/infiniband/core/ib_core.ko 397224 ./fs/cifs/cifs.ko 386249 ./fs/nfsd/nfsd.ko 379738 SPLIT BTF ========= $ for f in $(find . -name '*.ko'); do size -A -d $f | grep BTF | awk '{print $2}'; done | awk '{ s += $1 } END { print s }' 5194047 $ for f in $(find . -name '*.ko'); do printf "%s %d\n" $f $(size -A -d $f | grep BTF | awk '{print $2}'); done | sort -nr -k2 | head -n10 ./drivers/gpu/drm/i915/i915.ko 293206 ./drivers/gpu/drm/radeon/radeon.ko 282103 ./fs/xfs/xfs.ko 222150 ./drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko 198503 ./drivers/infiniband/hw/mlx5/mlx5_ib.ko 198356 ./drivers/net/ethernet/broadcom/bnx2x/bnx2x.ko 113444 ./fs/cifs/cifs.ko 109379 ./arch/x86/kvm/kvm.ko 100225 ./drivers/gpu/drm/drm.ko 94827 ./drivers/infiniband/core/ib_core.ko 91188 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201110011932.3201430-4-andrii@kernel.org --- lib/Kconfig.debug | 9 +++++++++ scripts/Makefile.modfinal | 20 ++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d7a7bc3b6098..1e78faaf20a5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -274,6 +274,15 @@ config DEBUG_INFO_BTF Turning this on expects presence of pahole tool, which will convert DWARF type info into equivalent deduplicated BTF type info. +config PAHOLE_HAS_SPLIT_BTF + def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119") + +config DEBUG_INFO_BTF_MODULES + def_bool y + depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF + help + Generate compact split BTF type information for kernel modules. + config GDB_SCRIPTS bool "Provide GDB scripts for kernel debugging" help diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index ae01baf96f4e..02b892421f7a 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -6,6 +6,7 @@ PHONY := __modfinal __modfinal: +include include/config/auto.conf include $(srctree)/scripts/Kbuild.include # for c_flags @@ -36,8 +37,23 @@ quiet_cmd_ld_ko_o = LD [M] $@ -T scripts/module.lds -o $@ $(filter %.o, $^); \ $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) -$(modules): %.ko: %.o %.mod.o scripts/module.lds FORCE - +$(call if_changed,ld_ko_o) +quiet_cmd_btf_ko = BTF [M] $@ + cmd_btf_ko = LLVM_OBJCOPY=$(OBJCOPY) $(PAHOLE) -J --btf_base vmlinux $@ + +# Same as newer-prereqs, but allows to exclude specified extra dependencies +newer_prereqs_except = $(filter-out $(PHONY) $(1),$?) + +# Same as if_changed, but allows to exclude specified extra dependencies +if_changed_except = $(if $(call newer_prereqs_except,$(2))$(cmd-check), \ + $(cmd); \ + printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:) + +# Re-generate module BTFs if either module's .ko or vmlinux changed +$(modules): %.ko: %.o %.mod.o scripts/module.lds vmlinux FORCE + +$(call if_changed_except,ld_ko_o,vmlinux) +ifdef CONFIG_DEBUG_INFO_BTF_MODULES + +$(if $(newer-prereqs),$(call cmd,btf_ko)) +endif targets += $(modules) $(modules:.ko=.mod.o) From 36e68442d1afd4f720704ee1ea8486331507e834 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:31 -0800 Subject: [PATCH 38/66] bpf: Load and verify kernel module BTFs Add kernel module listener that will load/validate and unload module BTF. Module BTFs gets ID generated for them, which makes it possible to iterate them with existing BTF iteration API. They are given their respective module's names, which will get reported through GET_OBJ_INFO API. They are also marked as in-kernel BTFs for tooling to distinguish them from user-provided BTFs. Also, similarly to vmlinux BTF, kernel module BTFs are exposed through sysfs as /sys/kernel/btf/. This is convenient for user-space tools to inspect module BTF contents and dump their types with existing tools: [vmuser@archvm bpf]$ ls -la /sys/kernel/btf total 0 drwxr-xr-x 2 root root 0 Nov 4 19:46 . drwxr-xr-x 13 root root 0 Nov 4 19:46 .. ... -r--r--r-- 1 root root 888 Nov 4 19:46 irqbypass -r--r--r-- 1 root root 100225 Nov 4 19:46 kvm -r--r--r-- 1 root root 35401 Nov 4 19:46 kvm_intel -r--r--r-- 1 root root 120 Nov 4 19:46 pcspkr -r--r--r-- 1 root root 399 Nov 4 19:46 serio_raw -r--r--r-- 1 root root 4094095 Nov 4 19:46 vmlinux Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201110011932.3201430-5-andrii@kernel.org --- Documentation/ABI/testing/sysfs-kernel-btf | 8 + include/linux/bpf.h | 2 + include/linux/module.h | 4 + kernel/bpf/btf.c | 194 +++++++++++++++++++++ kernel/bpf/sysfs_btf.c | 2 +- kernel/module.c | 32 ++++ 6 files changed, 241 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf index 2c9744b2cd59..fe96efdc9b6c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-btf +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -15,3 +15,11 @@ Description: information with description of all internal kernel types. See Documentation/bpf/btf.rst for detailed description of format itself. + +What: /sys/kernel/btf/ +Date: Nov 2020 +KernelVersion: 5.11 +Contact: bpf@vger.kernel.org +Description: + Read-only binary attribute exposing kernel module's BTF type + information as an add-on to the kernel's BTF (/sys/kernel/btf/vmlinux). diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 73d5381a5d5c..581b2a2e78eb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,9 +36,11 @@ struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; +struct kobject; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; +extern struct kobject *btf_kobj; typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); diff --git a/include/linux/module.h b/include/linux/module.h index a29187f7c360..20fce258ffba 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -475,6 +475,10 @@ struct module { unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + unsigned int btf_data_size; + void *btf_data; +#endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 856585db7aa7..0f1fd2669d69 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include /* BTF (BPF Type Format) is the meta data format which describes @@ -4476,6 +4478,75 @@ errout: return ERR_PTR(err); } +static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL, *base_btf; + int err; + + base_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(base_btf)) + return base_btf; + if (!base_btf) + return ERR_PTR(-EINVAL); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->base_btf = base_btf; + btf->start_id = base_btf->nr_types; + btf->start_str_off = base_btf->hdr.str_len; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "%s", module_name); + + btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + if (!btf->data) { + err = -ENOMEM; + goto errout; + } + memcpy(btf->data, data, data_size); + btf->data_size = data_size; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->data); + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; @@ -5651,3 +5722,126 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +struct btf_module { + struct list_head list; + struct module *module; + struct btf *btf; + struct bin_attribute *sysfs_attr; +}; + +static LIST_HEAD(btf_modules); +static DEFINE_MUTEX(btf_module_mutex); + +static ssize_t +btf_module_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + const struct btf *btf = bin_attr->private; + + memcpy(buf, btf->data + off, len); + return len; +} + +static int btf_module_notify(struct notifier_block *nb, unsigned long op, + void *module) +{ + struct btf_module *btf_mod, *tmp; + struct module *mod = module; + struct btf *btf; + int err = 0; + + if (mod->btf_data_size == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + goto out; + + switch (op) { + case MODULE_STATE_COMING: + btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); + if (!btf_mod) { + err = -ENOMEM; + goto out; + } + btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); + if (IS_ERR(btf)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); + kfree(btf_mod); + err = PTR_ERR(btf); + goto out; + } + err = btf_alloc_id(btf); + if (err) { + btf_free(btf); + kfree(btf_mod); + goto out; + } + + mutex_lock(&btf_module_mutex); + btf_mod->module = module; + btf_mod->btf = btf; + list_add(&btf_mod->list, &btf_modules); + mutex_unlock(&btf_module_mutex); + + if (IS_ENABLED(CONFIG_SYSFS)) { + struct bin_attribute *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + goto out; + + sysfs_bin_attr_init(attr); + attr->attr.name = btf->name; + attr->attr.mode = 0444; + attr->size = btf->data_size; + attr->private = btf; + attr->read = btf_module_read; + + err = sysfs_create_bin_file(btf_kobj, attr); + if (err) { + pr_warn("failed to register module [%s] BTF in sysfs: %d\n", + mod->name, err); + kfree(attr); + err = 0; + goto out; + } + + btf_mod->sysfs_attr = attr; + } + + break; + case MODULE_STATE_GOING: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + list_del(&btf_mod->list); + if (btf_mod->sysfs_attr) + sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + btf_put(btf_mod->btf); + kfree(btf_mod->sysfs_attr); + kfree(btf_mod); + break; + } + mutex_unlock(&btf_module_mutex); + break; + } +out: + return notifier_from_errno(err); +} + +static struct notifier_block btf_module_nb = { + .notifier_call = btf_module_notify, +}; + +static int __init btf_module_init(void) +{ + register_module_notifier(&btf_module_nb); + return 0; +} + +fs_initcall(btf_module_init); +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 11b3380887fa..ef6911aee3bb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -26,7 +26,7 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .read = btf_vmlinux_read, }; -static struct kobject *btf_kobj; +struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a7..f2996b02ab2e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -380,6 +380,35 @@ static void *section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } +/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ +static unsigned int find_any_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* + * Find a module section, or NULL. Fill in number of "objects" in section. + * Ignores SHF_ALLOC flag. + */ +static __maybe_unused void *any_section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_any_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -3250,6 +3279,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); +#endif #ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), From cecaf4a0f2dcbd7e76156f6d63748b84c176b95b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:32 -0800 Subject: [PATCH 39/66] tools/bpftool: Add support for in-kernel and named BTF in `btf show` Display vmlinux BTF name and kernel module names when listing available BTFs on the system. In human-readable output mode, module BTFs are reported with "name [module-name]", while vmlinux BTF will be reported as "name [vmlinux]". Square brackets are added by bpftool and follow kernel convention when displaying modules in human-readable text outputs. [vmuser@archvm bpf]$ sudo ../../../bpf/bpftool/bpftool btf s 1: name [vmlinux] size 4082281B 6: size 2365B prog_ids 8,6 map_ids 3 7: name [button] size 46895B 8: name [pcspkr] size 42328B 9: name [serio_raw] size 39375B 10: name [floppy] size 57185B 11: name [i2c_core] size 76186B 12: name [crc32c_intel] size 16036B 13: name [i2c_piix4] size 50497B 14: name [irqbypass] size 14124B 15: name [kvm] size 197985B 16: name [kvm_intel] size 123564B 17: name [cryptd] size 42466B 18: name [crypto_simd] size 17187B 19: name [glue_helper] size 39205B 20: name [aesni_intel] size 41034B 25: size 36150B pids bpftool(2519) In JSON mode, two fields (boolean "kernel" and string "name") are reported for each BTF object. vmlinux BTF is reported with name "vmlinux" (kernel itself returns and empty name for vmlinux BTF). [vmuser@archvm bpf]$ sudo ../../../bpf/bpftool/bpftool btf s -jp [{ "id": 1, "size": 4082281, "prog_ids": [], "map_ids": [], "kernel": true, "name": "vmlinux" },{ "id": 6, "size": 2365, "prog_ids": [8,6 ], "map_ids": [3 ], "kernel": false },{ "id": 7, "size": 46895, "prog_ids": [], "map_ids": [], "kernel": true, "name": "button" },{ ... Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20201110011932.3201430-6-andrii@kernel.org --- tools/bpf/bpftool/btf.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index c96b56e8e3a4..ed5e97157241 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -742,9 +742,14 @@ show_btf_plain(struct bpf_btf_info *info, int fd, struct btf_attach_table *btf_map_table) { struct btf_attach_point *obj; + const char *name = u64_to_ptr(info->name); int n; printf("%u: ", info->id); + if (info->kernel_btf) + printf("name [%s] ", name); + else if (name && name[0]) + printf("name %s ", name); printf("size %uB", info->btf_size); n = 0; @@ -771,6 +776,7 @@ show_btf_json(struct bpf_btf_info *info, int fd, struct btf_attach_table *btf_map_table) { struct btf_attach_point *obj; + const char *name = u64_to_ptr(info->name); jsonw_start_object(json_wtr); /* btf object */ jsonw_uint_field(json_wtr, "id", info->id); @@ -796,6 +802,11 @@ show_btf_json(struct bpf_btf_info *info, int fd, emit_obj_refs_json(&refs_table, info->id, json_wtr); /* pids */ + jsonw_bool_field(json_wtr, "kernel", info->kernel_btf); + + if (name && name[0]) + jsonw_string_field(json_wtr, "name", name); + jsonw_end_object(json_wtr); /* btf object */ } @@ -803,15 +814,30 @@ static int show_btf(int fd, struct btf_attach_table *btf_prog_table, struct btf_attach_table *btf_map_table) { - struct bpf_btf_info info = {}; + struct bpf_btf_info info; __u32 len = sizeof(info); + char name[64]; int err; + memset(&info, 0, sizeof(info)); err = bpf_obj_get_info_by_fd(fd, &info, &len); if (err) { p_err("can't get BTF object info: %s", strerror(errno)); return -1; } + /* if kernel support emitting BTF object name, pass name pointer */ + if (info.name_len) { + memset(&info, 0, sizeof(info)); + info.name_len = sizeof(name); + info.name = ptr_to_u64(name); + len = sizeof(info); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get BTF object info: %s", strerror(errno)); + return -1; + } + } if (json_output) show_btf_json(&info, fd, btf_prog_table, btf_map_table); From 58cfa49c2ba7f815adccc27a775e7cf8a8f7f539 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Nov 2020 09:50:12 +0800 Subject: [PATCH 40/66] selftest/bpf: Add missed ip6ip6 test back In comment 173ca26e9b51 ("samples/bpf: add comprehensive ipip, ipip6, ip6ip6 test") we added ip6ip6 test for bpf tunnel testing. But in commit 933a741e3b82 ("selftests/bpf: bpf tunnel test.") when we moved it to the current folder, we didn't add it. This patch add the ip6ip6 test back to bpf tunnel test. Update the ipip6's topology for both IPv4 and IPv6 testing. Since iperf test is removed as currect framework simplified it in purpose, I also removed unused tcp checkings in test_tunnel_kern.c. Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.") Signed-off-by: Hangbin Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201110015013.1570716-2-liuhangbin@gmail.com --- .../selftests/bpf/progs/test_tunnel_kern.c | 42 +++--------------- tools/testing/selftests/bpf/test_tunnel.sh | 43 +++++++++++++++++-- 2 files changed, 46 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index f48dbfe24ddc..a621b58ab079 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -528,12 +527,11 @@ int _ipip_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } @@ -541,16 +539,6 @@ int _ipip_set_tunnel(struct __sk_buff *skb) key.tunnel_ttl = 64; if (iph->protocol == IPPROTO_ICMP) { key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - } else { - if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) - return TC_ACT_SHOT; - - if (tcp->dest == bpf_htons(5200)) - key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */ - else if (tcp->dest == bpf_htons(5201)) - key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */ - else - return TC_ACT_SHOT; } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); @@ -585,19 +573,20 @@ int _ipip6_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct iphdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } __builtin_memset(&key, 0x0, sizeof(key)); - key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ key.tunnel_ttl = 64; + if (iph->protocol == IPPROTO_ICMP) { + key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ + } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); @@ -634,35 +623,18 @@ int _ip6ip6_set_tunnel(struct __sk_buff *skb) struct bpf_tunnel_key key = {}; void *data = (void *)(long)skb->data; struct ipv6hdr *iph = data; - struct tcphdr *tcp = data + sizeof(*iph); void *data_end = (void *)(long)skb->data_end; int ret; /* single length check */ - if (data + sizeof(*iph) + sizeof(*tcp) > data_end) { + if (data + sizeof(*iph) > data_end) { ERROR(1); return TC_ACT_SHOT; } - key.remote_ipv6[0] = bpf_htonl(0x2401db00); key.tunnel_ttl = 64; - if (iph->nexthdr == 58 /* NEXTHDR_ICMP */) { - key.remote_ipv6[3] = bpf_htonl(1); - } else { - if (iph->nexthdr != 6 /* NEXTHDR_TCP */) { - ERROR(iph->nexthdr); - return TC_ACT_SHOT; - } - - if (tcp->dest == bpf_htons(5200)) { - key.remote_ipv6[3] = bpf_htonl(1); - } else if (tcp->dest == bpf_htons(5201)) { - key.remote_ipv6[3] = bpf_htonl(2); - } else { - ERROR(tcp->dest); - return TC_ACT_SHOT; - } + key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */ } ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index bd12ec97a44d..1ccbe804e8e1 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -24,12 +24,12 @@ # Root namespace with metadata-mode tunnel + BPF # Device names and addresses: # veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay) -# tunnel dev 11, ex: gre11, IPv4: 10.1.1.200 (overlay) +# tunnel dev 11, ex: gre11, IPv4: 10.1.1.200, IPv6: 1::22 (overlay) # # Namespace at_ns0 with native tunnel # Device names and addresses: # veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay) -# tunnel dev 00, ex: gre00, IPv4: 10.1.1.100 (overlay) +# tunnel dev 00, ex: gre00, IPv4: 10.1.1.100, IPv6: 1::11 (overlay) # # # End-to-end ping packet flow @@ -250,7 +250,7 @@ add_ipip_tunnel() ip addr add dev $DEV 10.1.1.200/24 } -add_ipip6tnl_tunnel() +add_ip6tnl_tunnel() { ip netns exec at_ns0 ip addr add ::11/96 dev veth0 ip netns exec at_ns0 ip link set dev veth0 up @@ -262,11 +262,13 @@ add_ipip6tnl_tunnel() ip link add dev $DEV_NS type $TYPE \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip addr add dev $DEV_NS 1::11/96 ip netns exec at_ns0 ip link set dev $DEV_NS up # root namespace ip link add dev $DEV type $TYPE external ip addr add dev $DEV 10.1.1.200/24 + ip addr add dev $DEV 1::22/96 ip link set dev $DEV up } @@ -534,7 +536,7 @@ test_ipip6() check $TYPE config_device - add_ipip6tnl_tunnel + add_ip6tnl_tunnel ip link set dev veth1 mtu 1500 attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel # underlay @@ -553,6 +555,34 @@ test_ipip6() echo -e ${GREEN}"PASS: $TYPE"${NC} } +test_ip6ip6() +{ + TYPE=ip6tnl + DEV_NS=ip6ip6tnl00 + DEV=ip6ip6tnl11 + ret=0 + + check $TYPE + config_device + add_ip6tnl_tunnel + ip link set dev veth1 mtu 1500 + attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel + # underlay + ping6 $PING_ARG ::11 + # ip6 over ip6 + ping6 $PING_ARG 1::11 + check_err $? + ip netns exec at_ns0 ping6 $PING_ARG 1::22 + check_err $? + cleanup + + if [ $ret -ne 0 ]; then + echo -e ${RED}"FAIL: ip6$TYPE"${NC} + return 1 + fi + echo -e ${GREEN}"PASS: ip6$TYPE"${NC} +} + setup_xfrm_tunnel() { auth=0x$(printf '1%.0s' {1..40}) @@ -646,6 +676,7 @@ cleanup() ip link del veth1 2> /dev/null ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null + ip link del ip6ip6tnl11 2> /dev/null ip link del gretap11 2> /dev/null ip link del ip6gre11 2> /dev/null ip link del ip6gretap11 2> /dev/null @@ -742,6 +773,10 @@ bpf_tunnel_test() test_ipip6 errors=$(( $errors + $? )) + echo "Testing IP6IP6 tunnel..." + test_ip6ip6 + errors=$(( $errors + $? )) + echo "Testing IPSec tunnel..." test_xfrm_tunnel errors=$(( $errors + $? )) From e2215b0555ccd5ad25e260d6c949558b5796b3dc Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Nov 2020 09:50:13 +0800 Subject: [PATCH 41/66] samples/bpf: Remove unused test_ipip.sh The tcbpf2_kern.o and related kernel sections are moved to bpf selftest folder since b05cd7404323 ("samples/bpf: remove the bpf tunnel testsuite."). Remove this one as well. Fixes: b05cd7404323 ("samples/bpf: remove the bpf tunnel testsuite.") Signed-off-by: Hangbin Liu Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201110015013.1570716-3-liuhangbin@gmail.com --- samples/bpf/test_ipip.sh | 179 --------------------------------------- 1 file changed, 179 deletions(-) delete mode 100755 samples/bpf/test_ipip.sh diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh deleted file mode 100755 index 9e507c305c6e..000000000000 --- a/samples/bpf/test_ipip.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -function config_device { - ip netns add at_ns0 - ip netns add at_ns1 - ip netns add at_ns2 - ip link add veth0 type veth peer name veth0b - ip link add veth1 type veth peer name veth1b - ip link add veth2 type veth peer name veth2b - ip link set veth0b up - ip link set veth1b up - ip link set veth2b up - ip link set dev veth0b mtu 1500 - ip link set dev veth1b mtu 1500 - ip link set dev veth2b mtu 1500 - ip link set veth0 netns at_ns0 - ip link set veth1 netns at_ns1 - ip link set veth2 netns at_ns2 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 - ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad - ip netns exec at_ns1 ip link set dev veth1 up - ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 - ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad - ip netns exec at_ns2 ip link set dev veth2 up - ip link add br0 type bridge - ip link set br0 up - ip link set dev br0 mtu 1500 - ip link set veth0b master br0 - ip link set veth1b master br0 - ip link set veth2b master br0 -} - -function add_ipip_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ipip external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ipip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 -} - -function add_ip6ip6_tunnel { - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 - ip netns exec at_ns1 \ - ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 - ip netns exec at_ns1 ip link set dev $DEV_NS up - # same inner IP address in at_ns0 and at_ns1 - ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 - - ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external - ip netns exec at_ns2 ip link set dev $DEV up - ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 -} - -function attach_bpf { - DEV=$1 - SET_TUNNEL=$2 - GET_TUNNEL=$3 - ip netns exec at_ns2 tc qdisc add dev $DEV clsact - ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL - ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL -} - -function test_ipip { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip_tunnel - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv4 over IPv6 tunnel -function test_ipip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ipip6_tunnel - attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel - - ip netns exec at_ns0 ping -c 1 10.1.1.200 - ip netns exec at_ns2 ping -c 1 10.1.1.100 - ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 - ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 - cleanup -} - -# IPv6 over IPv6 tunnel -function test_ip6ip6 { - DEV_NS=ipip_std - DEV=ipip_bpf - config_device -# tcpdump -nei br0 & - cat /sys/kernel/debug/tracing/trace_pipe & - - add_ip6ip6_tunnel - attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel - - ip netns exec at_ns0 ping -6 -c 1 2601:646::2 - ip netns exec at_ns2 ping -6 -c 1 2601:646::1 - ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null - ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null - sleep 0.2 - # tcp check _same_ IP over different tunnels - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 - ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 - cleanup -} - -function cleanup { - set +ex - pkill iperf - ip netns delete at_ns0 - ip netns delete at_ns1 - ip netns delete at_ns2 - ip link del veth0 - ip link del veth1 - ip link del veth2 - ip link del br0 - pkill tcpdump - pkill cat - set -ex -} - -cleanup -echo "Testing IP tunnels..." -test_ipip -test_ipip6 -test_ip6ip6 -echo "*** PASS ***" From 7112d127984bd7b0c8ded7973b358829f16735f5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 10 Nov 2020 20:06:45 -0800 Subject: [PATCH 42/66] bpf: Compile out btf_parse_module() if module BTF is not enabled Make sure btf_parse_module() is compiled out if module BTFs are not enabled. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Reported-by: Stephen Rothwell Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201111040645.903494-1-andrii@kernel.org --- kernel/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0f1fd2669d69..6b2d508b33d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4478,6 +4478,8 @@ errout: return ERR_PTR(err); } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) { struct btf_verifier_env *env = NULL; @@ -4547,6 +4549,8 @@ errout: return ERR_PTR(err); } +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; From c8a950d0d3b926a02c7b2e713850d38217cec3d1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:05 +0100 Subject: [PATCH 43/66] tools: Factor HOSTCC, HOSTLD, HOSTAR definitions Several Makefiles in tools/ need to define the host toolchain variables. Move their definition to tools/scripts/Makefile.include Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/bpf/20201110164310.2600671-2-jean-philippe@linaro.org --- tools/bpf/resolve_btfids/Makefile | 9 --------- tools/build/Makefile | 4 ---- tools/objtool/Makefile | 9 --------- tools/perf/Makefile.perf | 4 ---- tools/power/acpi/Makefile.config | 1 - tools/scripts/Makefile.include | 10 ++++++++++ 6 files changed, 10 insertions(+), 27 deletions(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 66cb92136de4..bf656432ad73 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -18,15 +18,6 @@ else endif # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/build/Makefile b/tools/build/Makefile index 722f1700d96a..bae48e6fa995 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -15,10 +15,6 @@ endef $(call allow-override,CC,$(CROSS_COMPILE)gcc) $(call allow-override,LD,$(CROSS_COMPILE)ld) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - export HOSTCC HOSTLD HOSTAR ifeq ($(V),1) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 4ea9a833dde7..5cdb19036d7f 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -3,15 +3,6 @@ include ../scripts/Makefile.include include ../scripts/Makefile.arch # always use the host compiler -ifneq ($(LLVM),) -HOSTAR ?= llvm-ar -HOSTCC ?= clang -HOSTLD ?= ld.lld -else -HOSTAR ?= ar -HOSTCC ?= gcc -HOSTLD ?= ld -endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 7ce3f2e8b9c7..62f3deb1d3a8 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -175,10 +175,6 @@ endef LD += $(EXTRA_LDFLAGS) -HOSTCC ?= gcc -HOSTLD ?= ld -HOSTAR ?= ar - PKG_CONFIG = $(CROSS_COMPILE)pkg-config LLVM_CONFIG ?= llvm-config diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config index 54a2857c2510..331f6d30f472 100644 --- a/tools/power/acpi/Makefile.config +++ b/tools/power/acpi/Makefile.config @@ -54,7 +54,6 @@ INSTALL_SCRIPT = ${INSTALL_PROGRAM} CROSS = #/usr/i386-linux-uclibc/usr/bin/i386-uclibc- CROSS_COMPILE ?= $(CROSS) LD = $(CC) -HOSTCC = gcc # check if compiler option is supported cc-supports = ${shell if $(CC) ${1} -S -o /dev/null -x c /dev/null > /dev/null 2>&1; then echo "$(1)"; fi;} diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index a7974638561c..1358e89cdf7d 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -59,6 +59,16 @@ $(call allow-override,LD,$(CROSS_COMPILE)ld) $(call allow-override,CXX,$(CROSS_COMPILE)g++) $(call allow-override,STRIP,$(CROSS_COMPILE)strip) +ifneq ($(LLVM),) +HOSTAR ?= llvm-ar +HOSTCC ?= clang +HOSTLD ?= ld.lld +else +HOSTAR ?= ar +HOSTCC ?= gcc +HOSTLD ?= ld +endif + ifeq ($(CC_NO_CLANG), 1) EXTRA_WARNINGS += -Wstrict-aliasing=3 endif From 9e8929fdbb9c9026bd3a732e9ac7dc9617c86309 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:06 +0100 Subject: [PATCH 44/66] tools/bpftool: Force clean of out-of-tree build Cleaning a partial build can fail if the output directory for libbpf wasn't created: $ make -C tools/bpf/bpftool O=/tmp/bpf clean /bin/sh: line 0: cd: /tmp/bpf/libbpf/: No such file or directory tools/scripts/Makefile.include:17: *** output directory "/tmp/bpf/libbpf/" does not exist. Stop. make: *** [Makefile:36: /tmp/bpf/libbpf/libbpf.a-clean] Error 2 As a result make never gets around to clearing the leftover objects. Add the libbpf output directory as clean dependency to ensure clean always succeeds (similarly to the "descend" macro). The directory is later removed by the clean recipe. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-3-jean-philippe@linaro.org --- tools/bpf/bpftool/Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index f60e6ad3a1df..1358c093b812 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -27,11 +27,13 @@ LIBBPF = $(LIBBPF_PATH)libbpf.a BPFTOOL_VERSION ?= $(shell make -rR --no-print-directory -sC ../../.. kernelversion) -$(LIBBPF): FORCE - $(if $(LIBBPF_OUTPUT),@mkdir -p $(LIBBPF_OUTPUT)) +$(LIBBPF_OUTPUT): + $(QUIET_MKDIR)mkdir -p $@ + +$(LIBBPF): FORCE | $(LIBBPF_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a -$(LIBBPF)-clean: +$(LIBBPF)-clean: $(LIBBPF_OUTPUT) $(call QUIET_CLEAN, libbpf) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null From 8859b0da5aac28e4e9651c8971e7af344f8ffec1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:07 +0100 Subject: [PATCH 45/66] tools/bpftool: Fix cross-build The bpftool build first creates an intermediate binary, executed on the host, to generate skeletons required by the final build. When cross-building bpftool for an architecture different from the host, the intermediate binary should be built using the host compiler (gcc) and the final bpftool using the cross compiler (e.g. aarch64-linux-gnu-gcc). Generate the intermediate objects into the bootstrap/ directory using the host toolchain. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-4-jean-philippe@linaro.org --- tools/bpf/bpftool/Makefile | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 1358c093b812..d566bced135e 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -19,24 +19,37 @@ BPF_DIR = $(srctree)/tools/lib/bpf/ ifneq ($(OUTPUT),) LIBBPF_OUTPUT = $(OUTPUT)/libbpf/ LIBBPF_PATH = $(LIBBPF_OUTPUT) + BOOTSTRAP_OUTPUT = $(OUTPUT)/bootstrap/ else + LIBBPF_OUTPUT = LIBBPF_PATH = $(BPF_DIR) + BOOTSTRAP_OUTPUT = $(CURDIR)/bootstrap/ endif LIBBPF = $(LIBBPF_PATH)libbpf.a +LIBBPF_BOOTSTRAP_OUTPUT = $(BOOTSTRAP_OUTPUT)libbpf/ +LIBBPF_BOOTSTRAP = $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a BPFTOOL_VERSION ?= $(shell make -rR --no-print-directory -sC ../../.. kernelversion) -$(LIBBPF_OUTPUT): +$(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ $(LIBBPF): FORCE | $(LIBBPF_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a +$(LIBBPF_BOOTSTRAP): FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ + ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ + $(LIBBPF)-clean: $(LIBBPF_OUTPUT) $(call QUIET_CLEAN, libbpf) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null +$(LIBBPF_BOOTSTRAP)-clean: $(LIBBPF_BOOTSTRAP_OUTPUT) + $(call QUIET_CLEAN, libbpf-bootstrap) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) clean >/dev/null + prefix ?= /usr/local bash_compdir ?= /usr/share/bash-completion/completions @@ -94,6 +107,7 @@ CFLAGS += -DCOMPAT_NEED_REALLOCARRAY endif LIBS = $(LIBBPF) -lelf -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz ifeq ($(feature-libcap), 1) CFLAGS += -DUSE_LIBCAP LIBS += -lcap @@ -120,9 +134,9 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT SRCS += $(BFD_SRCS) endif -BPFTOOL_BOOTSTRAP := $(if $(OUTPUT),$(OUTPUT)bpftool-bootstrap,./bpftool-bootstrap) +BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool -BOOTSTRAP_OBJS = $(addprefix $(OUTPUT),main.o common.o json_writer.o gen.o btf.o) +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o) OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ @@ -169,12 +183,16 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c $(OUTPUT)feature.o: | zdep -$(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF) - $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(BOOTSTRAP_OBJS) $(LIBS) +$(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) + $(QUIET_LINK)$(HOSTCC) $(CFLAGS) $(LDFLAGS) -o $@ $(BOOTSTRAP_OBJS) \ + $(LIBS_BOOTSTRAP) $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS) +$(BOOTSTRAP_OUTPUT)%.o: %.c | $(BOOTSTRAP_OUTPUT) + $(QUIET_CC)$(HOSTCC) $(CFLAGS) -c -MMD -o $@ $< + $(OUTPUT)%.o: %.c $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD -o $@ $< @@ -182,11 +200,11 @@ feature-detect-clean: $(call QUIET_CLEAN, feature-detect) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null -clean: $(LIBBPF)-clean feature-detect-clean +clean: $(LIBBPF)-clean $(LIBBPF_BOOTSTRAP)-clean feature-detect-clean $(call QUIET_CLEAN, bpftool) $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d - $(Q)$(RM) -- $(BPFTOOL_BOOTSTRAP) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h - $(Q)$(RM) -r -- $(OUTPUT)libbpf/ + $(Q)$(RM) -- $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h + $(Q)$(RM) -r -- $(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(call QUIET_CLEAN, core-gen) $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool $(Q)$(RM) -r -- $(OUTPUT)feature/ From 3290996e713315dfc9589e2c340a4c4997d90be8 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:08 +0100 Subject: [PATCH 46/66] tools/runqslower: Use Makefile.include Makefile.include defines variables such as OUTPUT and CC for out-of-tree build and cross-build. Include it into the runqslower Makefile and use its $(QUIET*) helpers. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-5-jean-philippe@linaro.org --- tools/bpf/runqslower/Makefile | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index fb1337d69868..bcc4a7396713 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +include ../../scripts/Makefile.include + OUTPUT := .output CLANG ?= clang LLC ?= llc @@ -21,10 +23,8 @@ VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ abs_out := $(abspath $(OUTPUT)) ifeq ($(V),1) Q = -msg = else Q = @ -msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 endif @@ -37,12 +37,11 @@ all: runqslower runqslower: $(OUTPUT)/runqslower clean: - $(call msg,CLEAN) + $(call QUIET_CLEAN, runqslower) $(Q)rm -rf $(OUTPUT) runqslower $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) - $(call msg,BINARY,$@) - $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ + $(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o @@ -50,31 +49,26 @@ $(OUTPUT)/runqslower.o: runqslower.h $(OUTPUT)/runqslower.skel.h \ $(OUTPUT)/runqslower.bpf.o: $(OUTPUT)/vmlinux.h runqslower.h $(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(BPFTOOL) - $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(QUIET_GEN)$(BPFTOOL) gen skeleton $< > $@ $(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) - $(call msg,BPF,$@) - $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + $(QUIET_GEN)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ -c $(filter %.c,$^) -o $@ && \ $(LLVM_STRIP) -g $@ $(OUTPUT)/%.o: %.c | $(OUTPUT) - $(call msg,CC,$@) - $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ $(OUTPUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $(OUTPUT) + $(QUIET_MKDIR)mkdir -p $(OUTPUT) $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) - $(call msg,GEN,$@) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ echo "Couldn't find kernel BTF; set VMLINUX_BTF to" \ "specify its location." >&2; \ exit 1;\ fi - $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ + $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ From 85e59344d0790379e063bf3cd3dd0fe91ce3b505 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:09 +0100 Subject: [PATCH 47/66] tools/runqslower: Enable out-of-tree build Enable out-of-tree build for runqslower. Only set OUTPUT=.output if it wasn't already set by the user. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-6-jean-philippe@linaro.org --- tools/bpf/runqslower/Makefile | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index bcc4a7396713..0fc4d4046193 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -1,15 +1,18 @@ # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) include ../../scripts/Makefile.include -OUTPUT := .output +OUTPUT ?= $(abspath .output)/ + CLANG ?= clang LLC ?= llc LLVM_STRIP ?= llvm-strip -DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL_OUTPUT := $(OUTPUT)bpftool/ +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(abspath ../../lib/bpf) -BPFOBJ := $(OUTPUT)/libbpf.a -BPF_INCLUDE := $(OUTPUT) +BPFOBJ_OUTPUT := $(OUTPUT)libbpf/ +BPFOBJ := $(BPFOBJ_OUTPUT)libbpf.a +BPF_INCLUDE := $(BPFOBJ_OUTPUT) INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../lib) \ -I$(abspath ../../include/uapi) CFLAGS := -g -Wall @@ -20,7 +23,6 @@ VMLINUX_BTF_PATHS := /sys/kernel/btf/vmlinux /boot/vmlinux-$(KERNEL_REL) VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) -abs_out := $(abspath $(OUTPUT)) ifeq ($(V),1) Q = else @@ -38,7 +40,11 @@ runqslower: $(OUTPUT)/runqslower clean: $(call QUIET_CLEAN, runqslower) - $(Q)rm -rf $(OUTPUT) runqslower + $(Q)$(RM) -r $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT) + $(Q)$(RM) $(OUTPUT)*.o $(OUTPUT)*.d + $(Q)$(RM) $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h + $(Q)$(RM) $(OUTPUT)runqslower + $(Q)$(RM) -r .output $(OUTPUT)/runqslower: $(OUTPUT)/runqslower.o $(BPFOBJ) $(QUIET_LINK)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ @@ -59,8 +65,8 @@ $(OUTPUT)/%.bpf.o: %.bpf.c $(BPFOBJ) | $(OUTPUT) $(OUTPUT)/%.o: %.c | $(OUTPUT) $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ -$(OUTPUT): - $(QUIET_MKDIR)mkdir -p $(OUTPUT) +$(OUTPUT) $(BPFOBJ_OUTPUT) $(BPFTOOL_OUTPUT): + $(QUIET_MKDIR)mkdir -p $@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) $(Q)if [ ! -e "$(VMLINUX_BTF_PATH)" ] ; then \ @@ -70,10 +76,8 @@ $(OUTPUT)/vmlinux.h: $(VMLINUX_BTF_PATH) | $(OUTPUT) $(BPFTOOL) fi $(QUIET_GEN)$(BPFTOOL) btf dump file $(VMLINUX_BTF_PATH) format c > $@ -$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ - OUTPUT=$(abspath $(dir $@))/ $(abspath $@) +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ -$(DEFAULT_BPFTOOL): - $(Q)$(MAKE) $(submake_extras) -C ../bpftool \ - prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) From 2d9393fefb506fb8c8a483e5dc9bbd7774657b89 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:10 +0100 Subject: [PATCH 48/66] tools/runqslower: Build bpftool using HOSTCC When cross building runqslower for an other architecture, the intermediate bpftool used to generate a skeleton must be built using the host toolchain. Pass HOSTCC and HOSTLD, defined in Makefile.include, to the bpftool Makefile. Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-7-jean-philippe@linaro.org --- tools/bpf/runqslower/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index 0fc4d4046193..4d5ca54fcd4c 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -80,4 +80,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OU $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(BPFOBJ_OUTPUT) $@ $(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) \ + CC=$(HOSTCC) LD=$(HOSTLD) From 0639e5e97ad9c58dd15dcf6f6ccf677cfba39f98 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 10 Nov 2020 17:43:11 +0100 Subject: [PATCH 49/66] tools/bpftool: Fix build slowdown Commit ba2fd563b740 ("tools/bpftool: Support passing BPFTOOL_VERSION to make") changed BPFTOOL_VERSION to a recursively expanded variable, forcing it to be recomputed on every expansion of CFLAGS and dramatically slowing down the bpftool build. Restore BPFTOOL_VERSION as a simply expanded variable, guarded by an ifeq(). Fixes: ba2fd563b740 ("tools/bpftool: Support passing BPFTOOL_VERSION to make") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201110164310.2600671-8-jean-philippe@linaro.org --- tools/bpf/bpftool/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index d566bced135e..804ade95929f 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -30,7 +30,9 @@ LIBBPF = $(LIBBPF_PATH)libbpf.a LIBBPF_BOOTSTRAP_OUTPUT = $(BOOTSTRAP_OUTPUT)libbpf/ LIBBPF_BOOTSTRAP = $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a -BPFTOOL_VERSION ?= $(shell make -rR --no-print-directory -sC ../../.. kernelversion) +ifeq ($(BPFTOOL_VERSION),) +BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion) +endif $(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT): $(QUIET_MKDIR)mkdir -p $@ From 09a3dac7b579e57e7ef2d875b9216c845ae8a0e5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 Nov 2020 16:19:19 -0800 Subject: [PATCH 50/66] bpf: Fix NULL dereference in bpf_task_storage In bpf_pid_task_storage_update_elem(), it missed to test the !task_storage_ptr(task) which then could trigger a NULL pointer exception in bpf_local_storage_update(). Fixes: 4cf1bc1f1045 ("bpf: Implement task local storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Tested-by: Roman Gushchin Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201112001919.2028357-1-kafai@fb.com --- kernel/bpf/bpf_task_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 39a45fba4fb0..4ef1959a78f2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -150,7 +150,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task) { + if (!task || !task_storage_ptr(task)) { err = -ENOENT; goto out; } From 6a59edd832e2e353809778859d2a4a2d6c94d011 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 12 Nov 2020 10:10:50 +0100 Subject: [PATCH 51/66] tools/bpf: Add bootstrap/ to .gitignore Commit 8859b0da5aac ("tools/bpftool: Fix cross-build") added a build-time bootstrap/ directory for bpftool, and removed bpftool-bootstrap. Update .gitignore accordingly. Reported-by: Andrii Nakryiko Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201112091049.3159055-1-jean-philippe@linaro.org --- tools/bpf/bpftool/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore index 3e601bcfd461..944cb4b7c95d 100644 --- a/tools/bpf/bpftool/.gitignore +++ b/tools/bpf/bpftool/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only *.d -/bpftool-bootstrap +/bootstrap/ /bpftool bpftool*.8 bpf-helpers.* From c36538798fc6c80bd8bdaddad803b0c86dc13d7c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 12 Nov 2020 10:10:52 +0100 Subject: [PATCH 52/66] tools/bpf: Always run the *-clean recipes Make $(LIBBPF)-clean and $(LIBBPF_BOOTSTRAP)-clean .PHONY targets, in case those files exist. And keep consistency within the Makefile by making the directory dependencies order-only. Suggested-by: Andrii Nakryiko Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201112091049.3159055-2-jean-philippe@linaro.org --- tools/bpf/bpftool/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 804ade95929f..f897cb5fb12d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -44,11 +44,11 @@ $(LIBBPF_BOOTSTRAP): FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ ARCH= CC=$(HOSTCC) LD=$(HOSTLD) $@ -$(LIBBPF)-clean: $(LIBBPF_OUTPUT) +$(LIBBPF)-clean: FORCE | $(LIBBPF_OUTPUT) $(call QUIET_CLEAN, libbpf) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null -$(LIBBPF_BOOTSTRAP)-clean: $(LIBBPF_BOOTSTRAP_OUTPUT) +$(LIBBPF_BOOTSTRAP)-clean: FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) $(call QUIET_CLEAN, libbpf-bootstrap) $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) clean >/dev/null From 6d94e741a8ff818e5518da8257f5ca0aaed1f269 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:11 -0800 Subject: [PATCH 53/66] bpf: Support for pointers beyond pkt_end. This patch adds the verifier support to recognize inlined branch conditions. The LLVM knows that the branch evaluates to the same value, but the verifier couldn't track it. Hence causing valid programs to be rejected. The potential LLVM workaround: https://reviews.llvm.org/D87428 can have undesired side effects, since LLVM doesn't know that skb->data/data_end are being compared. LLVM has to introduce extra boolean variable and use inline_asm trick to force easier for the verifier assembly. Instead teach the verifier to recognize that r1 = skb->data; r1 += 10; r2 = skb->data_end; if (r1 > r2) { here r1 points beyond packet_end and subsequent if (r1 > r2) // always evaluates to "true". } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-2-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++------ 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e83ef6f6bf43..306869d4743b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,7 @@ struct bpf_reg_state { enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ - u16 range; + int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10da26e55130..7b1f85aa9741 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2739,7 +2739,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_mem_access(env, regno, off, size, reg->range, + + err = reg->range < 0 ? -EINVAL : + __check_mem_access(env, regno, off, size, reg->range, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); @@ -4697,6 +4699,32 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +enum { + AT_PKT_END = -1, + BEYOND_PKT_END = -2, +}; + +static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open) +{ + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regn]; + + if (reg->type != PTR_TO_PACKET) + /* PTR_TO_PACKET_META is not supported yet */ + return; + + /* The 'reg' is pkt > pkt_end or pkt >= pkt_end. + * How far beyond pkt_end it goes is unknown. + * if (!range_open) it's the case of pkt >= pkt_end + * if (range_open) it's the case of pkt > pkt_end + * hence this pointer is at least 1 byte bigger than pkt_end + */ + if (range_open) + reg->range = BEYOND_PKT_END; + else + reg->range = AT_PKT_END; +} + static void release_reg_references(struct bpf_verifier_env *env, struct bpf_func_state *state, int ref_obj_id) @@ -6708,7 +6736,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void __find_good_pkt_pointers(struct bpf_func_state *state, struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, u16 new_range) + enum bpf_reg_type type, int new_range) { struct bpf_reg_state *reg; int i; @@ -6733,8 +6761,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, enum bpf_reg_type type, bool range_right_open) { - u16 new_range; - int i; + int new_range, i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -6985,6 +7012,67 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return is_branch64_taken(reg, val, opcode); } +static int flip_opcode(u32 opcode) +{ + /* How can we transform "a b" into "b a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + return opcode_flip[opcode >> 4]; +} + +static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + u8 opcode) +{ + struct bpf_reg_state *pkt; + + if (src_reg->type == PTR_TO_PACKET_END) { + pkt = dst_reg; + } else if (dst_reg->type == PTR_TO_PACKET_END) { + pkt = src_reg; + opcode = flip_opcode(opcode); + } else { + return -1; + } + + if (pkt->range >= 0) + return -1; + + switch (opcode) { + case BPF_JLE: + /* pkt <= pkt_end */ + fallthrough; + case BPF_JGT: + /* pkt > pkt_end */ + if (pkt->range == BEYOND_PKT_END) + /* pkt has at last one extra byte beyond pkt_end */ + return opcode == BPF_JGT; + break; + case BPF_JLT: + /* pkt < pkt_end */ + fallthrough; + case BPF_JGE: + /* pkt >= pkt_end */ + if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END) + return opcode == BPF_JGE; + break; + } + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -7148,23 +7236,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 val, u32 val32, u8 opcode, bool is_jmp32) { - /* How can we transform "a b" into "b a"? */ - static const u8 opcode_flip[16] = { - /* these stay the same */ - [BPF_JEQ >> 4] = BPF_JEQ, - [BPF_JNE >> 4] = BPF_JNE, - [BPF_JSET >> 4] = BPF_JSET, - /* these swap "lesser" and "greater" (L and G in the opcodes) */ - [BPF_JGE >> 4] = BPF_JLE, - [BPF_JGT >> 4] = BPF_JLT, - [BPF_JLE >> 4] = BPF_JGE, - [BPF_JLT >> 4] = BPF_JGT, - [BPF_JSGE >> 4] = BPF_JSLE, - [BPF_JSGT >> 4] = BPF_JSLT, - [BPF_JSLE >> 4] = BPF_JSGE, - [BPF_JSLT >> 4] = BPF_JSGT - }; - opcode = opcode_flip[opcode >> 4]; + opcode = flip_opcode(opcode); /* This uses zero as "not present in table"; luckily the zero opcode, * BPF_JA, can't get here. */ @@ -7346,6 +7418,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(other_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7353,6 +7426,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end > pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, true); + mark_pkt_end(this_branch, insn->src_reg, false); } else { return false; } @@ -7365,6 +7439,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(this_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7372,6 +7447,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end < pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, false); + mark_pkt_end(other_branch, insn->src_reg, true); } else { return false; } @@ -7384,6 +7460,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(other_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7391,6 +7468,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, false); + mark_pkt_end(this_branch, insn->src_reg, true); } else { return false; } @@ -7403,6 +7481,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(this_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7410,6 +7489,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, true); + mark_pkt_end(other_branch, insn->src_reg, false); } else { return false; } @@ -7509,6 +7589,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (reg_is_pkt_pointer_any(dst_reg) && + reg_is_pkt_pointer_any(src_reg) && + !is_jmp32) { + pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); } if (pred >= 0) { @@ -7517,7 +7601,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!__is_pointer_value(false, dst_reg)) err = mark_chain_precision(env, insn->dst_reg); - if (BPF_SRC(insn->code) == BPF_X && !err) + if (BPF_SRC(insn->code) == BPF_X && !err && + !__is_pointer_value(false, src_reg)) err = mark_chain_precision(env, insn->src_reg); if (err) return err; From 9cc873e85800ccde80aa2e4b2bae9f1b5fa4c478 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:12 -0800 Subject: [PATCH 54/66] selftests/bpf: Add skb_pkt_end test Add a test that currently makes LLVM generate assembly code: $ llvm-objdump -S skb_pkt_end.o 0000000000000000 : ; if (skb_shorter(skb, ETH_IPV4_TCP_SIZE)) 0: 61 12 50 00 00 00 00 00 r2 = *(u32 *)(r1 + 80) 1: 61 14 4c 00 00 00 00 00 r4 = *(u32 *)(r1 + 76) 2: bf 43 00 00 00 00 00 00 r3 = r4 3: 07 03 00 00 36 00 00 00 r3 += 54 4: b7 01 00 00 00 00 00 00 r1 = 0 5: 2d 23 02 00 00 00 00 00 if r3 > r2 goto +2 6: 07 04 00 00 0e 00 00 00 r4 += 14 ; if (skb_shorter(skb, ETH_IPV4_TCP_SIZE)) 7: bf 41 00 00 00 00 00 00 r1 = r4 0000000000000040 : 8: b4 00 00 00 ff ff ff ff w0 = -1 ; if (!(ip = get_iphdr(skb))) 9: 2d 23 05 00 00 00 00 00 if r3 > r2 goto +5 ; proto = ip->protocol; 10: 71 12 09 00 00 00 00 00 r2 = *(u8 *)(r1 + 9) ; if (proto != IPPROTO_TCP) 11: 56 02 03 00 06 00 00 00 if w2 != 6 goto +3 ; if (tcp->dest != 0) 12: 69 12 16 00 00 00 00 00 r2 = *(u16 *)(r1 + 22) 13: 56 02 01 00 00 00 00 00 if w2 != 0 goto +1 ; return tcp->urg_ptr; 14: 69 10 26 00 00 00 00 00 r0 = *(u16 *)(r1 + 38) 0000000000000078 : ; } 15: 95 00 00 00 00 00 00 00 exit Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-3-alexei.starovoitov@gmail.com --- .../bpf/prog_tests/test_skb_pkt_end.c | 41 ++++++++++++++ .../testing/selftests/bpf/progs/skb_pkt_end.c | 54 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c create mode 100644 tools/testing/selftests/bpf/progs/skb_pkt_end.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c b/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c new file mode 100644 index 000000000000..cf1215531920 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_skb_pkt_end.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include +#include +#include "skb_pkt_end.skel.h" + +static int sanity_run(struct bpf_program *prog) +{ + __u32 duration, retval; + int err, prog_fd; + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, &retval, &duration); + if (CHECK(err || retval != 123, "test_run", + "err %d errno %d retval %d duration %d\n", + err, errno, retval, duration)) + return -1; + return 0; +} + +void test_test_skb_pkt_end(void) +{ + struct skb_pkt_end *skb_pkt_end_skel = NULL; + __u32 duration = 0; + int err; + + skb_pkt_end_skel = skb_pkt_end__open_and_load(); + if (CHECK(!skb_pkt_end_skel, "skb_pkt_end_skel_load", "skb_pkt_end skeleton failed\n")) + goto cleanup; + + err = skb_pkt_end__attach(skb_pkt_end_skel); + if (CHECK(err, "skb_pkt_end_attach", "skb_pkt_end attach failed: %d\n", err)) + goto cleanup; + + if (sanity_run(skb_pkt_end_skel->progs.main_prog)) + goto cleanup; + +cleanup: + skb_pkt_end__destroy(skb_pkt_end_skel); +} diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c new file mode 100644 index 000000000000..cf6823f42e80 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BPF_NO_PRESERVE_ACCESS_INDEX +#include +#include +#include + +#define NULL 0 +#define INLINE __always_inline + +#define skb_shorter(skb, len) ((void *)(long)(skb)->data + (len) > (void *)(long)skb->data_end) + +#define ETH_IPV4_TCP_SIZE (14 + sizeof(struct iphdr) + sizeof(struct tcphdr)) + +static INLINE struct iphdr *get_iphdr(struct __sk_buff *skb) +{ + struct iphdr *ip = NULL; + struct ethhdr *eth; + + if (skb_shorter(skb, ETH_IPV4_TCP_SIZE)) + goto out; + + eth = (void *)(long)skb->data; + ip = (void *)(eth + 1); + +out: + return ip; +} + +SEC("classifier/cls") +int main_prog(struct __sk_buff *skb) +{ + struct iphdr *ip = NULL; + struct tcphdr *tcp; + __u8 proto = 0; + + if (!(ip = get_iphdr(skb))) + goto out; + + proto = ip->protocol; + + if (proto != IPPROTO_TCP) + goto out; + + tcp = (void*)(ip + 1); + if (tcp->dest != 0) + goto out; + if (!tcp) + goto out; + + return tcp->urg_ptr; +out: + return -1; +} +char _license[] SEC("license") = "GPL"; From cb62d34019d9117bb94de6ed35959449d43d6055 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:13 -0800 Subject: [PATCH 55/66] selftests/bpf: Add asm tests for pkt vs pkt_end comparison. Add few assembly tests for packet comparison. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-4-alexei.starovoitov@gmail.com --- .../testing/selftests/bpf/verifier/ctx_skb.c | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/ctx_skb.c b/tools/testing/selftests/bpf/verifier/ctx_skb.c index 2e16b8e268f2..2022c0f2cd75 100644 --- a/tools/testing/selftests/bpf/verifier/ctx_skb.c +++ b/tools/testing/selftests/bpf/verifier/ctx_skb.c @@ -1089,3 +1089,45 @@ .errstr_unpriv = "R1 leaks addr", .result = REJECT, }, +{ + "pkt > pkt_end taken check", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, // 0. r2 = *(u32 *)(r1 + data_end) + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, // 1. r4 = *(u32 *)(r1 + data) + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_4), // 2. r3 = r4 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 42), // 3. r3 += 42 + BPF_MOV64_IMM(BPF_REG_1, 0), // 4. r1 = 0 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 2), // 5. if r3 > r2 goto 8 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 14), // 6. r4 += 14 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_4), // 7. r1 = r4 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 1), // 8. if r3 > r2 goto 10 + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, 9), // 9. r2 = *(u8 *)(r1 + 9) + BPF_MOV64_IMM(BPF_REG_0, 0), // 10. r0 = 0 + BPF_EXIT_INSN(), // 11. exit + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, +}, +{ + "pkt_end < pkt taken check", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, // 0. r2 = *(u32 *)(r1 + data_end) + offsetof(struct __sk_buff, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, // 1. r4 = *(u32 *)(r1 + data) + offsetof(struct __sk_buff, data)), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_4), // 2. r3 = r4 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 42), // 3. r3 += 42 + BPF_MOV64_IMM(BPF_REG_1, 0), // 4. r1 = 0 + BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 2), // 5. if r3 > r2 goto 8 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 14), // 6. r4 += 14 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_4), // 7. r1 = r4 + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, 1), // 8. if r2 < r3 goto 10 + BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1, 9), // 9. r2 = *(u8 *)(r1 + 9) + BPF_MOV64_IMM(BPF_REG_0, 0), // 10. r0 = 0 + BPF_EXIT_INSN(), // 11. exit + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SK_SKB, +}, From 9e838b02b0bb795793f12049307a354e28b5749c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:01 -0800 Subject: [PATCH 56/66] bpf: Folding omem_charge() into sk_storage_charge() sk_storage_charge() is the only user of omem_charge(). This patch simplifies it by folding omem_charge() into sk_storage_charge(). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201112211301.2586255-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index c907f0dc7f87..001eac65e40f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -15,18 +15,6 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); -static int omem_charge(struct sock *sk, unsigned int size) -{ - /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { - atomic_add(size, &sk->sk_omem_alloc); - return 0; - } - - return -ENOMEM; -} - static struct bpf_local_storage_data * sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { @@ -316,7 +304,16 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { - return omem_charge(owner, size); + struct sock *sk = (struct sock *)owner; + + /* same check as in sock_kmalloc() */ + if (size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + atomic_add(size, &sk->sk_omem_alloc); + return 0; + } + + return -ENOMEM; } static void sk_storage_uncharge(struct bpf_local_storage_map *smap, From e794bfddb8b8521bd016e73b411d6b03149e9e66 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:07 -0800 Subject: [PATCH 57/66] bpf: Rename some functions in bpf_sk_storage Rename some of the functions currently prefixed with sk_storage to bpf_sk_storage. That will make the next patch have fewer prefix check and also bring the bpf_sk_storage.c to a more consistent function naming. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201112211307.2587021-1-kafai@fb.com --- net/core/bpf_sk_storage.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 001eac65e40f..fd416678f236 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -16,7 +16,7 @@ DEFINE_BPF_STORAGE_CACHE(sk_cache); static struct bpf_local_storage_data * -sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) +bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; @@ -29,11 +29,11 @@ sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit); } -static int sk_storage_delete(struct sock *sk, struct bpf_map *map) +static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) { struct bpf_local_storage_data *sdata; - sdata = sk_storage_lookup(sk, map, false); + sdata = bpf_sk_storage_lookup(sk, map, false); if (!sdata) return -ENOENT; @@ -82,7 +82,7 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void sk_storage_map_free(struct bpf_map *map) +static void bpf_sk_storage_map_free(struct bpf_map *map) { struct bpf_local_storage_map *smap; @@ -91,7 +91,7 @@ static void sk_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap); } -static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; @@ -118,7 +118,7 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = sk_storage_lookup(sock->sk, map, true); + sdata = bpf_sk_storage_lookup(sock->sk, map, true); sockfd_put(sock); return sdata ? sdata->data : NULL; } @@ -154,7 +154,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - err = sk_storage_delete(sock->sk, map); + err = bpf_sk_storage_del(sock->sk, map); sockfd_put(sock); return err; } @@ -260,7 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; - sdata = sk_storage_lookup(sk, map, true); + sdata = bpf_sk_storage_lookup(sk, map, true); if (sdata) return (unsigned long)sdata->data; @@ -293,7 +293,7 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; - err = sk_storage_delete(sk, map); + err = bpf_sk_storage_del(sk, map); sock_put(sk); return err; } @@ -301,8 +301,8 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } -static int sk_storage_charge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { struct sock *sk = (struct sock *)owner; @@ -316,8 +316,8 @@ static int sk_storage_charge(struct bpf_local_storage_map *smap, return -ENOMEM; } -static void sk_storage_uncharge(struct bpf_local_storage_map *smap, - void *owner, u32 size) +static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) { struct sock *sk = owner; @@ -325,7 +325,7 @@ static void sk_storage_uncharge(struct bpf_local_storage_map *smap, } static struct bpf_local_storage __rcu ** -sk_storage_ptr(void *owner) +bpf_sk_storage_ptr(void *owner) { struct sock *sk = owner; @@ -336,8 +336,8 @@ static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = sk_storage_map_alloc, - .map_free = sk_storage_map_free, + .map_alloc = bpf_sk_storage_map_alloc, + .map_free = bpf_sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -345,9 +345,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, - .map_local_storage_charge = sk_storage_charge, - .map_local_storage_uncharge = sk_storage_uncharge, - .map_owner_storage_ptr = sk_storage_ptr, + .map_local_storage_charge = bpf_sk_storage_charge, + .map_local_storage_uncharge = bpf_sk_storage_uncharge, + .map_owner_storage_ptr = bpf_sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { From 8e4597c627fb48f361e2a5b012202cb1b6cbcd5e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:13 -0800 Subject: [PATCH 58/66] bpf: Allow using bpf_sk_storage in FENTRY/FEXIT/RAW_TP This patch enables the FENTRY/FEXIT/RAW_TP tracing program to use the bpf_sk_storage_(get|delete) helper, so those tracing programs can access the sk's bpf_local_storage and the later selftest will show some examples. The bpf_sk_storage is currently used in bpf-tcp-cc, tc, cg sockops...etc which is running either in softirq or task context. This patch adds bpf_sk_storage_get_tracing_proto and bpf_sk_storage_delete_tracing_proto. They will check in runtime that the helpers can only be called when serving softirq or running in a task context. That should enable most common tracing use cases on sk. During the load time, the new tracing_allowed() function will ensure the tracing prog using the bpf_sk_storage_(get|delete) helper is not tracing any bpf_sk_storage*() function itself. The sk is passed as "void *" when calling into bpf_local_storage. This patch only allows tracing a kernel function. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201112211313.2587383-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 + kernel/trace/bpf_trace.c | 5 +++ net/core/bpf_sk_storage.c | 74 ++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..0e85713f56df 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto bpf_sk_storage_get_tracing_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e4515b0f62a8..cfce60ad1cb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1735,6 +1736,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_tracing_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_tracing_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index fd416678f236..359908a7d3c1 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -378,6 +379,79 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (!in_serving_softirq() && !in_task()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (!in_serving_softirq() && !in_task()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; From 53632e11194663b7d5b043a68648892e593dc102 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:20 -0800 Subject: [PATCH 59/66] bpf: selftest: Use bpf_sk_storage in FENTRY/FEXIT/RAW_TP This patch tests storing the task's related info into the bpf_sk_storage by fentry/fexit tracing at listen, accept, and connect. It also tests the raw_tp at inet_sock_set_state. A negative test is done by tracing the bpf_sk_storage_free() and using bpf_sk_storage_get() at the same time. It ensures this bpf program cannot load. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201112211320.2587537-1-kafai@fb.com --- .../bpf/prog_tests/sk_storage_tracing.c | 135 ++++++++++++++++++ .../bpf/progs/test_sk_storage_trace_itself.c | 29 ++++ .../bpf/progs/test_sk_storage_tracing.c | 95 ++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c create mode 100644 tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c create mode 100644 tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c diff --git a/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c b/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c new file mode 100644 index 000000000000..2b392590e8ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_storage_tracing.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include +#include "test_progs.h" +#include "network_helpers.h" +#include "test_sk_storage_trace_itself.skel.h" +#include "test_sk_storage_tracing.skel.h" + +#define LO_ADDR6 "::1" +#define TEST_COMM "test_progs" + +struct sk_stg { + __u32 pid; + __u32 last_notclose_state; + char comm[16]; +}; + +static struct test_sk_storage_tracing *skel; +static __u32 duration; +static pid_t my_pid; + +static int check_sk_stg(int sk_fd, __u32 expected_state) +{ + struct sk_stg sk_stg; + int err; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_stg_map), &sk_fd, + &sk_stg); + if (!ASSERT_OK(err, "map_lookup(sk_stg_map)")) + return -1; + + if (!ASSERT_EQ(sk_stg.last_notclose_state, expected_state, + "last_notclose_state")) + return -1; + + if (!ASSERT_EQ(sk_stg.pid, my_pid, "pid")) + return -1; + + if (!ASSERT_STREQ(sk_stg.comm, skel->bss->task_comm, "task_comm")) + return -1; + + return 0; +} + +static void do_test(void) +{ + int listen_fd = -1, passive_fd = -1, active_fd = -1, value = 1, err; + char abyte; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0); + if (CHECK(listen_fd == -1, "start_server", + "listen_fd:%d errno:%d\n", listen_fd, errno)) + return; + + active_fd = connect_to_fd(listen_fd, 0); + if (CHECK(active_fd == -1, "connect_to_fd", "active_fd:%d errno:%d\n", + active_fd, errno)) + goto out; + + err = bpf_map_update_elem(bpf_map__fd(skel->maps.del_sk_stg_map), + &active_fd, &value, 0); + if (!ASSERT_OK(err, "map_update(del_sk_stg_map)")) + goto out; + + passive_fd = accept(listen_fd, NULL, 0); + if (CHECK(passive_fd == -1, "accept", "passive_fd:%d errno:%d\n", + passive_fd, errno)) + goto out; + + shutdown(active_fd, SHUT_WR); + err = read(passive_fd, &abyte, 1); + if (!ASSERT_OK(err, "read(passive_fd)")) + goto out; + + shutdown(passive_fd, SHUT_WR); + err = read(active_fd, &abyte, 1); + if (!ASSERT_OK(err, "read(active_fd)")) + goto out; + + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.del_sk_stg_map), + &active_fd, &value); + if (!ASSERT_ERR(err, "map_lookup(del_sk_stg_map)")) + goto out; + + err = check_sk_stg(listen_fd, BPF_TCP_LISTEN); + if (!ASSERT_OK(err, "listen_fd sk_stg")) + goto out; + + err = check_sk_stg(active_fd, BPF_TCP_FIN_WAIT2); + if (!ASSERT_OK(err, "active_fd sk_stg")) + goto out; + + err = check_sk_stg(passive_fd, BPF_TCP_LAST_ACK); + ASSERT_OK(err, "passive_fd sk_stg"); + +out: + if (active_fd != -1) + close(active_fd); + if (passive_fd != -1) + close(passive_fd); + if (listen_fd != -1) + close(listen_fd); +} + +void test_sk_storage_tracing(void) +{ + struct test_sk_storage_trace_itself *skel_itself; + int err; + + my_pid = getpid(); + + skel_itself = test_sk_storage_trace_itself__open_and_load(); + + if (!ASSERT_NULL(skel_itself, "test_sk_storage_trace_itself")) { + test_sk_storage_trace_itself__destroy(skel_itself); + return; + } + + skel = test_sk_storage_tracing__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_sk_storage_tracing")) + return; + + err = test_sk_storage_tracing__attach(skel); + if (!ASSERT_OK(err, "test_sk_storage_tracing__attach")) { + test_sk_storage_tracing__destroy(skel); + return; + } + + do_test(); + + test_sk_storage_tracing__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c b/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c new file mode 100644 index 000000000000..59ef72d02a61 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_trace_itself.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_stg_map SEC(".maps"); + +SEC("fentry/bpf_sk_storage_free") +int BPF_PROG(trace_bpf_sk_storage_free, struct sock *sk) +{ + int *value; + + value = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + + if (value) + *value = 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c new file mode 100644 index 000000000000..8e94e5c080aa --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include +#include + +struct sk_stg { + __u32 pid; + __u32 last_notclose_state; + char comm[16]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct sk_stg); +} sk_stg_map SEC(".maps"); + +/* Testing delete */ +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} del_sk_stg_map SEC(".maps"); + +char task_comm[16] = ""; + +SEC("tp_btf/inet_sock_set_state") +int BPF_PROG(trace_inet_sock_set_state, struct sock *sk, int oldstate, + int newstate) +{ + struct sk_stg *stg; + + if (newstate == BPF_TCP_CLOSE) + return 0; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return 0; + + stg->last_notclose_state = newstate; + + bpf_sk_storage_delete(&del_sk_stg_map, sk); + + return 0; +} + +static void set_task_info(struct sock *sk) +{ + struct task_struct *task; + struct sk_stg *stg; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return; + + stg->pid = bpf_get_current_pid_tgid(); + + task = (struct task_struct *)bpf_get_current_task(); + bpf_core_read_str(&stg->comm, sizeof(stg->comm), &task->comm); + bpf_core_read_str(&task_comm, sizeof(task_comm), &task->comm); +} + +SEC("fentry/inet_csk_listen_start") +int BPF_PROG(trace_inet_csk_listen_start, struct sock *sk, int backlog) +{ + set_task_info(sk); + + return 0; +} + +SEC("fentry/tcp_connect") +int BPF_PROG(trace_tcp_connect, struct sock *sk) +{ + set_task_info(sk); + + return 0; +} + +SEC("fexit/inet_csk_accept") +int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, + struct sock *accepted_sk) +{ + set_task_info(accepted_sk); + + return 0; +} + +char _license[] SEC("license") = "GPL"; From 423f16108c9d832bd96059d5c882c8ef6d76eb96 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:29 +0000 Subject: [PATCH 60/66] bpf: Augment the set of sleepable LSM hooks Update the set of sleepable hooks with the ones that do not trigger a warning with might_fault() when exercised with the correct kernel config options enabled, i.e. DEBUG_ATOMIC_SLEEP=y LOCKDEP=y PROVE_LOCKING=y This means that a sleepable LSM eBPF program can be attached to these LSM hooks. A new helper method bpf_lsm_is_sleepable_hook is added and the set is maintained locally in bpf_lsm.c Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201113005930.541956-2-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 7 ++++ kernel/bpf/bpf_lsm.c | 81 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 16 +------- 3 files changed, 89 insertions(+), 15 deletions(-) diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 73226181b744..0d1c33ace398 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -27,6 +27,8 @@ extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +bool bpf_lsm_is_sleepable_hook(u32 btf_id); + static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { @@ -54,6 +56,11 @@ void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ +static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e92c51bebb47..aed74b853415 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -72,6 +73,86 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +/* The set of hooks which are called without pagefaults disabled and are allowed + * to "sleep" and thus can be used for sleeable BPF programs. + */ +BTF_SET_START(sleepable_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf) +BTF_ID(func, bpf_lsm_bpf_map) +BTF_ID(func, bpf_lsm_bpf_map_alloc_security) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bprm_check_security) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +BTF_ID(func, bpf_lsm_bprm_committing_creds) +BTF_ID(func, bpf_lsm_bprm_creds_for_exec) +BTF_ID(func, bpf_lsm_bprm_creds_from_file) +BTF_ID(func, bpf_lsm_capget) +BTF_ID(func, bpf_lsm_capset) +BTF_ID(func, bpf_lsm_cred_prepare) +BTF_ID(func, bpf_lsm_file_ioctl) +BTF_ID(func, bpf_lsm_file_lock) +BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_receive) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_ID(func, bpf_lsm_inode_create) +BTF_ID(func, bpf_lsm_inode_free_security) +BTF_ID(func, bpf_lsm_inode_getattr) +BTF_ID(func, bpf_lsm_inode_getxattr) +BTF_ID(func, bpf_lsm_inode_mknod) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_rename) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_symlink) +BTF_ID(func, bpf_lsm_inode_unlink) +BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernfs_init_security) +BTF_ID(func, bpf_lsm_key_free) +BTF_ID(func, bpf_lsm_mmap_file) +BTF_ID(func, bpf_lsm_netlink_send) +BTF_ID(func, bpf_lsm_path_notify) +BTF_ID(func, bpf_lsm_release_secctx) +BTF_ID(func, bpf_lsm_sb_alloc_security) +BTF_ID(func, bpf_lsm_sb_eat_lsm_opts) +BTF_ID(func, bpf_lsm_sb_kern_mount) +BTF_ID(func, bpf_lsm_sb_mount) +BTF_ID(func, bpf_lsm_sb_remount) +BTF_ID(func, bpf_lsm_sb_set_mnt_opts) +BTF_ID(func, bpf_lsm_sb_show_options) +BTF_ID(func, bpf_lsm_sb_statfs) +BTF_ID(func, bpf_lsm_sb_umount) +BTF_ID(func, bpf_lsm_settime) +BTF_ID(func, bpf_lsm_socket_accept) +BTF_ID(func, bpf_lsm_socket_bind) +BTF_ID(func, bpf_lsm_socket_connect) +BTF_ID(func, bpf_lsm_socket_create) +BTF_ID(func, bpf_lsm_socket_getpeername) +BTF_ID(func, bpf_lsm_socket_getpeersec_dgram) +BTF_ID(func, bpf_lsm_socket_getsockname) +BTF_ID(func, bpf_lsm_socket_getsockopt) +BTF_ID(func, bpf_lsm_socket_listen) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_recvmsg) +BTF_ID(func, bpf_lsm_socket_sendmsg) +BTF_ID(func, bpf_lsm_socket_shutdown) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_ID(func, bpf_lsm_syslog) +BTF_ID(func, bpf_lsm_task_alloc) +BTF_ID(func, bpf_lsm_task_getsecid) +BTF_ID(func, bpf_lsm_task_prctl) +BTF_ID(func, bpf_lsm_task_setscheduler) +BTF_ID(func, bpf_lsm_task_to_inode) +BTF_SET_END(sleepable_lsm_hooks) + +bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b1f85aa9741..fb2943ea715d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11562,20 +11562,6 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) return -EINVAL; } -/* non exhaustive list of sleepable bpf_lsm_*() functions */ -BTF_SET_START(btf_sleepable_lsm_hooks) -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_lsm_bprm_committed_creds) -#else -BTF_ID_UNUSED -#endif -BTF_SET_END(btf_sleepable_lsm_hooks) - -static int check_sleepable_lsm_hook(u32 btf_id) -{ - return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); -} - /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list */ @@ -11797,7 +11783,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, /* LSM progs check that they are attached to bpf_lsm_*() funcs. * Only some of them are sleepable. */ - if (check_sleepable_lsm_hook(btf_id)) + if (bpf_lsm_is_sleepable_hook(btf_id)) ret = 0; break; default: From 6f100640ca5b2a2ff67b001c9fd3de21f7b12cf2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:30 +0000 Subject: [PATCH 61/66] bpf: Expose bpf_d_path helper to sleepable LSM hooks Sleepable hooks are never called from an NMI/interrupt context, so it is safe to use the bpf_d_path helper in LSM programs attaching to these hooks. The helper is not restricted to sleepable programs and merely uses the list of sleepable hooks as the initial subset of LSM hooks where it can be used. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201113005930.541956-3-kpsingh@chromium.org --- kernel/trace/bpf_trace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cfce60ad1cb5..02986c7b90eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include @@ -1179,7 +1181,11 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { - return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); + if (prog->type == BPF_PROG_TYPE_LSM) + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); + + return btf_id_set_contains(&btf_allowlist_d_path, + prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) From 8965398713d831f6b893805880c249e62e9059ae Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 13 Nov 2020 12:48:28 +0100 Subject: [PATCH 62/66] net: xdp: Introduce bulking for xdp tx return path XDP bulk APIs introduce a defer/flush mechanism to return pages belonging to the same xdp_mem_allocator object (identified via the mem.id field) in bulk to optimize I-cache and D-cache since xdp_return_frame is usually run inside the driver NAPI tx completion loop. The bulk queue size is set to 16 to be aligned to how XDP_REDIRECT bulking works. The bulk is flushed when it is full or when mem.id changes. xdp_frame_bulk is usually stored/allocated on the function call-stack to avoid locking penalties. Current implementation considers only page_pool memory model. Suggested-by: Jesper Dangaard Brouer Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/bpf/e190c03eac71b20c8407ae0fc2c399eda7835f49.1605267335.git.lorenzo@kernel.org --- include/net/xdp.h | 17 +++++++++++++- net/core/xdp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/include/net/xdp.h b/include/net/xdp.h index 3814fb631d52..7d48b2ae217a 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,18 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +#define XDP_BULK_QUEUE_SIZE 16 +struct xdp_frame_bulk { + int count; + void *xa; + void *q[XDP_BULK_QUEUE_SIZE]; +}; + +static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +{ + /* bq->count will be zero'ed when bq->xa gets updated */ + bq->xa = NULL; +} static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) @@ -194,6 +206,9 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq); /* When sending xdp_frame into the network stack, then there is no * return point callback, which is needed to release e.g. DMA-mapping @@ -245,6 +260,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); -#define DEV_MAP_BULK_SIZE 16 +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..bbaee7fdd44f 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -380,6 +380,65 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + int i; + + if (unlikely(!xa)) + return; + + for (i = 0; i < bq->count; i++) { + struct page *page = virt_to_head_page(bq->q[i]); + + page_pool_put_full_page(xa->page_pool, page, false); + } + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { __xdp_return(xdp->data, &xdp->rxq->mem, true); From 7886244736a4dbb49987f330772842130493e050 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 13 Nov 2020 12:48:29 +0100 Subject: [PATCH 63/66] net: page_pool: Add bulk support for ptr_ring Introduce the capability to batch page_pool ptr_ring refill since it is usually run inside the driver NAPI tx completion loop. Suggested-by: Jesper Dangaard Brouer Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/bpf/08dd249c9522c001313f520796faa777c4089e1c.1605267335.git.lorenzo@kernel.org --- include/net/page_pool.h | 26 +++++++++++++++ net/core/page_pool.c | 70 +++++++++++++++++++++++++++++++++++------ net/core/xdp.c | 9 ++---- 3 files changed, 88 insertions(+), 17 deletions(-) diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 81d7773f96cd..b5b195305346 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -152,6 +152,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); void page_pool_release_page(struct page_pool *pool, struct page *page); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -165,6 +167,11 @@ static inline void page_pool_release_page(struct page_pool *pool, struct page *page) { } + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} #endif void page_pool_put_page(struct page_pool *pool, struct page *page, @@ -215,4 +222,23 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } + +static inline void page_pool_ring_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); +} + +static inline void page_pool_ring_unlock(struct page_pool *pool) + __releases(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..f3c690b8c8e3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include #include +#include + #include #include #include @@ -362,8 +364,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -379,15 +382,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +405,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/xdp.c b/net/core/xdp.c index bbaee7fdd44f..3d330ebda893 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -393,16 +393,11 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { struct xdp_mem_allocator *xa = bq->xa; - int i; - if (unlikely(!xa)) + if (unlikely(!xa || !bq->count)) return; - for (i = 0; i < bq->count; i++) { - struct page *page = virt_to_head_page(bq->q[i]); - - page_pool_put_full_page(xa->page_pool, page, false); - } + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ bq->count = 0; } From 2f9d09394d138be99050ad9eabe4d3ff13f79da4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 13 Nov 2020 12:48:30 +0100 Subject: [PATCH 64/66] net: mvneta: Add xdp tx return bulking support Convert mvneta driver to xdp_return_frame_bulk APIs. XDP_REDIRECT (upstream codepath): 275Kpps XDP_REDIRECT (upstream codepath + bulking APIs): 284Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/9af8014006d022fc0fec78cdaa71beb56999750d.1605267335.git.lorenzo@kernel.org --- drivers/net/ethernet/marvell/mvneta.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 54b0bf574c05..183530ed4d1d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1834,8 +1834,13 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp, struct netdev_queue *nq, bool napi) { unsigned int bytes_compl = 0, pkts_compl = 0; + struct xdp_frame_bulk bq; int i; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + for (i = 0; i < num; i++) { struct mvneta_tx_buf *buf = &txq->buf[txq->txq_get_index]; struct mvneta_tx_desc *tx_desc = txq->descs + @@ -1857,9 +1862,12 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp, if (napi && buf->type == MVNETA_TYPE_XDP_TX) xdp_return_frame_rx_napi(buf->xdpf); else - xdp_return_frame(buf->xdpf); + xdp_return_frame_bulk(buf->xdpf, &bq); } } + xdp_flush_frame_bulk(&bq); + + rcu_read_unlock(); netdev_tx_completed_queue(nq, pkts_compl, bytes_compl); } From dbef19ccde5d83dd16646532f091b54a67c73cd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 13 Nov 2020 12:48:31 +0100 Subject: [PATCH 65/66] net: mvpp2: Add xdp tx return bulking support Convert mvpp2 driver to xdp_return_frame_bulk APIs. XDP_REDIRECT (upstream codepath): 1.79Mpps XDP_REDIRECT (upstream codepath + bulking APIs): 1.93Mpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Tested-by: Matteo Croce Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/0b38c295e58e8ce251ef6b4e2187a2f457f9f7a3.1605267335.git.lorenzo@kernel.org --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index f6616c8933ca..3069e192d773 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -2440,8 +2440,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, struct mvpp2_tx_queue *txq, struct mvpp2_txq_pcpu *txq_pcpu, int num) { + struct xdp_frame_bulk bq; int i; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + for (i = 0; i < num; i++) { struct mvpp2_txq_pcpu_buf *tx_buf = txq_pcpu->buffs + txq_pcpu->txq_get_index; @@ -2454,10 +2459,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, dev_kfree_skb_any(tx_buf->skb); else if (tx_buf->type == MVPP2_TYPE_XDP_TX || tx_buf->type == MVPP2_TYPE_XDP_NDO) - xdp_return_frame(tx_buf->xdpf); + xdp_return_frame_bulk(tx_buf->xdpf, &bq); mvpp2_txq_inc_get(txq_pcpu); } + xdp_flush_frame_bulk(&bq); + + rcu_read_unlock(); } static inline struct mvpp2_rx_queue *mvpp2_get_rx_queue(struct mvpp2_port *port, From b87c57ae12dbecd50471b437e09e3f7dc916d8bc Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 13 Nov 2020 12:48:32 +0100 Subject: [PATCH 66/66] net: mlx5: Add xdp tx return bulking support Convert mlx5 driver to xdp_return_frame_bulk APIs. XDP_REDIRECT (upstream codepath): 8.9Mpps XDP_REDIRECT (upstream codepath + bulking APIs): 10.2Mpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Tested-by: Jesper Dangaard Brouer Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/250460319fd868b7b5668fc1deca74dd42813a90.1605267335.git.lorenzo@kernel.org --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index ae90d533a350..2e3e78b0f333 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -366,7 +366,8 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_wqe_info *wi, u32 *xsk_frames, - bool recycle) + bool recycle, + struct xdp_frame_bulk *bq) { struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; u16 i; @@ -379,7 +380,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, /* XDP_TX from the XSK RQ and XDP_REDIRECT */ dma_unmap_single(sq->pdev, xdpi.frame.dma_addr, xdpi.frame.xdpf->len, DMA_TO_DEVICE); - xdp_return_frame(xdpi.frame.xdpf); + xdp_return_frame_bulk(xdpi.frame.xdpf, bq); break; case MLX5E_XDP_XMIT_MODE_PAGE: /* XDP_TX from the regular RQ */ @@ -397,12 +398,15 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) { + struct xdp_frame_bulk bq; struct mlx5e_xdpsq *sq; struct mlx5_cqe64 *cqe; u32 xsk_frames = 0; u16 sqcc; int i; + xdp_frame_bulk_init(&bq); + sq = container_of(cq, struct mlx5e_xdpsq, cq); if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) @@ -434,7 +438,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) sqcc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true, &bq); } while (!last_wqe); if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { @@ -447,6 +451,8 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) } } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + xdp_flush_frame_bulk(&bq); + if (xsk_frames) xsk_tx_completed(sq->xsk_pool, xsk_frames); @@ -463,8 +469,13 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) { + struct xdp_frame_bulk bq; u32 xsk_frames = 0; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + while (sq->cc != sq->pc) { struct mlx5e_xdp_wqe_info *wi; u16 ci; @@ -474,9 +485,12 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) sq->cc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false, &bq); } + xdp_flush_frame_bulk(&bq); + rcu_read_unlock(); + if (xsk_frames) xsk_tx_completed(sq->xsk_pool, xsk_frames); }