From 7c88f2bf7840c0ea67ac2de2e293a653f62ea134 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jul 2017 07:05:28 +0200 Subject: [PATCH 0001/1322] tile: defconfig: Cleanup from old Kconfig options Remove old, dead Kconfig options (in order appearing in this commit): - CRYPTO_ZLIB: commit 110492183c4b ("crypto: compress - remove unused pcomp interface"); - IP_NF_TARGET_ULOG: commit d4da843e6fad ("netfilter: kill remnants of ulog targets"); Signed-off-by: Krzysztof Kozlowski Signed-off-by: Chris Metcalf --- arch/tile/configs/tilegx_defconfig | 1 - arch/tile/configs/tilepro_defconfig | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index 0d925fa0f0c1..9f94435cc44f 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig @@ -409,5 +409,4 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=m CONFIG_CRYPTO_LZO=m diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index 149d8e8eacb8..1c5bd4f8ffca 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig @@ -189,7 +189,6 @@ CONFIG_IP_NF_MATCH_ECN=m CONFIG_IP_NF_MATCH_TTL=m CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=m CONFIG_IP_NF_MANGLE=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m @@ -521,7 +520,6 @@ CONFIG_CRYPTO_SEED=m CONFIG_CRYPTO_SERPENT=m CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_ZLIB=m CONFIG_CRYPTO_LZO=m CONFIG_CRC_CCITT=m CONFIG_CRC7=m From 637f23abca87d26e091e0d6647ec878d97d2c6cd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Jul 2017 10:33:02 +0300 Subject: [PATCH 0002/1322] tile: array underflow in setup_maxnodemem() My static checker correctly complains that we should have a lower bound on "node" to prevent an array underflow. Fixes: 867e359b97c9 ("arch/tile: core support for Tilera 32-bit chips.") Signed-off-by: Dan Carpenter Signed-off-by: Chris Metcalf --- arch/tile/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 443a70bccc1c..b1474e7d9afb 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -140,7 +140,7 @@ static int __init setup_maxnodemem(char *str) { char *endp; unsigned long long maxnodemem; - long node; + unsigned long node; node = str ? simple_strtoul(str, &endp, 0) : INT_MAX; if (node >= MAX_NUMNODES || *endp != ':') From 472b46c352c9ff0b6fa57dbf85d77c51901a3368 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 6 Aug 2017 18:44:27 +0200 Subject: [PATCH 0003/1322] uapi linux/kfd_ioctl.h: only use __u32 and __u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include instead of which on Linux includes and on non-Linux platforms defines __u32 etc types. Fixes user space compilation errors like: linux/kfd_ioctl.h:33:2: error: unknown type name ‘uint32_t’ uint32_t major_version; /* from KFD */ ^~~~~~~~ Signed-off-by: Mikko Rapeli Acked-by: Arnd Bergmann Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 168 ++++++++++++++++----------------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7b4567bacfc2..26283fefdf5f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -23,15 +23,15 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include #define KFD_IOCTL_MAJOR_VERSION 1 #define KFD_IOCTL_MINOR_VERSION 1 struct kfd_ioctl_get_version_args { - uint32_t major_version; /* from KFD */ - uint32_t minor_version; /* from KFD */ + __u32 major_version; /* from KFD */ + __u32 minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -43,36 +43,36 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - uint64_t ring_base_address; /* to KFD */ - uint64_t write_pointer_address; /* from KFD */ - uint64_t read_pointer_address; /* from KFD */ - uint64_t doorbell_offset; /* from KFD */ + __u64 ring_base_address; /* to KFD */ + __u64 write_pointer_address; /* from KFD */ + __u64 read_pointer_address; /* from KFD */ + __u64 doorbell_offset; /* from KFD */ - uint32_t ring_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t queue_type; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ - uint32_t queue_id; /* from KFD */ + __u32 ring_size; /* to KFD */ + __u32 gpu_id; /* to KFD */ + __u32 queue_type; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ + __u32 queue_id; /* from KFD */ - uint64_t eop_buffer_address; /* to KFD */ - uint64_t eop_buffer_size; /* to KFD */ - uint64_t ctx_save_restore_address; /* to KFD */ - uint64_t ctx_save_restore_size; /* to KFD */ + __u64 eop_buffer_address; /* to KFD */ + __u64 eop_buffer_size; /* to KFD */ + __u64 ctx_save_restore_address; /* to KFD */ + __u64 ctx_save_restore_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - uint32_t queue_id; /* to KFD */ - uint32_t pad; + __u32 queue_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_update_queue_args { - uint64_t ring_base_address; /* to KFD */ + __u64 ring_base_address; /* to KFD */ - uint32_t queue_id; /* to KFD */ - uint32_t ring_size; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ + __u32 queue_id; /* to KFD */ + __u32 ring_size; /* to KFD */ + __u32 queue_percentage; /* to KFD */ + __u32 queue_priority; /* to KFD */ }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -80,13 +80,13 @@ struct kfd_ioctl_update_queue_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - uint64_t alternate_aperture_base; /* to KFD */ - uint64_t alternate_aperture_size; /* to KFD */ + __u64 alternate_aperture_base; /* to KFD */ + __u64 alternate_aperture_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t default_policy; /* to KFD */ - uint32_t alternate_policy; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 default_policy; /* to KFD */ + __u32 alternate_policy; /* to KFD */ + __u32 pad; }; /* @@ -97,26 +97,26 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - uint64_t gpu_clock_counter; /* from KFD */ - uint64_t cpu_clock_counter; /* from KFD */ - uint64_t system_clock_counter; /* from KFD */ - uint64_t system_clock_freq; /* from KFD */ + __u64 gpu_clock_counter; /* from KFD */ + __u64 cpu_clock_counter; /* from KFD */ + __u64 system_clock_counter; /* from KFD */ + __u64 system_clock_freq; /* from KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; #define NUM_OF_SUPPORTED_GPUS 7 struct kfd_process_device_apertures { - uint64_t lds_base; /* from KFD */ - uint64_t lds_limit; /* from KFD */ - uint64_t scratch_base; /* from KFD */ - uint64_t scratch_limit; /* from KFD */ - uint64_t gpuvm_base; /* from KFD */ - uint64_t gpuvm_limit; /* from KFD */ - uint32_t gpu_id; /* from KFD */ - uint32_t pad; + __u64 lds_base; /* from KFD */ + __u64 lds_limit; /* from KFD */ + __u64 scratch_base; /* from KFD */ + __u64 scratch_limit; /* from KFD */ + __u64 gpuvm_base; /* from KFD */ + __u64 gpuvm_limit; /* from KFD */ + __u32 gpu_id; /* from KFD */ + __u32 pad; }; struct kfd_ioctl_get_process_apertures_args { @@ -124,8 +124,8 @@ struct kfd_ioctl_get_process_apertures_args { process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - uint32_t num_of_nodes; - uint32_t pad; + __u32 num_of_nodes; + __u32 pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -133,25 +133,25 @@ struct kfd_ioctl_get_process_apertures_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_unregister_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; + __u32 gpu_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_dbg_address_watch_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ + __u64 content_ptr; /* a pointer to the actual content */ + __u32 gpu_id; /* to KFD */ + __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ }; /* Matching HSA_EVENTTYPE */ @@ -172,44 +172,44 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_SIGNAL_EVENT_LIMIT 256 struct kfd_ioctl_create_event_args { - uint64_t event_page_offset; /* from KFD */ - uint32_t event_trigger_data; /* from KFD - signal events only */ - uint32_t event_type; /* to KFD */ - uint32_t auto_reset; /* to KFD */ - uint32_t node_id; /* to KFD - only valid for certain + __u64 event_page_offset; /* from KFD */ + __u32 event_trigger_data; /* from KFD - signal events only */ + __u32 event_type; /* to KFD */ + __u32 auto_reset; /* to KFD */ + __u32 node_id; /* to KFD - only valid for certain event types */ - uint32_t event_id; /* from KFD */ - uint32_t event_slot_index; /* from KFD */ + __u32 event_id; /* from KFD */ + __u32 event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_set_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_reset_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_memory_exception_failure { - uint32_t NotPresent; /* Page not present or supervisor privilege */ - uint32_t ReadOnly; /* Write access to a read-only page */ - uint32_t NoExecute; /* Execute access to a page marked NX */ - uint32_t pad; + __u32 NotPresent; /* Page not present or supervisor privilege */ + __u32 ReadOnly; /* Write access to a read-only page */ + __u32 NoExecute; /* Execute access to a page marked NX */ + __u32 pad; }; /* memory exception data*/ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - uint64_t va; - uint32_t gpu_id; - uint32_t pad; + __u64 va; + __u32 gpu_id; + __u32 pad; }; /* Event data*/ @@ -217,19 +217,19 @@ struct kfd_event_data { union { struct kfd_hsa_memory_exception_data memory_exception_data; }; /* From KFD */ - uint64_t kfd_event_data_ext; /* pointer to an extension structure + __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - uint32_t event_id; /* to KFD */ - uint32_t pad; + __u32 event_id; /* to KFD */ + __u32 pad; }; struct kfd_ioctl_wait_events_args { - uint64_t events_ptr; /* pointed to struct + __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - uint32_t num_events; /* to KFD */ - uint32_t wait_for_all; /* to KFD */ - uint32_t timeout; /* to KFD */ - uint32_t wait_result; /* from KFD */ + __u32 num_events; /* to KFD */ + __u32 wait_for_all; /* to KFD */ + __u32 timeout; /* to KFD */ + __u32 wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { From a33b2d0359a0aae25d5ac7a26b85a5682485ebbb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Aug 2016 17:46:23 -0700 Subject: [PATCH 0004/1322] selftests/seccomp: Add tests for basic ptrace actions This adds tests for using only ptrace to perform syscall changes, just to validate matching behavior between seccomp events and ptrace events. Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 73f5ea6778ce..e61b963f011b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1262,6 +1262,13 @@ TEST_F(TRACE_poke, getpid_runs_normally) # error "Do not know how to find your architecture's registers and syscalls" #endif +/* When the syscall return can't be changed, stub out the tests for it. */ +#ifdef SYSCALL_NUM_RET_SHARE_REG +# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action) +#else +# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(val, action) +#endif + /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux). */ @@ -1357,7 +1364,7 @@ void change_syscall(struct __test_metadata *_metadata, #ifdef SYSCALL_NUM_RET_SHARE_REG TH_LOG("Can't modify syscall return on this architecture"); #else - regs.SYSCALL_RET = 1; + regs.SYSCALL_RET = EPERM; #endif #ifdef HAVE_GETREGS @@ -1426,6 +1433,8 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, if (nr == __NR_getpid) change_syscall(_metadata, tracee, __NR_getppid); + if (nr == __NR_open) + change_syscall(_metadata, tracee, -1); } FIXTURE_DATA(TRACE_syscall) { @@ -1480,6 +1489,28 @@ FIXTURE_TEARDOWN(TRACE_syscall) free(self->prog.filter); } +TEST_F(TRACE_syscall, ptrace_syscall_redirected) +{ + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + /* Tracer will redirect getpid to getppid. */ + EXPECT_NE(self->mypid, syscall(__NR_getpid)); +} + +TEST_F(TRACE_syscall, ptrace_syscall_dropped) +{ + /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ + teardown_trace_fixture(_metadata, self->tracer); + self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, + true); + + /* Tracer should skip the open syscall, resulting in EPERM. */ + EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_open)); +} + TEST_F(TRACE_syscall, syscall_allowed) { long ret; @@ -1520,13 +1551,8 @@ TEST_F(TRACE_syscall, syscall_dropped) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); ASSERT_EQ(0, ret); -#ifdef SYSCALL_NUM_RET_SHARE_REG - /* gettid has been skipped */ - EXPECT_EQ(-1, syscall(__NR_gettid)); -#else /* gettid has been skipped and an altered return value stored. */ - EXPECT_EQ(1, syscall(__NR_gettid)); -#endif + EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_gettid)); EXPECT_NE(self->mytid, syscall(__NR_gettid)); } @@ -1557,6 +1583,7 @@ TEST_F(TRACE_syscall, skip_after_RET_TRACE) ASSERT_EQ(0, ret); /* Tracer will redirect getpid to getppid, and we should see EPERM. */ + errno = 0; EXPECT_EQ(-1, syscall(__NR_getpid)); EXPECT_EQ(EPERM, errno); } From 967d7ba8415139107033770a40c7ec7dd17fbcb1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Dec 2015 11:39:36 -0800 Subject: [PATCH 0005/1322] selftests/seccomp: Add simple seccomp overhead benchmark This attempts to produce a comparison between native getpid() and a RET_ALLOW-filtered getpid(), to measure the overhead cost of using seccomp(). Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/Makefile | 18 +++- .../selftests/seccomp/seccomp_benchmark.c | 99 +++++++++++++++++++ 2 files changed, 112 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/seccomp/seccomp_benchmark.c diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile index aeb0c805f3ca..553d870b4ca9 100644 --- a/tools/testing/selftests/seccomp/Makefile +++ b/tools/testing/selftests/seccomp/Makefile @@ -1,8 +1,16 @@ -TEST_GEN_PROGS := seccomp_bpf -CFLAGS += -Wl,-no-as-needed -Wall -LDFLAGS += -lpthread +all: include ../lib.mk -$(TEST_GEN_PROGS): seccomp_bpf.c ../kselftest_harness.h - $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ +.PHONY: all clean + +BINARIES := seccomp_bpf seccomp_benchmark +CFLAGS += -Wl,-no-as-needed -Wall + +seccomp_bpf: seccomp_bpf.c ../kselftest_harness.h + $(CC) $(CFLAGS) $(LDFLAGS) -lpthread $< -o $@ + +TEST_PROGS += $(BINARIES) +EXTRA_CLEAN := $(BINARIES) + +all: $(BINARIES) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c new file mode 100644 index 000000000000..5838c8697ec3 --- /dev/null +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -0,0 +1,99 @@ +/* + * Strictly speaking, this is not a test. But it can report during test + * runs so relative performace can be measured. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +unsigned long long timing(clockid_t clk_id, unsigned long long samples) +{ + pid_t pid, ret; + unsigned long long i; + struct timespec start, finish; + + pid = getpid(); + assert(clock_gettime(clk_id, &start) == 0); + for (i = 0; i < samples; i++) { + ret = syscall(__NR_getpid); + assert(pid == ret); + } + assert(clock_gettime(clk_id, &finish) == 0); + + i = finish.tv_sec - start.tv_sec; + i *= 1000000000; + i += finish.tv_nsec - start.tv_nsec; + + printf("%lu.%09lu - %lu.%09lu = %llu\n", + finish.tv_sec, finish.tv_nsec, + start.tv_sec, start.tv_nsec, + i); + + return i; +} + +unsigned long long calibrate(void) +{ + unsigned long long i; + + printf("Calibrating reasonable sample size...\n"); + + for (i = 5; ; i++) { + unsigned long long samples = 1 << i; + + /* Find something that takes more than 5 seconds to run. */ + if (timing(CLOCK_REALTIME, samples) / 1000000000ULL > 5) + return samples; + } +} + +int main(int argc, char *argv[]) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + unsigned long long samples; + unsigned long long native, filtered; + + if (argc > 1) + samples = strtoull(argv[1], NULL, 0); + else + samples = calibrate(); + + printf("Benchmarking %llu samples...\n", samples); + + native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid native: %llu ns\n", native); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + assert(ret == 0); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + assert(ret == 0); + + filtered = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW: %llu ns\n", filtered); + + printf("Estimated seccomp overhead per syscall: %llu ns\n", + filtered - native); + + if (filtered == native) + printf("Trying running again with more samples.\n"); + + return 0; +} From f3f6e30669c048f47d51ea59df9946a91f551c4c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2017 14:08:39 -0700 Subject: [PATCH 0006/1322] selftests/seccomp: Refactor RET_ERRNO tests This refactors the errno tests (since they all use the same pattern for their filter) and adds a RET_DATA field ordering test. Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- tools/testing/selftests/seccomp/seccomp_bpf.c | 95 +++++++++++-------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e61b963f011b..2fb49d99588d 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -136,7 +136,7 @@ TEST(no_new_privs_support) } } -/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */ +/* Tests kernel support by checking for a copy_from_user() fault on NULL. */ TEST(mode_filter_support) { long ret; @@ -541,26 +541,30 @@ TEST(arg_out_of_range) EXPECT_EQ(EINVAL, errno); } +#define ERRNO_FILTER(name, errno) \ + struct sock_filter _read_filter_##name[] = { \ + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, \ + offsetof(struct seccomp_data, nr)), \ + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), \ + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno), \ + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), \ + }; \ + struct sock_fprog prog_##name = { \ + .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \ + .filter = _read_filter_##name, \ + } + +/* Make sure basic errno values are correctly passed through a filter. */ TEST(ERRNO_valid) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(valid, E2BIG); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -568,26 +572,17 @@ TEST(ERRNO_valid) EXPECT_EQ(E2BIG, errno); } +/* Make sure an errno of zero is correctly handled by the arch code. */ TEST(ERRNO_zero) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(zero, 0); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -595,26 +590,21 @@ TEST(ERRNO_zero) EXPECT_EQ(0, read(0, NULL, 0)); } +/* + * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller. + * This tests that the errno value gets capped correctly, fixed by + * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO"). + */ TEST(ERRNO_capped) { - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; + ERRNO_FILTER(capped, 4096); long ret; pid_t parent = getppid(); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped); ASSERT_EQ(0, ret); EXPECT_EQ(parent, syscall(__NR_getppid)); @@ -622,6 +612,37 @@ TEST(ERRNO_capped) EXPECT_EQ(4095, errno); } +/* + * Filters are processed in reverse order: last applied is executed first. + * Since only the SECCOMP_RET_ACTION mask is tested for return values, the + * SECCOMP_RET_DATA mask results will follow the most recently applied + * matching filter return (and not the lowest or highest value). + */ +TEST(ERRNO_order) +{ + ERRNO_FILTER(first, 11); + ERRNO_FILTER(second, 13); + ERRNO_FILTER(third, 12); + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(-1, read(0, NULL, 0)); + EXPECT_EQ(12, errno); +} + FIXTURE_DATA(TRAP) { struct sock_fprog prog; }; From deb4de8b31bc5bf21efb6ac31150a01a631cd647 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2017 15:00:40 -0700 Subject: [PATCH 0007/1322] seccomp: Provide matching filter for introspection Both the upcoming logging improvements and changes to RET_KILL will need to know which filter a given seccomp return value originated from. In order to delay logic processing of result until after the seccomp loop, this adds a single pointer assignment on matches. This will allow both log and RET_KILL logic to work off the filter rather than doing more expensive tests inside the time-critical run_filters loop. Running tight cycles of getpid() with filters attached shows no measurable difference in speed. Suggested-by: Tyler Hicks Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- kernel/seccomp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 98b59b5db90b..1f3347fc2605 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -171,10 +171,14 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters + * @match: stores struct seccomp_filter that resulted in the return value, + * unless filter returned SECCOMP_RET_ALLOW, in which case it will + * be unchanged. * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(const struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd, + struct seccomp_filter **match) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -198,8 +202,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { ret = cur_ret; + *match = f; + } } return ret; } @@ -566,6 +572,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_filter *match = NULL; int data; /* @@ -574,7 +581,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ rmb(); - filter_ret = seccomp_run_filters(sd); + filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION; @@ -638,6 +645,11 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_ALLOW: + /* + * Note that the "match" filter will always be NULL for + * this action since SECCOMP_RET_ALLOW is the starting + * state in seccomp_run_filters(). + */ return 0; case SECCOMP_RET_KILL: From 8e5f1ad116df6b0de65eac458d5e7c318d1c05af Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:52 +0000 Subject: [PATCH 0008/1322] seccomp: Sysctl to display available actions This patch creates a read-only sysctl containing an ordered list of seccomp actions that the kernel supports. The ordering, from left to right, is the lowest action value (kill) to the highest action value (allow). Currently, a read of the sysctl file would return "kill trap errno trace allow". The contents of this sysctl file can be useful for userspace code as well as the system administrator. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_avail libseccomp and other userspace code can easily determine which actions the current kernel supports. The set of actions supported by the current kernel may be different than the set of action macros found in kernel headers that were installed where the userspace code was built. In addition, this sysctl will allow system administrators to know which actions are supported by the kernel and make it easier to configure exactly what seccomp logs through the audit subsystem. Support for this level of logging configuration will come in a future patch. Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- Documentation/sysctl/kernel.txt | 1 + .../userspace-api/seccomp_filter.rst | 16 ++++++ kernel/seccomp.c | 51 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index bac23c198360..995c42cf86ba 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -74,6 +74,7 @@ show up in /proc/sys/kernel: - reboot-cmd [ SPARC only ] - rtsig-max - rtsig-nr +- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst - sem - sem_next_id [ sysv ipc ] - sg-big-buff [ generic SCSI device (sg) ] diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index f71eb5ef1f2d..35fc7cbf1d95 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -169,7 +169,23 @@ The ``samples/seccomp/`` directory contains both an x86-specific example and a more generic example of a higher level macro interface for BPF program generation. +Sysctls +======= +Seccomp's sysctl files can be found in the ``/proc/sys/kernel/seccomp/`` +directory. Here's a description of each file in that directory: + +``actions_avail``: + A read-only ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) in string form. The ordering, from + left-to-right, is the least permissive return value to the most + permissive return value. + + The list represents the set of seccomp return values supported + by the kernel. A userspace program may use this list to + determine if the actions found in the ``seccomp.h``, when the + program was built, differs from the set of actions actually + supported in the current running kernel. Adding architecture support =========================== diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1f3347fc2605..5f19f41e4e50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -17,11 +17,13 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include @@ -934,3 +936,52 @@ out: return ret; } #endif + +#ifdef CONFIG_SYSCTL + +/* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_TRAP_NAME "trap" +#define SECCOMP_RET_ERRNO_NAME "errno" +#define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_ALLOW_NAME "allow" + +static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_ALLOW_NAME; + +static struct ctl_path seccomp_sysctl_path[] = { + { .procname = "kernel", }, + { .procname = "seccomp", }, + { } +}; + +static struct ctl_table seccomp_sysctl_table[] = { + { + .procname = "actions_avail", + .data = (void *) &seccomp_actions_avail, + .maxlen = sizeof(seccomp_actions_avail), + .mode = 0444, + .proc_handler = proc_dostring, + }, + { } +}; + +static int __init seccomp_sysctl_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); + if (!hdr) + pr_warn("seccomp: sysctl registration failed\n"); + else + kmemleak_not_leak(hdr); + + return 0; +} + +device_initcall(seccomp_sysctl_init) + +#endif /* CONFIG_SYSCTL */ From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:53 +0000 Subject: [PATCH 0009/1322] seccomp: Operation for checking if an action is available Userspace code that needs to check if the kernel supports a given action may not be able to use the /proc/sys/kernel/seccomp/actions_avail sysctl. The process may be running in a sandbox and, therefore, sufficient filesystem access may not be available. This patch adds an operation to the seccomp(2) syscall that allows userspace code to ask the kernel if a given action is available. If the action is supported by the kernel, 0 is returned. If the action is not supported by the kernel, -1 is returned with errno set to -EOPNOTSUPP. If this check is attempted on a kernel that doesn't support this new operation, -1 is returned with errno set to -EINVAL meaning that userspace code will have the ability to differentiate between the two error cases. Signed-off-by: Tyler Hicks Suggested-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 5 +-- kernel/seccomp.c | 26 ++++++++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 36 +++++++++++++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0f238a43ff1e..aaad61cc46bc 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -11,8 +11,9 @@ #define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ /* Valid operations for seccomp syscall. */ -#define SECCOMP_SET_MODE_STRICT 0 -#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_SET_MODE_STRICT 0 +#define SECCOMP_SET_MODE_FILTER 1 +#define SECCOMP_GET_ACTION_AVAIL 2 /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f19f41e4e50..7a6089f66fed 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -808,6 +808,27 @@ static inline long seccomp_set_mode_filter(unsigned int flags, } #endif +static long seccomp_get_action_avail(const char __user *uaction) +{ + u32 action; + + if (copy_from_user(&action, uaction, sizeof(action))) + return -EFAULT; + + switch (action) { + case SECCOMP_RET_KILL: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + case SECCOMP_RET_ALLOW: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + /* Common entry point for both prctl and syscall. */ static long do_seccomp(unsigned int op, unsigned int flags, const char __user *uargs) @@ -819,6 +840,11 @@ static long do_seccomp(unsigned int op, unsigned int flags, return seccomp_set_mode_strict(); case SECCOMP_SET_MODE_FILTER: return seccomp_set_mode_filter(flags, uargs); + case SECCOMP_GET_ACTION_AVAIL: + if (flags != 0) + return -EINVAL; + + return seccomp_get_action_avail(uargs); default: return -EINVAL; } diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 2fb49d99588d..1f2888f6678b 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1731,6 +1731,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) #define SECCOMP_SET_MODE_FILTER 1 #endif +#ifndef SECCOMP_GET_ACTION_AVAIL +#define SECCOMP_GET_ACTION_AVAIL 2 +#endif + #ifndef SECCOMP_FILTER_FLAG_TSYNC #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif @@ -2469,6 +2473,38 @@ TEST(syscall_restart) _metadata->passed = 0; } +TEST(get_action_avail) +{ + __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, + SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, + SECCOMP_RET_ALLOW }; + __u32 unknown_action = 0x10000000U; + int i; + long ret; + + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + ASSERT_NE(EINVAL, errno) { + TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!"); + } + EXPECT_EQ(ret, 0); + + for (i = 0; i < ARRAY_SIZE(actions); i++) { + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]); + EXPECT_EQ(ret, 0) { + TH_LOG("Expected action (0x%X) not available!", + actions[i]); + } + } + + /* Check that an unknown action is handled properly (EOPNOTSUPP) */ + ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action); + EXPECT_EQ(ret, -1); + EXPECT_EQ(errno, EOPNOTSUPP); +} + /* * TODO: * - add microbenchmarks From 0ddec0fc8900201c0897b87b762b7c420436662f Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:54 +0000 Subject: [PATCH 0010/1322] seccomp: Sysctl to configure actions that are allowed to be logged Adminstrators can write to this sysctl to set the seccomp actions that are allowed to be logged. Any actions not found in this sysctl will not be logged. For example, all SECCOMP_RET_KILL, SECCOMP_RET_TRAP, and SECCOMP_RET_ERRNO actions would be loggable if "kill trap errno" were written to the sysctl. SECCOMP_RET_TRACE actions would not be logged since its string representation ("trace") wasn't present in the sysctl value. The path to the sysctl is: /proc/sys/kernel/seccomp/actions_logged The actions_avail sysctl can be read to discover the valid action names that can be written to the actions_logged sysctl with the exception of "allow". SECCOMP_RET_ALLOW actions cannot be configured for logging. The default setting for the sysctl is to allow all actions to be logged except SECCOMP_RET_ALLOW. While only SECCOMP_RET_KILL actions are currently logged, an upcoming patch will allow applications to request additional actions to be logged. There's one important exception to this sysctl. If a task is specifically being audited, meaning that an audit context has been allocated for the task, seccomp will log all actions other than SECCOMP_RET_ALLOW despite the value of actions_logged. This exception preserves the existing auditing behavior of tasks with an allocated audit context. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if audit_enabled && task-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- .../userspace-api/seccomp_filter.rst | 18 ++ include/linux/audit.h | 6 +- kernel/seccomp.c | 171 +++++++++++++++++- 3 files changed, 187 insertions(+), 8 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 35fc7cbf1d95..2d1d8ab04ac5 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -187,6 +187,24 @@ directory. Here's a description of each file in that directory: program was built, differs from the set of actions actually supported in the current running kernel. +``actions_logged``: + A read-write ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) that are allowed to be logged. Writes + to the file do not need to be in ordered form but reads from the file + will be ordered in the same way as the actions_avail sysctl. + + It is important to note that the value of ``actions_logged`` does not + prevent certain actions from being logged when the audit subsystem is + configured to audit a task. If the action is not found in + ``actions_logged`` list, the final decision on whether to audit the + action for that task is ultimately left up to the audit subsystem to + decide for all seccomp return values other than ``SECCOMP_RET_ALLOW``. + + The ``allow`` string is not accepted in the ``actions_logged`` sysctl + as it is not possible to log ``SECCOMP_RET_ALLOW`` actions. Attempting + to write ``allow`` to the sysctl will result in an EINVAL being + returned. + Adding architecture support =========================== diff --git a/include/linux/audit.h b/include/linux/audit.h index 2150bdccfbab..8c30f06d639d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -314,11 +314,7 @@ void audit_core_dumps(long signr); static inline void audit_seccomp(unsigned long syscall, long signr, int code) { - if (!audit_enabled) - return; - - /* Force a record to be reported if a signal was delivered. */ - if (signr || unlikely(!audit_dummy_context())) + if (audit_enabled && unlikely(!audit_dummy_context())) __audit_seccomp(syscall, signr, code); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7a6089f66fed..54357e361aea 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -522,6 +522,45 @@ static void seccomp_send_sigsys(int syscall, int reason) } #endif /* CONFIG_SECCOMP_FILTER */ +/* For use with seccomp_actions_logged */ +#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_TRAP (1 << 2) +#define SECCOMP_LOG_ERRNO (1 << 3) +#define SECCOMP_LOG_TRACE (1 << 4) +#define SECCOMP_LOG_ALLOW (1 << 5) + +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + +static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +{ + bool log = false; + + switch (action) { + case SECCOMP_RET_ALLOW: + case SECCOMP_RET_TRAP: + case SECCOMP_RET_ERRNO: + case SECCOMP_RET_TRACE: + break; + case SECCOMP_RET_KILL: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL; + } + + /* + * Force an audit message to be emitted when the action is RET_KILL and + * the action is allowed to be logged by the admin. + */ + if (log) + return __audit_seccomp(syscall, signr, action); + + /* + * Let the audit subsystem decide if the action should be audited based + * on whether the current task itself is being audited. + */ + return audit_seccomp(syscall, signr, action); +} + /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit @@ -547,7 +586,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); do_exit(SIGKILL); } @@ -656,7 +695,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - audit_seccomp(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -673,7 +712,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - audit_seccomp(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action); return -1; } #else @@ -978,6 +1017,127 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRACE_NAME " " SECCOMP_RET_ALLOW_NAME; +struct seccomp_log_name { + u32 log; + const char *name; +}; + +static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, + { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, + { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, + { } +}; + +static bool seccomp_names_from_actions_logged(char *names, size_t size, + u32 actions_logged) +{ + const struct seccomp_log_name *cur; + bool append_space = false; + + for (cur = seccomp_log_names; cur->name && size; cur++) { + ssize_t ret; + + if (!(actions_logged & cur->log)) + continue; + + if (append_space) { + ret = strscpy(names, " ", size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } else + append_space = true; + + ret = strscpy(names, cur->name, size); + if (ret < 0) + return false; + + names += ret; + size -= ret; + } + + return true; +} + +static bool seccomp_action_logged_from_name(u32 *action_logged, + const char *name) +{ + const struct seccomp_log_name *cur; + + for (cur = seccomp_log_names; cur->name; cur++) { + if (!strcmp(cur->name, name)) { + *action_logged = cur->log; + return true; + } + } + + return false; +} + +static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) +{ + char *name; + + *actions_logged = 0; + while ((name = strsep(&names, " ")) && *name) { + u32 action_logged = 0; + + if (!seccomp_action_logged_from_name(&action_logged, name)) + return false; + + *actions_logged |= action_logged; + } + + return true; +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + int ret; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + memset(names, 0, sizeof(names)); + + if (!write) { + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged)) + return -EINVAL; + } + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + ret = proc_dostring(&table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) { + u32 actions_logged; + + if (!seccomp_actions_logged_from_names(&actions_logged, + table.data)) + return -EINVAL; + + if (actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; + + seccomp_actions_logged = actions_logged; + } + + return 0; +} + static struct ctl_path seccomp_sysctl_path[] = { { .procname = "kernel", }, { .procname = "seccomp", }, @@ -992,6 +1152,11 @@ static struct ctl_table seccomp_sysctl_table[] = { .mode = 0444, .proc_handler = proc_dostring, }, + { + .procname = "actions_logged", + .mode = 0644, + .proc_handler = seccomp_actions_logged_handler, + }, { } }; From 2b7ea5b5b5799f2878ed454bb48032bed6d101d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:55 +0000 Subject: [PATCH 0011/1322] seccomp: Selftest for detection of filter flag support Userspace needs to be able to reliably detect the support of a filter flag. A good way of doing that is by attempting to enter filter mode, with the flag bit(s) in question set, and a NULL pointer for the args parameter of seccomp(2). EFAULT indicates that the flag is valid and EINVAL indicates that the flag is invalid. This patch adds a selftest that can be used to test this method of detection in userspace. Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 1f2888f6678b..abf708e09892 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1835,6 +1835,66 @@ TEST(seccomp_syscall_mode_lock) } } +/* + * Test detection of known and unknown filter flags. Userspace needs to be able + * to check if a filter flag is supported by the current kernel and a good way + * of doing that is by attempting to enter filter mode, with the flag bit in + * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates + * that the flag is valid and EINVAL indicates that the flag is invalid. + */ +TEST(detect_seccomp_filter_flags) +{ + unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC }; + unsigned int flag, all_flags; + int i; + long ret; + + /* Test detection of known-good filter flags */ + for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) { + flag = flags[i]; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!", + flag); + } + + all_flags |= flag; + } + + /* Test detection of all known-good filter flags */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!", + all_flags); + } + + /* Test detection of an unknown filter flag */ + flag = -1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!", + flag); + } + + /* + * Test detection of an unknown filter flag that may simply need to be + * added to this test + */ + flag = flags[ARRAY_SIZE(flags) - 1] << 1; + ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?", + flag); + } +} + TEST(TSYNC_first) { struct sock_filter filter[] = { From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:56 +0000 Subject: [PATCH 0012/1322] seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for all actions except for SECCOMP_RET_ALLOW for the given filter. SECCOMP_RET_KILL actions are always logged, when "kill" is in the actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged, regardless of this flag. This flag can be used to create noisy filters that result in all non-allowed actions to be logged. A process may have one noisy filter, which is loaded with this flag, as well as a quiet filter that's not loaded with this flag. This allows for the actions in a set of filters to be selectively conveyed to the admin. Since a system could have a large number of allocated seccomp_filter structs, struct packing was taken in consideration. On 64 bit x86, the new log member takes up one byte of an existing four byte hole in the struct. On 32 bit x86, the new log member creates a new four byte hole (unavoidable) and consumes one of those bytes. Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not capable of inspecting the audit log to verify that the actions taken in the filter were logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- include/linux/seccomp.h | 3 +- include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 26 +++++-- tools/testing/selftests/seccomp/seccomp_bpf.c | 69 ++++++++++++++++++- 4 files changed, 91 insertions(+), 8 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index ecc296c137cd..c8bef436b61d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,7 +3,8 @@ #include -#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC) +#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ + SECCOMP_FILTER_FLAG_LOG) #ifdef CONFIG_SECCOMP diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index aaad61cc46bc..19a611d0712e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -17,6 +17,7 @@ /* Valid flags for SECCOMP_SET_MODE_FILTER */ #define SECCOMP_FILTER_FLAG_TSYNC 1 +#define SECCOMP_FILTER_FLAG_LOG 2 /* * All BPF programs must return a 32-bit value. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54357e361aea..ed9fde418fc4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,7 @@ * get/put helpers should be used when accessing an instance * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. + * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @@ -59,6 +60,7 @@ */ struct seccomp_filter { refcount_t usage; + bool log; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -452,6 +454,10 @@ static long seccomp_attach_filter(unsigned int flags, return ret; } + /* Set log flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_LOG) + filter->log = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -532,15 +538,22 @@ static void seccomp_send_sigsys(int syscall, int reason) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; -static inline void seccomp_log(unsigned long syscall, long signr, u32 action) +static inline void seccomp_log(unsigned long syscall, long signr, u32 action, + bool requested) { bool log = false; switch (action) { case SECCOMP_RET_ALLOW: + break; case SECCOMP_RET_TRAP: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; + break; case SECCOMP_RET_ERRNO: + log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; + break; case SECCOMP_RET_TRACE: + log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; case SECCOMP_RET_KILL: default: @@ -548,8 +561,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action) } /* - * Force an audit message to be emitted when the action is RET_KILL and - * the action is allowed to be logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL or + * the FILTER_FLAG_LOG bit was set and the action is allowed to be + * logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -586,7 +600,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); do_exit(SIGKILL); } @@ -695,7 +709,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, case SECCOMP_RET_KILL: default: - seccomp_log(this_syscall, SIGSYS, action); + seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { siginfo_t info; @@ -712,7 +726,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, unreachable(); skip: - seccomp_log(this_syscall, 0, action); + seccomp_log(this_syscall, 0, action, match ? match->log : false); return -1; } #else diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index abf708e09892..1c8c22ce7740 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1739,6 +1739,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif +#ifndef SECCOMP_FILTER_FLAG_LOG +#define SECCOMP_FILTER_FLAG_LOG 2 +#endif + #ifndef seccomp int seccomp(unsigned int op, unsigned int flags, void *args) { @@ -1844,7 +1848,8 @@ TEST(seccomp_syscall_mode_lock) */ TEST(detect_seccomp_filter_flags) { - unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC }; + unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC, + SECCOMP_FILTER_FLAG_LOG }; unsigned int flag, all_flags; int i; long ret; @@ -2533,6 +2538,67 @@ TEST(syscall_restart) _metadata->passed = 0; } +TEST_SIGNAL(filter_flag_log, SIGSYS) +{ + struct sock_filter allow_filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter kill_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog allow_prog = { + .len = (unsigned short)ARRAY_SIZE(allow_filter), + .filter = allow_filter, + }; + struct sock_fprog kill_prog = { + .len = (unsigned short)ARRAY_SIZE(kill_filter), + .filter = kill_filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */ + ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG, + &allow_prog); + ASSERT_NE(ENOSYS, errno) { + TH_LOG("Kernel does not support seccomp syscall!"); + } + EXPECT_NE(0, ret) { + TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!"); + } + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!"); + } + + /* Verify that a simple, permissive filter can be added with no flags */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog); + EXPECT_EQ(0, ret); + + /* See if the same filter can be added with the FILTER_FLAG_LOG flag */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, + &allow_prog); + ASSERT_NE(EINVAL, errno) { + TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!"); + } + EXPECT_EQ(0, ret); + + /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, + &kill_prog); + EXPECT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + TEST(get_action_avail) { __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, @@ -2573,6 +2639,7 @@ TEST(get_action_avail) * - endianness checking when appropriate * - 64-bit arg prodding * - arch value testing (x86 modes especially) + * - verify that FILTER_FLAG_LOG filters generate log messages * - ... */ From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 11 Aug 2017 04:33:57 +0000 Subject: [PATCH 0013/1322] seccomp: Action to log before allowing Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing the syscall. At the implementation level, this action is identical to the existing SECCOMP_RET_ALLOW action. However, it can be very useful when initially developing a seccomp filter for an application. The developer can set the default action to be SECCOMP_RET_LOG, maybe mark any obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the application through its paces. A list of syscalls that triggered the default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and that list can be used to build the syscall whitelist. Finally, the developer can change the default action to the desired value. This provides a more friendly experience than seeing the application get killed, then updating the filter and rebuilding the app, seeing the application get killed due to a different syscall, then updating the filter and rebuilding the app, etc. The functionality is similar to what's supported by the various LSMs. SELinux has permissive mode, AppArmor has complain mode, SMACK has bring-up mode, etc. SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow while logging is slightly more restrictive than quietly allowing. Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of inspecting the audit log to verify that the syscall was logged. With this patch, the logic for deciding if an action will be logged is: if action == RET_ALLOW: do not log else if action == RET_KILL && RET_KILL in actions_logged: log else if action == RET_LOG && RET_LOG in actions_logged: log else if filter-requests-logging && action in actions_logged: log else if audit_enabled && process-is-being-audited: log else: do not log Signed-off-by: Tyler Hicks Signed-off-by: Kees Cook --- .../userspace-api/seccomp_filter.rst | 9 ++ include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 23 ++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 98 ++++++++++++++++++- 4 files changed, 125 insertions(+), 6 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 2d1d8ab04ac5..f4977357daf2 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -141,6 +141,15 @@ In precedence order, they are: allow use of ptrace, even of other sandboxed processes, without extreme care; ptracers can use this mechanism to escape.) +``SECCOMP_RET_LOG``: + Results in the system call being executed after it is logged. This + should be used by application developers to learn which syscalls their + application needs without having to iterate through multiple test and + development cycles to build the list. + + This action will only be logged if "log" is present in the + actions_logged sysctl string. + ``SECCOMP_RET_ALLOW``: Results in the system call being executed. diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 19a611d0712e..f94433263e4b 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -31,6 +31,7 @@ #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ed9fde418fc4..59cde2ed3b92 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -533,10 +533,12 @@ static void seccomp_send_sigsys(int syscall, int reason) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) -#define SECCOMP_LOG_ALLOW (1 << 5) +#define SECCOMP_LOG_LOG (1 << 5) +#define SECCOMP_LOG_ALLOW (1 << 6) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE; + SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | + SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) @@ -555,15 +557,18 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; + case SECCOMP_RET_LOG: + log = seccomp_actions_logged & SECCOMP_LOG_LOG; + break; case SECCOMP_RET_KILL: default: log = seccomp_actions_logged & SECCOMP_LOG_KILL; } /* - * Force an audit message to be emitted when the action is RET_KILL or - * the FILTER_FLAG_LOG bit was set and the action is allowed to be - * logged by the admin. + * Force an audit message to be emitted when the action is RET_KILL, + * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is + * allowed to be logged by the admin. */ if (log) return __audit_seccomp(syscall, signr, action); @@ -699,6 +704,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; + case SECCOMP_RET_LOG: + seccomp_log(this_syscall, 0, action, true); + return 0; + case SECCOMP_RET_ALLOW: /* * Note that the "match" filter will always be NULL for @@ -873,6 +882,7 @@ static long seccomp_get_action_avail(const char __user *uaction) case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: + case SECCOMP_RET_LOG: case SECCOMP_RET_ALLOW: break; default: @@ -1023,12 +1033,14 @@ out: #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" +#define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { @@ -1041,6 +1053,7 @@ static const struct seccomp_log_name seccomp_log_names[] = { { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, + { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME }, { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME }, { } }; diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 1c8c22ce7740..7372958eccb5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -74,7 +74,12 @@ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#endif +#ifndef SECCOMP_RET_LOG +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#endif +#ifndef SECCOMP_RET_ACTION /* Masks for the return value sections. */ #define SECCOMP_RET_ACTION 0x7fff0000U #define SECCOMP_RET_DATA 0x0000ffffU @@ -342,6 +347,28 @@ TEST(empty_prog) EXPECT_EQ(EINVAL, errno); } +TEST(log_all) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + /* getppid() should succeed and be logged (no check for logging) */ + EXPECT_EQ(parent, syscall(__NR_getppid)); +} + TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS) { struct sock_filter filter[] = { @@ -756,6 +783,7 @@ TEST_F(TRAP, handler) FIXTURE_DATA(precedence) { struct sock_fprog allow; + struct sock_fprog log; struct sock_fprog trace; struct sock_fprog error; struct sock_fprog trap; @@ -767,6 +795,13 @@ FIXTURE_SETUP(precedence) struct sock_filter allow_insns[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; + struct sock_filter log_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG), + }; struct sock_filter trace_insns[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), @@ -803,6 +838,7 @@ FIXTURE_SETUP(precedence) memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \ self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns) FILTER_ALLOC(allow); + FILTER_ALLOC(log); FILTER_ALLOC(trace); FILTER_ALLOC(error); FILTER_ALLOC(trap); @@ -813,6 +849,7 @@ FIXTURE_TEARDOWN(precedence) { #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter) FILTER_FREE(allow); + FILTER_FREE(log); FILTER_FREE(trace); FILTER_FREE(error); FILTER_FREE(trap); @@ -830,6 +867,8 @@ TEST_F(precedence, allow_ok) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -854,6 +893,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -885,6 +926,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); @@ -906,6 +949,8 @@ TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -931,6 +976,8 @@ TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -952,6 +999,8 @@ TEST_F(precedence, errno_is_third) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); @@ -970,6 +1019,8 @@ TEST_F(precedence, errno_is_third_in_any_order) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); @@ -992,6 +1043,8 @@ TEST_F(precedence, trace_is_fourth) ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); ASSERT_EQ(0, ret); /* Should work just fine. */ @@ -1013,12 +1066,54 @@ TEST_F(precedence, trace_is_fourth_in_any_order) ASSERT_EQ(0, ret); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); /* Should work just fine. */ EXPECT_EQ(parent, syscall(__NR_getppid)); /* No ptracer */ EXPECT_EQ(-1, syscall(__NR_getpid)); } +TEST_F(precedence, log_is_fifth) +{ + pid_t mypid, parent; + long ret; + + mypid = getpid(); + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* Should also work just fine */ + EXPECT_EQ(mypid, syscall(__NR_getpid)); +} + +TEST_F(precedence, log_is_fifth_in_any_order) +{ + pid_t mypid, parent; + long ret; + + mypid = getpid(); + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* Should also work just fine */ + EXPECT_EQ(mypid, syscall(__NR_getpid)); +} + #ifndef PTRACE_O_TRACESECCOMP #define PTRACE_O_TRACESECCOMP 0x00000080 #endif @@ -2603,7 +2698,7 @@ TEST(get_action_avail) { __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, - SECCOMP_RET_ALLOW }; + SECCOMP_RET_LOG, SECCOMP_RET_ALLOW }; __u32 unknown_action = 0x10000000U; int i; long ret; @@ -2640,6 +2735,7 @@ TEST(get_action_avail) * - 64-bit arg prodding * - arch value testing (x86 modes especially) * - verify that FILTER_FLAG_LOG filters generate log messages + * - verify that RET_LOG generates log messages * - ... */ From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 12:53:18 -0700 Subject: [PATCH 0014/1322] seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL to the more accurate SECCOMP_RET_KILL_THREAD. The existing selftest values are intentionally left as SECCOMP_RET_KILL just to be sure we're exercising the alias. Signed-off-by: Kees Cook --- Documentation/networking/filter.txt | 2 +- .../userspace-api/seccomp_filter.rst | 4 +- include/uapi/linux/seccomp.h | 3 +- kernel/seccomp.c | 39 ++++++++++--------- samples/seccomp/bpf-direct.c | 4 +- samples/seccomp/bpf-helper.h | 2 +- tools/testing/selftests/seccomp/seccomp_bpf.c | 17 ++++---- 7 files changed, 39 insertions(+), 32 deletions(-) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index b69b205501de..73aa0f12156d 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -337,7 +337,7 @@ Examples for low-level BPF: jeq #14, good /* __NR_rt_sigprocmask */ jeq #13, good /* __NR_rt_sigaction */ jeq #35, good /* __NR_nanosleep */ - bad: ret #0 /* SECCOMP_RET_KILL */ + bad: ret #0 /* SECCOMP_RET_KILL_THREAD */ good: ret #0x7fff0000 /* SECCOMP_RET_ALLOW */ The above example code can be placed into a file (here called "foo"), and diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index f4977357daf2..d76396f2d8ed 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -87,11 +87,11 @@ Return values A seccomp filter may return any of the following values. If multiple filters exist, the return value for the evaluation of a given system call will always use the highest precedent value. (For example, -``SECCOMP_RET_KILL`` will always take precedence.) +``SECCOMP_RET_KILL_THREAD`` will always take precedence.) In precedence order, they are: -``SECCOMP_RET_KILL``: +``SECCOMP_RET_KILL_THREAD``: Results in the task exiting immediately without executing the system call. The exit status of the task (``status & 0x7f``) will be ``SIGSYS``, not ``SIGKILL``. diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index f94433263e4b..5a03f699eb17 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -27,7 +27,8 @@ * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 59cde2ed3b92..95ac54cff00f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL; + return SECCOMP_RET_KILL_THREAD; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,15 +529,17 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 0) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP | - SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | + SECCOMP_LOG_TRAP | + SECCOMP_LOG_ERRNO | + SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, @@ -560,13 +562,13 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: - log = seccomp_actions_logged & SECCOMP_LOG_KILL; + log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; } /* - * Force an audit message to be emitted when the action is RET_KILL, + * Force an audit message to be emitted when the action is RET_KILL_*, * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is * allowed to be logged by the admin. */ @@ -605,7 +607,7 @@ static void __secure_computing_strict(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true); + seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } @@ -716,7 +718,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ return 0; - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ @@ -878,7 +880,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { - case SECCOMP_RET_KILL: + case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: case SECCOMP_RET_TRACE: @@ -1029,19 +1031,20 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ -#define SECCOMP_RET_KILL_NAME "kill" +#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" #define SECCOMP_RET_TRACE_NAME "trace" #define SECCOMP_RET_LOG_NAME "log" #define SECCOMP_RET_ALLOW_NAME "allow" -static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " " - SECCOMP_RET_TRAP_NAME " " - SECCOMP_RET_ERRNO_NAME " " - SECCOMP_RET_TRACE_NAME " " - SECCOMP_RET_LOG_NAME " " - SECCOMP_RET_ALLOW_NAME; +static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_THREAD_NAME " " + SECCOMP_RET_TRAP_NAME " " + SECCOMP_RET_ERRNO_NAME " " + SECCOMP_RET_TRACE_NAME " " + SECCOMP_RET_LOG_NAME " " + SECCOMP_RET_ALLOW_NAME; struct seccomp_log_name { u32 log; @@ -1049,7 +1052,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { - { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME }, + { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME }, diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c index 151ec3f52189..235ce3c49ee9 100644 --- a/samples/seccomp/bpf-direct.c +++ b/samples/seccomp/bpf-direct.c @@ -129,7 +129,7 @@ static int install_filter(void) /* Check that read is only using stdin. */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), /* Check that write is only using stdout */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), @@ -139,7 +139,7 @@ static int install_filter(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), }; struct sock_fprog prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 1d8de9edd858..83dbe79cbe2c 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define ALLOW \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define DENY \ - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD) #define JUMP(labels, label) \ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ JUMP_JT, JUMP_JF) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 7372958eccb5..a3ba39a32449 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -68,15 +68,18 @@ #define SECCOMP_MODE_FILTER 2 #endif +#ifndef SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#endif #ifndef SECCOMP_RET_KILL -#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ -#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ -#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ -#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ #endif #ifndef SECCOMP_RET_LOG -#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #endif #ifndef SECCOMP_RET_ACTION @@ -2696,7 +2699,7 @@ TEST_SIGNAL(filter_flag_log, SIGSYS) TEST(get_action_avail) { - __u32 actions[] = { SECCOMP_RET_KILL, SECCOMP_RET_TRAP, + __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP, SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE, SECCOMP_RET_LOG, SECCOMP_RET_ALLOW }; __u32 unknown_action = 0x10000000U; From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:01:39 -0700 Subject: [PATCH 0015/1322] seccomp: Introduce SECCOMP_RET_KILL_PROCESS This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill an entire process. This cannot yet be reached by seccomp, but it changes the default-kill behavior (for unknown return values) from kill-thread to kill-process. Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 18 ++++++++++-------- kernel/seccomp.c | 22 ++++++++++++++++------ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 5a03f699eb17..7e77c92df78a 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -22,18 +22,20 @@ /* * All BPF programs must return a 32-bit value. * The bottom 16-bits are for optional return data. - * The upper 16-bits are ordered from least permissive values to most. + * The upper 16-bits are ordered from least permissive values to most, + * as a signed value (so 0x8000000 is negative). * * The ordering ensures that a min_t() over composed return values always * selects the least permissive choice. */ -#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ -#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD -#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ -#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ -#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ -#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ -#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ +#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */ +#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ +#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ #define SECCOMP_RET_ACTION 0x7fff0000U diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 95ac54cff00f..5c7299b9d953 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) - return SECCOMP_RET_KILL_THREAD; + return SECCOMP_RET_KILL_PROCESS; if (!sd) { populate_seccomp_data(&sd_local); @@ -529,14 +529,16 @@ static void seccomp_send_sigsys(int syscall, int reason) #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ -#define SECCOMP_LOG_KILL_THREAD (1 << 0) +#define SECCOMP_LOG_KILL_PROCESS (1 << 0) +#define SECCOMP_LOG_KILL_THREAD (1 << 1) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) -static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD | +static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | + SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE | @@ -563,8 +565,11 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; case SECCOMP_RET_KILL_THREAD: - default: log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; + break; + case SECCOMP_RET_KILL_PROCESS: + default: + log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; } /* @@ -719,10 +724,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL_THREAD: + case SECCOMP_RET_KILL_PROCESS: default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (get_nr_threads(current) == 1) { + if (action == SECCOMP_RET_KILL_PROCESS || + get_nr_threads(current) == 1) { siginfo_t info; /* Show the original registers in the dump. */ @@ -731,7 +738,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - do_exit(SIGSYS); + if (action == SECCOMP_RET_KILL_PROCESS) + do_group_exit(SIGSYS); + else + do_exit(SIGSYS); } unreachable(); From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:12:11 -0700 Subject: [PATCH 0016/1322] seccomp: Implement SECCOMP_RET_KILL_PROCESS action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the current thread. There have been a few requests for this to kill the entire process (the thread group). This cannot be just changed (discovered when adding coredump support since coredumping kills the entire process) because there are userspace programs depending on the thread-kill behavior. Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can be processed as "-1" by the kernel, below the existing RET_KILL that is ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only be visible when examining the siginfo in a core dump from a RET_KILL_*, where it will think it was thread-killed instead of process-killed. Attempts to introduce this behavior via other ways (filter flags, seccomp struct flags, masked RET_DATA bits) all come with weird side-effects and baggage. This change preserves the central behavioral expectations of the seccomp filter engine without putting too great a burden on changes needed in userspace to use the new action. The new action is discoverable by userspace through either the new actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp operation. If used without checking for availability, old kernels will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask will produce RET_KILL_THREAD). Cc: Paul Moore Cc: Fabricio Voznika Signed-off-by: Kees Cook --- Documentation/userspace-api/seccomp_filter.rst | 7 ++++++- include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 9 +++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index d76396f2d8ed..099c412951d6 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -87,10 +87,15 @@ Return values A seccomp filter may return any of the following values. If multiple filters exist, the return value for the evaluation of a given system call will always use the highest precedent value. (For example, -``SECCOMP_RET_KILL_THREAD`` will always take precedence.) +``SECCOMP_RET_KILL_PROCESS`` will always take precedence.) In precedence order, they are: +``SECCOMP_RET_KILL_PROCESS``: + Results in the entire process exiting immediately without executing + the system call. The exit status of the task (``status & 0x7f``) + will be ``SIGSYS``, not ``SIGKILL``. + ``SECCOMP_RET_KILL_THREAD``: Results in the task exiting immediately without executing the system call. The exit status of the task (``status & 0x7f``) will diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 7e77c92df78a..f6bc1dea3247 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -38,6 +38,7 @@ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ /* Masks for the return value sections. */ +#define SECCOMP_RET_ACTION_FULL 0xffff0000U #define SECCOMP_RET_ACTION 0x7fff0000U #define SECCOMP_RET_DATA 0x0000ffffU diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5c7299b9d953..c24579dfa7a1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -181,6 +181,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { @@ -206,7 +207,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); - if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) { + if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } @@ -650,7 +651,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, filter_ret = seccomp_run_filters(sd, &match); data = filter_ret & SECCOMP_RET_DATA; - action = filter_ret & SECCOMP_RET_ACTION; + action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: @@ -890,6 +891,7 @@ static long seccomp_get_action_avail(const char __user *uaction) return -EFAULT; switch (action) { + case SECCOMP_RET_KILL_PROCESS: case SECCOMP_RET_KILL_THREAD: case SECCOMP_RET_TRAP: case SECCOMP_RET_ERRNO: @@ -1041,6 +1043,7 @@ out: #ifdef CONFIG_SYSCTL /* Human readable action names for friendly sysctl interaction */ +#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process" #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread" #define SECCOMP_RET_TRAP_NAME "trap" #define SECCOMP_RET_ERRNO_NAME "errno" @@ -1049,6 +1052,7 @@ out: #define SECCOMP_RET_ALLOW_NAME "allow" static const char seccomp_actions_avail[] = + SECCOMP_RET_KILL_PROCESS_NAME " " SECCOMP_RET_KILL_THREAD_NAME " " SECCOMP_RET_TRAP_NAME " " SECCOMP_RET_ERRNO_NAME " " @@ -1062,6 +1066,7 @@ struct seccomp_log_name { }; static const struct seccomp_log_name seccomp_log_names[] = { + { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME }, { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME }, { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME }, { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME }, From f3e1821d9e1cc3fb434d7763001791dcd6720c90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Aug 2017 13:20:33 -0700 Subject: [PATCH 0017/1322] selftests/seccomp: Test thread vs process killing This verifies that SECCOMP_RET_KILL_PROCESS is higher priority than SECCOMP_RET_KILL_THREAD. (This also moves a bunch of defines up earlier in the file to use them earlier.) Signed-off-by: Kees Cook Reviewed-by: Tyler Hicks --- tools/testing/selftests/seccomp/seccomp_bpf.c | 228 +++++++++++++----- 1 file changed, 168 insertions(+), 60 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index a3ba39a32449..0683cd543cd5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -68,7 +68,17 @@ #define SECCOMP_MODE_FILTER 2 #endif -#ifndef SECCOMP_RET_KILL_THREAD +#ifndef SECCOMP_RET_ALLOW +struct seccomp_data { + int nr; + __u32 arch; + __u64 instruction_pointer; + __u64 args[6]; +}; +#endif + +#ifndef SECCOMP_RET_KILL_PROCESS +#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */ #define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */ #endif #ifndef SECCOMP_RET_KILL @@ -82,17 +92,53 @@ #define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ #endif -#ifndef SECCOMP_RET_ACTION -/* Masks for the return value sections. */ -#define SECCOMP_RET_ACTION 0x7fff0000U -#define SECCOMP_RET_DATA 0x0000ffffU +#ifndef __NR_seccomp +# if defined(__i386__) +# define __NR_seccomp 354 +# elif defined(__x86_64__) +# define __NR_seccomp 317 +# elif defined(__arm__) +# define __NR_seccomp 383 +# elif defined(__aarch64__) +# define __NR_seccomp 277 +# elif defined(__hppa__) +# define __NR_seccomp 338 +# elif defined(__powerpc__) +# define __NR_seccomp 358 +# elif defined(__s390__) +# define __NR_seccomp 348 +# else +# warning "seccomp syscall number unknown for this architecture" +# define __NR_seccomp 0xffff +# endif +#endif -struct seccomp_data { - int nr; - __u32 arch; - __u64 instruction_pointer; - __u64 args[6]; -}; +#ifndef SECCOMP_SET_MODE_STRICT +#define SECCOMP_SET_MODE_STRICT 0 +#endif + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_GET_ACTION_AVAIL +#define SECCOMP_GET_ACTION_AVAIL 2 +#endif + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +#ifndef SECCOMP_FILTER_FLAG_LOG +#define SECCOMP_FILTER_FLAG_LOG 2 +#endif + +#ifndef seccomp +int seccomp(unsigned int op, unsigned int flags, void *args) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, args); +} #endif #if __BYTE_ORDER == __LITTLE_ENDIAN @@ -550,6 +596,117 @@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS) close(fd); } +/* This is a thread task to die via seccomp filter violation. */ +void *kill_thread(void *data) +{ + bool die = (bool)data; + + if (die) { + prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + return (void *)SIBLING_EXIT_FAILURE; + } + + return (void *)SIBLING_EXIT_UNKILLED; +} + +/* Prepare a thread that will kill itself or both of us. */ +void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process) +{ + pthread_t thread; + void *status; + /* Kill only when calling __NR_prctl. */ + struct sock_filter filter_thread[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog_thread = { + .len = (unsigned short)ARRAY_SIZE(filter_thread), + .filter = filter_thread, + }; + struct sock_filter filter_process[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog_process = { + .len = (unsigned short)ARRAY_SIZE(filter_process), + .filter = filter_process, + }; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, + kill_process ? &prog_process : &prog_thread)); + + /* + * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS + * flag cannot be downgraded by a new filter. + */ + ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread)); + + /* Start a thread that will exit immediately. */ + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false)); + ASSERT_EQ(0, pthread_join(thread, &status)); + ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status); + + /* Start a thread that will die immediately. */ + ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true)); + ASSERT_EQ(0, pthread_join(thread, &status)); + ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status); + + /* + * If we get here, only the spawned thread died. Let the parent know + * the whole process didn't die (i.e. this thread, the spawner, + * stayed running). + */ + exit(42); +} + +TEST(KILL_thread) +{ + int status; + pid_t child_pid; + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + kill_thread_or_group(_metadata, false); + _exit(38); + } + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + + /* If only the thread was killed, we'll see exit 42. */ + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(42, WEXITSTATUS(status)); +} + +TEST(KILL_process) +{ + int status; + pid_t child_pid; + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + kill_thread_or_group(_metadata, true); + _exit(38); + } + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + + /* If the entire process was killed, we'll see SIGSYS. */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(SIGSYS, WTERMSIG(status)); +} + /* TODO(wad) add 64-bit versus 32-bit arg tests. */ TEST(arg_out_of_range) { @@ -1800,55 +1957,6 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) EXPECT_NE(self->mypid, syscall(__NR_getpid)); } -#ifndef __NR_seccomp -# if defined(__i386__) -# define __NR_seccomp 354 -# elif defined(__x86_64__) -# define __NR_seccomp 317 -# elif defined(__arm__) -# define __NR_seccomp 383 -# elif defined(__aarch64__) -# define __NR_seccomp 277 -# elif defined(__hppa__) -# define __NR_seccomp 338 -# elif defined(__powerpc__) -# define __NR_seccomp 358 -# elif defined(__s390__) -# define __NR_seccomp 348 -# else -# warning "seccomp syscall number unknown for this architecture" -# define __NR_seccomp 0xffff -# endif -#endif - -#ifndef SECCOMP_SET_MODE_STRICT -#define SECCOMP_SET_MODE_STRICT 0 -#endif - -#ifndef SECCOMP_SET_MODE_FILTER -#define SECCOMP_SET_MODE_FILTER 1 -#endif - -#ifndef SECCOMP_GET_ACTION_AVAIL -#define SECCOMP_GET_ACTION_AVAIL 2 -#endif - -#ifndef SECCOMP_FILTER_FLAG_TSYNC -#define SECCOMP_FILTER_FLAG_TSYNC 1 -#endif - -#ifndef SECCOMP_FILTER_FLAG_LOG -#define SECCOMP_FILTER_FLAG_LOG 2 -#endif - -#ifndef seccomp -int seccomp(unsigned int op, unsigned int flags, void *args) -{ - errno = 0; - return syscall(__NR_seccomp, op, flags, args); -} -#endif - TEST(seccomp_syscall) { struct sock_filter filter[] = { From 6849243bf4c6155151b294e9f0e0dc9540d6f083 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Aug 2017 20:26:57 -0700 Subject: [PATCH 0018/1322] samples: Unrename SECCOMP_RET_KILL Since samples can still be built before header installs, avoid the cosmetic renaming of SECCOMP_RET_KILL to avoid build failures in -next. Cc: Stephen Rothwell Signed-off-by: Kees Cook --- samples/seccomp/bpf-direct.c | 4 ++-- samples/seccomp/bpf-helper.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c index 235ce3c49ee9..151ec3f52189 100644 --- a/samples/seccomp/bpf-direct.c +++ b/samples/seccomp/bpf-direct.c @@ -129,7 +129,7 @@ static int install_filter(void) /* Check that read is only using stdin. */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), /* Check that write is only using stdout */ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), @@ -139,7 +139,7 @@ static int install_filter(void) BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), }; struct sock_fprog prog = { .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h index 83dbe79cbe2c..1d8de9edd858 100644 --- a/samples/seccomp/bpf-helper.h +++ b/samples/seccomp/bpf-helper.h @@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count); #define ALLOW \ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) #define DENY \ - BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD) + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) #define JUMP(labels, label) \ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ JUMP_JT, JUMP_JF) From a4e89ffb59235fd11d27107dea3efa4562ac0a12 Mon Sep 17 00:00:00 2001 From: Matt Weber Date: Wed, 28 Jun 2017 11:14:29 -0500 Subject: [PATCH 0019/1322] powerpc/e6500: Update machine check for L1D cache err This patch updates the machine check handler of Linux kernel to handle the e6500 architecture case. In e6500 core, L1 Data Cache Write Shadow Mode (DCWS) register is not implemented but L1 data cache always runs in write shadow mode. So, on L1 data cache parity errors, hardware will automatically invalidate the data cache but will still log a machine check interrupt. Signed-off-by: Ronak Desai Signed-off-by: Matthew Weber Signed-off-by: Scott Wood --- arch/powerpc/kernel/traps.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 675d5d2bfcde..410352acfa38 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -393,6 +393,7 @@ static inline int check_io_access(struct pt_regs *regs) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); + unsigned long pvr = mfspr(SPRN_PVR); unsigned long reason = mcsr; int recoverable = 1; @@ -434,8 +435,15 @@ int machine_check_e500mc(struct pt_regs *regs) * may still get logged and cause a machine check. We should * only treat the non-write shadow case as non-recoverable. */ - if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) - recoverable = 0; + /* On e6500 core, L1 DCWS (Data cache write shadow mode) bit + * is not implemented but L1 data cache always runs in write + * shadow mode. Hence on data cache parity errors HW will + * automatically invalidate the L1 Data Cache. + */ + if (PVR_VER(pvr) != PVR_VER_E6500) { + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; + } } if (reason & MCSR_L2MMU_MHIT) { From 50dad5fb59d12dc9a5a0e96073ee1e5857ac45a2 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Wed, 30 Aug 2017 00:33:35 +0530 Subject: [PATCH 0020/1322] drm/amdkfd: remove memset before memcpy calling memcpy immediately after memset with the same region of memory makes memset redundant. Signed-off-by: Himanshu Jha Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 1cae95e2b13a..03bec765b03d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -143,7 +143,6 @@ int pqm_create_queue(struct process_queue_manager *pqm, int num_queues = 0; struct queue *cur; - memset(&q_properties, 0, sizeof(struct queue_properties)); memcpy(&q_properties, properties, sizeof(struct queue_properties)); q = NULL; kq = NULL; From 1fabbf781167052f4480bdcc627378a27e78a559 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 2 Sep 2017 15:00:25 +0300 Subject: [PATCH 0021/1322] drm/amdkfd: pass queue's mqd when destroying mqd In VI, the destroy mqd function needs to inquire fields present in the mqd structure. That's why we need to pass it to that function instead of NULL. Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 681b639f5133..0649dd43e780 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -183,7 +183,7 @@ static void uninitialize(struct kernel_queue *kq) { if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, - NULL, + kq->queue->mqd, false, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, From 3af75db12e3d3537bc01c06bbdc876cb17e82b35 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 24 Aug 2017 09:22:57 +0900 Subject: [PATCH 0022/1322] iio: adc: ti-ads1015: fix comparator polarity setting The comparator polarity field in config register is not correctly initialized as per the interrupt trigger setting. Because the bitfield definision is wrong and bit shifting is missed. Fixes: d9f39babd8ba ("iio: adc: ti-ads1015: add threshold event support") Cc: Daniel Baluta Cc: Jonathan Cameron Signed-off-by: Akinobu Mita Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads1015.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iio/adc/ti-ads1015.c b/drivers/iio/adc/ti-ads1015.c index d1210024f6bc..e0dc20488335 100644 --- a/drivers/iio/adc/ti-ads1015.c +++ b/drivers/iio/adc/ti-ads1015.c @@ -52,7 +52,7 @@ #define ADS1015_CFG_COMP_QUE_MASK GENMASK(1, 0) #define ADS1015_CFG_COMP_LAT_MASK BIT(2) -#define ADS1015_CFG_COMP_POL_MASK BIT(2) +#define ADS1015_CFG_COMP_POL_MASK BIT(3) #define ADS1015_CFG_COMP_MODE_MASK BIT(4) #define ADS1015_CFG_DR_MASK GENMASK(7, 5) #define ADS1015_CFG_MOD_MASK BIT(8) @@ -1017,10 +1017,12 @@ static int ads1015_probe(struct i2c_client *client, switch (irq_trig) { case IRQF_TRIGGER_LOW: - cfg_comp |= ADS1015_CFG_COMP_POL_LOW; + cfg_comp |= ADS1015_CFG_COMP_POL_LOW << + ADS1015_CFG_COMP_POL_SHIFT; break; case IRQF_TRIGGER_HIGH: - cfg_comp |= ADS1015_CFG_COMP_POL_HIGH; + cfg_comp |= ADS1015_CFG_COMP_POL_HIGH << + ADS1015_CFG_COMP_POL_SHIFT; break; default: return -EINVAL; From c65e3d6ef4bfdc4c8460509f08507cf7dc026974 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 29 Aug 2017 13:45:11 +0200 Subject: [PATCH 0023/1322] iio: magnetometer: st_magn: fix drdy line configuration for LIS3MDL Data-ready line in LIS3MDL is routed to drdy pin and it is not possible to select a different INT pin. st_sensors_set_dataready_irq() assumes that if drdy int address is not exported in register map, irq trigger is not supported by the sensor and hw_irq_trigger is always false. Based on this configuration st_sensors_irq_thread does not consume generated interrupt causing an unhandled irq. Fix this taking into account status register address in st_sensors_set_dataready_irq() Fixes: 90efe0556292 (iio: st_sensors: harden interrupt handling) Signed-off-by: Lorenzo Bianconi Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/common/st_sensors/st_sensors_core.c | 11 ++++++++++- drivers/iio/magnetometer/st_magn_core.c | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c index d99bb1460fe2..02e833b14db0 100644 --- a/drivers/iio/common/st_sensors/st_sensors_core.c +++ b/drivers/iio/common/st_sensors/st_sensors_core.c @@ -463,8 +463,17 @@ int st_sensors_set_dataready_irq(struct iio_dev *indio_dev, bool enable) u8 drdy_mask; struct st_sensor_data *sdata = iio_priv(indio_dev); - if (!sdata->sensor_settings->drdy_irq.addr) + if (!sdata->sensor_settings->drdy_irq.addr) { + /* + * there are some devices (e.g. LIS3MDL) where drdy line is + * routed to a given pin and it is not possible to select a + * different one. Take into account irq status register + * to understand if irq trigger can be properly supported + */ + if (sdata->sensor_settings->drdy_irq.addr_stat_drdy) + sdata->hw_irq_trigger = enable; return 0; + } /* Enable/Disable the interrupt generator 1. */ if (sdata->sensor_settings->drdy_irq.ig1.en_addr > 0) { diff --git a/drivers/iio/magnetometer/st_magn_core.c b/drivers/iio/magnetometer/st_magn_core.c index 703e77008652..2e36d746f5bc 100644 --- a/drivers/iio/magnetometer/st_magn_core.c +++ b/drivers/iio/magnetometer/st_magn_core.c @@ -315,6 +315,10 @@ static const struct st_sensor_settings st_magn_sensors_settings[] = { }, }, }, + .drdy_irq = { + /* drdy line is routed drdy pin */ + .addr_stat_drdy = ST_SENSORS_DEFAULT_STAT_ADDR, + }, .multi_read_bit = true, .bootime = 2, }, From 3664847d95e60a9a943858b7800f8484669740fc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 25 Aug 2017 10:40:02 -0700 Subject: [PATCH 0024/1322] md/raid5: fix a race condition in stripe batch We have a race condition in below scenario, say have 3 continuous stripes, sh1, sh2 and sh3, sh1 is the stripe_head of sh2 and sh3: CPU1 CPU2 CPU3 handle_stripe(sh3) stripe_add_to_batch_list(sh3) -> lock(sh2, sh3) -> lock batch_lock(sh1) -> add sh3 to batch_list of sh1 -> unlock batch_lock(sh1) clear_batch_ready(sh1) -> lock(sh1) and batch_lock(sh1) -> clear STRIPE_BATCH_READY for all stripes in batch_list -> unlock(sh1) and batch_lock(sh1) ->clear_batch_ready(sh3) -->test_and_clear_bit(STRIPE_BATCH_READY, sh3) --->return 0 as sh->batch == NULL -> sh3->batch_head = sh1 -> unlock (sh2, sh3) In CPU1, handle_stripe will continue handle sh3 even it's in batch stripe list of sh1. By moving sh3->batch_head assignment in to batch_lock, we make it impossible to clear STRIPE_BATCH_READY before batch_head is set. Thanks Stephane for helping debug this tricky issue. Reported-and-tested-by: Stephane Thiell Cc: stable@vger.kernel.org (v4.1+) Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 049a958d3c1e..90414a4e0d49 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -811,6 +811,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -818,8 +826,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; From 50662499f9112ecced68d064846a2f1fd9640b66 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:18 +0200 Subject: [PATCH 0025/1322] ARM64: dts: meson-gx: Use correct mmc clock source 0 Now that the clock source 0 is properly described in the CCF, use it instead of assuming the default value (xtal) Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 6 +++--- arch/arm64/boot/dts/amlogic/meson-gxl.dtsi | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 52f1687e7a09..8f0c0cb02157 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -661,21 +661,21 @@ &sd_emmc_a { clocks = <&clkc CLKID_SD_EMMC_A>, - <&xtal>, + <&clkc CLKID_SD_EMMC_A_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_b { clocks = <&clkc CLKID_SD_EMMC_B>, - <&xtal>, + <&clkc CLKID_SD_EMMC_B_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_c { clocks = <&clkc CLKID_SD_EMMC_C>, - <&xtal>, + <&clkc CLKID_SD_EMMC_C_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index d6876e64979e..829d84db5fc5 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -603,21 +603,21 @@ &sd_emmc_a { clocks = <&clkc CLKID_SD_EMMC_A>, - <&xtal>, + <&clkc CLKID_SD_EMMC_A_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_b { clocks = <&clkc CLKID_SD_EMMC_B>, - <&xtal>, + <&clkc CLKID_SD_EMMC_B_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; &sd_emmc_c { clocks = <&clkc CLKID_SD_EMMC_C>, - <&xtal>, + <&clkc CLKID_SD_EMMC_C_CLK0>, <&clkc CLKID_FCLK_DIV2>; clock-names = "core", "clkin0", "clkin1"; }; From 673ccaaccf32a044d961c3bac3dd63452bdfa86c Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:19 +0200 Subject: [PATCH 0026/1322] ARM64: dts: meson: remove cap-sd-highspeed from emmc nodes It does not make much sense to define cap-sd-highspeed in the emmc nodes Just remove it. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi | 1 - arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts | 1 - arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts | 1 - 11 files changed, 11 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi index c89010e56488..d4f9c5b550c7 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi @@ -215,7 +215,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 9697a7a79464..7dae6acd3c8c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -297,7 +297,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; max-frequency = <200000000>; non-removable; disable-wp; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts index 9c59c3c6d1b6..a690956d6c75 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts @@ -274,7 +274,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts index d147c853ab05..a12303becab4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts @@ -274,7 +274,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; max-frequency = <200000000>; non-removable; disable-wp; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index 81ffc689a5bf..d77e19591ee3 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -241,7 +241,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi index 346753fb6324..0262ef8d48e4 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi @@ -201,7 +201,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts index 2a5804ce7f4b..f779a985f923 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts @@ -144,7 +144,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <100000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts index 4c2ac7650fcd..21274a6c1b9b 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts @@ -231,7 +231,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi index f3eea8e89d12..8899121f79e1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi @@ -137,7 +137,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts index 9b10c5f4f8c0..ff8a9f780485 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts @@ -196,7 +196,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts index 08f1dd69b679..470f72bb863c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-rbox-pro.dts @@ -220,7 +220,6 @@ pinctrl-names = "default"; bus-width = <8>; - cap-sd-highspeed; cap-mmc-highspeed; max-frequency = <200000000>; non-removable; From 67e7607fcdf1fad10e9f183424e709c59713e45d Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:20 +0200 Subject: [PATCH 0027/1322] ARM64: dts: meson: add mmc clk gate pins Add the pinctrl to switch mmc clk pins in gpio (pulled down) mode. This is necessary to be able to gate the clk outside of the SoC while keeping it running in the controller Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- .../boot/dts/amlogic/meson-gx-p23x-q20x.dtsi | 9 +++-- .../boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 9 +++-- .../dts/amlogic/meson-gxbb-nexbox-a95x.dts | 11 ++++--- .../boot/dts/amlogic/meson-gxbb-odroidc2.dts | 8 +++-- .../boot/dts/amlogic/meson-gxbb-p20x.dtsi | 9 +++-- .../boot/dts/amlogic/meson-gxbb-vega-s95.dtsi | 9 +++-- arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi | 33 +++++++++++++++++++ .../meson-gxl-s905x-hwacom-amazetv.dts | 6 ++-- .../amlogic/meson-gxl-s905x-libretech-cc.dts | 6 ++-- .../amlogic/meson-gxl-s905x-nexbox-a95x.dts | 9 +++-- .../dts/amlogic/meson-gxl-s905x-p212.dtsi | 9 +++-- arch/arm64/boot/dts/amlogic/meson-gxl.dtsi | 33 +++++++++++++++++++ .../boot/dts/amlogic/meson-gxm-nexbox-a1.dts | 6 ++-- 13 files changed, 126 insertions(+), 31 deletions(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi index d4f9c5b550c7..4157987f4a3d 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gx-p23x-q20x.dtsi @@ -168,7 +168,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -194,7 +195,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -212,7 +214,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 7dae6acd3c8c..60d5f2da6916 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -250,7 +250,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>, <&sdio_irq_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -276,7 +277,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -294,7 +296,8 @@ &sd_emmc_c { status = "disabled"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; max-frequency = <200000000>; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts index a690956d6c75..38dfdde5c147 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nexbox-a95x.dts @@ -51,7 +51,7 @@ / { compatible = "nexbox,a95x", "amlogic,meson-gxbb"; model = "NEXBOX A95X"; - + aliases { serial0 = &uart_AO; }; @@ -232,7 +232,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -253,7 +254,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -271,7 +273,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts index a12303becab4..1ffa1c238a72 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts @@ -50,7 +50,7 @@ / { compatible = "hardkernel,odroid-c2", "amlogic,meson-gxbb"; model = "Hardkernel ODROID-C2"; - + aliases { serial0 = &uart_AO; }; @@ -253,7 +253,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -271,7 +272,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; max-frequency = <200000000>; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index d77e19591ee3..704b214e8894 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -194,7 +194,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -220,7 +221,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -238,7 +240,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi index 0262ef8d48e4..f2bc6dea1fc6 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-vega-s95.dtsi @@ -155,7 +155,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins &sdio_irq_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -181,7 +182,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -198,7 +200,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi index 8f0c0cb02157..af834cdbba79 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi @@ -392,6 +392,17 @@ }; }; + emmc_clk_gate_pins: emmc_clk_gate { + mux { + groups = "BOOT_8"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "BOOT_8"; + bias-pull-down; + }; + }; + nor_pins: nor { mux { groups = "nor_d", @@ -430,6 +441,17 @@ }; }; + sdcard_clk_gate_pins: sdcard_clk_gate { + mux { + groups = "CARD_2"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "CARD_2"; + bias-pull-down; + }; + }; + sdio_pins: sdio { mux { groups = "sdio_d0", @@ -442,6 +464,17 @@ }; }; + sdio_clk_gate_pins: sdio_clk_gate { + mux { + groups = "GPIOX_4"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "GPIOX_4"; + bias-pull-down; + }; + }; + sdio_irq_pins: sdio_irq { mux { groups = "sdio_irq"; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts index f779a985f923..977b4240f3c1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-hwacom-amazetv.dts @@ -123,7 +123,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -141,7 +142,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index 69ca14ac10fa..a014c052241e 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -197,7 +197,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -215,7 +216,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts index 21274a6c1b9b..1b8f32867aa1 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-nexbox-a95x.dts @@ -189,7 +189,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -210,7 +211,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -228,7 +230,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi index 8899121f79e1..129af9068814 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-p212.dtsi @@ -95,7 +95,8 @@ &sd_emmc_a { status = "okay"; pinctrl-0 = <&sdio_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdio_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; #address-cells = <1>; #size-cells = <0>; @@ -116,7 +117,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -134,7 +136,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi index 829d84db5fc5..d8dd3298b15c 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi @@ -281,6 +281,17 @@ }; }; + emmc_clk_gate_pins: emmc_clk_gate { + mux { + groups = "BOOT_8"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "BOOT_8"; + bias-pull-down; + }; + }; + nor_pins: nor { mux { groups = "nor_d", @@ -319,6 +330,17 @@ }; }; + sdcard_clk_gate_pins: sdcard_clk_gate { + mux { + groups = "CARD_2"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "CARD_2"; + bias-pull-down; + }; + }; + sdio_pins: sdio { mux { groups = "sdio_d0", @@ -331,6 +353,17 @@ }; }; + sdio_clk_gate_pins: sdio_clk_gate { + mux { + groups = "GPIOX_4"; + function = "gpio_periphs"; + }; + cfg-pull-down { + pins = "GPIOX_4"; + bias-pull-down; + }; + }; + sdio_irq_pins: sdio_irq { mux { groups = "sdio_irq"; diff --git a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts index ff8a9f780485..22c697732f66 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxm-nexbox-a1.dts @@ -175,7 +175,8 @@ &sd_emmc_b { status = "okay"; pinctrl-0 = <&sdcard_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&sdcard_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <4>; cap-sd-highspeed; @@ -193,7 +194,8 @@ &sd_emmc_c { status = "okay"; pinctrl-0 = <&emmc_pins>; - pinctrl-names = "default"; + pinctrl-1 = <&emmc_clk_gate_pins>; + pinctrl-names = "default", "clk-gate"; bus-width = <8>; cap-mmc-highspeed; From 42776561a1def5d96699574efd7c9cbbd2e0fbc4 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:21 +0200 Subject: [PATCH 0028/1322] ARM64: dts: meson-gxbb: nanopi-k2: add card regulator settle times Changing the card voltage on the nanopi-k2 is not instantaneous, especially when switching from 3.3v to 1.8v. It take at least 3ms for the regulator to go from 3.3v to 1.8v. Add margin to that to make sure we don't upset the sdcard during the voltage switch Fixes: 9bc7ffb08daf ("arm64: dts: amlogic: Add NanoPi K2") Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 60d5f2da6916..acb6797756e5 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -107,6 +107,9 @@ states = <3300000 0>, <1800000 1>; + + regulator-settling-time-up-us = <100>; + regulator-settling-time-down-us = <5000>; }; wifi_32k: wifi-32k { From 8a5085c420d272af04552b2d2213471247fa86f2 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:22 +0200 Subject: [PATCH 0029/1322] ARM64: dts: meson-gxl: libretech-cc: add card regulator settle times Changing the card voltage on the cc is not instantaneous, especially when switching from 3.3v to 1.8v. It take at least 30ms for the regulator to go from 3.3v to 1.8v. Add margin to that to make sure we don't upset the sdcard during the voltage switch Fixes: 61ff2af9b278 ("ARM64: dts: fixup libretech cc definition") Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index a014c052241e..7d252168c2fa 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -91,6 +91,9 @@ states = <3300000 0>, <1800000 1>; + + regulator-settling-time-up-us = <200>; + regulator-settling-time-down-us = <50000>; }; vddio_boot: regulator-vddio_boot { From 3cde63ebc85cea63806d86a690d04457c0347703 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:23 +0200 Subject: [PATCH 0030/1322] ARM64: dts: meson-gxl: libretech-cc: enable high speed modes Enable sdcard UHS modes up to SDR50. Unfortunately, it seems the PCB of the libretech-cc cannot handle SDR104 at 200Mhz reliably. Also enable eMMC DDR52 mode. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts index 7d252168c2fa..64c54c92e214 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxl-s905x-libretech-cc.dts @@ -205,6 +205,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; @@ -224,6 +227,7 @@ bus-width = <8>; cap-mmc-highspeed; + mmc-ddr-3_3v; max-frequency = <50000000>; non-removable; disable-wp; From 0f553358241a3346b7eef133d631e5bc2f067a15 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:24 +0200 Subject: [PATCH 0031/1322] ARM64: dts: meson-gxbb: p20x: enable sdcard UHS modes Enable sdcard UHS modes, up to SDR50, on p20x based boards. While the s905 supports SDR104 mode, it appears that the PCB of p20x based boards can't cope with a rate as high as 200Mhz. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi index 704b214e8894..23c08c3afd0a 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-p20x.dtsi @@ -226,6 +226,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; From c1429e20a5a9f578e0e3ddb551c8ea94e8d3ddb3 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:25 +0200 Subject: [PATCH 0032/1322] ARM64: dts: meson-gxbb: nanopi-k2: enable sdcard UHS modes Enable UHS modes, up to SDR50, on the nanopi-k2 SBC. Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index acb6797756e5..4c1320a93fef 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -285,6 +285,9 @@ bus-width = <4>; cap-sd-highspeed; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; max-frequency = <100000000>; disable-wp; From 485a308f05d843034b6e82f688704c44888aecde Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 31 Aug 2017 15:52:26 +0200 Subject: [PATCH 0033/1322] ARM64: dts: meson-gxbb: nanopi-k2: enable sdr104 mode SDR104 seems to be OK on the nanopi-k2 SBC so enable it Signed-off-by: Jerome Brunet Signed-off-by: Kevin Hilman --- arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts index 4c1320a93fef..4b17a76959b2 100644 --- a/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts +++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-nanopi-k2.dts @@ -288,7 +288,8 @@ sd-uhs-sdr12; sd-uhs-sdr25; sd-uhs-sdr50; - max-frequency = <100000000>; + sd-uhs-sdr104; + max-frequency = <200000000>; disable-wp; cd-gpios = <&gpio CARD_6 GPIO_ACTIVE_HIGH>; From 184a09eb9a2fe425e49c9538f1604b05ed33cfef Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Wed, 6 Sep 2017 11:02:35 +0800 Subject: [PATCH 0034/1322] md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list In release_stripe_plug(), if a stripe_head has its STRIPE_ON_UNPLUG_LIST set, it indicates that this stripe_head is already in the raid5_plug_cb list and release_stripe() would be called instead to drop a reference count. Otherwise, the STRIPE_ON_UNPLUG_LIST bit would be set for this stripe_head and it will get queued into the raid5_plug_cb list. Since break_stripe_batch_list() did not preserve STRIPE_ON_UNPLUG_LIST, A stripe could be re-added to plug list while it is still on that list in the following situation. If stripe_head A is added to another stripe_head B's batch list, in this case A will have its batch_head != NULL and be added into the plug list. After that, stripe_head B gets handled and called break_stripe_batch_list() to reset all the batched stripe_head(including A which is still on the plug list)'s state and reset their batch_head to NULL. Before the plug list gets processed, if there is another write request comes in and get stripe_head A, A will have its batch_head == NULL (cleared by calling break_stripe_batch_list() on B) and be added to plug list once again. Signed-off-by: Dennis Yang Cc: stable@vger.kernel.org (v4.1+) Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 90414a4e0d49..34c01e83395a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4605,7 +4605,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; From ce06760ba46b66dae50f2519ae76bd15e89b5710 Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Thu, 31 Aug 2017 15:50:03 -0700 Subject: [PATCH 0035/1322] HID: wacom: bits shifted too much for 9th and 10th buttons Cintiq 12 has 10 expresskey buttons. The bit shift for the last two buttons were off by 5. Fixes: c7f0522 ("HID: wacom: Slim down wacom_intuos_pad processing") Signed-off-by: Ping Cheng Tested-by: Matthieu Robin Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index bb17d7bbefd3..f8ddcaaa2ac6 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -567,8 +567,8 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) keys = data[9] & 0x07; } } else { - buttons = ((data[6] & 0x10) << 10) | - ((data[5] & 0x10) << 9) | + buttons = ((data[6] & 0x10) << 5) | + ((data[5] & 0x10) << 4) | ((data[6] & 0x0F) << 4) | (data[5] & 0x0F); } From 74aebed6dc13425233f2224668353cff7a112776 Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Mon, 28 Aug 2017 14:15:39 -0700 Subject: [PATCH 0036/1322] HID: wacom: leds: Don't try to control the EKR's read-only LEDs Commit a50aac7193f1 introduces 'led.groups' and adds EKR support for these groups. However, unlike the other devices with LEDs, the EKR's LEDs are read-only and we shouldn't attempt to control them in wacom_led_control(). See bug: https://sourceforge.net/p/linuxwacom/bugs/342/ Fixes: a50aac7193f1 ("HID: wacom: leds: dynamically allocate LED groups") Cc: stable # 4.9 Signed-off-by: Aaron Armstrong Skomra Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index e82a696a1d07..735bfbbcaa82 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -766,6 +766,9 @@ static int wacom_led_control(struct wacom *wacom) if (!wacom->led.groups) return -ENOTSUPP; + if (wacom->wacom_wac.features.type == REMOTE) + return -ENOTSUPP; + if (wacom->wacom_wac.pid) { /* wireless connected */ report_id = WAC_CMD_WL_LED_CONTROL; buf_size = 13; From a3ae552b5259e6ffd8d124a9fa9923283af2828a Mon Sep 17 00:00:00 2001 From: Nicholas Bishop Date: Mon, 4 Sep 2017 15:40:42 -0400 Subject: [PATCH 0037/1322] HID: add multi-input quirk for IDC6680 touchscreen The Ideacom 6680 touchscreen is found in the Dell Latitude 2100. It has two USB descriptors, the first of which has two input reports. The HID_QUIRK_MULTI_INPUT quirk is needed to keep the correct maximum value for ABS_X/ABS_Y (8191 instead of 65535). Signed-off-by: Nicholas Bishop Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/usbhid/hid-quirks.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index b397a14ab970..11798b10e522 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -533,6 +533,7 @@ #define USB_VENDOR_ID_IDEACOM 0x1cb6 #define USB_DEVICE_ID_IDEACOM_IDC6650 0x6650 #define USB_DEVICE_ID_IDEACOM_IDC6651 0x6651 +#define USB_DEVICE_ID_IDEACOM_IDC6680 0x6680 #define USB_VENDOR_ID_ILITEK 0x222a #define USB_DEVICE_ID_ILITEK_MULTITOUCH 0x0001 diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c index a83fa76655b9..f489a5cfcb48 100644 --- a/drivers/hid/usbhid/hid-quirks.c +++ b/drivers/hid/usbhid/hid-quirks.c @@ -99,6 +99,7 @@ static const struct hid_blacklist { { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0A4A, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE, HID_QUIRK_ALWAYS_POLL }, + { USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680, HID_QUIRK_MULTI_INPUT }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_C007, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_C077, HID_QUIRK_ALWAYS_POLL }, { USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_KEYBOARD_G710_PLUS, HID_QUIRK_NOGET }, From e57f4e67a0c73ca1c3afb68928b917906bd82eaf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 22 Aug 2017 08:37:52 +0200 Subject: [PATCH 0038/1322] HID: multitouch: Fix system-control buttons not working Some laptops have system-control buttons (e.g. KEY_SLEEP) on the same interface as a hid-multitouch touch-pad. This commit fixes these buttons not working. Signed-off-by: Hans de Goede Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-multitouch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 440b999304a5..5609725df714 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -930,6 +930,7 @@ static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi, field->application != HID_DG_PEN && field->application != HID_DG_TOUCHPAD && field->application != HID_GD_KEYBOARD && + field->application != HID_GD_SYSTEM_CONTROL && field->application != HID_CP_CONSUMER_CONTROL && field->application != HID_GD_WIRELESS_RADIO_CTLS && !(field->application == HID_VD_ASUS_CUSTOM_MEDIA_KEYS && From b63c4c2718d641ba9bec888994f0cb0c23a1ef45 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 30 Aug 2017 15:13:25 -0700 Subject: [PATCH 0039/1322] HID: wacom: Properly report negative values from Intuos Pro 2 Bluetooth The wacom driver's IRQ handler for Bluetooth reports from the 2nd-gen Intuos Pro does not correctly process negative numbers. Values for tilt and rotation (which can go negative) are instead interpreted as unsigned and so jump to very large values when the data should be negative. This commit properly casts the data to ensure we report negative numbers when necessary. Fixes: 4922cd2 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface") Cc: stable@vger.kernel.org # v4.11 Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index f8ddcaaa2ac6..1d9e32d2bc63 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1229,9 +1229,9 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) if (range) { input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1])); input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3])); - input_report_abs(pen_input, ABS_TILT_X, frame[7]); - input_report_abs(pen_input, ABS_TILT_Y, frame[8]); - input_report_abs(pen_input, ABS_Z, get_unaligned_le16(&frame[9])); + input_report_abs(pen_input, ABS_TILT_X, (char)frame[7]); + input_report_abs(pen_input, ABS_TILT_Y, (char)frame[8]); + input_report_abs(pen_input, ABS_Z, (int16_t)get_unaligned_le16(&frame[9])); input_report_abs(pen_input, ABS_WHEEL, get_unaligned_le16(&frame[11])); } input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); From d252f4a10fb9c8f7187c6c936ff530039f8cb799 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 30 Aug 2017 15:13:26 -0700 Subject: [PATCH 0040/1322] HID: wacom: Correct coordinate system of touchring and pen twist The MobileStudio Pro, Cintiq Pro, and 2nd-gen Intuos Pro devices use a different coordinate system for their touchring and pen twist than prior devices. Prior devices had zero aligned to the tablet's left and would increase clockwise. Userspace expects data from the kernel to be in this old coordinate space, so adjustments are necessary. While the coordinate system for pen twist is formally defined by the HID standard, no such definition existed for the touchring at the time these tablets were introduced. Future tablets are expected to report touchring data using the same "zero-up clockwise-increasing" coordinate system defined for twist. Fixes: 50066a042d ("HID: wacom: generic: Add support for height, tilt, and twist usages") Fixes: 4922cd26f0 ("HID: wacom: Support 2nd-gen Intuos Pro's Bluetooth classic interface") Fixes: 60a2218698 ("HID: wacom: generic: add support for touchring") Cc: stable@vger.kernel.org # 4.10, 4.11 Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 73 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 1d9e32d2bc63..78d0398904dc 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1227,11 +1227,17 @@ static void wacom_intuos_pro2_bt_pen(struct wacom_wac *wacom) continue; if (range) { + /* Fix rotation alignment: userspace expects zero at left */ + int16_t rotation = (int16_t)get_unaligned_le16(&frame[9]); + rotation += 1800/4; + if (rotation > 899) + rotation -= 1800; + input_report_abs(pen_input, ABS_X, get_unaligned_le16(&frame[1])); input_report_abs(pen_input, ABS_Y, get_unaligned_le16(&frame[3])); input_report_abs(pen_input, ABS_TILT_X, (char)frame[7]); input_report_abs(pen_input, ABS_TILT_Y, (char)frame[8]); - input_report_abs(pen_input, ABS_Z, (int16_t)get_unaligned_le16(&frame[9])); + input_report_abs(pen_input, ABS_Z, rotation); input_report_abs(pen_input, ABS_WHEEL, get_unaligned_le16(&frame[11])); } input_report_abs(pen_input, ABS_PRESSURE, get_unaligned_le16(&frame[5])); @@ -1319,12 +1325,19 @@ static void wacom_intuos_pro2_bt_pad(struct wacom_wac *wacom) unsigned char *data = wacom->data; int buttons = (data[282] << 1) | ((data[281] >> 6) & 0x01); - int ring = data[285]; - int prox = buttons | (ring & 0x80); + int ring = data[285] & 0x7F; + bool ringstatus = data[285] & 0x80; + bool prox = buttons || ringstatus; + + /* Fix touchring data: userspace expects 0 at left and increasing clockwise */ + ring = 71 - ring; + ring += 3*72/16; + if (ring > 71) + ring -= 72; wacom_report_numbered_buttons(pad_input, 9, buttons); - input_report_abs(pad_input, ABS_WHEEL, (ring & 0x80) ? (ring & 0x7f) : 0); + input_report_abs(pad_input, ABS_WHEEL, ringstatus ? ring : 0); input_report_key(pad_input, wacom->tool[1], prox ? 1 : 0); input_report_abs(pad_input, ABS_MISC, prox ? PAD_DEVICE_ID : 0); @@ -1616,6 +1629,20 @@ static int wacom_tpc_irq(struct wacom_wac *wacom, size_t len) return 0; } +static int wacom_offset_rotation(struct input_dev *input, struct hid_usage *usage, + int value, int num, int denom) +{ + struct input_absinfo *abs = &input->absinfo[usage->code]; + int range = (abs->maximum - abs->minimum + 1); + + value += num*range/denom; + if (value > abs->maximum) + value -= range; + else if (value < abs->minimum) + value += range; + return value; +} + int wacom_equivalent_usage(int usage) { if ((usage & HID_USAGE_PAGE) == WACOM_HID_UP_WACOMDIGITIZER) { @@ -1898,6 +1925,7 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field unsigned equivalent_usage = wacom_equivalent_usage(usage->hid); int i; bool is_touch_on = value; + bool do_report = false; /* * Avoid reporting this event and setting inrange_state if this usage @@ -1912,6 +1940,29 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field } switch (equivalent_usage) { + case WACOM_HID_WD_TOUCHRING: + /* + * Userspace expects touchrings to increase in value with + * clockwise gestures and have their zero point at the + * tablet's left. HID events "should" be clockwise- + * increasing and zero at top, though the MobileStudio + * Pro and 2nd-gen Intuos Pro don't do this... + */ + if (hdev->vendor == 0x56a && + (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ + hdev->product == 0x357 || hdev->product == 0x358)) { /* Intuos Pro 2 */ + value = (field->logical_maximum - value); + + if (hdev->product == 0x357 || hdev->product == 0x358) + value = wacom_offset_rotation(input, usage, value, 3, 16); + else if (hdev->product == 0x34d || hdev->product == 0x34e) + value = wacom_offset_rotation(input, usage, value, 1, 2); + } + else { + value = wacom_offset_rotation(input, usage, value, 1, 4); + } + do_report = true; + break; case WACOM_HID_WD_TOUCHRINGSTATUS: if (!value) input_event(input, usage->type, usage->code, 0); @@ -1945,10 +1996,14 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field value, i); /* fall through*/ default: + do_report = true; + break; + } + + if (do_report) { input_event(input, usage->type, usage->code, value); if (value) wacom_wac->hid_data.pad_input_event_flag = true; - break; } } @@ -2089,6 +2144,14 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); wacom_wac->serial[0] |= (__u32)value; return; + case HID_DG_TWIST: + /* + * Userspace expects pen twist to have its zero point when + * the buttons/finger is on the tablet's left. HID values + * are zero when buttons are toward the top. + */ + value = wacom_offset_rotation(input, usage, value, 1, 4); + break; case WACOM_HID_WD_SENSE: wacom_wac->hid_data.sense_state = value; return; From 56d859e11aad4016e1cf864d65a0954d83120571 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 25 Aug 2017 18:06:04 -0400 Subject: [PATCH 0041/1322] HID: multitouch: support buttons and trackpoint on Lenovo X1 Tab Gen2 On the 2nd generation Lenovo Tablet only clickpad is working; the trackpoint and three mouse buttons do not work. hid_multitouch must export all inputs in order to get trackpoint and buttons to function. Signed-off-by: Pavel Tatashin Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-multitouch.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 11798b10e522..a98919199858 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -661,6 +661,7 @@ #define USB_DEVICE_ID_LENOVO_CBTKBD 0x6048 #define USB_DEVICE_ID_LENOVO_TPPRODOCK 0x6067 #define USB_DEVICE_ID_LENOVO_X1_COVER 0x6085 +#define USB_DEVICE_ID_LENOVO_X1_TAB 0x60a3 #define USB_VENDOR_ID_LG 0x1fd2 #define USB_DEVICE_ID_LG_MULTITOUCH 0x0064 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 5609725df714..9e8c4d2ba11d 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1420,6 +1420,12 @@ static const struct hid_device_id mt_devices[] = { USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, + /* Lenovo X1 TAB Gen 2 */ + { .driver_data = MT_CLS_WIN_8_DUAL, + HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8, + USB_VENDOR_ID_LENOVO, + USB_DEVICE_ID_LENOVO_X1_TAB) }, + /* Anton devices */ { .driver_data = MT_CLS_EXPORT_ALL_INPUTS, MT_USB_DEVICE(USB_VENDOR_ID_ANTON, From fcaa4a07d2a4b541e91da7a55d8b3331f96d1865 Mon Sep 17 00:00:00 2001 From: Shrirang Bagul Date: Thu, 10 Aug 2017 17:54:01 +0800 Subject: [PATCH 0042/1322] HID: multitouch: Support ALPS PTP stick with pid 0x120A This patch adds ALPS PTP sticks with pid/device id 0x120A to the list of devices supported by hid-multitouch. Signed-off-by: Shrirang Bagul Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-multitouch.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index a98919199858..832897d4a849 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -75,6 +75,7 @@ #define USB_VENDOR_ID_ALPS_JP 0x044E #define HID_DEVICE_ID_ALPS_U1_DUAL 0x120B +#define HID_DEVICE_ID_ALPS_U1_PTP_2 0x120A #define HID_DEVICE_ID_ALPS_U1_DUAL_PTP 0x121F #define HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP 0x1220 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 9e8c4d2ba11d..c78625dceced 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1419,6 +1419,10 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, + { .driver_data = MT_CLS_WIN_8_DUAL, + HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, + USB_VENDOR_ID_ALPS_JP, + HID_DEVICE_ID_ALPS_U1_PTP_2) }, /* Lenovo X1 TAB Gen 2 */ { .driver_data = MT_CLS_WIN_8_DUAL, From 23e4c67ae13da7549a54f4e8c0014f48b2ef5204 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 23:45:34 +0200 Subject: [PATCH 0043/1322] ata: avoid gcc-7 warning in ata_timing_quantize gcc-7 warns about the result of a constant multiplication used as a boolean: drivers/ata/libata-core.c: In function 'ata_timing_quantize': drivers/ata/libata-core.c:3164:30: warning: '*' in boolean context, suggest '&&' instead [-Wint-in-bool-context] This slightly rearranges the macro to simplify the code and avoid the warning at the same time. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 1945a8ea2099..ee4c1ec9dca0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3234,19 +3234,19 @@ static const struct ata_timing ata_timing[] = { }; #define ENOUGH(v, unit) (((v)-1)/(unit)+1) -#define EZ(v, unit) ((v)?ENOUGH(v, unit):0) +#define EZ(v, unit) ((v)?ENOUGH(((v) * 1000), unit):0) static void ata_timing_quantize(const struct ata_timing *t, struct ata_timing *q, int T, int UT) { - q->setup = EZ(t->setup * 1000, T); - q->act8b = EZ(t->act8b * 1000, T); - q->rec8b = EZ(t->rec8b * 1000, T); - q->cyc8b = EZ(t->cyc8b * 1000, T); - q->active = EZ(t->active * 1000, T); - q->recover = EZ(t->recover * 1000, T); - q->dmack_hold = EZ(t->dmack_hold * 1000, T); - q->cycle = EZ(t->cycle * 1000, T); - q->udma = EZ(t->udma * 1000, UT); + q->setup = EZ(t->setup, T); + q->act8b = EZ(t->act8b, T); + q->rec8b = EZ(t->rec8b, T); + q->cyc8b = EZ(t->cyc8b, T); + q->active = EZ(t->active, T); + q->recover = EZ(t->recover, T); + q->dmack_hold = EZ(t->dmack_hold, T); + q->cycle = EZ(t->cycle, T); + q->udma = EZ(t->udma, UT); } void ata_timing_merge(const struct ata_timing *a, const struct ata_timing *b, From 59cd827f26019ac790b2f34cbad478037f51c570 Mon Sep 17 00:00:00 2001 From: Matt Chen Date: Mon, 28 Aug 2017 14:57:54 +0800 Subject: [PATCH 0044/1322] iwlwifi: mvm: fix wowlan resume failed to load INIT ucode If we set disconnect on wowlan and run suspend/resume, will run into: ...snipped iwlwifi 0000:01:00.0: Failed to load firmware chunk! iwlwifi 0000:01:00.0: Could not load the [0] uCode section iwlwifi 0000:01:00.0: Failed to start INIT ucode: -110 iwlwifi 0000:01:00.0: Failed to run INIT ucode: -110 iwlwifi 0000:01:00.0: Failed to start RT ucode: -110 It is because we still keep IWL_MVM_STATUS_IN_HW_RESTART in __iwl_mvm_resume. When mac80211 starts the device as __iwl_mvm_mac_start(), we will miss iwl_mvm_restart_cleanup(mvm). Signed-off-by: Matt Chen Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 5de19ea10575..b205a7bfb828 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -2167,7 +2167,7 @@ out: * 1. We are not using a unified image * 2. We are using a unified image but had an error while exiting D3 */ - set_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status); + set_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status); set_bit(IWL_MVM_STATUS_D3_RECONFIG, &mvm->status); /* * When switching images we return 1, which causes mac80211 From 6110d9e5bdd15c4e60fb67f330fbf74681e7daf7 Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Tue, 29 Aug 2017 13:56:02 +0300 Subject: [PATCH 0045/1322] iwlwifi: mvm: Flush non STA TX queues When starting wowlan mac80211 requests flush w/o vif and we ignore this request. As a result some packets stay stuck in the queue and it may end up with a queue hang. Allow the driver to flush queues even if station isn't specified. Signed-off-by: David Spinadel Signed-off-by: Luca Coelho --- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 15f2d826bb4b..64b0be73ea72 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -3975,6 +3975,43 @@ out_unlock: return ret; } +static void iwl_mvm_flush_no_vif(struct iwl_mvm *mvm, u32 queues, bool drop) +{ + if (drop) { + if (iwl_mvm_has_new_tx_api(mvm)) + /* TODO new tx api */ + WARN_ONCE(1, + "Need to implement flush TX queue\n"); + else + iwl_mvm_flush_tx_path(mvm, + iwl_mvm_flushable_queues(mvm) & queues, + 0); + } else { + if (iwl_mvm_has_new_tx_api(mvm)) { + struct ieee80211_sta *sta; + int i; + + mutex_lock(&mvm->mutex); + + for (i = 0; i < ARRAY_SIZE(mvm->fw_id_to_mac_id); i++) { + sta = rcu_dereference_protected( + mvm->fw_id_to_mac_id[i], + lockdep_is_held(&mvm->mutex)); + if (IS_ERR_OR_NULL(sta)) + continue; + + iwl_mvm_wait_sta_queues_empty(mvm, + iwl_mvm_sta_from_mac80211(sta)); + } + + mutex_unlock(&mvm->mutex); + } else { + iwl_trans_wait_tx_queues_empty(mvm->trans, + queues); + } + } +} + static void iwl_mvm_mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop) { @@ -3985,7 +4022,12 @@ static void iwl_mvm_mac_flush(struct ieee80211_hw *hw, int i; u32 msk = 0; - if (!vif || vif->type != NL80211_IFTYPE_STATION) + if (!vif) { + iwl_mvm_flush_no_vif(mvm, queues, drop); + return; + } + + if (vif->type != NL80211_IFTYPE_STATION) return; /* Make sure we're done with the deferred traffic before flushing */ From 0fe8bed6e37c259b85d123ef9667f972305c9d6b Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Thu, 31 Aug 2017 16:27:06 +0300 Subject: [PATCH 0046/1322] iwlwifi: mvm: send all non-bufferable frames on the probe queue AP interfaces now send all non-bufferable frames using the broadcast station. Thus allow them to use the probe queue and don't warn about it. Fixes: eb045e6e0389 ("iwlwifi: mvm: Avoid deferring non bufferable frames") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 172b5e63d3fb..6f2e2af23219 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -564,8 +564,8 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: /* - * Handle legacy hostapd as well, where station will be added - * only just before sending the association response. + * Non-bufferable frames use the broadcast station, thus they + * use the probe queue. * Also take care of the case where we send a deauth to a * station that we don't have, or similarly an association * response (with non-success status) for a station we can't @@ -573,9 +573,9 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm, * Also, disassociate frames might happen, particular with * reason 7 ("Class 3 frame received from nonassociated STA"). */ - if (ieee80211_is_probe_resp(fc) || ieee80211_is_auth(fc) || - ieee80211_is_deauth(fc) || ieee80211_is_assoc_resp(fc) || - ieee80211_is_disassoc(fc)) + if (ieee80211_is_mgmt(fc) && + (!ieee80211_is_bufferable_mmpdu(fc) || + ieee80211_is_deauth(fc) || ieee80211_is_disassoc(fc))) return mvm->probe_queue; if (info->hw_queue == info->control.vif->cab_queue) return mvmvif->cab_queue; From bd800e41a3de5c7e56b2fd27088bdaf5e228d227 Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Mon, 28 Aug 2017 11:51:05 +0300 Subject: [PATCH 0047/1322] iwlwifi: mvm: change state when queueing agg start work Add a new state to enum iwl_mvm_agg_state, which is used between queueing the work that starts tx aggregations and actually starting that work (changing to state IWL_AGG_STARTING). This solves a race where ieee80211_start_tx_ba_session is called a second time, before the work queued by the first run has a chance to change the agg_state. In this case the second call to ieee80211_start_tx_ba_session returns an error, and the fallback is to abort the ba session start. Fixes: 482e48440a0e ("iwlwifi: mvm: change open and close criteria of a BA session") Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 3 ++- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 6 ++++-- drivers/net/wireless/intel/iwlwifi/mvm/sta.h | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index ba7bd049d3d4..0fe723ca844e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -661,7 +661,8 @@ static void rs_tl_turn_on_agg(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta, (lq_sta->tx_agg_tid_en & BIT(tid)) && (tid_data->tx_count_last >= IWL_MVM_RS_AGG_START_THRESHOLD)) { IWL_DEBUG_RATE(mvm, "try to aggregate tid %d\n", tid); - rs_tl_turn_on_agg_for_tid(mvm, lq_sta, tid, sta); + if (rs_tl_turn_on_agg_for_tid(mvm, lq_sta, tid, sta) == 0) + tid_data->state = IWL_AGG_QUEUED; } } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 411a2055dc45..2dafe9bb4d8b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -2385,8 +2385,10 @@ int iwl_mvm_sta_tx_agg_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif, if (WARN_ON_ONCE(tid >= IWL_MAX_TID_COUNT)) return -EINVAL; - if (mvmsta->tid_data[tid].state != IWL_AGG_OFF) { - IWL_ERR(mvm, "Start AGG when state is not IWL_AGG_OFF %d!\n", + if (mvmsta->tid_data[tid].state != IWL_AGG_QUEUED && + mvmsta->tid_data[tid].state != IWL_AGG_OFF) { + IWL_ERR(mvm, + "Start AGG when state is not IWL_AGG_QUEUED or IWL_AGG_OFF %d!\n", mvmsta->tid_data[tid].state); return -ENXIO; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h index d13893806513..aedabe101cf0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h @@ -281,6 +281,7 @@ struct iwl_mvm_vif; * These states relate to a specific RA / TID. * * @IWL_AGG_OFF: aggregation is not used + * @IWL_AGG_QUEUED: aggregation start work has been queued * @IWL_AGG_STARTING: aggregation are starting (between start and oper) * @IWL_AGG_ON: aggregation session is up * @IWL_EMPTYING_HW_QUEUE_ADDBA: establishing a BA session - waiting for the @@ -290,6 +291,7 @@ struct iwl_mvm_vif; */ enum iwl_mvm_agg_state { IWL_AGG_OFF = 0, + IWL_AGG_QUEUED, IWL_AGG_STARTING, IWL_AGG_ON, IWL_EMPTYING_HW_QUEUE_ADDBA, From 8458e48ac7ad86a5ab7f3d1a8cacd9205a9a97ce Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Sun, 3 Sep 2017 16:04:38 +0300 Subject: [PATCH 0048/1322] iwlwifi: mvm: wake the correct mac80211 queue iwl_mvm_start_mac_queues() takes a bitmap of the queues to wake. When deferred tx is purged, set the bit of the hw_queue so the correct queue will be waken up. Fixes: 7e39a00d5931 ("iwlwifi: mvm: start mac queues when deferred tx frames are purged") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 64b0be73ea72..3a6ce4222ff5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2563,7 +2563,7 @@ static void iwl_mvm_purge_deferred_tx_frames(struct iwl_mvm *mvm, * queues, so we should never get a second deferred * frame for the RA/TID. */ - iwl_mvm_start_mac_queues(mvm, info->hw_queue); + iwl_mvm_start_mac_queues(mvm, BIT(info->hw_queue)); ieee80211_free_txskb(mvm->hw, skb); } } From 97bce57bd7f96e1218751996f549a6e61f18cc8c Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 17:59:15 +0300 Subject: [PATCH 0049/1322] iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD The MCAST_FILTER_CMD can get quite large when we have many mcast addresses to set (we support up to 255). So the command should be send as NOCOPY to prevent a warning caused by too-long commands: WARNING: CPU: 0 PID: 9700 at /root/iwlwifi/stack-dev/drivers/net/wireless/intel/iwlwifi/pcie/tx.c:1550 iwl_pcie_enqueue_hcmd+0x8c7/0xb40 [iwlwifi] Command MCAST_FILTER_CMD (0x1d0) is too large (328 bytes) This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196743 Cc: stable@vger.kernel.org Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 3a6ce4222ff5..635db63f972e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1546,6 +1546,11 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, struct iwl_mvm_mc_iter_data *data = _data; struct iwl_mvm *mvm = data->mvm; struct iwl_mcast_filter_cmd *cmd = mvm->mcast_filter_cmd; + struct iwl_host_cmd hcmd = { + .id = MCAST_FILTER_CMD, + .flags = CMD_ASYNC, + .dataflags[0] = IWL_HCMD_DFL_NOCOPY, + }; int ret, len; /* if we don't have free ports, mcast frames will be dropped */ @@ -1560,7 +1565,10 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, memcpy(cmd->bssid, vif->bss_conf.bssid, ETH_ALEN); len = roundup(sizeof(*cmd) + cmd->count * ETH_ALEN, 4); - ret = iwl_mvm_send_cmd_pdu(mvm, MCAST_FILTER_CMD, CMD_ASYNC, len, cmd); + hcmd.len[0] = len; + hcmd.data[0] = cmd; + + ret = iwl_mvm_send_cmd(mvm, &hcmd); if (ret) IWL_ERR(mvm, "mcast filter cmd error. ret=%d\n", ret); } From 61e7d91bcf7725b9fcd9cbfc5fa0e0f84f19e6de Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 18:57:35 +0300 Subject: [PATCH 0050/1322] iwlwifi: mvm: handle FIF_ALLMULTI when setting multicast addresses We were ignoring the FIF_ALLMULTI flag when setting the multicast addresses with MCAST_FILTER_CMD. Check if this flag is set and enable pass_all accordingly. We also need to set the count to 0 if pass_all is enable so we don't pass addresses to the firmware when not needed (as doing so causes an assert). This fixes https://bugzilla.kernel.org/show_bug.cgi?id=196741 Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 635db63f972e..3bcaa82f59b2 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1643,6 +1643,12 @@ static void iwl_mvm_configure_filter(struct ieee80211_hw *hw, if (!cmd) goto out; + if (changed_flags & FIF_ALLMULTI) + cmd->pass_all = !!(*total_flags & FIF_ALLMULTI); + + if (cmd->pass_all) + cmd->count = 0; + iwl_mvm_recalc_multicast(mvm); out: mutex_unlock(&mvm->mutex); From 3f497de997c7ed34ad8a90b64f1ca53a41d428b4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 2 Sep 2017 11:05:22 +0300 Subject: [PATCH 0051/1322] iwlwifi: mvm: initialize status in iwl_mvm_add_int_sta_common() We always need to initialize the status argument to the success case before calling iwl_mvm_send_cmd_status() or iwl_mvm_send_cmd_pdu_status() (which calls the former) otherwise we may get an uninitialized value back. In this case, we use ADD_STA_SUCCESS as success. Fixes: 732d06e9d9cf ("iwlwifi: mvm: add station before allocating a queue") Reported by: Dan Carpenter Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 2dafe9bb4d8b..c4a343534c5e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -1285,7 +1285,7 @@ static int iwl_mvm_add_int_sta_common(struct iwl_mvm *mvm, { struct iwl_mvm_add_sta_cmd cmd; int ret; - u32 status; + u32 status = ADD_STA_SUCCESS; lockdep_assert_held(&mvm->mutex); From d460f1fb83a44833a09c8eaa34b30ce553cab8c5 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 2 Sep 2017 11:25:40 +0300 Subject: [PATCH 0052/1322] iwlwifi: mvm: set status before calling iwl_mvm_send_cmd_status() We always must set the status to what we consider success before calling iwl_mvm_send_cmd_status() (also iwl_mvm_send_cmd_pdu_status() which calls it). Fix a few places where initialization is missing. Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index 50983615dce6..774122fed454 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -555,7 +555,7 @@ static int iwl_mvm_lmac_scan_abort(struct iwl_mvm *mvm) struct iwl_host_cmd cmd = { .id = SCAN_OFFLOAD_ABORT_CMD, }; - u32 status; + u32 status = CAN_ABORT_STATUS; ret = iwl_mvm_send_cmd_status(mvm, &cmd, &status); if (ret) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index 8876c2abc440..4d907f60bce9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -529,6 +529,7 @@ int iwl_mvm_ctdp_command(struct iwl_mvm *mvm, u32 op, u32 state) lockdep_assert_held(&mvm->mutex); + status = 0; ret = iwl_mvm_send_cmd_pdu_status(mvm, WIDE_ID(PHY_OPS_GROUP, CTDP_CONFIG_CMD), sizeof(cmd), &cmd, &status); From 5f90472c00ddf1e64c2865f71cced297bd5f80a2 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 4 Sep 2017 20:27:04 +0300 Subject: [PATCH 0053/1322] iwlwifi: mvm: fix reorder buffer for 9000 devices The condition to check if reorder buffer ran out of space is faulty, as it takes into account only the NSSN. In case the head SN was too far behind the reorder buffer should move forward, regardless of the NSSN status. This caused the driver to release packets out of order in some scenarios. Fixes: b915c10174fb ("iwlwifi: mvm: add reorder buffer per queue") Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 67ffd9774712..77f77bc5d083 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -672,11 +672,12 @@ static bool iwl_mvm_reorder(struct iwl_mvm *mvm, * If there was a significant jump in the nssn - adjust. * If the SN is smaller than the NSSN it might need to first go into * the reorder buffer, in which case we just release up to it and the - * rest of the function will take of storing it and releasing up to the - * nssn + * rest of the function will take care of storing it and releasing up to + * the nssn */ if (!iwl_mvm_is_sn_less(nssn, buffer->head_sn + buffer->buf_size, - buffer->buf_size)) { + buffer->buf_size) || + !ieee80211_sn_less(sn, buffer->head_sn + buffer->buf_size)) { u16 min_sn = ieee80211_sn_less(sn, nssn) ? sn : nssn; iwl_mvm_release_frames(mvm, sta, napi, buffer, min_sn); From cac72b990d34f4c70208998a86f910ba38253c94 Mon Sep 17 00:00:00 2001 From: Lyude Date: Sat, 22 Jul 2017 21:15:09 -0400 Subject: [PATCH 0054/1322] HID: rmi: Make sure the HID device is opened on resume So it looks like that suspend/resume has actually always been broken on hid-rmi. The fact it worked was a rather silly coincidence that was relying on the HID device to already be opened upon resume. This means that so long as anything was reading the /dev/input/eventX node for for an RMI device, it would suspend and resume correctly. As well, if nothing happened to be keeping the HID device away it would shut off, then the RMI driver would get confused on resume when it stopped responding and explode. So, call hid_hw_open() in rmi_post_resume() so we make sure that the device is alive before we try talking to it. This fixes RMI device suspend/resume over HID. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196851 [jkosina@suse.cz: removed useless hunk that was zero-initializing 'ret'] Signed-off-by: Lyude Cc: Andrew Duggan Cc: stable@vger.kernel.org Reviewed-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-rmi.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-rmi.c b/drivers/hid/hid-rmi.c index 5b40c2614599..ef241d66562e 100644 --- a/drivers/hid/hid-rmi.c +++ b/drivers/hid/hid-rmi.c @@ -436,17 +436,24 @@ static int rmi_post_resume(struct hid_device *hdev) if (!(data->device_flags & RMI_DEVICE)) return 0; - ret = rmi_reset_attn_mode(hdev); + /* Make sure the HID device is ready to receive events */ + ret = hid_hw_open(hdev); if (ret) return ret; + ret = rmi_reset_attn_mode(hdev); + if (ret) + goto out; + ret = rmi_driver_resume(rmi_dev, false); if (ret) { hid_warn(hdev, "Failed to resume device: %d\n", ret); - return ret; + goto out; } - return 0; +out: + hid_hw_close(hdev); + return ret; } #endif /* CONFIG_PM */ From 01f5bbd17a8066b58dba9b5049fad504bce67322 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 7 Sep 2017 10:40:35 +0300 Subject: [PATCH 0055/1322] mmc: block: Fix incorrectly initialized requests mmc_init_request() depends on card->bouncesz so it must be calculated before blk_init_allocated_queue() starts allocating requests. Reported-by: Seraphime Kirkovski Fixes: 304419d8a7e9 ("mmc: core: Allocate per-request data using the..") Signed-off-by: Adrian Hunter Tested-by: Seraphime Kirkovski Cc: Signed-off-by: Ulf Hansson Tested-by: Pavel Machek --- drivers/mmc/core/queue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index affa7370ba82..74c663b1c0a7 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -242,6 +242,12 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; + /* + * mmc_init_request() depends on card->bouncesz so it must be calculated + * before blk_init_allocated_queue() starts allocating requests. + */ + card->bouncesz = mmc_queue_calc_bouncesz(host); + mq->card = card; mq->queue = blk_alloc_queue(GFP_KERNEL); if (!mq->queue) @@ -265,7 +271,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); - card->bouncesz = mmc_queue_calc_bouncesz(host); if (card->bouncesz) { blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); blk_queue_max_segments(mq->queue, card->bouncesz / 512); From b4f146f5faf85d21f5cd414531c3ced9c1b61e3c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 5 Sep 2017 20:27:50 +0200 Subject: [PATCH 0056/1322] mmc: host: fix typo after MMC_DEBUG move MMC_DEBUG was moved and one letter got strangely capitalized. Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 02179ed2a40d..8c15637178ff 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -5,7 +5,7 @@ comment "MMC/SD/SDIO Host Controller Drivers" config MMC_DEBUG - bool "MMC host drivers debugginG" + bool "MMC host drivers debugging" depends on MMC != n help This is an option for use by developers; most people should From b917c6d18c031cfce11ec35139033845f205b1d0 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Thu, 7 Sep 2017 13:24:17 +0200 Subject: [PATCH 0057/1322] mmc: cavium: Fix use-after-free in of_platform_device_destroy KASAN reported the following: [ 19.338655] ================================================================== [ 19.345946] BUG: KASAN: use-after-free in of_platform_device_destroy+0x88/0x100 [ 19.345966] Read of size 8 at addr fffffe01aa6f1468 by task systemd-udevd/264 [ 19.345983] CPU: 1 PID: 264 Comm: systemd-udevd Not tainted 4.13.0-jang+ #737 [ 19.345989] Hardware name: Cavium ThunderX CN81XX board (DT) [ 19.345995] Call trace: [ 19.346013] [] dump_backtrace+0x0/0x368 [ 19.346026] [] show_stack+0x24/0x30 [ 19.346040] [] dump_stack+0xa4/0xc8 [ 19.346057] [] print_address_description+0x68/0x258 [ 19.346070] [] kasan_report+0x238/0x2f8 [ 19.346082] [] __asan_load8+0x88/0xb8 [ 19.346098] [] of_platform_device_destroy+0x88/0x100 [ 19.346131] [] thunder_mmc_probe+0x314/0x550 [thunderx_mmc] [ 19.346147] [] pci_device_probe+0x158/0x1f8 [ 19.346162] [] driver_probe_device+0x394/0x5f8 [ 19.346174] [] __driver_attach+0x154/0x158 [ 19.346185] [] bus_for_each_dev+0xdc/0x140 [ 19.346196] [] driver_attach+0x38/0x48 [ 19.346207] [] bus_add_driver+0x290/0x3c8 [ 19.346219] [] driver_register+0xbc/0x1a0 [ 19.346232] [] __pci_register_driver+0xc4/0xd8 [ 19.346260] [] thunder_mmc_driver_init+0x24/0x10000 [thunderx_mmc] [ 19.346273] [] do_one_initcall+0x98/0x1c0 [ 19.346289] [] do_init_module+0xe0/0x2cc [ 19.346303] [] load_module+0x3238/0x35c0 [ 19.346318] [] SyS_finit_module+0x190/0x1a0 [ 19.346329] [] __sys_trace_return+0x0/0x4 This is caused by: platform_device_register() -> platform_device_unregister(to_platform_device(dev)) freeing struct device -> of_node_clear_flag(dev->of_node, ...) writing to the freed device The issue is solved by increasing the reference count before calling of_platform_device_destroy() so freeing the device is postponed after the call. Fixes: 8fb83b142823 ("mmc: cavium: Fix probing race with regulator") Signed-off-by: Jan Glauber Signed-off-by: Ulf Hansson --- drivers/mmc/host/cavium-thunderx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/cavium-thunderx.c b/drivers/mmc/host/cavium-thunderx.c index b9cc95998799..eee08d81b242 100644 --- a/drivers/mmc/host/cavium-thunderx.c +++ b/drivers/mmc/host/cavium-thunderx.c @@ -7,6 +7,7 @@ * * Copyright (C) 2016 Cavium Inc. */ +#include #include #include #include @@ -149,8 +150,11 @@ error: for (i = 0; i < CAVIUM_MAX_MMC; i++) { if (host->slot[i]) cvm_mmc_of_slot_remove(host->slot[i]); - if (host->slot_pdev[i]) + if (host->slot_pdev[i]) { + get_device(&host->slot_pdev[i]->dev); of_platform_device_destroy(&host->slot_pdev[i]->dev, NULL); + put_device(&host->slot_pdev[i]->dev); + } } clk_disable_unprepare(host->clk); return ret; From bfaa1ce809bbcd12b1399409ab1dbf0cdaba6e27 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 15:13:33 +0100 Subject: [PATCH 0058/1322] drm/amdkfd: check for null dev to avoid a null pointer dereference The call to kfd_device_by_id can potentially return null, so check that dev is null and return with -EINVAL to avoid a null pointer dereference. Detected by CoverityScan CID#1454629 ("Dereference null return value") Fixes: 5d71dbc3a588 ("drm/amdkfd: Implement image tiling mode support v2") Signed-off-by: Colin Ian King Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index e4a8c2e52cb2..660b3fbade41 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -892,6 +892,8 @@ static int kfd_ioctl_get_tile_config(struct file *filep, int err = 0; dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; dev->kfd2kgd->get_tile_config(dev->kgd, &config); From b0e07da3f5c8d069d186a7983ff64eaebf2ea230 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 11 Sep 2017 11:39:50 +0200 Subject: [PATCH 0059/1322] qxl: fix primary surface handling The atomic conversion of the qxl driver didn't got the primary surface handling completely right. It works in the common simple cases, but fails for example when changing the display resolution using xrandr or in multihead setups. The rules are simple: There is one primary surface. Before defining a new one you have to destroy the old one. This patch makes qxl_primary_atomic_update() destroy the primary surface before defining a new one. It fixes is_primary flag updates. It adds is_primary checks so we don't try to update the primary surface in case it already has the state we want it being in. Fixes: 3538e80a869b ("drm: qxl: Atomic phase 1: Implement mode_set_nofb") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102338 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196777 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170911093950.22401-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 36 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index 03fe182203ce..7babdd8f20fd 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -512,23 +512,25 @@ static void qxl_primary_atomic_update(struct drm_plane *plane, .y2 = qfb->base.height }; - if (!old_state->fb) { - qxl_io_log(qdev, - "create primary fb: %dx%d,%d,%d\n", - bo->surf.width, bo->surf.height, - bo->surf.stride, bo->surf.format); - - qxl_io_create_primary(qdev, 0, bo); - bo->is_primary = true; - return; - - } else { + if (old_state->fb) { qfb_old = to_qxl_framebuffer(old_state->fb); bo_old = gem_to_qxl_bo(qfb_old->obj); + } else { + bo_old = NULL; + } + + if (bo == bo_old) + return; + + if (bo_old && bo_old->is_primary) { + qxl_io_destroy_primary(qdev); bo_old->is_primary = false; } - bo->is_primary = true; + if (!bo->is_primary) { + qxl_io_create_primary(qdev, 0, bo); + bo->is_primary = true; + } qxl_draw_dirty_fb(qdev, qfb, bo, 0, 0, &norect, 1, 1); } @@ -537,13 +539,15 @@ static void qxl_primary_atomic_disable(struct drm_plane *plane, { struct qxl_device *qdev = plane->dev->dev_private; - if (old_state->fb) - { struct qxl_framebuffer *qfb = + if (old_state->fb) { + struct qxl_framebuffer *qfb = to_qxl_framebuffer(old_state->fb); struct qxl_bo *bo = gem_to_qxl_bo(qfb->obj); - qxl_io_destroy_primary(qdev); - bo->is_primary = false; + if (bo->is_primary) { + qxl_io_destroy_primary(qdev); + bo->is_primary = false; + } } } From bf2afee14e07de16d3cafc67edbfc2a3cc65e4bc Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 8 Sep 2017 10:37:35 +1000 Subject: [PATCH 0060/1322] cifs: check rsp for NULL before dereferencing in SMB2_open In SMB2_open there are several paths where the SendReceive2 call will return an error before it sets rsp_iov.iov_base thus leaving iov_base uninitialized. Thus we need to check rsp before we dereference it in the call to get_rfc1002_length(). A report of this issue was previously reported in http://www.spinics.net/lists/linux-cifs/msg12846.html RH-bugzilla : 1476151 Version 2 : * Lets properly initialize rsp_iov before we use it. Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky . Signed-off-by: Steve French Reported-by: Xiaoli Feng CC: Stable --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5531e7ee1210..69a751b038ab 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1634,7 +1634,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct cifs_tcon *tcon = oparms->tcon; struct cifs_ses *ses = tcon->ses; struct kvec iov[4]; - struct kvec rsp_iov; + struct kvec rsp_iov = {NULL, 0}; int resp_buftype; int uni_path_len; __le16 *copy_path = NULL; @@ -1763,7 +1763,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); - if (err_buf) + if (err_buf && rsp) *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, GFP_KERNEL); goto creat_exit; From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 8 Sep 2017 16:24:32 +0200 Subject: [PATCH 0061/1322] etnaviv: fix submit error path If the gpu submit fails, bail out to avoid accessing a potentially unititalized fence. CC: stable@vger.kernel.org #4.12+ Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index 6463fc2c736f..b95362186f9c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, cmdbuf->user_size = ALIGN(args->stream_size, 8); ret = etnaviv_gpu_submit(gpu, submit, cmdbuf); - if (ret == 0) - cmdbuf = NULL; + if (ret) + goto out; + + cmdbuf = NULL; if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) { /* From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 11 Sep 2017 15:29:31 +0200 Subject: [PATCH 0062/1322] etnaviv: fix gem object list corruption All manipulations of the gem_object list need to be protected by the list mutex, as GEM objects can be created and freed in parallel. This fixes a kernel memory corruption. CC: stable@vger.kernel.org Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 9a3bea738330..87b95eeedd9e 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etnaviv_gem_shmem_ops = { void etnaviv_gem_free_object(struct drm_gem_object *obj) { struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); + struct etnaviv_drm_private *priv = obj->dev->dev_private; struct etnaviv_vram_mapping *mapping, *tmp; /* object should not be active */ WARN_ON(is_active(etnaviv_obj)); + mutex_lock(&priv->gem_lock); list_del(&etnaviv_obj->gem_node); + mutex_unlock(&priv->gem_lock); list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list, obj_node) { From fc3100d64f0ae383ae8d845989103da06d62763b Mon Sep 17 00:00:00 2001 From: Pu Hou Date: Tue, 5 Sep 2017 05:17:24 +0200 Subject: [PATCH 0063/1322] s390/perf: fix bug when creating per-thread event A per-thread event could not be created correctly like below: perf record --per-thread -e rB0000 -- sleep 1 Error: The sys_perf_event_open() syscall returned with 19 (No such device) for event (rB0000). /bin/dmesg may provide additional information. No CONFIG_PERF_EVENTS=y kernel support configured? This bug was introduced by: commit c311c797998c1e70eade463dd60b843da4f1a203 Author: Alexey Dobriyan Date: Mon May 8 15:56:15 2017 -0700 cpumask: make "nr_cpumask_bits" unsigned If a per-thread event is not attached to any CPU, the cpu field in struct perf_event is -1. The above commit converts the CPU number to unsigned int, which result in an illegal CPU number. Fixes: c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") Cc: # v4.12+ Cc: Alexey Dobriyan Acked-by: Heiko Carstens Signed-off-by: Pu Hou Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index c1bf75ffb875..7e1e40323b78 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -823,9 +823,12 @@ static int cpumsf_pmu_event_init(struct perf_event *event) } /* Check online status of the CPU to which the event is pinned */ - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) - return -ENODEV; + if (event->cpu >= 0) { + if ((unsigned int)event->cpu >= nr_cpumask_bits) + return -ENODEV; + if (!cpu_online(event->cpu)) + return -ENODEV; + } /* Force reset of idle/hv excludes regardless of what the * user requested. From 673514aff5d797b0cdb3f09097ae0e253b5667c3 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Tue, 12 Sep 2017 17:22:42 +0200 Subject: [PATCH 0064/1322] s390/dasd: fix race during dasd initialization Fix a panic in blk_mq_hctx_has_pending() that is caused by a racy call to blk_mq_run_hw_queues in a dasd function that might get called with the request queue not yet initialized during initialization. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index ea19b4ff87a2..29f35e29d480 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1644,7 +1644,9 @@ void dasd_generic_handle_state_change(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } } EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); @@ -3759,7 +3761,9 @@ int dasd_generic_path_operational(struct dasd_device *device) dasd_schedule_device_bh(device); if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } if (!device->stopped) @@ -4025,7 +4029,9 @@ int dasd_generic_restore_device(struct ccw_device *cdev) if (device->block) { dasd_schedule_block_bh(device->block); - blk_mq_run_hw_queues(device->block->request_queue, true); + if (device->block->request_queue) + blk_mq_run_hw_queues(device->block->request_queue, + true); } clear_bit(DASD_FLAG_SUSPENDED, &device->flags); From 8320caeeffdefec3b58b9d4a7ed8e1079492fe7b Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Fri, 8 Sep 2017 10:55:27 -0700 Subject: [PATCH 0065/1322] HID: i2c-hid: allocate hid buffers for real worst case The buffer allocation is not currently accounting for an extra byte for the report id. This can cause an out of bounds access in function i2c_hid_set_or_send_report() with reportID > 15. Cc: stable@vger.kernel.org Signed-off-by: Adrian Salido Reviewed-by: Benson Leung Signed-off-by: Guenter Roeck Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/i2c-hid/i2c-hid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index 77396145d2d0..9145c2129a96 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -543,7 +543,8 @@ static int i2c_hid_alloc_buffers(struct i2c_hid *ihid, size_t report_size) { /* the worst case is computed from the set_report command with a * reportID > 15 and the maximum report length */ - int args_len = sizeof(__u8) + /* optional ReportID byte */ + int args_len = sizeof(__u8) + /* ReportID */ + sizeof(__u8) + /* optional ReportID byte */ sizeof(__u16) + /* data register */ sizeof(__u16) + /* size of the report */ report_size; /* report */ From 993f0d93f8538c15bd5c12a1a9fd74c777efea1b Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 7 Sep 2017 17:44:12 -0700 Subject: [PATCH 0066/1322] HID: wacom: generic: Send MSC_SERIAL and ABS_MISC when leaving prox The latest generation of pro devices (MobileStudio Pro, 2nd-gen Intuos Pro, Cintiq Pro) send a serial number of '0' whenever the pen is too far away for reliable communication. Userspace defines that a serial number of '0' is invalid, so we need to be careful not to actually forward this value. Additionally, since EMR ISDv4 devices do not support serial numbers or tool IDs, we'd like to not send these events if they aren't necessary. The existing code achieves these goals by adding a check for a non-zero serial number within the wacom_wac_pen_report function. The MSC_SERIAL and ABS_MISC events are only sent if the serial number is non-zero. This code fails, however when the pen for a pro device leaves proximity. When the pen leaves prox and the tablet sends a serial of 0, wacom_wac_pen_event dutifully clears the serial number. When wacom_wac_pen_report is called, it does not send either the MSC_SERIAL of the exiting tool nor an ABS_MISC event. This patch prevents the wacom_wac_pen_event function from clearing an already-set serial number. This ensures that we have the serial number handy when exiting proximity, but requires us to manually clear it afterwards to ensure the driver does not send stale data (e.g. when switching between AES pens that report a serial nubmer of 0 for the first few fully in-proximity packets). Fixes: f85c9dc678 ("HID: wacom: generic: Support tool ID and additional tool types") Cc: stable # v4.10 Signed-off-by: Ping Cheng Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 78d0398904dc..e2ba36da6e20 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2141,8 +2141,10 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->hid_data.tipswitch |= value; return; case HID_DG_TOOLSERIALNUMBER: - wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); - wacom_wac->serial[0] |= (__u32)value; + if (value) { + wacom_wac->serial[0] = (wacom_wac->serial[0] & ~0xFFFFFFFFULL); + wacom_wac->serial[0] |= (__u32)value; + } return; case HID_DG_TWIST: /* @@ -2156,15 +2158,17 @@ static void wacom_wac_pen_event(struct hid_device *hdev, struct hid_field *field wacom_wac->hid_data.sense_state = value; return; case WACOM_HID_WD_SERIALHI: - wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); - wacom_wac->serial[0] |= ((__u64)value) << 32; - /* - * Non-USI EMR devices may contain additional tool type - * information here. See WACOM_HID_WD_TOOLTYPE case for - * more details. - */ - if (value >> 20 == 1) { - wacom_wac->id[0] |= value & 0xFFFFF; + if (value) { + wacom_wac->serial[0] = (wacom_wac->serial[0] & 0xFFFFFFFF); + wacom_wac->serial[0] |= ((__u64)value) << 32; + /* + * Non-USI EMR devices may contain additional tool type + * information here. See WACOM_HID_WD_TOOLTYPE case for + * more details. + */ + if (value >> 20 == 1) { + wacom_wac->id[0] |= value & 0xFFFFF; + } } return; case WACOM_HID_WD_TOOLTYPE: @@ -2279,6 +2283,7 @@ static void wacom_wac_pen_report(struct hid_device *hdev, if (!prox) { wacom_wac->tool[0] = 0; wacom_wac->id[0] = 0; + wacom_wac->serial[0] = 0; } } From 92380b572d95caf48f8424746aeee63c5a2b1922 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 7 Sep 2017 17:47:38 -0700 Subject: [PATCH 0067/1322] HID: wacom: generic: Clear ABS_MISC when tool leaves proximity The tool ID information sent in ABS_MISC is expected to be reset to 0 when a tool leaves proximity. Not doing this can cause problems if a tool is removed and then re-introduced. Kernel event filtering will prevent the (identical) ABS_MISC event from being sent when the tool re-enters proxmity. This can cause userspace to not properly set the tool ID. Fixes: f85c9dc678 ("HID: wacom: generic: Support tool ID and additional tool types") Cc: stable # v4.10 Signed-off-by: Ping Cheng Signed-off-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index e2ba36da6e20..aa692e28b2cd 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2272,7 +2272,7 @@ static void wacom_wac_pen_report(struct hid_device *hdev, input_report_key(input, wacom_wac->tool[0], prox); if (wacom_wac->serial[0]) { input_event(input, EV_MSC, MSC_SERIAL, wacom_wac->serial[0]); - input_report_abs(input, ABS_MISC, id); + input_report_abs(input, ABS_MISC, prox ? id : 0); } wacom_wac->hid_data.tipswitch = false; From 4cf97582b46f123a4b7cd88d999f1806c2eb4093 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 11 Sep 2017 17:43:56 +0200 Subject: [PATCH 0068/1322] drm/amdgpu: revert tile table update for oland MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several users have complained that the tile table update broke Oland support. Despite several attempts to fix it, the root cause is still unknown at this point and no solution is available. As it is not acceptable to leave a known regression breaking a major functionality in the kernel for several releases, let's just reverse this optimization for now. It can be implemented again later if and only if the breakage is understood and fixed. As there were no complaints for Hainan so far, only the Oland part of the offending commit is reverted. Optimization is preserved on Hainan, so this commit isn't an actual revert of the original. This fixes bug #194761: https://bugzilla.kernel.org/show_bug.cgi?id=194761 Reviewed-by: Marek Olšák Signed-off-by: Jean Delvare Fixes: f8d9422ef80c ("drm/amdgpu: update tile table for oland/hainan") Cc: Flora Cui Cc: Junwei Zhang Cc: Alex Deucher Cc: Marek Olšák Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 189 +++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index d228f5a99044..dbbe986f90f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -636,7 +636,194 @@ static void gfx_v6_0_tiling_mode_table_init(struct amdgpu_device *adev) NUM_BANKS(ADDR_SURF_2_BANK); for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); - } else if (adev->asic_type == CHIP_OLAND || adev->asic_type == CHIP_HAINAN) { + } else if (adev->asic_type == CHIP_OLAND) { + tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[1] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[2] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[3] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_128B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[4] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[5] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[6] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[7] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[8] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_LINEAR_ALIGNED) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[9] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[10] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[11] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[12] = MICRO_TILE_MODE(ADDR_SURF_DISPLAY_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[13] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_1D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_64B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[14] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[15] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[16] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[17] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P4_8x16) | + TILE_SPLIT(split_equal_to_row_size) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[21] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_2) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[22] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_4) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_4); + tilemode[23] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_256B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_2) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[24] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_512B) | + NUM_BANKS(ADDR_SURF_16_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2); + tilemode[25] = MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) | + ARRAY_MODE(ARRAY_2D_TILED_THIN1) | + PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) | + TILE_SPLIT(ADDR_SURF_TILE_SPLIT_1KB) | + NUM_BANKS(ADDR_SURF_8_BANK) | + BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) | + BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) | + MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_1); + for (reg_offset = 0; reg_offset < num_tile_mode_states; reg_offset++) + WREG32(mmGB_TILE_MODE0 + reg_offset, tilemode[reg_offset]); + } else if (adev->asic_type == CHIP_HAINAN) { tilemode[0] = MICRO_TILE_MODE(ADDR_SURF_DEPTH_MICRO_TILING) | ARRAY_MODE(ARRAY_2D_TILED_THIN1) | PIPE_CONFIG(ADDR_SURF_P2) | From b468b6a4969f9bdddb31d484f151bfa03fbee767 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:36 +0200 Subject: [PATCH 0069/1322] scsi: scsi_transport_fc: fix NULL pointer dereference in fc_bsg_job_timeout bsg-lib now embeddeds the job structure into the request, and req->special can't be used anymore. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 3c6bc0081fcb..ba9d70f8a6a1 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -3571,7 +3571,7 @@ fc_vport_sched_delete(struct work_struct *work) static enum blk_eh_timer_return fc_bsg_job_timeout(struct request *req) { - struct bsg_job *job = (void *) req->special; + struct bsg_job *job = blk_mq_rq_to_pdu(req); struct Scsi_Host *shost = fc_bsg_to_shost(job); struct fc_rport *rport = fc_bsg_to_rport(job); struct fc_internal *i = to_fc_internal(shost->transportt); From fd536998314a9dc54f384fa16287321edb81602a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Sep 2017 22:00:57 +0200 Subject: [PATCH 0070/1322] scsi: acornscsi: fix build error A cleanup patch introduced a fatal typo from inbalanced curly braces: drivers/scsi/arm/acornscsi.c: In function 'acornscsi_host_reset': drivers/scsi/arm/acornscsi.c:2773:1: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] drivers/scsi/arm/acornscsi.c:2795:12: error: invalid storage class for function 'acornscsi_show_info' static int acornscsi_show_info(struct seq_file *m, struct Scsi_Host *instance) The same patch incorrectly changed the argument type of the reset handler, as shown by this warning: drivers/scsi/arm/acornscsi.c:2888:27: error: initialization of 'int (*)(struct scsi_cmnd *)' from incompatible pointer type 'int (*)(struct Scsi_Host *)' [-Werror=incompatible-pointer-types] .eh_host_reset_handler = acornscsi_host_reset, This removes one the extraneous opening brace and reverts the argument type change. [mkp: fixed checkpatch complaint] Fixes: 74fa80ee3fae ("scsi: acornscsi: move bus reset to host reset") Signed-off-by: Arnd Bergmann Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/arm/acornscsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index 690816f3c6af..421fe869a11e 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -2725,9 +2725,9 @@ int acornscsi_abort(struct scsi_cmnd *SCpnt) * Params : SCpnt - command causing reset * Returns : one of SCSI_RESET_ macros */ -int acornscsi_host_reset(struct Scsi_Host *shpnt) +int acornscsi_host_reset(struct scsi_cmnd *SCpnt) { - AS_Host *host = (AS_Host *)shpnt->hostdata; + AS_Host *host = (AS_Host *)SCpnt->device->host->hostdata; struct scsi_cmnd *SCptr; host->stats.resets += 1; @@ -2741,7 +2741,7 @@ int acornscsi_host_reset(struct Scsi_Host *shpnt) printk(KERN_WARNING "acornscsi_reset: "); print_sbic_status(asr, ssr, host->scsi.phase); - for (devidx = 0; devidx < 9; devidx ++) { + for (devidx = 0; devidx < 9; devidx++) acornscsi_dumplog(host, devidx); } #endif From e785fa0a164aa11001cba931367c7f94ffaff888 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: [PATCH 0071/1322] nl80211: check for the required netlink attributes presence nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Cc: # v3.1-rc1 Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..fbd5593e88cb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10903,6 +10903,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) From f7f3dc00f61261cdc9ccd8b886f21bc4dffd6fd9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 7 Sep 2017 19:08:21 +0200 Subject: [PATCH 0072/1322] x86/cpu/AMD: Fix erratum 1076 (CPB bit) CPUID Fn8000_0007_EDX[CPB] is wrongly 0 on models up to B1. But they do support CPB (AMD's Core Performance Boosting cpufreq CPU feature), so fix that. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sherry Hurwitz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170907170821.16021-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9862e2cd6d93..d58184b7cd44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ From 0998b7a0befdf6e734032895ee639a5e6f88cc3f Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Thu, 14 Sep 2017 08:01:38 +0200 Subject: [PATCH 0073/1322] objtool: Fix memory leak in elf_create_rela_section() Let's free the allocated char array 'relaname' before returning, in order to avoid leaking memory. Signed-off-by: Martin Kepplinger Acked-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mingo.kernel.org@gmail.com Link: http://lkml.kernel.org/r/20170914060138.26472-1-martink@posteo.de Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6e9f980a7d26..1e89a5f8bfc9 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -508,6 +508,7 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base) strcat(relaname, base->name); sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); + free(relaname); if (!sec) return NULL; From df968c9329f6e5cf3596a0a54adb6f749747a746 Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Fri, 15 Sep 2017 02:15:05 -0500 Subject: [PATCH 0074/1322] objtool: Do not retrieve data from empty sections Binutils 2.29-9 in Debian return an error when elf_getdata is invoked on empty section (.note.GNU-stack in all kernel files), causing immediate failure of kernel build with: elf_getdata: can't manipulate null section As nothing is done with sections that have zero size, just do not retrieve their data at all. Signed-off-by: Petr Vandrovec Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2ce30a44349065b70d0f00e71e286dc0cbe745e6.1505459652.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 1e89a5f8bfc9..b4cd8bc62521 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -175,19 +175,20 @@ static int read_sections(struct elf *elf) return -1; } - sec->data = elf_getdata(s, NULL); - if (!sec->data) { - WARN_ELF("elf_getdata"); - return -1; + if (sec->sh.sh_size != 0) { + sec->data = elf_getdata(s, NULL); + if (!sec->data) { + WARN_ELF("elf_getdata"); + return -1; + } + if (sec->data->d_off != 0 || + sec->data->d_size != sec->sh.sh_size) { + WARN("unexpected data attributes for %s", + sec->name); + return -1; + } } - - if (sec->data->d_off != 0 || - sec->data->d_size != sec->sh.sh_size) { - WARN("unexpected data attributes for %s", sec->name); - return -1; - } - - sec->len = sec->data->d_size; + sec->len = sec->sh.sh_size; } /* sanity check, one more call to elf_nextscn() should return NULL */ From 97dab2ae7e8473a821f72a039ead0f36b12ba22d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 15 Sep 2017 02:17:11 -0500 Subject: [PATCH 0075/1322] objtool: Fix object file corruption Arnd Bergmann reported that a randconfig build was failing with the following link error: built-in.o: member arch/x86/kernel/time.o in archive is not an object It turns out the link failed because the time.o file had been corrupted by objtool: nm: arch/x86/kernel/time.o: File format not recognized In certain rare cases, when a .o file's ORC table is very small, the .data section size doesn't change because it's page aligned. Because all the existing sections haven't changed size, libelf doesn't detect any section header changes, and so it doesn't update the section header table properly. Instead it writes junk in the section header entries for the new ORC sections. Make sure libelf properly updates the section header table by setting the ELF_F_DIRTY flag in the top level elf struct. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") Link: http://lkml.kernel.org/r/e650fd0f2d8a209d1409a9785deb101fdaed55fb.1505459813.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/elf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index b4cd8bc62521..24460155c82c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -563,6 +563,7 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; + /* Update section headers for changed sections: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->changed) { s = elf_getscn(elf->elf, sec->idx); @@ -570,13 +571,17 @@ int elf_write(struct elf *elf) WARN_ELF("elf_getscn"); return -1; } - if (!gelf_update_shdr (s, &sec->sh)) { + if (!gelf_update_shdr(s, &sec->sh)) { WARN_ELF("gelf_update_shdr"); return -1; } } } + /* Make sure the new section header entries get updated properly. */ + elf_flagelf(elf->elf, ELF_C_SET, ELF_F_DIRTY); + + /* Write all changes to the file. */ if (elf_update(elf->elf, ELF_C_WRITE) < 0) { WARN_ELF("elf_update"); return -1; From 820608548737e315c6f93e3099b4e65bde062334 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 15 Sep 2017 11:55:27 -0400 Subject: [PATCH 0076/1322] drm/radeon: disable hard reset in hibernate for APUs Fixes a hibernation regression on APUs. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=191571 Fixes: 274ad65c9d02bdc (drm/radeon: hard reset r600 and newer GPU when hibernating.) Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 997131d58c7f..ffc10cadcf34 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1663,7 +1663,7 @@ int radeon_suspend_kms(struct drm_device *dev, bool suspend, radeon_agp_suspend(rdev); pci_save_state(dev->pdev); - if (freeze && rdev->family >= CHIP_CEDAR) { + if (freeze && rdev->family >= CHIP_CEDAR && !(rdev->flags & RADEON_IS_IGP)) { rdev->asic->asic_reset(rdev, true); pci_restore_state(dev->pdev); } else if (suspend) { From 9c95be0163a0a699ed7eda4727f485f8453580ef Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 13 Sep 2017 16:29:46 +0200 Subject: [PATCH 0077/1322] scsi: sd: Remove unnecessary condition in sd_read_block_limits() After series of changes around WRITE_SAME and UNMAP setup we ended up with leftover unnecessary condition. Remove it. Signed-off-by: Lukas Czerner Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 11c1738c2100..fb9f8b5f4673 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2915,8 +2915,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sd_config_discard(sdkp, SD_LBP_WS16); else if (sdkp->lbpws10) sd_config_discard(sdkp, SD_LBP_WS10); - else if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); else sd_config_discard(sdkp, SD_LBP_DISABLE); } From 4759df905a474d245752c9dc94288e779b8734dd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:15 +0200 Subject: [PATCH 0078/1322] scsi: sg: factor out sg_fill_request_table() Factor out sg_fill_request_table() for better readability. [mkp: typos, applied by hand] Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 61 +++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index cf0e71db9e51..1acdb6e03999 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -828,6 +828,40 @@ static int max_sectors_bytes(struct request_queue *q) return max_sectors << 9; } +static void +sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) +{ + Sg_request *srp; + int val; + unsigned int ms; + + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; + memset(&rinfo[val], 0, SZ_SG_REQ_INFO); + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; + } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; + } +} + static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) { @@ -1012,38 +1046,13 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -EFAULT; else { sg_req_info_t *rinfo; - unsigned int ms; rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - val = 0; - list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val >= SG_MAX_QUEUE) - break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = srp->sg_io_owned; - rinfo[val].pack_id = srp->header.pack_id; - rinfo[val].usr_ptr = srp->header.usr_ptr; - val++; - } + sg_fill_request_table(sfp, rinfo); read_unlock_irqrestore(&sfp->rq_list_lock, iflags); result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); From 3e0097499839e0fe3af380410eababe5a47c4cf9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:16 +0200 Subject: [PATCH 0079/1322] scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE When calling SG_GET_REQUEST_TABLE ioctl only a half-filled table is returned; the remaining part will then contain stale kernel memory information. This patch zeroes out the entire table to avoid this issue. Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Eric Dumazet Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 1acdb6e03999..0419c2298eab 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -839,7 +839,6 @@ sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) list_for_each_entry(srp, &sfp->rq_list, entry) { if (val > SG_MAX_QUEUE) break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; rinfo[val].problem = srp->header.masked_status & @@ -1047,8 +1046,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) else { sg_req_info_t *rinfo; - rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, - GFP_KERNEL); + rinfo = kzalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, + GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); From 6c92f7dbf25c36f35320e4ae0b508676410bac04 Mon Sep 17 00:00:00 2001 From: Dave Carroll Date: Fri, 15 Sep 2017 11:04:28 -0600 Subject: [PATCH 0080/1322] scsi: aacraid: Fix 2T+ drives on SmartIOC-2000 The logic for supporting large drives was previously tied to 4Kn support for SmartIOC-2000. As SmartIOC-2000 does not support volumes using 4Kn drives, use the intended option flag AAC_OPT_NEW_COMM_64 to determine support for volumes greater than 2T. Cc: Signed-off-by: Dave Carroll Reviewed-by: Christoph Hellwig Reviewed-by: Raghava Aditya Renukunta Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 12 ++++++------ drivers/scsi/aacraid/aacraid.h | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index a64285ab0728..af3e4d3f9735 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -699,13 +699,13 @@ static void _aac_probe_container1(void * context, struct fib * fibptr) int status; dresp = (struct aac_mount *) fib_data(fibptr); - if (!(fibptr->dev->supplement_adapter_info.supported_options2 & - AAC_OPTION_VARIABLE_BLOCK_SIZE)) + if (!aac_supports_2T(fibptr->dev)) { dresp->mnt[0].capacityhigh = 0; - if ((le32_to_cpu(dresp->status) != ST_OK) || - (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { - _aac_probe_container2(context, fibptr); - return; + if ((le32_to_cpu(dresp->status) == ST_OK) && + (le32_to_cpu(dresp->mnt[0].vol) != CT_NONE)) { + _aac_probe_container2(context, fibptr); + return; + } } scsicmd = (struct scsi_cmnd *) context; diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index 92fabf2b0c24..403a639574e5 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -2701,6 +2701,11 @@ static inline int aac_is_src(struct aac_dev *dev) return 0; } +static inline int aac_supports_2T(struct aac_dev *dev) +{ + return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64); +} + char * get_container_type(unsigned type); extern int numacb; extern char aac_driver_version[]; From 51ae253870f55e14ba2854ce9577ac2920efef0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:29:13 +0200 Subject: [PATCH 0081/1322] xen: x86: mark xen_find_pt_base as __init gcc-4.6 causes a harmless link-time warning: WARNING: vmlinux.o(.text.unlikely+0x48e): Section mismatch in reference from the function xen_find_pt_base() to the function .init.text:m2p() The function xen_find_pt_base() references the function __init m2p(). This is often because xen_find_pt_base lacks a __init annotation or the annotation of m2p is wrong. Newer compilers inline this function, so it never shows up, but marking it __init is the right way to avoid the warning. Fixes: 70e61199559a ("xen: move p2m list if conflicting with e820 map") Signed-off-by: Arnd Bergmann Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0422ee7e70b3..ddfeebc2b125 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2221,7 +2221,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) * not the first page table in the page table pool. * Iterate through the initial page tables to find the real page table base. */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) +static phys_addr_t __init xen_find_pt_base(pmd_t *pmd) { phys_addr_t pt_base, paddr; unsigned pmdidx; From fd7d56270b526ca3ed0c224362e3c64a0f86687a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 14 Sep 2017 11:42:17 +0200 Subject: [PATCH 0082/1322] fs/proc: Report eip/esp in /prod/PID/stat for coredumping Commit 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in /proc/PID/stat") stopped reporting eip/esp because it is racy and dangerous for executing tasks. The comment adds: As far as I know, there are no use programs that make any material use of these fields, so just get rid of them. However, existing userspace core-dump-handler applications (for example, minicoredumper) are using these fields since they provide an excellent cross-platform interface to these valuable pointers. So that commit introduced a user space visible regression. Partially revert the change and make the readout possible for tasks with the proper permissions and only if the target task has the PF_DUMPCORE flag set. Fixes: 0a1eb2d474ed ("fs/proc: Stop reporting eip and esp in> /proc/PID/stat") Reported-by: Marco Felsch Signed-off-by: John Ogness Reviewed-by: Andy Lutomirski Cc: Tycho Andersen Cc: Kees Cook Cc: Peter Zijlstra Cc: Brian Gerst Cc: stable@vger.kernel.org Cc: Tetsuo Handa Cc: Borislav Petkov Cc: Al Viro Cc: Linux API Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/87poatfwg6.fsf@linutronix.de Signed-off-by: Thomas Gleixner --- fs/proc/array.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c355574aa0..525157ca25cb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -421,7 +422,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * esp and eip are intentionally zeroed out. There is no * non-racy way to read them without freezing the task. * Programs that need reliable values can use ptrace(2). + * + * The only exception is if the task is core dumping because + * a program is not able to use ptrace(2) in that case. It is + * safe because the task has stopped executing permanently. */ + if (permitted && (task->flags & PF_DUMPCORE)) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } } get_task_comm(tcomm, task); From 5c756065e47dc3e84b00577bd109f0a8e69903d7 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 6 Sep 2017 11:02:56 +0200 Subject: [PATCH 0083/1322] scsi: lpfc: Don't return internal MBXERR_ERROR code from probe function Internal error codes happen to be positive, thus the PCI driver core won't treat them as failure, but we do. This would cause a crash later on as lpfc_pci_remove_one() is called (e.g. as shutdown function). Fixes: 6d368e532168 ("[SCSI] lpfc 8.3.24: Add resource extent support") Signed-off-by: Stefano Brivio Reviewed-by: Johannes Thumshirn Acked-by: Dick Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 7e7ae786121b..100bc4c8798d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -6131,6 +6131,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) "Extents and RPI headers enabled.\n"); } mempool_free(mboxq, phba->mbox_mem_pool); + rc = -EIO; goto out_free_bsmbx; } From 4cb433e856bce5974ea035181cc8eb406496dccc Mon Sep 17 00:00:00 2001 From: Nikola Pajkovsky Date: Wed, 13 Sep 2017 10:46:17 +0200 Subject: [PATCH 0084/1322] scsi: aacraid: error: testing array offset 'bus' after use Fix possible indexing array of bound for &aac->hba_map[bus][cid], where bus and cid boundary check happens later. Fixes: 0d643ff3c353 ("scsi: aacraid: use aac_tmf_callback for reset fib") Signed-off-by: Nikola Pajkovsky Reviewed-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/linit.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 87cc4a93e637..62beb2596466 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -906,12 +906,14 @@ static int aac_eh_dev_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", @@ -962,12 +964,14 @@ static int aac_eh_target_reset(struct scsi_cmnd *cmd) bus = aac_logical_to_phys(scmd_channel(cmd)); cid = scmd_id(cmd); - info = &aac->hba_map[bus][cid]; - if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS || - info->devtype != AAC_DEVTYPE_NATIVE_RAW) + + if (bus >= AAC_MAX_BUSES || cid >= AAC_MAX_TARGETS) return FAILED; - if (info->reset_state > 0) + info = &aac->hba_map[bus][cid]; + + if (info->devtype != AAC_DEVTYPE_NATIVE_RAW && + info->reset_state > 0) return FAILED; pr_err("%s: Host adapter reset request. SCSI hang ?\n", From 9cb067ef8a10bb13112e4d1c0ea996ec96527422 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Sep 2017 23:29:03 +0200 Subject: [PATCH 0085/1322] genirq: Fix cpumask check in __irq_startup_managed() The result of cpumask_any_and() is invalid when result greater or equal nr_cpu_ids. The current check is checking for greater only. Fix it. Fixes: 761ea388e8c4 ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Tony Luck Cc: Chen Yu Cc: Marc Zyngier Cc: Alok Kataria Cc: Joerg Roedel Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Christoph Hellwig Cc: Peter Zijlstra Cc: Borislav Petkov Cc: stable@vger.kernel.org Cc: Paolo Bonzini Cc: Rui Zhang Cc: "K. Y. Srinivasan" Cc: Arjan van de Ven Cc: Dan Williams Cc: Len Brown Link: http://lkml.kernel.org/r/20170913213152.272283444@linutronix.de --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f51b7b6d2451..6fc89fd93824 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt From 6354a06cbaa8c49d8377a6cee3e7db399c23601c Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 13 Sep 2017 09:38:40 +0200 Subject: [PATCH 0086/1322] Revert "arm64: dts: rockchip: Add basic cpu frequencies for RK3368" This reverts commit 6f2dea1f5fdb73eb2e050d9ebe990121d557e519. Without accurate cpu regulators being set for boards this will wreak havoc when cpufreq-dt begins to set new frequencies without adjusting the core frequency. Additionally the rk3368 has an unsolved issue in that it has two separate cpu clusters with separate clock lines but only one cpu supply regulator for both clusters, which causes even more problems. While it seems that originally only one cluster was supposed to be active at a time (big or little), talking with real users of the hardware revealed that having all 8 cores accessible at 1.2GHz max is way more liked than having 4 cores at 1.5GHz max. Such an approach needs changes to cpufreq and/or opp though to control the two separate clock lines when setting both clusters to the same frequencies. In any case, having the OPPs in the dts at this point in time is undesireable, so remove them again for now. Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3368.dtsi | 72 +----------------------- 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index e0518b4bc6c2..19fbaa5e7bdd 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -113,8 +113,7 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x0>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; + #cooling-cells = <2>; /* min followed by max */ }; @@ -123,8 +122,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x1>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_l2: cpu@2 { @@ -132,8 +129,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x2>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_l3: cpu@3 { @@ -141,8 +136,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x3>; enable-method = "psci"; - clocks = <&cru ARMCLKL>; - operating-points-v2 = <&cluster0_opp>; }; cpu_b0: cpu@100 { @@ -150,8 +143,7 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x100>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; + #cooling-cells = <2>; /* min followed by max */ }; @@ -160,8 +152,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x101>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; }; cpu_b2: cpu@102 { @@ -169,8 +159,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x102>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; }; cpu_b3: cpu@103 { @@ -178,62 +166,6 @@ compatible = "arm,cortex-a53", "arm,armv8"; reg = <0x0 0x103>; enable-method = "psci"; - clocks = <&cru ARMCLKB>; - operating-points-v2 = <&cluster1_opp>; - }; - }; - - cluster0_opp: opp-table0 { - compatible = "operating-points-v2"; - opp-shared; - - opp00 { - opp-hz = /bits/ 64 <312000000>; - opp-microvolt = <950000>; - clock-latency-ns = <40000>; - }; - opp01 { - opp-hz = /bits/ 64 <408000000>; - opp-microvolt = <950000>; - }; - opp02 { - opp-hz = /bits/ 64 <600000000>; - opp-microvolt = <950000>; - }; - opp03 { - opp-hz = /bits/ 64 <816000000>; - opp-microvolt = <1025000>; - }; - opp04 { - opp-hz = /bits/ 64 <1008000000>; - opp-microvolt = <1125000>; - }; - }; - - cluster1_opp: opp-table1 { - compatible = "operating-points-v2"; - opp-shared; - - opp00 { - opp-hz = /bits/ 64 <312000000>; - opp-microvolt = <950000>; - clock-latency-ns = <40000>; - }; - opp01 { - opp-hz = /bits/ 64 <408000000>; - opp-microvolt = <950000>; - }; - opp02 { - opp-hz = /bits/ 64 <600000000>; - opp-microvolt = <950000>; - }; - opp03 { - opp-hz = /bits/ 64 <816000000>; - opp-microvolt = <975000>; - }; - opp04 { - opp-hz = /bits/ 64 <1008000000>; - opp-microvolt = <1050000>; }; }; From 6d57339890c9a9dfcd4ba4f8931b911b962968e3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 17:08:16 +0200 Subject: [PATCH 0087/1322] dma-coherent: fix rmem_dma_device_init regression My recent bug fix introduced another bug, which caused rmem_dma_device_init to always fail, as rmem->priv is never set to anything. This restores the previous behavior, calling dma_init_coherent_memory() whenever ->priv is NULL. Fixes: d35b0996fef3 ("dma-coherent: fix dma_declare_coherent_memory() logic error") Reported-by: Roy Pledge Tested-by: Roy Pledge Signed-off-by: Arnd Bergmann Signed-off-by: Christoph Hellwig --- drivers/base/dma-coherent.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index a39b2166b145..744f64f43454 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -348,16 +348,15 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) struct dma_coherent_mem *mem = rmem->priv; int ret; - if (!mem) - return -ENODEV; - - ret = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); - - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; + if (!mem) { + ret = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + if (ret) { + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return ret; + } } mem->use_dev_dma_pfn_offset = true; rmem->priv = mem; From ec11653b531099ddc08a8c7eb495ab83cae84e19 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Sep 2017 14:51:20 -0500 Subject: [PATCH 0088/1322] CIFS/SMB3: Update documentation to reflect SMB3 and various changes Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky --- Documentation/filesystems/cifs/AUTHORS | 5 ++ Documentation/filesystems/cifs/README | 81 ++++++++++++------------- Documentation/filesystems/cifs/TODO | 72 +++++++++++----------- Documentation/filesystems/cifs/cifs.txt | 24 +++++--- 4 files changed, 91 insertions(+), 91 deletions(-) diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS index c98800df677f..9f4f87e16240 100644 --- a/Documentation/filesystems/cifs/AUTHORS +++ b/Documentation/filesystems/cifs/AUTHORS @@ -41,6 +41,11 @@ Igor Mammedov (DFS support) Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) Scott Lovenberg Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features) +Aurelien Aptel (for DFS SMB3 work and some key bug fixes) +Ronnie Sahlberg (for SMB3 xattr work and bug fixes) +Shirish Pargaonkar (for many ACL patches over the years) +Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) + Test case and Bug Report contributors ------------------------------------- diff --git a/Documentation/filesystems/cifs/README b/Documentation/filesystems/cifs/README index a54788405429..a9da51553ba3 100644 --- a/Documentation/filesystems/cifs/README +++ b/Documentation/filesystems/cifs/README @@ -1,10 +1,14 @@ -The CIFS VFS support for Linux supports many advanced network filesystem -features such as hierarchical dfs like namespace, hardlinks, locking and more. +This module supports the SMB3 family of advanced network protocols (as well +as older dialects, originally called "CIFS" or SMB1). + +The CIFS VFS module for Linux supports many advanced network filesystem +features such as hierarchical DFS like namespace, hardlinks, locking and more. It was designed to comply with the SNIA CIFS Technical Reference (which supersedes the 1992 X/Open SMB Standard) as well as to perform best practice practical interoperability with Windows 2000, Windows XP, Samba and equivalent servers. This code was developed in participation with the Protocol Freedom -Information Foundation. +Information Foundation. CIFS and now SMB3 has now become a defacto +standard for interoperating between Macs and Windows and major NAS appliances. Please see http://protocolfreedom.org/ and @@ -15,30 +19,11 @@ for more details. For questions or bug reports please contact: sfrench@samba.org (sfrench@us.ibm.com) +See the project page at: https://wiki.samba.org/index.php/LinuxCIFS_utils + Build instructions: ================== -For Linux 2.4: -1) Get the kernel source (e.g.from http://www.kernel.org) -and download the cifs vfs source (see the project page -at http://us1.samba.org/samba/Linux_CIFS_client.html) -and change directory into the top of the kernel directory -then patch the kernel (e.g. "patch -p1 < cifs_24.patch") -to add the cifs vfs to your kernel configure options if -it has not already been added (e.g. current SuSE and UL -users do not need to apply the cifs_24.patch since the cifs vfs is -already in the kernel configure menu) and then -mkdir linux/fs/cifs and then copy the current cifs vfs files from -the cifs download to your kernel build directory e.g. - - cp /fs/cifs/* to /fs/cifs - -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make dep -6) make modules (or "make" if CIFS VFS not to be built as a module) - -For Linux 2.6: +For Linux: 1) Download the kernel (e.g. from http://www.kernel.org) and change directory into the top of the kernel directory tree (e.g. /usr/src/linux-2.5.73) @@ -61,16 +46,13 @@ would simply type "make install"). If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on the CIFS VFS web site) copy it to the same directory in which mount.smbfs and similar files reside (usually /sbin). Although the helper software is not -required, mount.cifs is recommended. Eventually the Samba 3.0 utility program -"net" may also be helpful since it may someday provide easier mount syntax for -users who are used to Windows e.g. - net use +required, mount.cifs is recommended. Most distros include a "cifs-utils" +package that includes this utility so it is recommended to install this. + Note that running the Winbind pam/nss module (logon service) on all of your Linux clients is useful in mapping Uids and Gids consistently across the domain to the proper network user. The mount.cifs mount helper can be -trivially built from Samba 3.0 or later source e.g. by executing: - - gcc samba/source/client/mount.cifs.c -o mount.cifs +found at cifs-utils.git on git.samba.org If cifs is built as a module, then the size and number of network buffers and maximum number of simultaneous requests to one server can be configured. @@ -79,6 +61,18 @@ Changing these from their defaults is not recommended. By executing modinfo on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made at module initialization time (by running insmod cifs.ko) can be seen. +Recommendations +=============== +To improve security the SMB2.1 dialect or later (usually will get SMB3) is now +the new default. To use old dialects (e.g. to mount Windows XP) use "vers=1.0" +on mount (or vers=2.0 for Windows Vista). Note that the CIFS (vers=1.0) is +much older and less secure than the default dialect SMB3 which includes +many advanced security features such as downgrade attack detection +and encrypted shares and stronger signing and authentication algorithms. +There are additional mount options that may be helpful for SMB3 to get +improved POSIX behavior (NB: can use vers=3.0 to force only SMB3, never 2.1): + "mfsymlinks" and "cifsacl" and "idsfromsid" + Allowing User Mounts ==================== To permit users to mount and unmount over directories they own is possible @@ -98,9 +92,7 @@ and execution of suid programs on the remote target would be enabled by default. This can be changed, as with nfs and other filesystems, by simply specifying "nosuid" among the mount options. For user mounts though to be able to pass the suid flag to mount requires rebuilding -mount.cifs with the following flag: - - gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs +mount.cifs with the following flag: CIFS_ALLOW_USR_SUID There is a corresponding manual page for cifs mounting in the Samba 3.0 and later source tree in docs/manpages/mount.cifs.8 @@ -189,18 +181,18 @@ applications running on the same server as Samba. Use instructions: ================ Once the CIFS VFS support is built into the kernel or installed as a module -(cifs.o), you can use mount syntax like the following to access Samba or Windows -servers: +(cifs.ko), you can use mount syntax like the following to access Samba or +Mac or Windows servers: - mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword + mount -t cifs //9.53.216.11/e$ /mnt -o username=myname,password=mypassword Before -o the option -v may be specified to make the mount.cifs mount helper display the mount steps more verbosely. After -o the following commonly used cifs vfs specific options are supported: - user= - pass= + username= + password= domain= Other cifs mount options are described below. Use of TCP names (in addition to @@ -246,13 +238,16 @@ the Server's registry. Samba starting with version 3.10 will allow such filenames (ie those which contain valid Linux characters, which normally would be forbidden for Windows/CIFS semantics) as long as the server is configured for Unix Extensions (and the client has not disabled -/proc/fs/cifs/LinuxExtensionsEnabled). - +/proc/fs/cifs/LinuxExtensionsEnabled). In addition the mount option +"mapposix" can be used on CIFS (vers=1.0) to force the mapping of +illegal Windows/NTFS/SMB characters to a remap range (this mount parm +is the default for SMB3). This remap ("mapposix") range is also +compatible with Mac (and "Services for Mac" on some older Windows). CIFS VFS Mount Options ====================== A partial list of the supported mount options follows: - user The user name to use when trying to establish + username The user name to use when trying to establish the CIFS session. password The user password. If the mount helper is installed, the user will be prompted for password diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index 066ffddc3964..396ecfd6ff4a 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -1,4 +1,4 @@ -Version 2.03 August 1, 2014 +Version 2.04 September 13, 2017 A Partial List of Missing Features ================================== @@ -8,73 +8,69 @@ for visible, important contributions to this module. Here is a partial list of the known problems and missing features: a) SMB3 (and SMB3.02) missing optional features: - - RDMA + - RDMA (started) - multichannel (started) - directory leases (improved metadata caching) - T10 copy offload (copy chunk is only mechanism supported) - - encrypted shares b) improved sparse file support c) Directory entry caching relies on a 1 second timer, rather than -using FindNotify or equivalent. - (started) +using Directory Leases d) quota support (needs minor kernel change since quota calls to make it to network filesystems or deviceless filesystems) -e) improve support for very old servers (OS/2 and Win9x for example) -Including support for changing the time remotely (utimes command). +e) Better optimize open to reduce redundant opens (using reference +counts more) and to improve use of compounding in SMB3 to reduce +number of roundtrips. -f) hook lower into the sockets api (as NFS/SunRPC does) to avoid the -extra copy in/out of the socket buffers in some cases. - -g) Better optimize open (and pathbased setfilesize) to reduce the -oplock breaks coming from windows srv. Piggyback identical file -opens on top of each other by incrementing reference count rather -than resending (helps reduce server resource utilization and avoid -spurious oplock breaks). - -h) Add support for storing symlink info to Windows servers -in the Extended Attribute format their SFU clients would recognize. - -i) Finish inotify support so kde and gnome file list windows +f) Finish inotify support so kde and gnome file list windows will autorefresh (partially complete by Asser). Needs minor kernel vfs change to support removing D_NOTIFY on a file. -j) Add GUI tool to configure /proc/fs/cifs settings and for display of +g) Add GUI tool to configure /proc/fs/cifs settings and for display of the CIFS statistics (started) -k) implement support for security and trusted categories of xattrs +h) implement support for security and trusted categories of xattrs (requires minor protocol extension) to enable better support for SELINUX -l) Implement O_DIRECT flag on open (already supported on mount) +i) Implement O_DIRECT flag on open (already supported on mount) -m) Create UID mapping facility so server UIDs can be mapped on a per +j) Create UID mapping facility so server UIDs can be mapped on a per mount or a per server basis to client UIDs or nobody if no mapping -exists. This is helpful when Unix extensions are negotiated to -allow better permission checking when UIDs differ on the server -and client. Add new protocol request to the CIFS protocol -standard for asking the server for the corresponding name of a -particular uid. +exists. Also better integration with winbind for resolving SID owners -n) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) +k) Add tools to take advantage of more smb3 specific ioctls and features -o) mount check for unmatched uids +l) encrypted file support -p) Add support for new vfs entry point for fallocate +m) improved stats gathering, tools (perhaps integration with nfsometer?) -q) Add tools to take advantage of cifs/smb3 specific ioctls and features -such as "CopyChunk" (fast server side file copy) +n) allow setting more NTFS/SMB3 file attributes remotely (currently limited to compressed +file attribute via chflags) and improve user space tools for managing and +viewing them. -r) encrypted file support +o) mount helper GUI (to simplify the various configuration options on mount) -s) improved stats gathering, tools (perhaps integration with nfsometer?) +p) autonegotiation of dialects (offering more than one dialect ie SMB3.02, +SMB3, SMB2.1 not just SMB3). -t) allow setting more NTFS/SMB3 file attributes remotely (currently limited to compressed -file attribute via chflags) +q) Allow mount.cifs to be more verbose in reporting errors with dialect +or unsupported feature errors. -u) mount helper GUI (to simplify the various configuration options on mount) +r) updating cifs documentation, and user guid. +s) Addressing bugs found by running a broader set of xfstests in standard +file system xfstest suite. + +t) split cifs and smb3 support into separate modules so legacy (and less +secure) CIFS dialect can be disabled in environments that don't need it +and simplify the code. + +u) Finish up SMB3.1.1 dialect support + +v) POSIX Extensions for SMB3.1.1 KNOWN BUGS ==================================== diff --git a/Documentation/filesystems/cifs/cifs.txt b/Documentation/filesystems/cifs/cifs.txt index 2fac91ac96cf..67756607246e 100644 --- a/Documentation/filesystems/cifs/cifs.txt +++ b/Documentation/filesystems/cifs/cifs.txt @@ -1,24 +1,28 @@ - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block + This is the client VFS module for the SMB3 NAS protocol as well + older dialects such as the Common Internet File System (CIFS) + protocol which was the successor to the Server Message Block (SMB) protocol, the native file sharing mechanism for most early PC operating systems. New and improved versions of CIFS are now called SMB2 and SMB3. These dialects are also supported by the CIFS VFS module. CIFS is fully supported by network - file servers such as Windows 2000, 2003, 2008 and 2012 + file servers such as Windows 2000, 2003, 2008, 2012 and 2016 as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems), so + server support for Linux and many other operating systems), Apple + systems, as well as most Network Attached Storage vendors, so this network filesystem client can mount to a wide variety of servers. The intent of this module is to provide the most advanced network - file system function for CIFS compliant servers, including better - POSIX compliance, secure per-user session establishment, high - performance safe distributed caching (oplock), optional packet + file system function for SMB3 compliant servers, including advanced + security features, excellent parallelized high performance i/o, better + POSIX compliance, secure per-user session establishment, encryption, + high performance safe distributed caching (leases/oplocks), optional packet signing, large files, Unicode support and other internationalization improvements. Since both Samba server and this filesystem client support - the CIFS Unix extensions, the combination can provide a reasonable - alternative to NFSv4 for fileserving in some Linux to Linux environments, - not just in Linux to Windows environments. + the CIFS Unix extensions (and in the future SMB3 POSIX extensions), + the combination can provide a reasonable alternative to other network and + cluster file systems for fileserving in some Linux to Linux environments, + not just in Linux to Windows (or Linux to Mac) environments. This filesystem has an mount utility (mount.cifs) that can be obtained from From 47061a24e2ee5bd8a40d473d47a5bd823fa0081f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:48 -0700 Subject: [PATCH 0089/1322] x86/mm: Factor out CR3-building code Current, the code that assembles a value to load into CR3 is open-coded everywhere. Factor it out into helpers build_cr3() and build_cr3_noflush(). This makes one semantic change: __get_current_cr3_fast() was wrong on SME systems. No one noticed because the only caller is in the VMX code, and there are no CPUs with both SME and VMX. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Link: http://lkml.kernel.org/r/ce350cf11e93e2842d14d0b95b0199c7d881f527.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 15 +++++++++++---- arch/x86/mm/tlb.c | 11 +++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 7ae318c340d9..a999ba6b721f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,6 +286,15 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +{ + return __sme_pa(mm->pgd) | asid; +} + +static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +{ + return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; +} /* * This can be used from process context to figure out what the value of @@ -296,10 +305,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); - - if (static_cpu_has(X86_FEATURE_PCID)) - cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1ab3821f9e26..93fe97cce581 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -126,8 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != - (__sme_pa(real_prev->pgd) | prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -172,7 +171,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | prev_asid); + write_cr3(build_cr3(next, prev_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -216,12 +215,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(__sme_pa(next->pgd) | new_asid); + write_cr3(build_cr3(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); + write_cr3(build_cr3_noflush(next, new_asid)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } @@ -265,7 +264,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(cr3 & ~CR3_PCID_MASK); + write_cr3(build_cr3(mm, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); From 52a2af400c1075219b3f0ce5c96fc961da44018a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:49 -0700 Subject: [PATCH 0090/1322] x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code Putting the logical ASID into CR3's PCID bits directly means that we have two cases to consider separately: ASID == 0 and ASID != 0. This means that bugs that only hit in one of these cases trigger nondeterministically. There were some bugs like this in the past, and I think there's still one in current kernels. In particular, we have a number of ASID-unware code paths that save CR3, write some special value, and then restore CR3. This includes suspend/resume, hibernate, kexec, EFI, and maybe other things I've missed. This is currently dangerous: if ASID != 0, then this code sequence will leave garbage in the TLB tagged for ASID 0. We could potentially see corruption when switching back to ASID 0. In principle, an initialize_tlbstate_and_flush() call after these sequences would solve the problem, but EFI, at least, does not call this. (And it probably shouldn't -- initialize_tlbstate_and_flush() is rather expensive.) Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cdc14bbe5d3c3ef2a562be09a6368ffe9bd947a6.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index a999ba6b721f..c120b5db178a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -286,14 +286,31 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID + * bits. This serves two purposes. It prevents a nasty situation in + * which PCID-unaware code saves CR3, loads some other value (with PCID + * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if + * the saved ASID was nonzero. It also means that any bugs involving + * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger + * deterministically. + */ + static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) { - return __sme_pa(mm->pgd) | asid; + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(mm->pgd); + } } static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) { - return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH; + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; } /* From b8b7abaed7a49b350f8ba659ddc264b04931d581 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:50 -0700 Subject: [PATCH 0091/1322] x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier Otherwise we might have the PCID feature bit set during cpu_init(). This is just for robustness. I haven't seen any actual bugs here. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels") Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/bugs.c | 8 -------- arch/x86/kernel/cpu/common.c | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index db684880d74a..0af86d9242da 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,14 +21,6 @@ void __init check_bugs(void) { -#ifdef CONFIG_X86_32 - /* - * Regardless of whether PCID is enumerated, the SDM says - * that it can't be enabled in 32-bit mode. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); -#endif - identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 775f10100d7f..c9176bae7fd8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) From 4ba55e65f471d011d3ba2ac2022180ea0877d68e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:51 -0700 Subject: [PATCH 0092/1322] x86/mm/32: Load a sane CR3 before cpu_init() on secondary CPUs For unknown historical reasons (i.e. Borislav doesn't recall), 32-bit kernels invoke cpu_init() on secondary CPUs with initial_page_table loaded into CR3. Then they set current->active_mm to &init_mm and call enter_lazy_tlb() before fixing CR3. This means that the x86 TLB code gets invoked while CR3 is inconsistent, and, with the improved PCID sanity checks I added, we warn. Fix it by loading swapper_pg_dir (i.e. init_mm.pgd) earlier. Reported-by: Paul Menzel Reported-by: Pavel Machek Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 72c0098d92ce ("x86/mm: Reinitialize TLB state on hotplug and resume") Link: http://lkml.kernel.org/r/30cdfea504682ba3b9012e77717800a91c22097f.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0854ff169274..ad59edd84de7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused) */ if (boot_cpu_has(X86_FEATURE_PCID)) __write_cr4(__read_cr4() | X86_CR4_PCIDE); - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); - smp_callin(); - - enable_start_cpu0 = 0; #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); #endif + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* From bf29ed1567b67854dc13504f685c45a2ea9b2081 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:44 -0700 Subject: [PATCH 0093/1322] syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility on address limit failures. By default, send a SIGKILL signal to kill the current process preventing exploitation of a bad address limit. Make the TIF_FSCHECK flag optional so ARM can use this function. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keescook@chromium.org --- include/linux/syscalls.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 95606a2d556f..a78186d826d7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -221,22 +221,26 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) -#ifdef TIF_FSCHECK /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. */ static inline void addr_limit_user_check(void) { - +#ifdef TIF_FSCHECK if (!test_thread_flag(TIF_FSCHECK)) return; - - BUG_ON(!segment_eq(get_fs(), USER_DS)); - clear_thread_flag(TIF_FSCHECK); -} #endif + if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS), + "Invalid address limit on user-mode return")) + force_sig(SIGKILL, current); + +#ifdef TIF_FSCHECK + clear_thread_flag(TIF_FSCHECK); +#endif +} + asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); asmlinkage long sys_time(time_t __user *tloc); From 2404269bc4e77a67875c8db6667be34c9913c96e Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:45 -0700 Subject: [PATCH 0094/1322] Revert "arm/syscalls: Check address limit on user-mode return" This reverts commit 73ac5d6a2b6ac3ae8d1e1818f3e9946f97489bc9. The work pending loop can call set_fs after addr_limit_user_check removed the _TIF_FSCHECK flag. This may happen at anytime based on how ARM handles alignment exceptions. It leads to an infinite loop condition. After discussion, it has been agreed that the generic approach is not tailored to the ARM architecture and any fix might not be complete. This patch will be replaced by an architecture specific implementation. The work flag approach will be kept for other architectures. Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-3-git-send-email-keescook@chromium.org --- arch/arm/include/asm/thread_info.h | 15 ++++++--------- arch/arm/include/asm/uaccess.h | 2 -- arch/arm/kernel/entry-common.S | 9 ++------- arch/arm/kernel/signal.c | 5 ----- 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 1d468b527b7b..776757d1604a 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,11 +139,10 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_FSCHECK 4 /* Check FS is USER_DS on return */ -#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ +#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -154,7 +153,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) @@ -168,9 +166,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_FSCHECK) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 87936dd5d151..0bf2347495f1 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,8 +70,6 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index ca3614dc6938..0b60adf4a5d9 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -49,9 +49,7 @@ ret_fast_syscall: UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK - bne fast_work_pending - tst r1, #_TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending /* perform architecture specific actions before user return */ @@ -77,15 +75,12 @@ ret_fast_syscall: str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK - bne fast_work_pending - tst r1, #_TIF_WORK_MASK + tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) /* Slower path - fall through to work_pending */ -fast_work_pending: #endif tst r1, #_TIF_SYSCALL_WORK diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index e2de50bf8742..5814298ef0b7 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -614,10 +613,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) * Update the trace code with the current status. */ trace_hardirqs_off(); - - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { if (likely(thread_flags & _TIF_NEED_RESCHED)) { schedule(); From e33f8d32677fa4f4f8996ef46748f86aac81ccff Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:46 -0700 Subject: [PATCH 0095/1322] arm/syscalls: Optimize address limit check Disable the generic address limit check in favor of an architecture specific optimized implementation. The generic implementation using pending work flags did not work well with ARM and alignment faults. The address limit is checked on each syscall return path to user-mode path as well as the irq user-mode return function. If the address limit was changed, a function is called to report data corruption (stopping the kernel or process based on configuration). The address limit check has to be done before any pending work because they can reset the address limit and the process is killed using a SIGKILL signal. For example the lkdtm address limit check does not work because the signal to kill the process will reset the user-mode address limit. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Tested-by: Kees Cook Tested-by: Leonard Crestez Reviewed-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keescook@chromium.org --- arch/arm/kernel/entry-common.S | 11 +++++++++++ arch/arm/kernel/signal.c | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0b60adf4a5d9..99c908226065 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_AEABI #include #endif @@ -48,10 +49,14 @@ ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending + /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -74,6 +79,9 @@ ret_fast_syscall: UNWIND(.cantunwind ) str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -106,6 +114,9 @@ ENTRY(ret_to_user) ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298ef0b7..b67ae12503f3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -673,3 +674,9 @@ struct page *get_signal_page(void) return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +} From a2048e34d4655c06d31400646ae495bbfeb16b27 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 7 Sep 2017 08:30:47 -0700 Subject: [PATCH 0096/1322] arm64/syscalls: Move address limit check in loop A bug was reported on ARM where set_fs might be called after it was checked on the work pending function. ARM64 is not affected by this bug but has a similar construct. In order to avoid any similar problems in the future, the addr_limit_user_check function is moved at the beginning of the loop. Fixes: cf7de27ab351 ("arm64/syscalls: Check address limit on user-mode return") Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-api@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-5-git-send-email-keescook@chromium.org --- arch/arm64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c45214f8fb54..0bdc96c61bc0 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -751,10 +751,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, */ trace_hardirqs_off(); - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_NEED_RESCHED) { schedule(); } else { From 9764c02fcbad40001fd3f63558d918e4d519bb75 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Sep 2017 10:41:35 -0500 Subject: [PATCH 0097/1322] SMB3: Add support for multidialect negotiate (SMB2.1 and later) With the need to discourage use of less secure dialect, SMB1 (CIFS), we temporarily upgraded the dialect to SMB3 in 4.13, but since there are various servers which only support SMB2.1 (2.1 is more secure than CIFS/SMB1) but not optimal for a default dialect - add support for multidialect negotiation. cifs.ko will now request SMB2.1 or later (ie SMB2.1 or SMB3.0, SMB3.02) and the server will pick the latest most secure one it can support. In addition since we are sending multidialect negotiate, add support for secure negotiate to validate that a man in the middle didn't downgrade us. Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky CC: Stable # 4.13+ --- fs/cifs/cifsglob.h | 6 ++++ fs/cifs/connect.c | 24 +++++++++---- fs/cifs/smb2ops.c | 40 ++++++++++++++++++++++ fs/cifs/smb2pdu.c | 85 ++++++++++++++++++++++++++++++++++++++++------ fs/cifs/smb2pdu.h | 2 +- 5 files changed, 139 insertions(+), 18 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 808486c29f0d..de5b2e1fcce5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -188,6 +188,8 @@ enum smb_version { #ifdef CONFIG_CIFS_SMB311 Smb_311, #endif /* SMB311 */ + Smb_3any, + Smb_default, Smb_version_err }; @@ -1701,6 +1703,10 @@ extern struct smb_version_values smb20_values; #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; +#define SMBDEFAULT_VERSION_STRING "default" +extern struct smb_version_values smbdefault_values; +#define SMB3ANY_VERSION_STRING "3" +extern struct smb_version_values smb3any_values; #define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5aa2d278ca84..8d38b22afb2b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -301,6 +301,8 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_311, SMB311_VERSION_STRING }, { Smb_311, ALT_SMB311_VERSION_STRING }, #endif /* SMB311 */ + { Smb_3any, SMB3ANY_VERSION_STRING }, + { Smb_default, SMBDEFAULT_VERSION_STRING }, { Smb_version_err, NULL } }; @@ -1148,6 +1150,14 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->vals = &smb311_values; break; #endif /* SMB311 */ + case Smb_3any: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb3any_values; + break; + case Smb_default: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smbdefault_values; + break; default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); return 1; @@ -1274,9 +1284,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; - /* FIXME: add autonegotiation for SMB3 or later rather than just SMB3 */ - vol->ops = &smb30_operations; /* both secure and accepted widely */ - vol->vals = &smb30_values; + /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ + vol->ops = &smb30_operations; + vol->vals = &smbdefault_values; vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; @@ -1988,11 +1998,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (got_version == false) pr_warn("No dialect specified on mount. Default has changed to " - "a more secure dialect, SMB3 (vers=3.0), from CIFS " + "a more secure dialect, SMB2.1 or later (e.g. SMB3), from CIFS " "(SMB1). To use the less secure SMB1 dialect to access " - "old servers which do not support SMB3 specify vers=1.0" - " on mount. For somewhat newer servers such as Windows " - "7 try vers=2.1.\n"); + "old servers which do not support SMB3 (or SMB2.1) specify vers=1.0" + " on mount.\n"); kfree(mountdata_copy); return 0; @@ -2133,6 +2142,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (vol->nosharesock) return 0; + /* BB update this for smb3any and default case */ if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index fb2934b9b97c..405a9bf13aac 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3110,6 +3110,46 @@ struct smb_version_values smb21_values = { .create_lease_size = sizeof(struct create_lease), }; +struct smb_version_values smb3any_values = { + .version_string = SMB3ANY_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; + +struct smb_version_values smbdefault_values = { + .version_string = SMBDEFAULT_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; + struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 69a751b038ab..5c16591a128e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -491,10 +491,25 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->hdr.sync_hdr.SessionId = 0; - req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); - - req->DialectCount = cpu_to_le16(1); /* One vers= at a time for now */ - inc_rfc1001_len(req, 2); + if (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + req->DialectCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4); + } else if (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + req->DialectCount = cpu_to_le16(3); + inc_rfc1001_len(req, 6); + } else { + /* otherwise send specific dialect */ + req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); + req->DialectCount = cpu_to_le16(1); + inc_rfc1001_len(req, 2); + } /* only one of SMB2 signing flags may be set in SMB2 request */ if (ses->sign) @@ -528,16 +543,42 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) */ if (rc == -EOPNOTSUPP) { cifs_dbg(VFS, "Dialect not supported by server. Consider " - "specifying vers=1.0 or vers=2.1 on mount for accessing" + "specifying vers=1.0 or vers=2.0 on mount for accessing" " older servers\n"); goto neg_exit; } else if (rc != 0) goto neg_exit; + if (strcmp(ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { + cifs_dbg(VFS, + "SMB2 dialect returned but not requested\n"); + return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { + cifs_dbg(VFS, + "SMB2.1 dialect returned but not requested\n"); + return -EIO; + } + } else if (strcmp(ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) { + cifs_dbg(VFS, + "SMB2 dialect returned but not requested\n"); + return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { + /* ops set to 3.0 by default for default so update */ + ses->server->ops = &smb21_operations; + } + } else if (rsp->DialectRevision != ses->server->vals->protocol_id) { + /* if requested single dialect ensure returned dialect matched */ + cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", + cpu_to_le16(rsp->DialectRevision)); + return -EIO; + } + cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); - /* BB we may eventually want to match the negotiated vs. requested - dialect, even though we are only requesting one at a time */ if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) cifs_dbg(FYI, "negotiated smb2.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) @@ -558,6 +599,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } server->dialect = le16_to_cpu(rsp->DialectRevision); + /* BB: add check that dialect was valid given dialect(s) we asked for */ + /* SMB2 only has an extended negflavor */ server->negflavor = CIFS_NEGFLAVOR_EXTENDED; /* set it to the maximum buffer size value we can send with 1 credit */ @@ -606,6 +649,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) struct validate_negotiate_info_req vneg_inbuf; struct validate_negotiate_info_rsp *pneg_rsp; u32 rsplen; + u32 inbuflen; /* max of 4 dialects */ cifs_dbg(FYI, "validate negotiate\n"); @@ -634,9 +678,30 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) else vneg_inbuf.SecurityMode = 0; - vneg_inbuf.DialectCount = cpu_to_le16(1); - vneg_inbuf.Dialects[0] = - cpu_to_le16(tcon->ses->server->vals->protocol_id); + + if (strcmp(tcon->ses->server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) { + vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + vneg_inbuf.DialectCount = cpu_to_le16(2); + /* structure is big enough for 3 dialects, sending only 2 */ + inbuflen = sizeof(struct validate_negotiate_info_req) - 2; + } else if (strcmp(tcon->ses->server->vals->version_string, + SMBDEFAULT_VERSION_STRING) == 0) { + vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + vneg_inbuf.DialectCount = cpu_to_le16(3); + /* structure is big enough for 3 dialects */ + inbuflen = sizeof(struct validate_negotiate_info_req); + } else { + /* otherwise specific dialect was requested */ + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + vneg_inbuf.DialectCount = cpu_to_le16(1); + /* structure is big enough for 3 dialects, sending only 1 */ + inbuflen = sizeof(struct validate_negotiate_info_req) - 4; + } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 393ed5f4e1b6..6c9653a130c8 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -716,7 +716,7 @@ struct validate_negotiate_info_req { __u8 Guid[SMB2_CLIENT_GUID_SIZE]; __le16 SecurityMode; __le16 DialectCount; - __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ + __le16 Dialects[3]; /* BB expand this if autonegotiate > 3 dialects */ } __packed; struct validate_negotiate_info_rsp { From 1368f155a972eab037f81b5dea82afd544227552 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 11:24:15 +0200 Subject: [PATCH 0098/1322] cifs: hide unused functions The newly added SMB2+ attribute support causes unused function warnings when CONFIG_CIFS_XATTR is disabled: fs/cifs/smb2ops.c:563:1: error: 'smb2_set_ea' defined but not used [-Werror=unused-function] smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, fs/cifs/smb2ops.c:513:1: error: 'smb2_query_eas' defined but not used [-Werror=unused-function] smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, This adds another #ifdef around the affected functions. Fixes: 5517554e4313 ("cifs: Add support for writing attributes on SMB2+") Fixes: 95907fea4fd8 ("cifs: Add support for reading attributes on SMB2+") Signed-off-by: Arnd Bergmann Acked-by: Geert Uytterhoeven Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 405a9bf13aac..0dafdbae1f8c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -426,6 +426,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_XATTR static ssize_t move_smb2_ea_to_cifs(char *dst, size_t dst_size, struct smb2_file_full_ea_info *src, size_t src_size, @@ -613,6 +614,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +#endif static bool smb2_can_echo(struct TCP_Server_Info *server) From 94a9daeaece415ce37ccb413b49a580e68e3477b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Sep 2017 18:13:11 -0500 Subject: [PATCH 0099/1322] Update version of cifs module Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 30bf89b1fd9a..5a10e566f0e6 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.09" +#define CIFS_VERSION "2.10" #endif /* _CIFSFS_H */ From 8fce3dc5c5d6f6301f67311fa79f333902b58cea Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:42:59 +0200 Subject: [PATCH 0100/1322] clocksource/integrator: Fix section mismatch warning gcc-4.6 and older fail to inline integrator_clocksource_init, so they end up showing a harmless warning: WARNING: vmlinux.o(.text+0x4aa94c): Section mismatch in reference from the function integrator_clocksource_init() to the function .init.text:clocksource_mmio_init() The function integrator_clocksource_init() references the function __init clocksource_mmio_init(). This is often because integrator_clocksource_init lacks a __init annotation or the annotation of clocksource_mmio_init is wrong. Add the missing __init annotation that makes it build cleanly with all compilers. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Thierry Reding Cc: Linus Walleij Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/20170915194310.1170514-1-arnd@arndb.de --- drivers/clocksource/timer-integrator-ap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index 2ff64d9d4fb3..62d24690ba02 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -36,8 +36,8 @@ static u64 notrace integrator_read_sched_clock(void) return -readl(sched_clk_base + TIMER_VALUE); } -static int integrator_clocksource_init(unsigned long inrate, - void __iomem *base) +static int __init integrator_clocksource_init(unsigned long inrate, + void __iomem *base) { u32 ctrl = TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC; unsigned long rate = inrate; From b8f3911610529ba531b99d1e109c7a40fb071f0a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 12 Sep 2017 15:10:35 +0200 Subject: [PATCH 0101/1322] mtd: spi-nor: Check consistency of the memory size extracted from the SFDP One field of the flash parameter table contains information about the flash device size. Most of the time the data extracted from this field is valid, but sometimes the BFPT section of the SFDP table is corrupted or invalid and this field is set to 0xffffffff, thus resulting in an integer overflow when setting params->size. Since NOR devices are anayway always smaller than 2^64 bytes, we can easily stop the BFPT parsing if the size reported in this table is invalid. Fixes: f384b352cbf0 ("mtd: spi-nor: parse Serial Flash Discoverable Parameters (SFDP) tables") Reported-by: Geert Uytterhoeven Signed-off-by: Boris Brezillon Tested-by: Geert Uytterhoeven Acked-by: Cyrille Pitchen --- drivers/mtd/spi-nor/spi-nor.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index cf1d4a15e10a..4425b0283725 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -2127,6 +2127,15 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, params->size = bfpt.dwords[BFPT_DWORD(2)]; if (params->size & BIT(31)) { params->size &= ~BIT(31); + + /* + * Prevent overflows on params->size. Anyway, a NOR of 2^64 + * bits is unlikely to exist so this error probably means + * the BFPT we are reading is corrupted/wrong. + */ + if (params->size > 63) + return -EINVAL; + params->size = 1ULL << params->size; } else { params->size++; From bfa4133795e5a0badd402dd3f58b13b3cec64a4b Mon Sep 17 00:00:00 2001 From: Cyrille Pitchen Date: Wed, 6 Sep 2017 23:45:02 +0200 Subject: [PATCH 0102/1322] mtd: spi-nor: fix DMA unsafe buffer issue in spi_nor_read_sfdp() spi_nor_read_sfdp() calls nor->read() to read the SFDP data. When the m25p80 driver is used (pretty common case), nor->read() is then implemented by the m25p80_read() function, which is likely to initialize a 'struct spi_transfer' from its buf argument before appending this structure inside the 'struct spi_message' argument of spi_sync(). Besides the SPI sub-system states that both .tx_buf and .rx_buf members of 'struct spi_transfer' must point into dma-safe memory. However, two of the three calls of spi_nor_read_sfdp() were given pointers to stack allocated memory as buf argument, hence not in a dma-safe area. Hopefully, the third and last call of spi_nor_read_sfdp() was already given a kmalloc'ed buffer argument, hence dma-safe. So this patch fixes this issue by introducing a spi_nor_read_sfdp_dma_unsafe() function which simply wraps the existing spi_nor_read_sfdp() function and uses some kmalloc'ed memory as a bounce buffer. Fixes: f384b352cbf0 ("mtd: spi-nor: parse Serial Flash Discoverable Parameters (SFDP) tables") Reported-by: Geert Uytterhoeven Signed-off-by: Cyrille Pitchen Signed-off-by: Boris Brezillon --- drivers/mtd/spi-nor/spi-nor.c | 36 ++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 4425b0283725..19c000722cbc 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -1784,7 +1784,7 @@ spi_nor_set_pp_settings(struct spi_nor_pp_command *pp, * @nor: pointer to a 'struct spi_nor' * @addr: offset in the SFDP area to start reading data from * @len: number of bytes to read - * @buf: buffer where the SFDP data are copied into + * @buf: buffer where the SFDP data are copied into (dma-safe memory) * * Whatever the actual numbers of bytes for address and dummy cycles are * for (Fast) Read commands, the Read SFDP (5Ah) instruction is always @@ -1829,6 +1829,36 @@ read_err: return ret; } +/** + * spi_nor_read_sfdp_dma_unsafe() - read Serial Flash Discoverable Parameters. + * @nor: pointer to a 'struct spi_nor' + * @addr: offset in the SFDP area to start reading data from + * @len: number of bytes to read + * @buf: buffer where the SFDP data are copied into + * + * Wrap spi_nor_read_sfdp() using a kmalloc'ed bounce buffer as @buf is now not + * guaranteed to be dma-safe. + * + * Return: -ENOMEM if kmalloc() fails, the return code of spi_nor_read_sfdp() + * otherwise. + */ +static int spi_nor_read_sfdp_dma_unsafe(struct spi_nor *nor, u32 addr, + size_t len, void *buf) +{ + void *dma_safe_buf; + int ret; + + dma_safe_buf = kmalloc(len, GFP_KERNEL); + if (!dma_safe_buf) + return -ENOMEM; + + ret = spi_nor_read_sfdp(nor, addr, len, dma_safe_buf); + memcpy(buf, dma_safe_buf, len); + kfree(dma_safe_buf); + + return ret; +} + struct sfdp_parameter_header { u8 id_lsb; u8 minor; @@ -2101,7 +2131,7 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor, bfpt_header->length * sizeof(u32)); addr = SFDP_PARAM_HEADER_PTP(bfpt_header); memset(&bfpt, 0, sizeof(bfpt)); - err = spi_nor_read_sfdp(nor, addr, len, &bfpt); + err = spi_nor_read_sfdp_dma_unsafe(nor, addr, len, &bfpt); if (err < 0) return err; @@ -2252,7 +2282,7 @@ static int spi_nor_parse_sfdp(struct spi_nor *nor, int i, err; /* Get the SFDP header. */ - err = spi_nor_read_sfdp(nor, 0, sizeof(header), &header); + err = spi_nor_read_sfdp_dma_unsafe(nor, 0, sizeof(header), &header); if (err < 0) return err; From 27563cd9f8f52f09523e061985917c38f302bd0c Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 14 Sep 2017 17:28:12 +0300 Subject: [PATCH 0103/1322] ARM: dts: at91: sama5d27_som1_ek: update pinmux/pinconf for LEDs and USB There are some changes from the prototype board concerning LEDs and USB pins: - USBB power enable and red LED pins are inverted. - The polarity of LEDs is inverted too. Signed-off-by: Ludovic Desroches Signed-off-by: Claudiu Beznea Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index 9c9088c99cc4..f13ef4fbab60 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -67,7 +67,7 @@ usb1: ohci@00400000 { num-ports = <3>; - atmel,vbus-gpio = <&pioA PIN_PA10 GPIO_ACTIVE_HIGH>; + atmel,vbus-gpio = <&pioA PIN_PA27 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb_default>; status = "okay"; @@ -330,7 +330,7 @@ }; pinctrl_led_gpio_default: led_gpio_default { - pinmux = , + pinmux = , , ; bias-pull-up; @@ -396,7 +396,7 @@ }; pinctrl_usb_default: usb_default { - pinmux = , + pinmux = , ; bias-disable; }; @@ -520,17 +520,17 @@ red { label = "red"; - gpios = <&pioA PIN_PA27 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PA10 GPIO_ACTIVE_HIGH>; }; green { label = "green"; - gpios = <&pioA PIN_PB1 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PB1 GPIO_ACTIVE_HIGH>; }; blue { label = "blue"; - gpios = <&pioA PIN_PA31 GPIO_ACTIVE_LOW>; + gpios = <&pioA PIN_PA31 GPIO_ACTIVE_HIGH>; linux,default-trigger = "heartbeat"; }; }; From 5f506faa0de810f07af9345826fd588f61bb3b2f Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Thu, 14 Sep 2017 17:28:13 +0300 Subject: [PATCH 0104/1322] ARM: dts: at91: sama5d27_som1_ek: fix typos Fix typos that prevent proper using of uart2 and uart4 devices. Signed-off-by: Ludovic Desroches Signed-off-by: Claudiu Beznea Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index f13ef4fbab60..be5cd913f274 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -120,7 +120,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mikrobus2_uart>; atmel,use-dma-rx; - atmel-use-dma-tx; + atmel,use-dma-tx; status = "okay"; }; @@ -178,7 +178,7 @@ uart4: serial@fc00c000 { atmel,use-dma-rx; atmel,use-dma-tx; - pinctrl-name = "default"; + pinctrl-names = "default"; pinctrl-0 = <&pinctrl_mikrobus1_uart>; status = "okay"; }; From e025a3ac3460275bf86a4c5d02857eee14db4247 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 14 Sep 2017 17:28:14 +0300 Subject: [PATCH 0105/1322] ARM: dts: at91: sama5d27_som1_ek: fix USB host vbus The USB host has 3 ports so we must specify the entries for each in the atmel,vbus-gpio property. The specified pin (PA27) is the vbus for USBB and not USBA. Signed-off-by: Nicolas Ferre [claudiu.beznea@microchip.com: change subject to match the desired prefix] Signed-off-by: Claudiu Beznea --- arch/arm/boot/dts/at91-sama5d27_som1_ek.dts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts index be5cd913f274..60cb084a8d92 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d27_som1_ek.dts @@ -67,7 +67,10 @@ usb1: ohci@00400000 { num-ports = <3>; - atmel,vbus-gpio = <&pioA PIN_PA27 GPIO_ACTIVE_HIGH>; + atmel,vbus-gpio = <0 /* &pioA PIN_PD20 GPIO_ACTIVE_HIGH */ + &pioA PIN_PA27 GPIO_ACTIVE_HIGH + 0 + >; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb_default>; status = "okay"; From 093d79f62a89f47d9b5fd0746768146d9696535c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 24 Aug 2017 13:44:54 +0200 Subject: [PATCH 0106/1322] ARM: at91: Replace uses of virt_to_phys with __pa_symbol The PM code wrongly uses virt_to_phys() instead of __pa_symbol() and was not updated by commit 64fc2a947a98 ("ARM: 8641/1: treewide: Replace uses of virt_to_phys with __pa_symbol") because it was not yet in tree. Signed-off-by: Alexandre Belloni Signed-off-by: Nicolas Ferre --- arch/arm/mach-at91/pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 5036f996e694..849014c01cf4 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -533,8 +533,8 @@ static void __init at91_pm_backup_init(void) } pm_bu->suspended = 0; - pm_bu->canary = virt_to_phys(&canary); - pm_bu->resume = virt_to_phys(cpu_resume); + pm_bu->canary = __pa_symbol(&canary); + pm_bu->resume = __pa_symbol(cpu_resume); return; From 2d48a237c8b61b457d146310d7e1e61224d0ca56 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 9 Sep 2017 06:02:46 +0200 Subject: [PATCH 0107/1322] ARC: reset: Only build on archs that have IOMEM This avoids the error: drivers/reset/reset-hsdk-v1.o: In function `hsdkv1_reset_probe': /home/thomas/git/linux/drivers/reset/reset-hsdk-v1.c:101: undefined reference to `devm_ioremap_resource' collect2: error: ld returned 1 exit status Signed-off-by: Thomas Meyer Signed-off-by: Philipp Zabel --- drivers/reset/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index e0c393214264..d063bd5373dd 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -36,6 +36,7 @@ config RESET_BERLIN config RESET_HSDK_V1 bool "HSDK v1 Reset Driver" + depends on HAS_IOMEM default n help This enables the reset controller driver for HSDK v1. From fc9655e65160936e32adae0d0e8aae25eb12c4e0 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Thu, 14 Sep 2017 12:49:41 +0200 Subject: [PATCH 0108/1322] ARC: reset: add missing DT binding documentation for HSDKv1 reset driver When applying the original patch [1], the DT binding docs were lost. This patch adds them back. [1] https://patchwork.kernel.org/patch/9852997/ Fixes: e0be864f1424 ("ARC: reset: introduce HSDKv1 reset driver") Signed-off-by: Eugeniy Paltsev Signed-off-by: Philipp Zabel --- .../bindings/reset/snps,hsdk-v1-reset.txt | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt new file mode 100644 index 000000000000..6a68146ee353 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt @@ -0,0 +1,28 @@ +Binding for the HSDK v1 reset controller + +This binding uses the common reset binding[1]. + +[1] Documentation/devicetree/bindings/reset/reset.txt + +Required properties: +- compatible: should be "snps,hsdk-v1.0-reset". +- reg: should always contain 2 pairs address - length: first for reset + configuration register and second for corresponding SW reset and status bits + register. +- #reset-cells: from common reset binding; Should always be set to 1. + +Example: + reset: reset@880 { + compatible = "snps,hsdk-v1.0-reset"; + #reset-cells = <1>; + reg = <0x8A0 0x4>, <0xFF0 0x4>; + }; + +Specifying reset lines connected to IP modules: + ethernet@.... { + .... + resets = <&reset HSDK_V1_ETH_RESET>; + .... + }; + +The index could be found in From 70e743e4cec3733dc13559f6184b35d358b9ef3f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 16:52:59 +0200 Subject: [PATCH 0109/1322] uwb: ensure that endpoint is interrupt hwarc_neep_init() assumes that endpoint 0 is interrupt, but there's no check for that, which results in a WARNING in USB core code, when a bad USB descriptor is provided from a device: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3 at drivers/usb/core/urb.c:449 usb_submit_urb+0xf8a/0x11d0 Modules linked in: CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.13.0+ #111 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event task: ffff88006bdc1a00 task.stack: ffff88006bde8000 RIP: 0010:usb_submit_urb+0xf8a/0x11d0 drivers/usb/core/urb.c:448 RSP: 0018:ffff88006bdee3c0 EFLAGS: 00010282 RAX: 0000000000000029 RBX: ffff8800672a7200 RCX: 0000000000000000 RDX: 0000000000000029 RSI: ffff88006c815c78 RDI: ffffed000d7bdc6a RBP: ffff88006bdee4c0 R08: fffffbfff0fe00ff R09: fffffbfff0fe00ff R10: 0000000000000018 R11: fffffbfff0fe00fe R12: 1ffff1000d7bdc7f R13: 0000000000000003 R14: 0000000000000001 R15: ffff88006b02cc90 FS: 0000000000000000(0000) GS:ffff88006c800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe4daddf000 CR3: 000000006add6000 CR4: 00000000000006f0 Call Trace: hwarc_neep_init+0x4ce/0x9c0 drivers/uwb/hwa-rc.c:710 uwb_rc_add+0x2fb/0x730 drivers/uwb/lc-rc.c:361 hwarc_probe+0x34e/0x9b0 drivers/uwb/hwa-rc.c:858 usb_probe_interface+0x351/0x8d0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_set_configuration+0x1064/0x1890 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4890 hub_port_connect_change drivers/usb/core/hub.c:4996 port_event drivers/usb/core/hub.c:5102 hub_event+0x23c8/0x37c0 drivers/usb/core/hub.c:5182 process_one_work+0x9fb/0x1570 kernel/workqueue.c:2097 worker_thread+0x1e4/0x1350 kernel/workqueue.c:2231 kthread+0x324/0x3f0 kernel/kthread.c:231 ret_from_fork+0x25/0x30 arch/x86/entry/entry_64.S:425 Code: 48 8b 85 30 ff ff ff 48 8d b8 98 00 00 00 e8 8e 93 07 ff 45 89 e8 44 89 f1 4c 89 fa 48 89 c6 48 c7 c7 a0 e5 55 86 e8 20 08 8f fd <0f> ff e9 9b f7 ff ff e8 4a 04 d6 fd e9 80 f7 ff ff e8 60 11 a6 ---[ end trace 55d741234124cfc3 ]--- Check that endpoint is interrupt. Found by syzkaller. Signed-off-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/hwa-rc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index 35a1e777b449..9a53912bdfe9 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -825,6 +825,8 @@ static int hwarc_probe(struct usb_interface *iface, if (iface->cur_altsetting->desc.bNumEndpoints < 1) return -ENODEV; + if (!usb_endpoint_xfer_int(&iface->cur_altsetting->endpoint[0].desc)) + return -ENODEV; result = -ENOMEM; uwb_rc = uwb_rc_alloc(); From bbf26183b7a6236ba602f4d6a2f7cade35bba043 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 14:30:55 +0200 Subject: [PATCH 0110/1322] uwb: properly check kthread_run return value uwbd_start() calls kthread_run() and checks that the return value is not NULL. But the return value is not NULL in case kthread_run() fails, it takes the form of ERR_PTR(-EINTR). Use IS_ERR() instead. Also add a check to uwbd_stop(). Signed-off-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/uwbd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index 01c20a260a8b..39dd4ef53c77 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -302,18 +302,22 @@ static int uwbd(void *param) /** Start the UWB daemon */ void uwbd_start(struct uwb_rc *rc) { - rc->uwbd.task = kthread_run(uwbd, rc, "uwbd"); - if (rc->uwbd.task == NULL) + struct task_struct *task = kthread_run(uwbd, rc, "uwbd"); + if (IS_ERR(task)) { + rc->uwbd.task = NULL; printk(KERN_ERR "UWB: Cannot start management daemon; " "UWB won't work\n"); - else + } else { + rc->uwbd.task = task; rc->uwbd.pid = rc->uwbd.task->pid; + } } /* Stop the UWB daemon and free any unprocessed events */ void uwbd_stop(struct uwb_rc *rc) { - kthread_stop(rc->uwbd.task); + if (rc->uwbd.task) + kthread_stop(rc->uwbd.task); uwbd_flush(rc); } From b2a542bbb3081dbd64acc8929c140d196664c406 Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Tue, 5 Sep 2017 11:40:56 +0300 Subject: [PATCH 0111/1322] usb: Increase quirk delay for USB devices Commit e0429362ab15 ("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e") introduced quirk to workaround an issue with some Logitech webcams. The workaround is introducing delay for some USB operations. According to our testing, delay introduced by original commit is not long enough and in rare cases we still see issues described by the aforementioned commit. This patch increases delays introduced by original commit. Having this patch applied we do not see those problems anymore. Signed-off-by: Dmitry Fleytman Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 2 +- drivers/usb/core/hub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 4be52c602e9b..854c8d66cfbe 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -852,7 +852,7 @@ int usb_get_configuration(struct usb_device *dev) } if (dev->quirks & USB_QUIRK_DELAY_INIT) - msleep(100); + msleep(200); result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, bigbuffer, length); diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 41eaf0b52518..b5c733613823 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4838,7 +4838,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto loop; if (udev->quirks & USB_QUIRK_DELAY_INIT) - msleep(1000); + msleep(2000); /* consecutive bus-powered hubs aren't reliable; they can * violate the voltage drop budget. if the new child has From 17b694a5476de09e119c517dcc3b71eff6835bdf Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Sep 2017 13:58:31 +0200 Subject: [PATCH 0112/1322] mtd: nand: lpc32xx_mlc: Fix an error handling path in lpc32xx_nand_probe() If 'clk_prepare_enable()' fails, we must 'put' the corresponding clock. Fixes: 4d26f012ab59 ("mtd: nand: lpc32xx_mlc: Handle return value of clk_prepare_enable.") Signed-off-by: Christophe JAILLET Signed-off-by: Boris Brezillon --- drivers/mtd/nand/lpc32xx_mlc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/lpc32xx_mlc.c b/drivers/mtd/nand/lpc32xx_mlc.c index c3bb358ef01e..5796468db653 100644 --- a/drivers/mtd/nand/lpc32xx_mlc.c +++ b/drivers/mtd/nand/lpc32xx_mlc.c @@ -707,7 +707,7 @@ static int lpc32xx_nand_probe(struct platform_device *pdev) } res = clk_prepare_enable(host->clk); if (res) - goto err_exit1; + goto err_put_clk; nand_chip->cmd_ctrl = lpc32xx_nand_cmd_ctrl; nand_chip->dev_ready = lpc32xx_nand_device_ready; @@ -814,6 +814,7 @@ err_exit3: dma_release_channel(host->dma_chan); err_exit2: clk_disable_unprepare(host->clk); +err_put_clk: clk_put(host->clk); err_exit1: lpc32xx_wp_enable(host); From e580b8bc4316cbb8bbffb5ed7bf1e477064755ed Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 18 Sep 2017 09:40:12 +0100 Subject: [PATCH 0113/1322] arm64: efi: Don't include EFI fpsimd save/restore code in non-EFI kernels __efi_fpsimd_begin()/__efi_fpsimd_end() are for use when making EFI calls only, so using them in non-EFI kernels is not allowed. This patch compiles them out if CONFIG_EFI is not set. Acked-by: Ard Biesheuvel Signed-off-by: Dave Martin Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 3a68cf38a6b3..f444f374bd7b 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -321,6 +321,8 @@ void kernel_neon_end(void) } EXPORT_SYMBOL(kernel_neon_end); +#ifdef CONFIG_EFI + static DEFINE_PER_CPU(struct fpsimd_state, efi_fpsimd_state); static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); @@ -370,6 +372,8 @@ void __efi_fpsimd_end(void) kernel_neon_end(); } +#endif /* CONFIG_EFI */ + #endif /* CONFIG_KERNEL_MODE_NEON */ #ifdef CONFIG_CPU_PM From c73cc120a33e12e4e254b4b42bc613204ccb923b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 11:20:19 +0100 Subject: [PATCH 0114/1322] arm64: relax assembly code alignment from 16 byte to 4 byte Aarch64 instructions must be word aligned. The current 16 byte alignment is more than enough. Relax it into 4 byte alignment. Signed-off-by: Masahiro Yamada Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/linkage.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index 636c1bced7d4..1b266292f0be 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -1,7 +1,7 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H -#define __ALIGN .align 4 -#define __ALIGN_STR ".align 4" +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" #endif From 3d6a7b99e3fa29b92d6288487e057e0a596bd2b0 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Mon, 18 Sep 2017 11:20:20 +0100 Subject: [PATCH 0115/1322] arm64: ensure the kernel is compiled for LP64 The kernel needs to be compiled as a LP64 binary for ARM64, even when using a compiler that defaults to code-generation for the ILP32 ABI. Consequently, we need to explicitly pass '-mabi=lp64' (supported on gcc-4.9 and newer). Signed-off-by: Andrew Pinski Signed-off-by: Philipp Tomsich Signed-off-by: Christoph Muellner Signed-off-by: Yury Norov Reviewed-by: David Daney Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 9b41f1e3b1a0..939b310913cf 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -50,17 +50,22 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) +KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) +KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) + ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ AS += -EB LD += -EB +LDFLAGS += -maarch64linuxb UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian CHECKFLAGS += -D__AARCH64EL__ AS += -EL LD += -EL +LDFLAGS += -maarch64linux UTS_MACHINE := aarch64 endif From 0a51fb7174f2b7866b4d7a4a5c23b685b674beb6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 9 Sep 2017 12:19:22 +0300 Subject: [PATCH 0116/1322] quota: add missing lock into __dquot_transfer() Lock dq_dqb_lock around dquot_decr_inodes() Signed-off-by: Konstantin Khlebnikov Fixes: 7b9ca4c61bc2 ("quota: Reduce contention on dq_data_lock") Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8381db9db6d9..50b0556a124f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1980,7 +1980,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0, &warn_to[cnt]); if (ret) { + spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); + spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } From 0ab0b271bf75073cb254b5ea0593aceae5a42bd3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 16 Sep 2017 22:32:12 +0200 Subject: [PATCH 0117/1322] isofs: fix build regression The new isofs_show_options() function fails to build when CONFIG_NLS is disabled: fs/isofs/inode.c: In function 'isofs_show_options': fs/isofs/inode.c:518:44: error: 'CONFIG_NLS_DEFAULT' undeclared (first use in this function) fs/isofs/inode.c:518:44: note: each undeclared identifier is reported only once for each function it appears in This adds a check for CONFIG_JOLIET (which selects NLS), matching the other uses of the iocharset handling in this file. Fixes: 6fecb86a44f5 ("isofs: Implement show_options") Signed-off-by: Arnd Bergmann Signed-off-by: Jan Kara --- fs/isofs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index db692f554158..447a24d77b89 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -514,9 +514,11 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_fmode != ISOFS_INVALID_MODE) seq_printf(m, ",fmode=%o", sbi->s_fmode); +#ifdef CONFIG_JOLIET if (sbi->s_nls_iocharset && strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); +#endif return 0; } From 056e4fc2018364ba01a23a1ced0ccbbdfa4520b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Sep 2017 21:23:13 +0200 Subject: [PATCH 0118/1322] staging: unisys/visorbus: add __init/__exit annotations gcc-4.6 causes a harmless warning about the init function: WARNING: vmlinux.o(.text+0xed62c2): Section mismatch in reference from the function init_unisys() to the function .init.text:visorutil_spar_detect() The function init_unisys() references the function __init visorutil_spar_detect(). This is often because init_unisys lacks a __init annotation or the annotation of visorutil_spar_detect is wrong. It appears that newer versions inline visorutil_spar_detect(), end up with an empty __init section. This marks the module entry points as __init and __exit respectively, which avoids the warning and slightly reduces the runtime code size. Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/staging/unisys/visorbus/visorchipset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c index 74cce4f1a7bd..27ecf6fb49fd 100644 --- a/drivers/staging/unisys/visorbus/visorchipset.c +++ b/drivers/staging/unisys/visorbus/visorchipset.c @@ -1826,7 +1826,7 @@ static __init int visorutil_spar_detect(void) return 0; } -static int init_unisys(void) +static int __init init_unisys(void) { int result; @@ -1841,7 +1841,7 @@ static int init_unisys(void) return 0; }; -static void exit_unisys(void) +static void __exit exit_unisys(void) { acpi_bus_unregister_driver(&unisys_acpi_driver); } From a3563b09f132661d447f69224ef65fdec02f5c61 Mon Sep 17 00:00:00 2001 From: Arun Nagendran Date: Fri, 15 Sep 2017 12:30:32 -0400 Subject: [PATCH 0119/1322] staging: mt29f_spinand: Enable the read ECC before program the page Current program_page function did following operation: 1. read page (with ECC OFF) 2. modify the page 3. write the page (with ECC ON) For some case(buggy flash Chip), while read the page without ECC ON, we may read the page with bit flip error and modify that bad page without knowing the bit flip error on that page. also we re-calculate the hash for bad page and write it. This could bring potential in-consistency problem with Flash data. Verify this logic with GIGA DEVICE Part(GD5F2GQ4RCFIG): we see this in-conststency problem wit Giga Device and fix on this patch resovle that issue. Signed-off-by: Arun Nagendran Signed-off-by: Greg Kroah-Hartman --- drivers/staging/mt29f_spinand/mt29f_spinand.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/staging/mt29f_spinand/mt29f_spinand.c b/drivers/staging/mt29f_spinand/mt29f_spinand.c index 13eaf16ecd16..87595c594b12 100644 --- a/drivers/staging/mt29f_spinand/mt29f_spinand.c +++ b/drivers/staging/mt29f_spinand/mt29f_spinand.c @@ -496,8 +496,12 @@ static int spinand_program_page(struct spi_device *spi_nand, if (!wbuf) return -ENOMEM; - enable_read_hw_ecc = 0; - spinand_read_page(spi_nand, page_id, 0, CACHE_BUF, wbuf); + enable_read_hw_ecc = 1; + retval = spinand_read_page(spi_nand, page_id, 0, CACHE_BUF, wbuf); + if (retval < 0) { + dev_err(&spi_nand->dev, "ecc error on read page!!!\n"); + return retval; + } for (i = offset, j = 0; i < len; i++, j++) wbuf[i] &= buf[j]; From e1bf28868ab0bfd1fc73dd8d5e642d88456c30e9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Sep 2017 14:55:06 +0100 Subject: [PATCH 0120/1322] staging: r8822be: fix null pointer dereferences with a null driver_adapter The call to _rtl_dbg_trace via macro HALMAC_RT_TRACE will trigger a null pointer deference on a null driver_adapter. Fix this by assigning driver_adapter earlier to halmac_adapter->driver_adapter before the tracing call so that a non-null driver_adapter is passed instead. I should have spotted these with an earlier patch I sent, but I overlooked these in the rather large CoverityScan logs. Detected by CoverityScan, CID#1454550, CID#1454554, CID#1454565, CID#1454591, CID#1454598 ("Explicit null dereferenced") Fixes: 938a0447f094 ("staging: r8822be: Add code for halmac sub-driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- .../rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c | 4 ++-- .../rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c index 5f84526cb5b5..edbf6af1c8b7 100644 --- a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c +++ b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_api_88xx.c @@ -2901,11 +2901,11 @@ halmac_update_datapack_88xx(struct halmac_adapter *halmac_adapter, if (halmac_adapter->fw_version.h2c_version < 4) return HALMAC_RET_FW_NO_SUPPORT; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "[TRACE]%s ==========>\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; - HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "[TRACE]%s <==========\n", __func__); diff --git a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c index f33024e4d853..544f638ed3ef 100644 --- a/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c +++ b/drivers/staging/rtlwifi/halmac/halmac_88xx/halmac_func_88xx.c @@ -1618,10 +1618,11 @@ halmac_send_h2c_set_pwr_mode_88xx(struct halmac_adapter *halmac_adapter, void *driver_adapter = NULL; enum halmac_ret_status status = HALMAC_RET_SUCCESS; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "%s!!\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; h2c_header = h2c_buff; h2c_cmd = h2c_header + HALMAC_H2C_CMD_HDR_SIZE_88XX; @@ -1713,10 +1714,11 @@ halmac_media_status_rpt_88xx(struct halmac_adapter *halmac_adapter, u8 op_mode, void *driver_adapter = NULL; enum halmac_ret_status status = HALMAC_RET_SUCCESS; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "halmac_send_h2c_set_pwr_mode_88xx!!\n"); - driver_adapter = halmac_adapter->driver_adapter; h2c_header = H2c_buff; h2c_cmd = h2c_header + HALMAC_H2C_CMD_HDR_SIZE_88XX; @@ -2143,10 +2145,11 @@ halmac_func_ctrl_ch_switch_88xx(struct halmac_adapter *halmac_adapter, enum halmac_cmd_process_status *process_status = &halmac_adapter->halmac_state.scan_state_set.process_status; + driver_adapter = halmac_adapter->driver_adapter; + HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "halmac_ctrl_ch_switch!!\n"); - driver_adapter = halmac_adapter->driver_adapter; halmac_api = (struct halmac_api *)halmac_adapter->halmac_api; if (halmac_transition_scan_state_88xx( @@ -2276,15 +2279,13 @@ enum halmac_ret_status halmac_send_h2c_update_bcn_parse_info_88xx( { u8 h2c_buff[HALMAC_H2C_CMD_SIZE_88XX] = {0}; u16 h2c_seq_mum = 0; - void *driver_adapter = NULL; + void *driver_adapter = halmac_adapter->driver_adapter; struct halmac_h2c_header_info h2c_header_info; enum halmac_ret_status status = HALMAC_RET_SUCCESS; HALMAC_RT_TRACE(driver_adapter, HALMAC_MSG_H2C, DBG_DMESG, "%s!!\n", __func__); - driver_adapter = halmac_adapter->driver_adapter; - UPDATE_BEACON_PARSING_INFO_SET_FUNC_EN(h2c_buff, bcn_ie_info->func_en); UPDATE_BEACON_PARSING_INFO_SET_SIZE_TH(h2c_buff, bcn_ie_info->size_th); UPDATE_BEACON_PARSING_INFO_SET_TIMEOUT(h2c_buff, bcn_ie_info->timeout); From b72703e26b9478da531144ce5c4552dd22f1103d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 6 Sep 2017 17:40:25 +0200 Subject: [PATCH 0121/1322] staging: pi433: Move limit check to switch default to kill warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: drivers/staging/pi433/rf69.c: In function ‘rf69_set_dio_mapping’: drivers/staging/pi433/rf69.c:566: warning: ‘regaddr’ may be used uninitialized in this function drivers/staging/pi433/rf69.c:565: warning: ‘shift’ may be used uninitialized in this function drivers/staging/pi433/rf69.c:564: warning: ‘mask’ may be used uninitialized in this function While this is a false positive, it can easily be fixed by moving the limit check into the "default" case of the switch statement. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- drivers/staging/pi433/rf69.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/staging/pi433/rf69.c b/drivers/staging/pi433/rf69.c index c4b1b218ea38..290b419aa9dd 100644 --- a/drivers/staging/pi433/rf69.c +++ b/drivers/staging/pi433/rf69.c @@ -570,12 +570,6 @@ int rf69_set_dio_mapping(struct spi_device *spi, u8 DIONumber, u8 value) dev_dbg(&spi->dev, "set: DIO mapping"); #endif - // check DIO number - if (DIONumber > 5) { - dev_dbg(&spi->dev, "set: illegal input param"); - return -EINVAL; - } - switch (DIONumber) { case 0: mask=MASK_DIO0; shift=SHIFT_DIO0; regaddr=REG_DIOMAPPING1; break; case 1: mask=MASK_DIO1; shift=SHIFT_DIO1; regaddr=REG_DIOMAPPING1; break; @@ -583,6 +577,9 @@ int rf69_set_dio_mapping(struct spi_device *spi, u8 DIONumber, u8 value) case 3: mask=MASK_DIO3; shift=SHIFT_DIO3; regaddr=REG_DIOMAPPING1; break; case 4: mask=MASK_DIO4; shift=SHIFT_DIO4; regaddr=REG_DIOMAPPING2; break; case 5: mask=MASK_DIO5; shift=SHIFT_DIO5; regaddr=REG_DIOMAPPING2; break; + default: + dev_dbg(&spi->dev, "set: illegal input param"); + return -EINVAL; } // read reg From e5f5d0e20b6cecc0ebe6fc8e7df6f8823ad2d594 Mon Sep 17 00:00:00 2001 From: Okash Khawaja Date: Tue, 5 Sep 2017 12:51:59 +0100 Subject: [PATCH 0122/1322] staging: speakup: fix speakup-r empty line lockup When cursor is at beginning of an empty or whitespace-only line and speakup-r typed, kernel locks up. This happens because deadlock of in input_event function over dev->event_lock, as demonstrated by lockdep logs. The reason for that is speakup simulates a down arrow - because cursor is at an empty line - while inside key press notifier handler which is ultimately triggered from input_event function. The simulated key press leads to input_event being called again, this time under its own context. So the spinlock is dev->event_lock is acquired while still being held. This patch ensures that key press is not simulated from inside key press notifier handler. Instead it delegates to cursor_timer. It starts the timer and passes RA_DOWN_ARROW as argument. When timer handler runs and sees RA_DOWN_ARROW, it will then call kbd_fakekey2(RA_DOWN_ARROW) which will correctly simulate the keypress inside timer context. When not inside key press notifier callback, the behaviour will remain the same as before this patch. Signed-off-by: Okash Khawaja Reviewed-by: Samuel Thibault Signed-off-by: Greg Kroah-Hartman --- drivers/staging/speakup/main.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c index 67956e24779c..56f7be6af1f6 100644 --- a/drivers/staging/speakup/main.c +++ b/drivers/staging/speakup/main.c @@ -1376,6 +1376,8 @@ static void reset_highlight_buffers(struct vc_data *); static int read_all_key; +static int in_keyboard_notifier; + static void start_read_all_timer(struct vc_data *vc, int command); enum { @@ -1408,7 +1410,10 @@ static void read_all_doc(struct vc_data *vc) cursor_track = read_all_mode; spk_reset_index_count(0); if (get_sentence_buf(vc, 0) == -1) { - kbd_fakekey2(vc, RA_DOWN_ARROW); + del_timer(&cursor_timer); + if (!in_keyboard_notifier) + speakup_fake_down_arrow(); + start_read_all_timer(vc, RA_DOWN_ARROW); } else { say_sentence_num(0, 0); synth_insert_next_index(0); @@ -2212,8 +2217,10 @@ static int keyboard_notifier_call(struct notifier_block *nb, int ret = NOTIFY_OK; static int keycode; /* to hold the current keycode */ + in_keyboard_notifier = 1; + if (vc->vc_mode == KD_GRAPHICS) - return ret; + goto out; /* * First, determine whether we are handling a fake keypress on @@ -2225,7 +2232,7 @@ static int keyboard_notifier_call(struct notifier_block *nb, */ if (speakup_fake_key_pressed()) - return ret; + goto out; switch (code) { case KBD_KEYCODE: @@ -2266,6 +2273,8 @@ static int keyboard_notifier_call(struct notifier_block *nb, break; } } +out: + in_keyboard_notifier = 0; return ret; } From 974d4d03fc020af4fa4e9e72a86f0fefa37803c5 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Sun, 3 Sep 2017 19:06:31 +0200 Subject: [PATCH 0123/1322] staging: vchiq_2835_arm: Fix NULL ptr dereference in free_pagelist This fixes a NULL pointer dereference on RPi 2 with multi_v7_defconfig. The function page_address() could return NULL with enabled CONFIG_HIGHMEM. So fix this by using kmap() instead. Signed-off-by: Stefan Wahren Fixes: 71bad7f08641 ("staging: add bcm2708 vchiq driver") Cc: stable Signed-off-by: Greg Kroah-Hartman --- .../vc04_services/interface/vchiq_arm/vchiq_2835_arm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c index 0159ca4407d8..be08849175ea 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c @@ -612,18 +612,20 @@ free_pagelist(struct vchiq_pagelist_info *pagelistinfo, if (head_bytes > actual) head_bytes = actual; - memcpy((char *)page_address(pages[0]) + + memcpy((char *)kmap(pages[0]) + pagelist->offset, fragments, head_bytes); + kunmap(pages[0]); } if ((actual >= 0) && (head_bytes < actual) && (tail_bytes != 0)) { - memcpy((char *)page_address(pages[num_pages - 1]) + + memcpy((char *)kmap(pages[num_pages - 1]) + ((pagelist->offset + actual) & (PAGE_SIZE - 1) & ~(g_cache_line_size - 1)), fragments + g_cache_line_size, tail_bytes); + kunmap(pages[num_pages - 1]); } down(&g_free_fragments_mutex); From 55168470835688e5da5828cdcf1b1498d7baadb1 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Mon, 11 Sep 2017 10:45:12 +0300 Subject: [PATCH 0124/1322] usb: dwc3: ep0: fix DMA starvation by assigning req->trb on ep0 If we don't assign a TRB to ep0 requests, we won't be able to unmap the request later on resulting in starvation of DMA resources. Fixes: 4a71fcb8ac5f ("usb: dwc3: gadget: only unmap requests from DMA if mapped") Reported-by: Thinh Nguyen Tested-by: Thinh Nguyen Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/ep0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 827e376bfa97..75e6cb044eb2 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -990,6 +990,8 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, DWC3_TRBCTL_CONTROL_DATA, true); + req->trb = &dwc->ep0_trb[dep->trb_enqueue - 1]; + /* Now prepare one extra TRB to align transfer size */ dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, maxpacket - rem, @@ -1015,6 +1017,8 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, DWC3_TRBCTL_CONTROL_DATA, true); + req->trb = &dwc->ep0_trb[dep->trb_enqueue - 1]; + /* Now prepare one extra TRB to align transfer size */ dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, 0, DWC3_TRBCTL_CONTROL_DATA, @@ -1029,6 +1033,9 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, dwc3_ep0_prepare_one_trb(dep, req->request.dma, req->request.length, DWC3_TRBCTL_CONTROL_DATA, false); + + req->trb = &dwc->ep0_trb[dep->trb_enqueue]; + ret = dwc3_ep0_start_trans(dep); } From 13541226dc056fa3f54417ce12f18ba711a1591c Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 31 Aug 2017 11:06:07 -0700 Subject: [PATCH 0125/1322] ARC: reset: remove the misleading v1 suffix all over There is no plan yet to do a v2 board. And even if we were to do it only some IPs would actually change, so it be best to add suffixes at that point, not now ! Signed-off-by: Vineet Gupta Signed-off-by: Philipp Zabel --- ...,hsdk-v1-reset.txt => snps,hsdk-reset.txt} | 8 ++-- MAINTAINERS | 6 +-- drivers/reset/Kconfig | 6 +-- drivers/reset/Makefile | 2 +- .../reset/{reset-hsdk-v1.c => reset-hsdk.c} | 44 +++++++++---------- include/dt-bindings/reset/snps,hsdk-reset.h | 17 +++++++ .../dt-bindings/reset/snps,hsdk-v1-reset.h | 17 ------- 7 files changed, 50 insertions(+), 50 deletions(-) rename Documentation/devicetree/bindings/reset/{snps,hsdk-v1-reset.txt => snps,hsdk-reset.txt} (74%) rename drivers/reset/{reset-hsdk-v1.c => reset-hsdk.c} (72%) create mode 100644 include/dt-bindings/reset/snps,hsdk-reset.h delete mode 100644 include/dt-bindings/reset/snps,hsdk-v1-reset.h diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt similarity index 74% rename from Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt rename to Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt index 6a68146ee353..830069b1c37c 100644 --- a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt +++ b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt @@ -1,11 +1,11 @@ -Binding for the HSDK v1 reset controller +Binding for the Synopsys HSDK reset controller This binding uses the common reset binding[1]. [1] Documentation/devicetree/bindings/reset/reset.txt Required properties: -- compatible: should be "snps,hsdk-v1.0-reset". +- compatible: should be "snps,hsdk-reset". - reg: should always contain 2 pairs address - length: first for reset configuration register and second for corresponding SW reset and status bits register. @@ -13,7 +13,7 @@ Required properties: Example: reset: reset@880 { - compatible = "snps,hsdk-v1.0-reset"; + compatible = "snps,hsdk-reset"; #reset-cells = <1>; reg = <0x8A0 0x4>, <0xFF0 0x4>; }; @@ -25,4 +25,4 @@ Specifying reset lines connected to IP modules: .... }; -The index could be found in +The index could be found in diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..3b9887b3644e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12915,9 +12915,9 @@ F: drivers/mmc/host/dw_mmc* SYNOPSYS HSDK RESET CONTROLLER DRIVER M: Eugeniy Paltsev S: Supported -F: drivers/reset/reset-hsdk-v1.c -F: include/dt-bindings/reset/snps,hsdk-v1-reset.h -F: Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt +F: drivers/reset/reset-hsdk.c +F: include/dt-bindings/reset/snps,hsdk-reset.h +F: Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt SYSTEM CONFIGURATION (SYSCON) M: Lee Jones diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index d063bd5373dd..a7c7d5a8c089 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -34,12 +34,12 @@ config RESET_BERLIN help This enables the reset controller driver for Marvell Berlin SoCs. -config RESET_HSDK_V1 - bool "HSDK v1 Reset Driver" +config RESET_HSDK + bool "Synopsys HSDK Reset Driver" depends on HAS_IOMEM default n help - This enables the reset controller driver for HSDK v1. + This enables the reset controller driver for HSDK board. config RESET_IMX7 bool "i.MX7 Reset Driver" if COMPILE_TEST diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile index d368367110e5..af1c15c330b3 100644 --- a/drivers/reset/Makefile +++ b/drivers/reset/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_ARCH_TEGRA) += tegra/ obj-$(CONFIG_RESET_A10SR) += reset-a10sr.o obj-$(CONFIG_RESET_ATH79) += reset-ath79.o obj-$(CONFIG_RESET_BERLIN) += reset-berlin.o -obj-$(CONFIG_RESET_HSDK_V1) += reset-hsdk-v1.o +obj-$(CONFIG_RESET_HSDK) += reset-hsdk.o obj-$(CONFIG_RESET_IMX7) += reset-imx7.o obj-$(CONFIG_RESET_LANTIQ) += reset-lantiq.o obj-$(CONFIG_RESET_LPC18XX) += reset-lpc18xx.o diff --git a/drivers/reset/reset-hsdk-v1.c b/drivers/reset/reset-hsdk.c similarity index 72% rename from drivers/reset/reset-hsdk-v1.c rename to drivers/reset/reset-hsdk.c index bca13e4bf622..8bce391c6943 100644 --- a/drivers/reset/reset-hsdk-v1.c +++ b/drivers/reset/reset-hsdk.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2017 Synopsys. * - * Synopsys HSDKv1 SDP reset driver. + * Synopsys HSDK Development platform reset driver. * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -18,9 +18,9 @@ #include #include -#define to_hsdkv1_rst(p) container_of((p), struct hsdkv1_rst, rcdev) +#define to_hsdk_rst(p) container_of((p), struct hsdk_rst, rcdev) -struct hsdkv1_rst { +struct hsdk_rst { void __iomem *regs_ctl; void __iomem *regs_rst; spinlock_t lock; @@ -49,12 +49,12 @@ static const u32 rst_map[] = { #define CGU_IP_SW_RESET_RESET BIT(0) #define SW_RESET_TIMEOUT 10000 -static void hsdkv1_reset_config(struct hsdkv1_rst *rst, unsigned long id) +static void hsdk_reset_config(struct hsdk_rst *rst, unsigned long id) { writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL); } -static int hsdkv1_reset_do(struct hsdkv1_rst *rst) +static int hsdk_reset_do(struct hsdk_rst *rst) { u32 reg; @@ -69,28 +69,28 @@ static int hsdkv1_reset_do(struct hsdkv1_rst *rst) !(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT); } -static int hsdkv1_reset_reset(struct reset_controller_dev *rcdev, +static int hsdk_reset_reset(struct reset_controller_dev *rcdev, unsigned long id) { - struct hsdkv1_rst *rst = to_hsdkv1_rst(rcdev); + struct hsdk_rst *rst = to_hsdk_rst(rcdev); unsigned long flags; int ret; spin_lock_irqsave(&rst->lock, flags); - hsdkv1_reset_config(rst, id); - ret = hsdkv1_reset_do(rst); + hsdk_reset_config(rst, id); + ret = hsdk_reset_do(rst); spin_unlock_irqrestore(&rst->lock, flags); return ret; } -static const struct reset_control_ops hsdkv1_reset_ops = { - .reset = hsdkv1_reset_reset, +static const struct reset_control_ops hsdk_reset_ops = { + .reset = hsdk_reset_reset, }; -static int hsdkv1_reset_probe(struct platform_device *pdev) +static int hsdk_reset_probe(struct platform_device *pdev) { - struct hsdkv1_rst *rst; + struct hsdk_rst *rst; struct resource *mem; rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL); @@ -110,7 +110,7 @@ static int hsdkv1_reset_probe(struct platform_device *pdev) spin_lock_init(&rst->lock); rst->rcdev.owner = THIS_MODULE; - rst->rcdev.ops = &hsdkv1_reset_ops; + rst->rcdev.ops = &hsdk_reset_ops; rst->rcdev.of_node = pdev->dev.of_node; rst->rcdev.nr_resets = HSDK_MAX_RESETS; rst->rcdev.of_reset_n_cells = 1; @@ -118,20 +118,20 @@ static int hsdkv1_reset_probe(struct platform_device *pdev) return reset_controller_register(&rst->rcdev); } -static const struct of_device_id hsdkv1_reset_dt_match[] = { - { .compatible = "snps,hsdk-v1.0-reset" }, +static const struct of_device_id hsdk_reset_dt_match[] = { + { .compatible = "snps,hsdk-reset" }, { }, }; -static struct platform_driver hsdkv1_reset_driver = { - .probe = hsdkv1_reset_probe, +static struct platform_driver hsdk_reset_driver = { + .probe = hsdk_reset_probe, .driver = { - .name = "hsdk-v1.0-reset", - .of_match_table = hsdkv1_reset_dt_match, + .name = "hsdk-reset", + .of_match_table = hsdk_reset_dt_match, }, }; -builtin_platform_driver(hsdkv1_reset_driver); +builtin_platform_driver(hsdk_reset_driver); MODULE_AUTHOR("Eugeniy Paltsev "); -MODULE_DESCRIPTION("Synopsys HSDKv1 SDP reset driver"); +MODULE_DESCRIPTION("Synopsys HSDK SDP reset driver"); MODULE_LICENSE("GPL v2"); diff --git a/include/dt-bindings/reset/snps,hsdk-reset.h b/include/dt-bindings/reset/snps,hsdk-reset.h new file mode 100644 index 000000000000..e1a643e4bc91 --- /dev/null +++ b/include/dt-bindings/reset/snps,hsdk-reset.h @@ -0,0 +1,17 @@ +/** + * This header provides index for the HSDK reset controller. + */ +#ifndef _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK +#define _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK + +#define HSDK_APB_RESET 0 +#define HSDK_AXI_RESET 1 +#define HSDK_ETH_RESET 2 +#define HSDK_USB_RESET 3 +#define HSDK_SDIO_RESET 4 +#define HSDK_HDMI_RESET 5 +#define HSDK_GFX_RESET 6 +#define HSDK_DMAC_RESET 7 +#define HSDK_EBI_RESET 8 + +#endif /*_DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK*/ diff --git a/include/dt-bindings/reset/snps,hsdk-v1-reset.h b/include/dt-bindings/reset/snps,hsdk-v1-reset.h deleted file mode 100644 index d898c89b7123..000000000000 --- a/include/dt-bindings/reset/snps,hsdk-v1-reset.h +++ /dev/null @@ -1,17 +0,0 @@ -/** - * This header provides index for the HSDK v1 reset controller. - */ -#ifndef _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1 -#define _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1 - -#define HSDK_V1_APB_RESET 0 -#define HSDK_V1_AXI_RESET 1 -#define HSDK_V1_ETH_RESET 2 -#define HSDK_V1_USB_RESET 3 -#define HSDK_V1_SDIO_RESET 4 -#define HSDK_V1_HDMI_RESET 5 -#define HSDK_V1_GFX_RESET 6 -#define HSDK_V1_DMAC_RESET 7 -#define HSDK_V1_EBI_RESET 8 - -#endif /*_DT_BINDINGS_RESET_CONTROLLER_HSDK_V1*/ From 74378c5c8cdaf0ce9f65e67cbd0613286f2c3bad Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 5 Sep 2017 20:16:27 +0200 Subject: [PATCH 0126/1322] driver core: Fix link to device power management documentation Correct location as of commit 2728b2d2e5be4b82 (PM / core / docs: Convert sleep states API document to reST). Fixes: 2728b2d2e5be4b82 (PM / core / docs: Convert sleep states API document to reST) Signed-off-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/device.h b/include/linux/device.h index c6f27207dbe8..1d2607923a24 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -838,7 +838,7 @@ struct dev_links_info { * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. - * See Documentation/power/admin-guide/devices.rst for details. + * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. From 096a2c610df279d001a8f61faa3eb7f98ca6196b Mon Sep 17 00:00:00 2001 From: Rafael Wysocki Date: Fri, 8 Sep 2017 01:20:54 +0200 Subject: [PATCH 0127/1322] ACPI / PMIC: Add code reviewers to MAINTAINERS Andy and Mika review code changes under drivers/acpi/pmic/ on a regular basis and I rely on their help with that, so add them as code reviwewers for that part of the kernel. Signed-off-by: Rafael J. Wysocki Acked-by: Lee Jones Reviewed-by: Mika Westerberg Reviewed-by: Andy Shevchenko --- MAINTAINERS | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..fa4e916b72e9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -352,6 +352,18 @@ L: linux-acpi@vger.kernel.org S: Maintained F: drivers/acpi/arm64 +ACPI PMIC DRIVERS +M: "Rafael J. Wysocki" +M: Len Brown +R: Andy Shevchenko +R: Mika Westerberg +L: linux-acpi@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-acpi/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm +B: https://bugzilla.kernel.org +S: Supported +F: drivers/acpi/pmic/ + ACPI THERMAL DRIVER M: Zhang Rui L: linux-acpi@vger.kernel.org From 41ba8bd0829f5c210715ece8b0f699bed0da8eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Tue, 5 Sep 2017 23:14:29 +0200 Subject: [PATCH 0128/1322] PM / QoS: Use the correct variable to check the QoS request type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the actual function argument for the validation of the request type, instead of the type field in a fresh (supposedly zero-initialized) request structure. Signed-off-by: Jan H. Schönherr Signed-off-by: Rafael J. Wysocki --- drivers/base/power/qos.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index f850daeffba4..277d43a83f53 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -277,11 +277,11 @@ void dev_pm_qos_constraints_destroy(struct device *dev) mutex_unlock(&dev_pm_qos_sysfs_mtx); } -static bool dev_pm_qos_invalid_request(struct device *dev, - struct dev_pm_qos_request *req) +static bool dev_pm_qos_invalid_req_type(struct device *dev, + enum dev_pm_qos_req_type type) { - return !req || (req->type == DEV_PM_QOS_LATENCY_TOLERANCE - && !dev->power.set_latency_tolerance); + return type == DEV_PM_QOS_LATENCY_TOLERANCE && + !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, @@ -290,7 +290,7 @@ static int __dev_pm_qos_add_request(struct device *dev, { int ret = 0; - if (!dev || dev_pm_qos_invalid_request(dev, req)) + if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), From 73600b619bf8b9803a0610624a0e11f5ed8f761e Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sat, 2 Sep 2017 10:49:38 +0200 Subject: [PATCH 0129/1322] mtd: nand: remove unused blockmask variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fix the following build warning: drivers/mtd/nand/nand_base.c:2671:30: attention : variable ‘blockmask’ set but not used [-Wunused-but-set-variable] Fixes: 0b4773fd1649 ("mtd: nand: Drop unused cached programming support") Signed-off-by: Corentin Labbe Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index bcc8cef1c615..12edaae17d81 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2668,7 +2668,7 @@ static uint8_t *nand_fill_oob(struct mtd_info *mtd, uint8_t *oob, size_t len, static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { - int chipnr, realpage, page, blockmask, column; + int chipnr, realpage, page, column; struct nand_chip *chip = mtd_to_nand(mtd); uint32_t writelen = ops->len; @@ -2704,7 +2704,6 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to, realpage = (int)(to >> chip->page_shift); page = realpage & chip->pagemask; - blockmask = (1 << (chip->phys_erase_shift - chip->page_shift)) - 1; /* Invalidate the page cache, when we write to the cached page */ if (to <= ((loff_t)chip->pagebuf << chip->page_shift) && From fe9c1c9555a529bf678148f719c1b9f1730f68a3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 14 Sep 2017 14:38:58 +0200 Subject: [PATCH 0130/1322] xen: don't compile pv-specific parts if XEN_PV isn't configured xenbus_client.c contains some functions specific for pv guests. Enclose them with #ifdef CONFIG_XEN_PV to avoid compiling them when they are not needed (e.g. on ARM). Signed-off-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus_client.c | 130 +++++++++++++++-------------- 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 82a8866758ee..a1c17000129b 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -519,64 +519,6 @@ static int __xenbus_map_ring(struct xenbus_device *dev, return err; } -static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, - grant_ref_t *gnt_refs, - unsigned int nr_grefs, - void **vaddr) -{ - struct xenbus_map_node *node; - struct vm_struct *area; - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; - phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; - int err = GNTST_okay; - int i; - bool leaked; - - *vaddr = NULL; - - if (nr_grefs > XENBUS_MAX_RING_GRANTS) - return -EINVAL; - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); - if (!area) { - kfree(node); - return -ENOMEM; - } - - for (i = 0; i < nr_grefs; i++) - phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; - - err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, - phys_addrs, - GNTMAP_host_map | GNTMAP_contains_pte, - &leaked); - if (err) - goto failed; - - node->nr_handles = nr_grefs; - node->pv.area = area; - - spin_lock(&xenbus_valloc_lock); - list_add(&node->next, &xenbus_valloc_pages); - spin_unlock(&xenbus_valloc_lock); - - *vaddr = area->addr; - return 0; - -failed: - if (!leaked) - free_vm_area(area); - else - pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); - - kfree(node); - return err; -} - struct map_ring_valloc_hvm { unsigned int idx; @@ -725,6 +667,65 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) } EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); +#ifdef CONFIG_XEN_PV +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node; + struct vm_struct *area; + pte_t *ptes[XENBUS_MAX_RING_GRANTS]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; + int err = GNTST_okay; + int i; + bool leaked; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_GRANTS) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes); + if (!area) { + kfree(node); + return -ENOMEM; + } + + for (i = 0; i < nr_grefs; i++) + phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; + + err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, + phys_addrs, + GNTMAP_host_map | GNTMAP_contains_pte, + &leaked); + if (err) + goto failed; + + node->nr_handles = nr_grefs; + node->pv.area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = area->addr; + return 0; + +failed: + if (!leaked) + free_vm_area(area); + else + pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); + + kfree(node); + return err; +} + static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) { struct xenbus_map_node *node; @@ -788,6 +789,12 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) return err; } +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_valloc_pv, + .unmap = xenbus_unmap_ring_vfree_pv, +}; +#endif + struct unmap_ring_vfree_hvm { unsigned int idx; @@ -916,11 +923,6 @@ enum xenbus_state xenbus_read_driver_state(const char *path) } EXPORT_SYMBOL_GPL(xenbus_read_driver_state); -static const struct xenbus_ring_ops ring_ops_pv = { - .map = xenbus_map_ring_valloc_pv, - .unmap = xenbus_unmap_ring_vfree_pv, -}; - static const struct xenbus_ring_ops ring_ops_hvm = { .map = xenbus_map_ring_valloc_hvm, .unmap = xenbus_unmap_ring_vfree_hvm, @@ -928,8 +930,10 @@ static const struct xenbus_ring_ops ring_ops_hvm = { void __init xenbus_ring_ops_init(void) { +#ifdef CONFIG_XEN_PV if (!xen_feature(XENFEAT_auto_translated_physmap)) ring_ops = &ring_ops_pv; else +#endif ring_ops = &ring_ops_hvm; } From a931b9ce93841a5b66b709ba5a244276e345e63b Mon Sep 17 00:00:00 2001 From: Guneshwor Singh Date: Thu, 14 Sep 2017 17:49:40 +0530 Subject: [PATCH 0131/1322] ALSA: compress: Remove unused variable Commit 04c5d5a430fc ("ALSA: compress: Embed struct device") removed the statement that used 'str' but didn't remove the variable itself. So remove it. [Adding stable to Cc since pr_debug() may refer to the uninitialized buffer -- tiwai] Fixes: 04c5d5a430fc ("ALSA: compress: Embed struct device") Signed-off-by: Guneshwor Singh Cc: Signed-off-by: Takashi Iwai --- sound/core/compress_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index fec1dfdb14ad..4490a699030b 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -948,14 +948,13 @@ static const struct file_operations snd_compr_file_ops = { static int snd_compress_dev_register(struct snd_device *device) { int ret = -EINVAL; - char str[16]; struct snd_compr *compr; if (snd_BUG_ON(!device || !device->device_data)) return -EBADFD; compr = device->device_data; - pr_debug("reg %s for device %s, direction %d\n", str, compr->name, + pr_debug("reg device %s, direction %d\n", compr->name, compr->direction); /* register compressed device */ ret = snd_register_device(SNDRV_DEVICE_TYPE_COMPRESS, From 1c363eaece2752c5f8b1b874cb4ae435de06aa66 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 10:56:13 +0200 Subject: [PATCH 0132/1322] android: binder: fix type mismatch warning Allowing binder to expose the 64-bit API on 32-bit kernels caused a build warning: drivers/android/binder.c: In function 'binder_transaction_buffer_release': drivers/android/binder.c:2220:15: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_translate_fd_array': drivers/android/binder.c:2445:13: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_fixup_parent': drivers/android/binder.c:2511:18: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] This adds extra type casts to avoid the warning. However, there is another problem with the Kconfig option: turning it on or off creates two incompatible ABI versions, a kernel that has this enabled cannot run user space that was built without it or vice versa. A better solution might be to leave the option hidden until the binder code is fixed to deal with both ABI versions. Fixes: e8d2ed7db7c3 ("Revert "staging: Fix build issues with new binder API"") Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d055b3f2a207..72af95c9ea22 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2217,7 +2217,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id, (u64)fda->num_fds); continue; } - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); for (fd_index = 0; fd_index < fda->num_fds; fd_index++) task_close_fd(proc, fd_array[fd_index]); } break; @@ -2442,7 +2442,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, */ parent_buffer = parent->buffer - binder_alloc_get_user_buffer_offset(&target_proc->alloc); - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); @@ -2508,7 +2508,7 @@ static int binder_fixup_parent(struct binder_transaction *t, proc->pid, thread->pid); return -EINVAL; } - parent_buffer = (u8 *)(parent->buffer - + parent_buffer = (u8 *)((uintptr_t)parent->buffer - binder_alloc_get_user_buffer_offset( &target_proc->alloc)); *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; From 52b81611f209da5f49019260522633e994e241b5 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:25:38 -0700 Subject: [PATCH 0133/1322] binder: fix an ret value override commit 372e3147df70 ("binder: guarantee txn complete / errors delivered in-order") incorrectly defined a local ret value. This ret value will be invalid when out of the if block Fixes: 372e3147df70 ("binder: refactor binder ref inc/dec for thread safety") Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 72af95c9ea22..b257bb0e2cfe 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2326,7 +2326,6 @@ static int binder_translate_handle(struct flat_binder_object *fp, (u64)node->ptr); binder_node_unlock(node); } else { - int ret; struct binder_ref_data dest_rdata; binder_node_unlock(node); From d53bebdf4d779497b29e1aad26e19cac1d446f42 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:21:52 -0700 Subject: [PATCH 0134/1322] binder: fix memory corruption in binder_transaction binder commit 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") made a change to enqueue tcomplete to thread->todo before enqueuing the transaction. However, in err_dead_proc_or_thread case, the tcomplete is directly freed, without dequeued. It may cause the thread->todo list to be corrupted. So, dequeue it before freeing. Fixes: 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index b257bb0e2cfe..ab34239a76ee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3082,6 +3082,7 @@ static void binder_transaction(struct binder_proc *proc, err_dead_proc_or_thread: return_error = BR_DEAD_REPLY; return_error_line = __LINE__; + binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: From 93dc1774d2a4c7a298d5cdf78cc8acdcb7b1428d Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 7 Sep 2017 15:37:30 +0200 Subject: [PATCH 0135/1322] auxdisplay: charlcd: properly restore atomic counter on error path Commit f4757af ("staging: panel: Fix single-open policy race condition") introduced in 3.19-rc1 attempted to fix a race condition on the open, but failed to properly do it and used to exit without restoring the semaphore. This results in -EBUSY being returned after the first open error until the module is reloaded or the system restarted (ie: consecutive to a dual open resulting in -EBUSY or to a permission error). [ Note for stable maintainers: the code moved from drivers/misc/panel.c to drivers/auxdisplay/{charlcd,panel}.c during 4.12. The patch easily applies there (modulo the renamed atomic counter) but I can provide a tested backport if desired. ] Fixes: f4757af85 # 3.19-rc1 Cc: stable@vger.kernel.org Cc: Mariusz Gorski Cc: Geert Uytterhoeven Cc: Miguel Ojeda Sandonis Signed-off-by: Willy Tarreau Signed-off-by: Greg Kroah-Hartman --- drivers/auxdisplay/charlcd.c | 11 +++++++++-- drivers/auxdisplay/panel.c | 13 ++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index cfeb049a01ef..642afd88870b 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -647,18 +647,25 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf, static int charlcd_open(struct inode *inode, struct file *file) { struct charlcd_priv *priv = to_priv(the_charlcd); + int ret; + ret = -EBUSY; if (!atomic_dec_and_test(&charlcd_available)) - return -EBUSY; /* open only once at a time */ + goto fail; /* open only once at a time */ + ret = -EPERM; if (file->f_mode & FMODE_READ) /* device is write-only */ - return -EPERM; + goto fail; if (priv->must_clear) { charlcd_clear_display(&priv->lcd); priv->must_clear = false; } return nonseekable_open(inode, file); + + fail: + atomic_inc(&charlcd_available); + return ret; } static int charlcd_release(struct inode *inode, struct file *file) diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index df126dcdaf18..6911acd896d9 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1105,14 +1105,21 @@ static ssize_t keypad_read(struct file *file, static int keypad_open(struct inode *inode, struct file *file) { - if (!atomic_dec_and_test(&keypad_available)) - return -EBUSY; /* open only once at a time */ + int ret; + ret = -EBUSY; + if (!atomic_dec_and_test(&keypad_available)) + goto fail; /* open only once at a time */ + + ret = -EPERM; if (file->f_mode & FMODE_WRITE) /* device is read-only */ - return -EPERM; + goto fail; keypad_buflen = 0; /* flush the buffer on opening */ return 0; + fail: + atomic_inc(&keypad_available); + return ret; } static int keypad_release(struct inode *inode, struct file *file) From 38b0774c0598c7a54b8499d18c2b764c35dc94ab Mon Sep 17 00:00:00 2001 From: Guy Shapiro Date: Mon, 11 Sep 2017 11:00:11 +0200 Subject: [PATCH 0136/1322] nvmem: core: return EFBIG on out-of-range write When writing data that exceeds the nvmem size to a nvmem sysfs file using the sh redirection operator >, the shell hangs, trying to write the out-of-range bytes endlessly. Fix the problem by returning EFBIG described in man 2 write. Similar change was done for binary sysfs files on commit 0936896056365349afa867c16e9f9100a6707cbf Signed-off-by: Guy Shapiro Cc: linux-api@vger.kernel.org Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index de54c7f5048a..3866117bc285 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -135,7 +135,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj, /* Stop the user from writing */ if (pos >= nvmem->size) - return 0; + return -EFBIG; if (count < nvmem->word_size) return -EINVAL; From aad8d097c9224be264939fc6c02a5570ea094f60 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 Sep 2017 11:00:12 +0200 Subject: [PATCH 0137/1322] nvmem: add missing of_node_put() in of_nvmem_cell_get() of_get_next_parent() increments the refcount of the returned node. It should be put when done. Signed-off-by: Masahiro Yamada Signed-off-by: Srinivas Kandagatla Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 3866117bc285..d12e5de78e70 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -789,6 +789,7 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, return ERR_PTR(-EINVAL); nvmem = __nvmem_device_get(nvmem_np, NULL, NULL); + of_node_put(nvmem_np); if (IS_ERR(nvmem)) return ERR_CAST(nvmem); From 6878e7de6af726de47f9f3bec649c3f49e786586 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Sep 2017 16:29:48 -0700 Subject: [PATCH 0138/1322] driver core: suppress sending MODALIAS in UNBIND uevents The current udev rules cause modules to be loaded on all device events save for "remove". With the introduction of KOBJ_BIND/KOBJ_UNBIND this causes issues, as driver modules that have devices bound to their drivers get immediately reloaded, and it appears to the user that module unloading doe snot work. The standard udev matching rule is foillowing: ENV{MODALIAS}=="?*", RUN{builtin}+="kmod load $env{MODALIAS}" Given that MODALIAS data is not terribly useful for UNBIND event, let's zap it from the generated uevent environment until we get userspace updated with the correct udev rule that only loads modules on "add" event. Reported-by: Jakub Kicinski Tested-by: Jakub Kicinski Fixes: 1455cf8dbfd0 ("driver core: emit uevents when device is bound ...") Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- lib/kobject_uevent.c | 49 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index e590523ea476..f237a09a5862 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -294,6 +294,26 @@ static void cleanup_uevent_env(struct subprocess_info *info) } #endif +static void zap_modalias_env(struct kobj_uevent_env *env) +{ + static const char modalias_prefix[] = "MODALIAS="; + int i; + + for (i = 0; i < env->envp_idx;) { + if (strncmp(env->envp[i], modalias_prefix, + sizeof(modalias_prefix) - 1)) { + i++; + continue; + } + + if (i != env->envp_idx - 1) + memmove(&env->envp[i], &env->envp[i + 1], + sizeof(env->envp[i]) * env->envp_idx - 1); + + env->envp_idx--; + } +} + /** * kobject_uevent_env - send an uevent with environmental data * @@ -409,16 +429,29 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, } } - /* - * Mark "add" and "remove" events in the object to ensure proper - * events to userspace during automatic cleanup. If the object did - * send an "add" event, "remove" will automatically generated by - * the core, if not already done by the caller. - */ - if (action == KOBJ_ADD) + switch (action) { + case KOBJ_ADD: + /* + * Mark "add" event so we can make sure we deliver "remove" + * event to userspace during automatic cleanup. If + * the object did send an "add" event, "remove" will + * automatically generated by the core, if not already done + * by the caller. + */ kobj->state_add_uevent_sent = 1; - else if (action == KOBJ_REMOVE) + break; + + case KOBJ_REMOVE: kobj->state_remove_uevent_sent = 1; + break; + + case KOBJ_UNBIND: + zap_modalias_env(env); + break; + + default: + break; + } mutex_lock(&uevent_sock_mutex); /* we will send an event, so request a new sequence number */ From 452562abb5b76c14449dead2a7113f641893e8bc Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 7 Sep 2017 15:16:05 +0100 Subject: [PATCH 0139/1322] base: arch_topology: fix section mismatch build warnings Commit 2ef7a2953c81 ("arm, arm64: factorize common cpu capacity default code") introduced init_cpu_capacity_callback and init_cpu_capacity_notifier which are referenced from initcall and are missing __init{,data} annotations resulting the below section mismatch build warnings. "WARNING: vmlinux.o(.text+0xbab790): Section mismatch in reference from the function init_cpu_capacity_callback() to the variable .init.text:$x The function init_cpu_capacity_callback() references the variable __init $x. This is often because init_cpu_capacity_callback lacks a __init annotation or the annotation of $x is wrong." This patch fixes the above build warnings by adding the required annotations. Fixes: 2ef7a2953c81 ("arm, arm64: factorize common cpu capacity default code") Cc: Juri Lelli Cc: stable Signed-off-by: Sudeep Holla Signed-off-by: Greg Kroah-Hartman --- drivers/base/arch_topology.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 41be9ff7d70a..6df7d6676a48 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -166,11 +166,11 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) } #ifdef CONFIG_CPU_FREQ -static cpumask_var_t cpus_to_visit; -static void parsing_done_workfn(struct work_struct *work); -static DECLARE_WORK(parsing_done_work, parsing_done_workfn); +static cpumask_var_t cpus_to_visit __initdata; +static void __init parsing_done_workfn(struct work_struct *work); +static __initdata DECLARE_WORK(parsing_done_work, parsing_done_workfn); -static int +static int __init init_cpu_capacity_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -206,7 +206,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, return 0; } -static struct notifier_block init_cpu_capacity_notifier = { +static struct notifier_block init_cpu_capacity_notifier __initdata = { .notifier_call = init_cpu_capacity_callback, }; @@ -232,7 +232,7 @@ static int __init register_cpufreq_notifier(void) } core_initcall(register_cpufreq_notifier); -static void parsing_done_workfn(struct work_struct *work) +static void __init parsing_done_workfn(struct work_struct *work) { cpufreq_unregister_notifier(&init_cpu_capacity_notifier, CPUFREQ_POLICY_NOTIFIER); From 9821786d7c90eee2f6852261893d9703801aae47 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 18 Sep 2017 17:39:12 +0300 Subject: [PATCH 0140/1322] usb: xhci: Free the right ring in xhci_add_endpoint() In the xhci_add_endpoint(), a new ring was allocated and saved at xhci_virt_ep->new_ring. Hence, when error happens, we need to free the allocated ring before returning error. Current code frees xhci_virt_ep->ring instead of the new_ring. This patch fixes this. Cc: Signed-off-by: Lu Baolu Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index b2ff1ff1a02f..ee198ea47f49 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1703,7 +1703,8 @@ static int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, if (xhci->quirks & XHCI_MTK_HOST) { ret = xhci_mtk_add_ep_quirk(hcd, udev, ep); if (ret < 0) { - xhci_free_endpoint_ring(xhci, virt_dev, ep_index); + xhci_ring_free(xhci, virt_dev->eps[ep_index].new_ring); + virt_dev->eps[ep_index].new_ring = NULL; return ret; } } From 5a838a13c9b4e5dd188b7a6eaeb894e9358ead0c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:13 +0300 Subject: [PATCH 0141/1322] xhci: fix finding correct bus_state structure for USB 3.1 hosts xhci driver keeps a bus_state structure for each hcd (usb2 and usb3) The structure is picked based on hcd speed, but driver only compared for HCD_USB3 speed, returning the wrong bus_state for HCD_USB31 hosts. This caused null pointer dereference errors in bus_resume function. Cc: Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 2abaa4d6d39d..e96e90d0fe9f 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1681,7 +1681,7 @@ struct xhci_bus_state { static inline unsigned int hcd_index(struct usb_hcd *hcd) { - if (hcd->speed == HCD_USB3) + if (hcd->speed >= HCD_USB3) return 0; else return 1; From 114ec3a6f9096d211a4aff4277793ba969a62c73 Mon Sep 17 00:00:00 2001 From: Jim Dickerson Date: Mon, 18 Sep 2017 17:39:14 +0300 Subject: [PATCH 0142/1322] usb: pci-quirks.c: Corrected timeout values used in handshake Servers were emitting failed handoff messages but were not waiting the full 1 second as designated in section 4.22.1 of the eXtensible Host Controller Interface specifications. The handshake was using wrong units so calls were made with milliseconds not microseconds. Comments referenced 5 seconds not 1 second as in specs. The wrong units were also corrected in a second handshake call. Cc: Signed-off-by: Jim Dickerson Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 658d9d1f9ea3..e02dbb1e1cdd 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -1022,7 +1022,7 @@ EXPORT_SYMBOL_GPL(usb_disable_xhci_ports); * * Takes care of the handoff between the Pre-OS (i.e. BIOS) and the OS. * It signals to the BIOS that the OS wants control of the host controller, - * and then waits 5 seconds for the BIOS to hand over control. + * and then waits 1 second for the BIOS to hand over control. * If we timeout, assume the BIOS is broken and take control anyway. */ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) @@ -1069,9 +1069,9 @@ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) if (val & XHCI_HC_BIOS_OWNED) { writel(val | XHCI_HC_OS_OWNED, base + ext_cap_offset); - /* Wait for 5 seconds with 10 microsecond polling interval */ + /* Wait for 1 second with 10 microsecond polling interval */ timeout = handshake(base + ext_cap_offset, XHCI_HC_BIOS_OWNED, - 0, 5000, 10); + 0, 1000000, 10); /* Assume a buggy BIOS and take HC ownership anyway */ if (timeout) { @@ -1100,7 +1100,7 @@ hc_init: * operational or runtime registers. Wait 5 seconds and no more. */ timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_CNR, 0, - 5000, 10); + 5000000, 10); /* Assume a buggy HC and start HC initialization anyway */ if (timeout) { val = readl(op_reg_base + XHCI_STS_OFFSET); From 76a14d7bf92960aac2f5450bfd23783bfa618be9 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:15 +0300 Subject: [PATCH 0143/1322] xhci: fix wrong endpoint ESIT value shown in tracing Read the endpiont ESIT from endpiont context using correct macro. Add a macro for reading the high bits of ESIT for Large ESIT Payload Capable hosts (LEC=1) Cc: # 4.12 Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index e96e90d0fe9f..7c87189e8110 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -735,6 +735,8 @@ struct xhci_ep_ctx { #define EP_MAXPSTREAMS(p) (((p) << 10) & EP_MAXPSTREAMS_MASK) /* Endpoint is set up with a Linear Stream Array (vs. Secondary Stream Array) */ #define EP_HAS_LSA (1 << 15) +/* hosts with LEC=1 use bits 31:24 as ESIT high bits. */ +#define CTX_TO_MAX_ESIT_PAYLOAD_HI(p) (((p) >> 24) & 0xff) /* ep_info2 bitmasks */ /* @@ -2540,8 +2542,8 @@ static inline const char *xhci_decode_ep_context(u32 info, u32 info2, u64 deq, u8 lsa; u8 hid; - esit = EP_MAX_ESIT_PAYLOAD_HI(info) << 16 | - EP_MAX_ESIT_PAYLOAD_LO(tx_info); + esit = CTX_TO_MAX_ESIT_PAYLOAD_HI(info) << 16 | + CTX_TO_MAX_ESIT_PAYLOAD(tx_info); ep_state = info & EP_STATE_MASK; max_pstr = info & EP_MAXPSTREAMS_MASK; From c6b8e79306f515b5483eb11076e0fbfc140434a8 Mon Sep 17 00:00:00 2001 From: Adam Wallis Date: Mon, 18 Sep 2017 17:39:16 +0300 Subject: [PATCH 0144/1322] usb: host: xhci-plat: allow sysdev to inherit from ACPI Commit 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration") updated the method determining DMA for XHCI from sysdev. However, this patch broke the ability to enumerate the FWNODE from parent ACPI devices from the child plat XHCI device. Currently, xhci_plat is not set up properly when the parent device is an ACPI node. The conditions that xhci_plat_probe should satisfy are 1. xhci_plat comes from firmware 2. xhci_plat is child of a device from firmware (dwc3-plat) 3. xhci_plat is grandchild of a pci device (dwc3-pci) Case 2 is covered when the child is an OF node (by checking sysdev->parent->of_node), however, an ACPI parent will return NULL in the of_node check and will thus not result in sysdev being set to sysdev->parent [ 17.591549] xhci-hcd: probe of xhci-hcd.6.auto failed with error -5 This change adds a check for ACPI to completely allow for condition 2. This is done by first checking if the parent node is of type ACPI (e.g., dwc3-plat) and set sysdev to sysdev->parent if either of the two following conditions are met: 1: If fwnode is empty (in the case that platform_device_add_properties was not called on the allocated platform device) 2: fwnode exists but is not of type ACPI (this would happen if platform_device_add_properties was called on the allocated device. Instead of type FWNODE_ACPI, you would end up with FWNODE_PDATA) Cc: stable@vger.kernel.org #4.12.x Cc: stable@vger.kernel.org #4.13.x Fixes: 4c39d4b949d3 ("usb: xhci: use bus->sysdev for DMA configuration") Tested-by: Thang Q. Nguyen Signed-off-by: Adam Wallis Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-plat.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index 163bafde709f..1cb6eaef4ae1 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -178,14 +178,18 @@ static int xhci_plat_probe(struct platform_device *pdev) * 2. xhci_plat is child of a device from firmware (dwc3-plat) * 3. xhci_plat is grandchild of a pci device (dwc3-pci) */ - sysdev = &pdev->dev; - if (sysdev->parent && !sysdev->of_node && sysdev->parent->of_node) - sysdev = sysdev->parent; + for (sysdev = &pdev->dev; sysdev; sysdev = sysdev->parent) { + if (is_of_node(sysdev->fwnode) || + is_acpi_device_node(sysdev->fwnode)) + break; #ifdef CONFIG_PCI - else if (sysdev->parent && sysdev->parent->parent && - sysdev->parent->parent->bus == &pci_bus_type) - sysdev = sysdev->parent->parent; + else if (sysdev->bus == &pci_bus_type) + break; #endif + } + + if (!sysdev) + sysdev = &pdev->dev; /* Try to set 64-bit DMA first */ if (WARN_ON(!sysdev->dma_mask)) From 4ec1cd3eeeee7ccc35681270da028dbc29ca7bbd Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:17 +0300 Subject: [PATCH 0145/1322] xhci: Fix sleeping with spin_lock_irq() held in ASmedia 1042A workaround The flow control workaround for ASM1042A xHC hosts sleeps between register polling. The workaround gets called in several places, among them with spin_lock_irq() held when xHC host is resumed or hoplug removed. This was noticed as kernel panics at resume on a Dell XPS15 9550 with TB16 thunderbolt dock. Avoid sleeping with spin_lock_irq() held, use udelay() instead The original workaround was added to 4.9 and 4.12 stable releases, this patch needs to be applied to those as well. Fixes: 9da5a1092b13 ("xhci: Bad Ethernet performance plugged in ASM1042A host") Cc: #4.9+ Reported-by: Jose Marino Tested-by: Jose Marino Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index e02dbb1e1cdd..6dda3623a276 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -447,7 +447,7 @@ static int usb_asmedia_wait_write(struct pci_dev *pdev) if ((value & ASMT_CONTROL_WRITE_BIT) == 0) return 0; - usleep_range(40, 60); + udelay(50); } dev_warn(&pdev->dev, "%s: check_write_ready timeout", __func__); From 7bea22b124d77845c85a62eaa29a85ba6cc2f899 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:18 +0300 Subject: [PATCH 0146/1322] xhci: set missing SuperSpeedPlus Link Protocol bit in roothub descriptor A SuperSpeedPlus roothub needs to have the Link Protocol (LP) bit set in the bmSublinkSpeedAttr[] entry of a SuperSpeedPlus descriptor. If the xhci controller has an optional Protocol Speed ID (PSI) table then that will be used as a base to create the roothub SuperSpeedPlus descriptor. The PSI table does not however necessary contain the LP bit so we need to set it manually. Check the psi speed and set LP bit if speed is 10Gbps or higher. We're not setting it for 5 to 10Gbps as USB 3.1 specification always mention SuperSpeedPlus for 10Gbps or higher, and some SSIC USB 3.0 speeds can be over 5Gbps, such as SSIC-G3B-L1 at 5830 Mbps Cc: # 4.6+ Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index ad89a6d4111b..4dba2cf73cbe 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -112,7 +112,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* If PSI table exists, add the custom speed attributes from it */ if (usb3_1 && xhci->usb3_rhub.psi_count) { - u32 ssp_cap_base, bm_attrib, psi; + u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; ssp_cap_base = USB_DT_BOS_SIZE + USB_DT_USB_SS_CAP_SIZE; @@ -139,6 +139,15 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { psi = xhci->usb3_rhub.psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; + psi_exp = XHCI_EXT_PORT_PSIE(psi); + psi_mant = XHCI_EXT_PORT_PSIM(psi); + + /* Shift to Gbps and set SSP Link BIT(14) if 10Gpbs */ + for (; psi_exp < 3; psi_exp++) + psi_mant /= 1000; + if (psi_mant >= 10) + psi |= BIT(14); + if ((psi & PLT_MASK) == PLT_SYM) { /* Symmetric, create SSA RX and TX from one PSI entry */ put_unaligned_le32(psi, &buf[offset]); From bcd6a7aa13800afc1418e6b29d944d882214939a Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 18 Sep 2017 17:39:19 +0300 Subject: [PATCH 0147/1322] Revert "xhci: Limit USB2 port wake support for AMD Promontory hosts" This reverts commit dec08194ffeccfa1cf085906b53d301930eae18f. Commit dec08194ffec ("xhci: Limit USB2 port wake support for AMD Promontory hosts") makes all high speed USB ports on ASUS PRIME B350M-A cease to function after enabling runtime PM. All boards with this chipsets will be affected, so revert the commit. The original patch was added to stable 4.9, 4.11 and 4.12 and needs to reverted from there as well Cc: # 4.9+ Signed-off-by: Kai-Heng Feng Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 3 --- drivers/usb/host/xhci-pci.c | 12 ------------ drivers/usb/host/xhci.h | 2 +- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 4dba2cf73cbe..da9158f171cb 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1515,9 +1515,6 @@ int xhci_bus_suspend(struct usb_hcd *hcd) t2 |= PORT_WKOC_E | PORT_WKCONN_E; t2 &= ~PORT_WKDISC_E; } - if ((xhci->quirks & XHCI_U2_DISABLE_WAKE) && - (hcd->speed < HCD_USB3)) - t2 &= ~PORT_WAKE_BITS; } else t2 &= ~PORT_WAKE_BITS; diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 8071c8fdd15e..76f392954733 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -54,11 +54,6 @@ #define PCI_DEVICE_ID_INTEL_APL_XHCI 0x5aa8 #define PCI_DEVICE_ID_INTEL_DNV_XHCI 0x19d0 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_4 0x43b9 -#define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba -#define PCI_DEVICE_ID_AMD_PROMONTORYA_2 0x43bb -#define PCI_DEVICE_ID_AMD_PROMONTORYA_1 0x43bc - #define PCI_DEVICE_ID_ASMEDIA_1042A_XHCI 0x1142 static const char hcd_name[] = "xhci_hcd"; @@ -142,13 +137,6 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) if (pdev->vendor == PCI_VENDOR_ID_AMD) xhci->quirks |= XHCI_TRUST_TX_LENGTH; - if ((pdev->vendor == PCI_VENDOR_ID_AMD) && - ((pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_4) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_3) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_2) || - (pdev->device == PCI_DEVICE_ID_AMD_PROMONTORYA_1))) - xhci->quirks |= XHCI_U2_DISABLE_WAKE; - if (pdev->vendor == PCI_VENDOR_ID_INTEL) { xhci->quirks |= XHCI_LPM_SUPPORT; xhci->quirks |= XHCI_INTEL_HOST; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 7c87189e8110..2b48aa4f6b76 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1828,7 +1828,7 @@ struct xhci_hcd { /* For controller with a broken Port Disable implementation */ #define XHCI_BROKEN_PORT_PED (1 << 25) #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) -#define XHCI_U2_DISABLE_WAKE (1 << 27) +/* Reserved. It was XHCI_U2_DISABLE_WAKE */ #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL (1 << 28) unsigned int num_active_eps; From bf563b01c2895a4bfd1a29cc5abc67fe706ecffd Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:42 +0200 Subject: [PATCH 0148/1322] driver core: platform: Don't read past the end of "driver_override" buffer When printing the driver_override parameter when it is 4095 and 4094 bytes long, the printing code would access invalid memory because we need count+1 bytes for printing. Reject driver_override values of these lengths in driver_override_store(). This is in close analogy to commit 4efe874aace5 ("PCI: Don't read past the end of sysfs "driver_override" buffer") from Sasha Levin. Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Cc: stable@vger.kernel.org # v3.17+ Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index d1bd99271066..9045c5f3734e 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -868,7 +868,8 @@ static ssize_t driver_override_store(struct device *dev, struct platform_device *pdev = to_platform_device(dev); char *driver_override, *old, *cp; - if (count > PATH_MAX) + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) return -EINVAL; driver_override = kstrndup(buf, count, GFP_KERNEL); From b0ade85165b3caeb0cd908cffe5921a39f25c243 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 10 Sep 2017 13:41:41 +0200 Subject: [PATCH 0149/1322] netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If no spinlock debugging options (CONFIG_GENERIC_LOCKBREAK, CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_LOCK_ALLOC) are enabled on a UP platform (e.g. m68k defconfig), arch_spinlock_t is an empty struct, hence using ARRAY_SIZE(nf_nat_locks) causes a division by zero: net/netfilter/nf_nat_core.c: In function ‘nf_nat_setup_info’: net/netfilter/nf_nat_core.c:432: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘__nf_nat_cleanup_conntrack’: net/netfilter/nf_nat_core.c:535: warning: division by zero net/netfilter/nf_nat_core.c:537: warning: division by zero net/netfilter/nf_nat_core.c: In function ‘nf_nat_init’: net/netfilter/nf_nat_core.c:810: warning: division by zero net/netfilter/nf_nat_core.c:811: warning: division by zero net/netfilter/nf_nat_core.c:824: warning: division by zero Fix this by using the CONNTRACK_LOCKS definition instead. Suggested-by: Florian Westphal Fixes: 8073e960a03bf7b5 ("netfilter: nat: use keyed locks") Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); From 7f4f7dd4417d9efd038b14d39c70170db2e0baa0 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 11 Sep 2017 21:52:40 +0200 Subject: [PATCH 0150/1322] netfilter: ipset: ipset list may return wrong member count for set with timeout Simple testcase: $ ipset create test hash:ip timeout 5 $ ipset add test 1.2.3.4 $ ipset add test 1.2.2.2 $ sleep 5 $ ipset l Name: test Type: hash:ip Revision: 5 Header: family inet hashsize 1024 maxelem 65536 timeout 5 Size in memory: 296 References: 0 Number of entries: 2 Members: We return "Number of entries: 2" but no members are listed. That is because mtype_list runs "ip_set_timeout_expired" and does not list the expired entries, but set->elements is never upated (until mtype_gc cleans it up later). Reviewed-by: Joshua Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; From 432219fd002ada95d2f45dd3a25c3f7c73f27864 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 2 Sep 2017 01:15:13 +0300 Subject: [PATCH 0151/1322] serial: sh-sci: document R8A77970 bindings R-Car V3M (R8A77970) SoC also has the R-Car gen3 compatible SCIF and HSCIF ports, so document the SoC specific bindings. Signed-off-by: Sergei Shtylyov Acked-by: Rob Herring Acked-by: Geert Uytterhoeven Acked-by: Simon Horman Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/renesas,sci-serial.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt index 4fc96946f81d..cf504d0380ae 100644 --- a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt +++ b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt @@ -41,6 +41,8 @@ Required properties: - "renesas,hscif-r8a7795" for R8A7795 (R-Car H3) HSCIF compatible UART. - "renesas,scif-r8a7796" for R8A7796 (R-Car M3-W) SCIF compatible UART. - "renesas,hscif-r8a7796" for R8A7796 (R-Car M3-W) HSCIF compatible UART. + - "renesas,scif-r8a77970" for R8A77970 (R-Car V3M) SCIF compatible UART. + - "renesas,hscif-r8a77970" for R8A77970 (R-Car V3M) HSCIF compatible UART. - "renesas,scif-r8a77995" for R8A77995 (R-Car D3) SCIF compatible UART. - "renesas,hscif-r8a77995" for R8A77995 (R-Car D3) HSCIF compatible UART. - "renesas,scifa-sh73a0" for SH73A0 (SH-Mobile AG5) SCIFA compatible UART. From 104583b504da8a3ad86d033b497cb6e58941a9a1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 12 Sep 2017 12:39:54 +0200 Subject: [PATCH 0152/1322] mxser: fix timeout calculation for low rates Paul reported, that low rates like B300 make the driver to hang in mxser_wait_until_sent. His debugging tackled the issue down to the info->timeout computation in mxser_set_baud. Obviously, ints are used there and they easily overflow with these low rates: B300 makes info->timeout to be -373. So switch all these types to unsigned as it ought to be. And use the u64 domain to perform the computation as in the worst case, we need 35 bits to store the computed value (before division). And use do_div not to break 32 bit kernels. [v2] make it actually build Signed-off-by: Jiri Slaby Cc: Paul Tested-by: Signed-off-by: Greg Kroah-Hartman --- drivers/tty/mxser.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c index 1c0c9553bc05..7dd38047ba23 100644 --- a/drivers/tty/mxser.c +++ b/drivers/tty/mxser.c @@ -246,11 +246,11 @@ struct mxser_port { unsigned char err_shadow; struct async_icount icount; /* kernel counters for 4 input interrupts */ - int timeout; + unsigned int timeout; int read_status_mask; int ignore_status_mask; - int xmit_fifo_size; + unsigned int xmit_fifo_size; int xmit_head; int xmit_tail; int xmit_cnt; @@ -572,8 +572,9 @@ static void mxser_dtr_rts(struct tty_port *port, int on) static int mxser_set_baud(struct tty_struct *tty, long newspd) { struct mxser_port *info = tty->driver_data; - int quot = 0, baud; + unsigned int quot = 0, baud; unsigned char cval; + u64 timeout; if (!info->ioaddr) return -1; @@ -594,8 +595,13 @@ static int mxser_set_baud(struct tty_struct *tty, long newspd) quot = 0; } - info->timeout = ((info->xmit_fifo_size * HZ * 10 * quot) / info->baud_base); - info->timeout += HZ / 50; /* Add .02 seconds of slop */ + /* + * worst case (128 * 1000 * 10 * 18432) needs 35 bits, so divide in the + * u64 domain + */ + timeout = (u64)info->xmit_fifo_size * HZ * 10 * quot; + do_div(timeout, info->baud_base); + info->timeout = timeout + HZ / 50; /* Add .02 seconds of slop */ if (quot) { info->MCR |= UART_MCR_DTR; From 0e5ec4140c4a8a38c4ff7293c018eb7da69a30db Mon Sep 17 00:00:00 2001 From: Russell Enderby Date: Tue, 5 Sep 2017 12:16:47 -0500 Subject: [PATCH 0153/1322] serial: bcm63xx: fix timing issue. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue where unprintable characters can occur or output is cut off over the serial uart / linux console depending on timing. Problem occurs when changing the serial baud rate when setting up the new console.The bcm63xx driver does a disable and flush of the uart tx fifo while there is data still in the tx fifo. If the tx fifo still has data it is trying to send out, we need to wait until it is empty before disabling and flushing the uart. When we now go to change the uart parameters including speed we check if there is data currently in the tx fifo.If there is was mdelay(10) and check again.If it tries 3 times and still has data in it we just continue and sacrifice the tx fifo buffer. A cleaner and more preferred approach would be to remove : - spin_lock_irqsave() - bcm_uart_disable() - bcm_uart_flush() However it is not clear if the author put those in to fix another underlying issue.As a result this solution is a safer approach. Output before the fix: [0.306000] 14e00520.serial: ttyS0 at MMIO 0x14e00520 (irq = 9, base_baud = 1687500) is a° 0.315000] console[ttyS0] enabled Output verified after the fix: [0.315000] 14e00520.serial: ttyS0 at MMIO 0x14e00520 (irq = 9, base_baud = 1687500) is a bcm63xx_uart [0.334000] console[ttyS0] enabled Signed-off-by: Russell Enderby Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/bcm63xx_uart.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c index 583c9a0c7ecc..8c48c3784831 100644 --- a/drivers/tty/serial/bcm63xx_uart.c +++ b/drivers/tty/serial/bcm63xx_uart.c @@ -507,9 +507,14 @@ static void bcm_uart_set_termios(struct uart_port *port, { unsigned int ctl, baud, quot, ier; unsigned long flags; + int tries; spin_lock_irqsave(&port->lock, flags); + /* Drain the hot tub fully before we power it off for the winter. */ + for (tries = 3; !bcm_uart_tx_empty(port) && tries; tries--) + mdelay(10); + /* disable uart while changing speed */ bcm_uart_disable(port); bcm_uart_flush(port); From 9d7ee0e28da59b05647c3d2a7ad4076c16b1a6ef Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 4 Sep 2017 19:20:24 +0800 Subject: [PATCH 0154/1322] tty: serial: lpuart: avoid report NULL interrupt The current driver register irq in .startup() and free the irq in .shutdown(), then user will see the NULL interrupt output from 'cat /proc/interrupts' after the uart port test completed: ... 41: 515 0 0 0 GICv3 257 Level fsl-lpuart 42: 2 0 0 0 GICv3 258 Level ... It is better to register all the irqs during probe function via devm_request_irq() to avoid to call free_irq(). Signed-off-by: Fugang Duan Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 40 ++++++++++++++------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 849c1f9991ce..f0252184291e 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -1276,7 +1276,6 @@ static void rx_dma_timer_init(struct lpuart_port *sport) static int lpuart_startup(struct uart_port *port) { struct lpuart_port *sport = container_of(port, struct lpuart_port, port); - int ret; unsigned long flags; unsigned char temp; @@ -1291,11 +1290,6 @@ static int lpuart_startup(struct uart_port *port) sport->rxfifo_size = 0x1 << (((temp >> UARTPFIFO_RXSIZE_OFF) & UARTPFIFO_FIFOSIZE_MASK) + 1); - ret = devm_request_irq(port->dev, port->irq, lpuart_int, 0, - DRIVER_NAME, sport); - if (ret) - return ret; - spin_lock_irqsave(&sport->port.lock, flags); lpuart_setup_watermark(sport); @@ -1333,7 +1327,6 @@ static int lpuart_startup(struct uart_port *port) static int lpuart32_startup(struct uart_port *port) { struct lpuart_port *sport = container_of(port, struct lpuart_port, port); - int ret; unsigned long flags; unsigned long temp; @@ -1346,11 +1339,6 @@ static int lpuart32_startup(struct uart_port *port) sport->rxfifo_size = 0x1 << (((temp >> UARTFIFO_RXSIZE_OFF) & UARTFIFO_FIFOSIZE_MASK) - 1); - ret = devm_request_irq(port->dev, port->irq, lpuart32_int, 0, - DRIVER_NAME, sport); - if (ret) - return ret; - spin_lock_irqsave(&sport->port.lock, flags); lpuart32_setup_watermark(sport); @@ -1380,8 +1368,6 @@ static void lpuart_shutdown(struct uart_port *port) spin_unlock_irqrestore(&port->lock, flags); - devm_free_irq(port->dev, port->irq, sport); - if (sport->lpuart_dma_rx_use) { del_timer_sync(&sport->lpuart_timer); lpuart_dma_rx_free(&sport->port); @@ -1400,7 +1386,6 @@ static void lpuart_shutdown(struct uart_port *port) static void lpuart32_shutdown(struct uart_port *port) { - struct lpuart_port *sport = container_of(port, struct lpuart_port, port); unsigned long temp; unsigned long flags; @@ -1413,8 +1398,6 @@ static void lpuart32_shutdown(struct uart_port *port) lpuart32_write(port, temp, UARTCTRL); spin_unlock_irqrestore(&port->lock, flags); - - devm_free_irq(port->dev, port->irq, sport); } static void @@ -2212,16 +2195,22 @@ static int lpuart_probe(struct platform_device *pdev) platform_set_drvdata(pdev, &sport->port); - if (lpuart_is_32(sport)) + if (lpuart_is_32(sport)) { lpuart_reg.cons = LPUART32_CONSOLE; - else + ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart32_int, 0, + DRIVER_NAME, sport); + } else { lpuart_reg.cons = LPUART_CONSOLE; + ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart_int, 0, + DRIVER_NAME, sport); + } + + if (ret) + goto failed_irq_request; ret = uart_add_one_port(&lpuart_reg, &sport->port); - if (ret) { - clk_disable_unprepare(sport->clk); - return ret; - } + if (ret) + goto failed_attach_port; sport->dma_tx_chan = dma_request_slave_channel(sport->port.dev, "tx"); if (!sport->dma_tx_chan) @@ -2240,6 +2229,11 @@ static int lpuart_probe(struct platform_device *pdev) } return 0; + +failed_attach_port: +failed_irq_request: + clk_disable_unprepare(sport->clk); + return ret; } static int lpuart_remove(struct platform_device *pdev) From c91261437985d481c472639d4397931d77f5d4e9 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 2 Sep 2017 23:13:55 +0300 Subject: [PATCH 0155/1322] serial: sccnxp: Fix error handling in sccnxp_probe() sccnxp_probe() returns result of regulator_disable() that may lead to returning zero, while device is not properly initialized. Also the driver enables clocks, but it does not disable it. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sccnxp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/sccnxp.c b/drivers/tty/serial/sccnxp.c index cdd2f942317c..b9c7a904c1ea 100644 --- a/drivers/tty/serial/sccnxp.c +++ b/drivers/tty/serial/sccnxp.c @@ -889,7 +889,16 @@ static int sccnxp_probe(struct platform_device *pdev) goto err_out; uartclk = 0; } else { - clk_prepare_enable(clk); + ret = clk_prepare_enable(clk); + if (ret) + goto err_out; + + ret = devm_add_action_or_reset(&pdev->dev, + (void(*)(void *))clk_disable_unprepare, + clk); + if (ret) + goto err_out; + uartclk = clk_get_rate(clk); } @@ -988,7 +997,7 @@ static int sccnxp_probe(struct platform_device *pdev) uart_unregister_driver(&s->uart); err_out: if (!IS_ERR(s->regulator)) - return regulator_disable(s->regulator); + regulator_disable(s->regulator); return ret; } From 64cfcaed7b25f69d8b7a091a23961f50c6788a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20D=C3=ADaz?= Date: Fri, 7 Jul 2017 10:27:06 -0500 Subject: [PATCH 0156/1322] selftests: net: More graceful finding of `ip'. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ip tool might be provided by another package (such as Busybox), not necessarily implementing the -Version switch. Trying an actual usage (`ip link show') might be a better test that would work with all implementations of `ip'. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/net/netdevice.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh index 4e00568d70c2..90cb903c3381 100755 --- a/tools/testing/selftests/net/netdevice.sh +++ b/tools/testing/selftests/net/netdevice.sh @@ -178,7 +178,7 @@ if [ "$(id -u)" -ne 0 ];then exit 0 fi -ip -Version 2>/dev/null >/dev/null +ip link show 2>/dev/null >/dev/null if [ $? -ne 0 ];then echo "SKIP: Could not run test without the ip tool" exit 0 From 96e5335859e30fa2bb2da8befc86292c3c1a73dd Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 11 Sep 2017 12:49:01 +0200 Subject: [PATCH 0157/1322] tools: fix testing/selftests/sigaltstack for s390x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On s390x the compilation of the file sas.c in directory tools/testing/selftests/sigaltstack fails with this error message: root@s35lp76 testing]# make selftests/sigaltstack/sas cc selftests/sigaltstack/sas.c -o selftests/sigaltstack/sas selftests/sigaltstack/sas.c: In function ‘my_usr1’: selftests/sigaltstack/sas.c:42:25: error: invalid register name for ‘sp’ register unsigned long sp asm("sp"); ^~ : recipe for target 'selftests/sigaltstack/sas' failed make: *** [selftests/sigaltstack/sas] Error 1 [root@s35lp76 testing]# On s390x the stack pointer is register r15, the register name "sp" is unknown. Make this line platform dependend and use register r15. With this patch the compilation and test succeeds: [root@s35lp76 testing]# ./selftests/sigaltstack/sas TAP version 13 ok 1 Initial sigaltstack state was SS_DISABLE # [RUN] signal USR1 ok 2 sigaltstack is disabled in sighandler # [RUN] switched to user ctx # [RUN] signal USR2 # [OK] Stack preserved ok 3 sigaltstack is still SS_AUTODISARM after signal Pass 3 Fail 0 Xfail 0 Xpass 0 Skip 0 Error 0 1..3 [root@s35lp76 testing]# Signed-off-by: Thomas Richter Signed-off-by: Shuah Khan --- tools/testing/selftests/sigaltstack/sas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c index 7d406c3973ba..97bb150837df 100644 --- a/tools/testing/selftests/sigaltstack/sas.c +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -39,7 +39,11 @@ void my_usr1(int sig, siginfo_t *si, void *u) stack_t stk; struct stk_data *p; +#if __s390x__ + register unsigned long sp asm("%15"); +#else register unsigned long sp asm("sp"); +#endif if (sp < (unsigned long)sstack || sp >= (unsigned long)sstack + SIGSTKSZ) { From 172a8ca880f1c8b01bc4d1e0239bcb293ef65e0b Mon Sep 17 00:00:00 2001 From: Fathi Boudra Date: Thu, 29 Jun 2017 12:39:53 +0300 Subject: [PATCH 0158/1322] selftests: breakpoints: re-order TEST_GEN_PROGS targets breakpoint_test can fail on arm64 with older/unpatched glibc: breakpoint_test_arm64.c: In function 'run_test': breakpoint_test_arm64.c:170:25: error: 'TRAP_HWBKPT' undeclared (first use in this function) due to glibc missing several of the TRAP_* constants in the userspace definitions. Specifically TRAP_BRANCH and TRAP_HWBKPT. See https://sourceware.org/bugzilla/show_bug.cgi?id=21286 It prevents to build step_after_suspend_test afterward, since make won't continue. We still want to be able to build and run the test, independently of breakpoint_test_arm64 build failure. Re-order TEST_GEN_PROGS to be able to build step_after_suspend_test first. Signed-off-by: Fathi Boudra Signed-off-by: Shuah Khan --- tools/testing/selftests/breakpoints/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index 6b214b7b10fb..247b0a1899d7 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -2,14 +2,14 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) +TEST_GEN_PROGS := step_after_suspend_test + ifeq ($(ARCH),x86) -TEST_GEN_PROGS := breakpoint_test +TEST_GEN_PROGS += breakpoint_test endif ifneq (,$(filter $(ARCH),aarch64 arm64)) -TEST_GEN_PROGS := breakpoint_test_arm64 +TEST_GEN_PROGS += breakpoint_test_arm64 endif -TEST_GEN_PROGS += step_after_suspend_test - include ../lib.mk From 67b2e30eb7b346db60afe91b0927738fb604d0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20D=C3=ADaz?= Date: Thu, 17 Aug 2017 10:55:30 -0500 Subject: [PATCH 0159/1322] selftests: intel_pstate: build only on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests are only for x86, so don't try to build or run them on other architectures. Signed-off-by: Daniel Díaz Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/Makefile | 2 ++ tools/testing/selftests/intel_pstate/run.sh | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile index 849a90ffe8dd..a97e24edde39 100644 --- a/tools/testing/selftests/intel_pstate/Makefile +++ b/tools/testing/selftests/intel_pstate/Makefile @@ -1,7 +1,9 @@ CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE LDLIBS := $(LDLIBS) -lm +ifeq (,$(filter $(ARCH),x86)) TEST_GEN_FILES := msr aperf +endif TEST_PROGS := run.sh diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 7868c106b8b1..1b4b8302dfc2 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -29,6 +29,11 @@ EVALUATE_ONLY=0 +if ! uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ | grep -q x86; then + echo "$0 # Skipped: Test can only run on x86 architectures." + exit 0 +fi + max_cpus=$(($(nproc)-1)) # compile programs From 6f0003363a13be699ba945cfa4074193e04fbea5 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:18 +0200 Subject: [PATCH 0160/1322] selftests/intel_pstate: No need to compile test progs in the run script Both test programs are being compiled by make, so no need to compile both programs in the runner script. This resolves an error when installing all selftests via make install and run them in a different environemnt. Running tests in intel_pstate ======================================== ./run.sh: line 35: gcc: command not found Problem compiling aperf.c. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/intel_pstate/run.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh index 1b4b8302dfc2..d3ab48f91cd6 100755 --- a/tools/testing/selftests/intel_pstate/run.sh +++ b/tools/testing/selftests/intel_pstate/run.sh @@ -36,12 +36,6 @@ fi max_cpus=$(($(nproc)-1)) -# compile programs -gcc aperf.c -Wall -D_GNU_SOURCE -o aperf -lm -[ $? -ne 0 ] && echo "Problem compiling aperf.c." && exit 1 -gcc -o msr msr.c -lm -[ $? -ne 0 ] && echo "Problem compiling msr.c." && exit 1 - function run_test () { file_ext=$1 From 56a268cd4a410081d477913d51e044039d3a8834 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 13:19:23 +0200 Subject: [PATCH 0161/1322] selftests/bpf: Make bpf_util work on uniprocessor systems The current implementation fails to work on uniprocessor systems. Fix the parser to also handle the uniprocessor case. Signed-off-by: Thomas Meyer Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Shuah Khan --- tools/testing/selftests/bpf/bpf_util.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 20ecbaa0d85d..6c53a8906eff 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -12,6 +12,7 @@ static inline unsigned int bpf_num_possible_cpus(void) unsigned int start, end, possible_cpus = 0; char buff[128]; FILE *fp; + int n; fp = fopen(fcpu, "r"); if (!fp) { @@ -20,17 +21,17 @@ static inline unsigned int bpf_num_possible_cpus(void) } while (fgets(buff, sizeof(buff), fp)) { - if (sscanf(buff, "%u-%u", &start, &end) == 2) { - possible_cpus = start == 0 ? end + 1 : 0; - break; + n = sscanf(buff, "%u-%u", &start, &end); + if (n == 0) { + printf("Failed to retrieve # possible CPUs!\n"); + exit(1); + } else if (n == 1) { + end = start; } + possible_cpus = start == 0 ? end + 1 : 0; + break; } - fclose(fp); - if (!possible_cpus) { - printf("Failed to retrieve # possible CPUs!\n"); - exit(1); - } return possible_cpus; } From 84c06566cfb84e5f6aed9582d4570df8406700c9 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:17 +0200 Subject: [PATCH 0162/1322] selftests/ftrace: multiple_kprobes: Also check for support The multiple_kprobes test case fails to check for KPROBE_EVENT support. Add the check to prevent a false test result. Signed-off-by: Thomas Meyer Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc index 2a1cb9908746..a4fd4c851a5b 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -1,6 +1,8 @@ #!/bin/sh # description: Register/unregister many kprobe events +[ -f kprobe_events ] || exit_unsupported # this is configurable + # ftrace fentry skip size depends on the machine architecture. # Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le case `uname -m` in From 63ecc3d9436f8012e49dc846d6cb0a85a3433517 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 13 Sep 2017 19:30:51 -0600 Subject: [PATCH 0163/1322] udpv6: Fix the checksum computation when HW checksum does not apply While trying an ESP transport mode encryption for UDPv6 packets of datagram size 1436 with MTU 1500, checksum error was observed in the secondary fragment. This error occurs due to the UDP payload checksum being missed out when computing the full checksum for these packets in udp6_hwcsum_outgoing(). Fixes: d39d938c8228 ("ipv6: Introduce udpv6_send_skb()") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/ipv6/udp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; From 265698d7e6132a2d41471135534f4f36ad15b09c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Sep 2017 22:46:36 +0200 Subject: [PATCH 0164/1322] nl80211: fix null-ptr dereference on invalid mesh configuration If TX rates are specified during mesh join, the channel must also be specified. Check the channel pointer to avoid a null pointer dereference if it isn't. Reported-by: Jouni Malinen Fixes: 8564e38206de ("cfg80211: add checks for beacon rate, extend to mesh") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fbd5593e88cb..690874293cfc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9987,6 +9987,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) From 76cc0d3282d4b933fa144fa41fbc5318e0fdca24 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 12:00:07 +0800 Subject: [PATCH 0165/1322] ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header Now in ip6gre_header before packing the ipv6 header, it skb_push t->hlen which only includes encap_hlen + tun_hlen. It means greh and inner header would be over written by ipv6 stuff and ipv6h might have no chance to set up. Jianlin found this issue when using remote any on ip6_gre, the packets he captured on gre dev are truncated: 22:50:26.210866 Out ethertype IPv6 (0x86dd), length 120: truncated-ip6 -\ 8128 bytes missing!(flowlabel 0x92f40, hlim 0, next-header Options (0) \ payload length: 8192) ::1:2000:0 > ::1:0:86dd: HBH [trunc] ip-proto-128 \ 8184 It should also skb_push ipv6hdr so that ipv6h points to the right position to set ipv6 stuff up. This patch is to skb_push hlen + sizeof(*ipv6h) and also fix some indents in ip6gre_header. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..20f66f4c9460 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. From 7b4dc3c0da0d66e7b20a826c537d41bb73e4df54 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 18 Aug 2017 17:49:58 +0800 Subject: [PATCH 0166/1322] drm/i915/gvt: Fix incorrect PCI BARs reporting Looking at our virtual PCI device, we can see surprising Region 4 and Region 5. 00:10.0 VGA compatible controller: Intel Corporation Sky Lake Integrated Graphics (rev 06) (prog-if 00 [VGA controller]) .... Region 0: Memory at 140000000 (64-bit, non-prefetchable) [size=16M] Region 2: Memory at 180000000 (64-bit, prefetchable) [size=1G] Region 4: Memory at (32-bit, non-prefetchable) Region 5: Memory at (32-bit, non-prefetchable) Expansion ROM at febd6000 [disabled] [size=2K] The fact is that we only implemented BAR0 and BAR2. Surprising Region 4 and Region 5 are shown because we report their size as 0xffffffff. They should report size 0 instead. BTW, the physical GPU has a PIO BAR. GVTg hasn't implemented PIO access, so we ignored this BAR for vGPU device. v2: fix BAR size value calculation. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1458032 Signed-off-by: Changbin Du Cc: stable@vger.kernel.org Signed-off-by: Zhenyu Wang (cherry picked from commit f1751362d6357a90bc6e53176cec715ff2dbed74) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gvt/cfg_space.c | 113 ++++++++++++--------------- 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cfg_space.c b/drivers/gpu/drm/i915/gvt/cfg_space.c index 40af17ec6312..ff3154fe6588 100644 --- a/drivers/gpu/drm/i915/gvt/cfg_space.c +++ b/drivers/gpu/drm/i915/gvt/cfg_space.c @@ -197,78 +197,65 @@ static int emulate_pci_command_write(struct intel_vgpu *vgpu, static int emulate_pci_bar_write(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes) { - unsigned int bar_index = - (rounddown(offset, 8) % PCI_BASE_ADDRESS_0) / 8; u32 new = *(u32 *)(p_data); bool lo = IS_ALIGNED(offset, 8); u64 size; int ret = 0; bool mmio_enabled = vgpu_cfg_space(vgpu)[PCI_COMMAND] & PCI_COMMAND_MEMORY; + struct intel_vgpu_pci_bar *bars = vgpu->cfg_space.bar; - if (WARN_ON(bar_index >= INTEL_GVT_PCI_BAR_MAX)) - return -EINVAL; - + /* + * Power-up software can determine how much address + * space the device requires by writing a value of + * all 1's to the register and then reading the value + * back. The device will return 0's in all don't-care + * address bits. + */ if (new == 0xffffffff) { - /* - * Power-up software can determine how much address - * space the device requires by writing a value of - * all 1's to the register and then reading the value - * back. The device will return 0's in all don't-care - * address bits. - */ - size = vgpu->cfg_space.bar[bar_index].size; - if (lo) { - new = rounddown(new, size); - } else { - u32 val = vgpu_cfg_space(vgpu)[rounddown(offset, 8)]; - /* for 32bit mode bar it returns all-0 in upper 32 - * bit, for 64bit mode bar it will calculate the - * size with lower 32bit and return the corresponding - * value + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_1: + size = ~(bars[INTEL_GVT_PCI_BAR_GTTMMIO].size -1); + intel_vgpu_write_pci_bar(vgpu, offset, + size >> (lo ? 0 : 32), lo); + /* + * Untrap the BAR, since guest hasn't configured a + * valid GPA */ - if (val & PCI_BASE_ADDRESS_MEM_TYPE_64) - new &= (~(size-1)) >> 32; - else - new = 0; - } - /* - * Unmapp & untrap the BAR, since guest hasn't configured a - * valid GPA - */ - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: ret = trap_gttmmio(vgpu, false); break; - case INTEL_GVT_PCI_BAR_APERTURE: + case PCI_BASE_ADDRESS_2: + case PCI_BASE_ADDRESS_3: + size = ~(bars[INTEL_GVT_PCI_BAR_APERTURE].size -1); + intel_vgpu_write_pci_bar(vgpu, offset, + size >> (lo ? 0 : 32), lo); ret = map_aperture(vgpu, false); break; + default: + /* Unimplemented BARs */ + intel_vgpu_write_pci_bar(vgpu, offset, 0x0, false); } - intel_vgpu_write_pci_bar(vgpu, offset, new, lo); } else { - /* - * Unmapp & untrap the old BAR first, since guest has - * re-configured the BAR - */ - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: - ret = trap_gttmmio(vgpu, false); + switch (offset) { + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_1: + /* + * Untrap the old BAR first, since guest has + * re-configured the BAR + */ + trap_gttmmio(vgpu, false); + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); + ret = trap_gttmmio(vgpu, mmio_enabled); break; - case INTEL_GVT_PCI_BAR_APERTURE: - ret = map_aperture(vgpu, false); + case PCI_BASE_ADDRESS_2: + case PCI_BASE_ADDRESS_3: + map_aperture(vgpu, false); + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); + ret = map_aperture(vgpu, mmio_enabled); break; - } - intel_vgpu_write_pci_bar(vgpu, offset, new, lo); - /* Track the new BAR */ - if (mmio_enabled) { - switch (bar_index) { - case INTEL_GVT_PCI_BAR_GTTMMIO: - ret = trap_gttmmio(vgpu, true); - break; - case INTEL_GVT_PCI_BAR_APERTURE: - ret = map_aperture(vgpu, true); - break; - } + default: + intel_vgpu_write_pci_bar(vgpu, offset, new, lo); } } return ret; @@ -299,10 +286,7 @@ int intel_vgpu_emulate_cfg_write(struct intel_vgpu *vgpu, unsigned int offset, } switch (rounddown(offset, 4)) { - case PCI_BASE_ADDRESS_0: - case PCI_BASE_ADDRESS_1: - case PCI_BASE_ADDRESS_2: - case PCI_BASE_ADDRESS_3: + case PCI_BASE_ADDRESS_0 ... PCI_BASE_ADDRESS_5: if (WARN_ON(!IS_ALIGNED(offset, 4))) return -EINVAL; return emulate_pci_bar_write(vgpu, offset, p_data, bytes); @@ -344,7 +328,6 @@ void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, struct intel_gvt *gvt = vgpu->gvt; const struct intel_gvt_device_info *info = &gvt->device_info; u16 *gmch_ctl; - int i; memcpy(vgpu_cfg_space(vgpu), gvt->firmware.cfg_space, info->cfg_space_size); @@ -371,13 +354,13 @@ void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, */ memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_1, 0, 4); memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_3, 0, 4); + memset(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_4, 0, 8); memset(vgpu_cfg_space(vgpu) + INTEL_GVT_PCI_OPREGION, 0, 4); - for (i = 0; i < INTEL_GVT_MAX_BAR_NUM; i++) { - vgpu->cfg_space.bar[i].size = pci_resource_len( - gvt->dev_priv->drm.pdev, i * 2); - vgpu->cfg_space.bar[i].tracked = false; - } + vgpu->cfg_space.bar[INTEL_GVT_PCI_BAR_GTTMMIO].size = + pci_resource_len(gvt->dev_priv->drm.pdev, 0); + vgpu->cfg_space.bar[INTEL_GVT_PCI_BAR_APERTURE].size = + pci_resource_len(gvt->dev_priv->drm.pdev, 2); } /** From 814feed316e9d15b0ae869ca729370e7630b8d13 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Sep 2017 10:56:42 +0200 Subject: [PATCH 0167/1322] drm/i915: Fix an error handling in 'intel_framebuffer_init()' We should go through the error handling path to decrease the 'framebuffer_references' as done everywhere else in this function. Fixes: 2e2adb05736c ("drm/i915: Add render decompression support") Signed-off-by: Christophe JAILLET Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170910085642.13673-1-christophe.jaillet@wanadoo.fr (cherry picked from commit 37875d6b3af702425ce292de81b77faf94766616) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index f17275519484..00cd17c76fdc 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14030,7 +14030,7 @@ static int intel_framebuffer_init(struct intel_framebuffer *intel_fb, if (mode_cmd->handles[i] != mode_cmd->handles[0]) { DRM_DEBUG_KMS("bad plane %d handle\n", i); - return -EINVAL; + goto err; } stride_alignment = intel_fb_stride_alignment(fb, i); From ac73661c6222faef1a97ec44f424234f165e4710 Mon Sep 17 00:00:00 2001 From: "Lee, Shawn C" Date: Tue, 12 Sep 2017 11:36:30 +0800 Subject: [PATCH 0168/1322] drm/i915/bxt: set min brightness from VBT Min brightness value from vbt was missing for BXT platform. This setting have to refer backlight ic spec to restrict min backlight output. Without this restriction, driver would allow to configure lower brightness value and violate backlight ic requirement. Fixes: 0fb890c01349 ("drm/i915/bxt: BLC implementation") Cc: Jani Nikula Cc: Cooper Chiou Cc: Gary C Wang Signed-off-by: Shawn Lee Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1505187390-7039-1-git-send-email-shawn.c.lee@intel.com (cherry picked from commit c3881128cb672cf484a52fbb36b5daa9044d9168) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_panel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index a17b1de7d7e0..d4dd248ac9a8 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -1699,6 +1699,8 @@ bxt_setup_backlight(struct intel_connector *connector, enum pipe unused) if (!panel->backlight.max) return -ENODEV; + panel->backlight.min = get_backlight_min_vbt(connector); + val = bxt_get_backlight(connector); val = intel_panel_compute_brightness(connector, val); panel->backlight.level = clamp(val, panel->backlight.min, From abeae421b03d800d33894df7fbca6d00c70c358e Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Tue, 5 Sep 2017 15:14:31 +0530 Subject: [PATCH 0169/1322] Revert "drm/i915/bxt: Disable device ready before shutdown command" This reverts commit bbdf0b2ff32a ("drm/i915/bxt: Disable device ready before shutdown command"). Disable device ready before shutdown command was added previously to avoid a split screen issue seen on dual link DSI panels. As of now, dual link is not supported and will need some rework in the upstream code. For single link DSI panels, the change is not required. This will cause failure in sending SHUTDOWN packet during disable. Hence reverting the change. Will handle the change as part of dual link enabling in upstream. Fixes: bbdf0b2ff32a ("drm/i915/bxt: Disable device ready before shutdown command") Cc: # v4.12+ Signed-off-by: Uma Shankar Signed-off-by: Vidya Srinivas Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1504604671-17237-1-git-send-email-vidya.srinivas@intel.com (cherry picked from commit 33c8d8870c67faf3161898a56af98ac3c1c71450) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_dsi.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c index f0c11aec5ea5..7442891762be 100644 --- a/drivers/gpu/drm/i915/intel_dsi.c +++ b/drivers/gpu/drm/i915/intel_dsi.c @@ -892,8 +892,6 @@ static void intel_dsi_disable(struct intel_encoder *encoder, struct intel_crtc_state *old_crtc_state, struct drm_connector_state *old_conn_state) { - struct drm_device *dev = encoder->base.dev; - struct drm_i915_private *dev_priv = dev->dev_private; struct intel_dsi *intel_dsi = enc_to_intel_dsi(&encoder->base); enum port port; @@ -902,15 +900,6 @@ static void intel_dsi_disable(struct intel_encoder *encoder, intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_BACKLIGHT_OFF); intel_panel_disable_backlight(old_conn_state); - /* - * Disable Device ready before the port shutdown in order - * to avoid split screen - */ - if (IS_BROXTON(dev_priv)) { - for_each_dsi_port(port, intel_dsi->ports) - I915_WRITE(MIPI_DEVICE_READY(port), 0); - } - /* * According to the spec we should send SHUTDOWN before * MIPI_SEQ_DISPLAY_OFF only for v3+ VBTs, but field testing From 8c7a758873577f0d1bb3f879584338f73e6c6bc3 Mon Sep 17 00:00:00 2001 From: "Lee, Shawn C" Date: Wed, 13 Sep 2017 13:19:20 +0800 Subject: [PATCH 0170/1322] drm/i915/cnp: set min brightness from VBT Min brightness value from vbt was missing for CNP platform. This setting have to refer backlight ic spec to restrict min backlight output. Without this restriction, driver would allow to configure lower brightness value and violate backlight ic requirement. Fixes: 4c9f7086ac6d ("drm/i915/cnp: Backlight support for CNP.") Cc: Jani Nikula Signed-off-by: Shawn Lee Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/1505279961-16140-1-git-send-email-shawn.c.lee@intel.com (cherry picked from commit f44e354f857f207cd361269c5e38e1f96e0b616c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_panel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index d4dd248ac9a8..3b1c5d783ee7 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -1737,6 +1737,8 @@ cnp_setup_backlight(struct intel_connector *connector, enum pipe unused) if (!panel->backlight.max) return -ENODEV; + panel->backlight.min = get_backlight_min_vbt(connector); + val = bxt_get_backlight(connector); val = intel_panel_compute_brightness(connector, val); panel->backlight.level = clamp(val, panel->backlight.min, From 99df13b6ea811a63eeacb278d05a5b914ce28073 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Sep 2017 17:42:13 +0100 Subject: [PATCH 0171/1322] drm/i915: Remove unused 'in_vbl' from i915_get_crtc_scanoutpos() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1bf6ad622b9b ("drm/vblank: drop the mode argument from drm_calc_vbltimestamp_from_scanoutpos") removed the use of in_vbl, but did not remove the local variable. Do so now. Fixes: 1bf6ad622b9b ("drm/vblank: drop the mode argument from drm_calc_vbltimestamp_from_scanoutpos") Signed-off-by: Chris Wilson Cc: Ville Syrjälä Cc: Daniel Vetter Cc: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170914164213.18461-1-chris@chris-wilson.co.uk Reviewed-by: Ville Syrjälä (cherry picked from commit e01e71fc49d4c95090a04f898a3fe788c652a04b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_irq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index e21ce9c18b6e..b63893eeca73 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -839,7 +839,6 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, pipe); int position; int vbl_start, vbl_end, hsync_start, htotal, vtotal; - bool in_vbl = true; unsigned long irqflags; if (WARN_ON(!mode->crtc_clock)) { @@ -922,8 +921,6 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); - in_vbl = position >= vbl_start && position < vbl_end; - /* * While in vblank, position will be negative * counting up towards 0 at vbl_end. And outside From f2654a4781318dc7ab8d6cde66f1fa39eab980a9 Mon Sep 17 00:00:00 2001 From: Fahad Kunnathadi Date: Fri, 15 Sep 2017 12:01:58 +0530 Subject: [PATCH 0172/1322] net: phy: Fix mask value write on gmii2rgmii converter speed register To clear Speed Selection in MDIO control register(0x10), ie, clear bits 6 and 13 to zero while keeping other bits same. Before AND operation,The Mask value has to be perform with bitwise NOT operation (ie, ~ operator) This patch clears current speed selection before writing the new speed settings to gmii2rgmii converter Fixes: f411a6160bd4 ("net: phy: Add gmiitorgmii converter support") Signed-off-by: Fahad Kunnathadi Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/xilinx_gmii2rgmii.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index d15dd3938ba8..2e5150b0b8d5 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -44,7 +44,7 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev) priv->phy_drv->read_status(phydev); val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG); - val &= XILINX_GMII2RGMII_SPEED_MASK; + val &= ~XILINX_GMII2RGMII_SPEED_MASK; if (phydev->speed == SPEED_1000) val |= BMCR_SPEED1000; From 8c22dab03ad072e45060c299c70d02a4f6fc4aab Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 15:58:33 +0800 Subject: [PATCH 0173/1322] ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline If ipv6 has been disabled from cmdline since kernel started, it makes no sense to allow users to create any ip6 tunnel. Otherwise, it could some potential problem. Jianlin found a kernel crash caused by this in ip6_gre when he set ipv6.disable=1 in grub: [ 209.588865] Unable to handle kernel paging request for data at address 0x00000080 [ 209.588872] Faulting instruction address: 0xc000000000a3aa6c [ 209.588879] Oops: Kernel access of bad area, sig: 11 [#1] [ 209.589062] NIP [c000000000a3aa6c] fib_rules_lookup+0x4c/0x260 [ 209.589071] LR [c000000000b9ad90] fib6_rule_lookup+0x50/0xb0 [ 209.589076] Call Trace: [ 209.589097] fib6_rule_lookup+0x50/0xb0 [ 209.589106] rt6_lookup+0xc4/0x110 [ 209.589116] ip6gre_tnl_link_config+0x214/0x2f0 [ip6_gre] [ 209.589125] ip6gre_newlink+0x138/0x3a0 [ip6_gre] [ 209.589134] rtnl_newlink+0x798/0xb80 [ 209.589142] rtnetlink_rcv_msg+0xec/0x390 [ 209.589151] netlink_rcv_skb+0x138/0x150 [ 209.589159] rtnetlink_rcv+0x48/0x70 [ 209.589169] netlink_unicast+0x538/0x640 [ 209.589175] netlink_sendmsg+0x40c/0x480 [ 209.589184] ___sys_sendmsg+0x384/0x4e0 [ 209.589194] SyS_sendmsg+0xd4/0x140 [ 209.589201] SyS_socketcall+0x3e0/0x4f0 [ 209.589209] system_call+0x38/0xe0 This patch is to return -EOPNOTSUPP in ip6_tunnel_init if ipv6 has been disabled from cmdline. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..f2f21c24915f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2259,6 +2259,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; From 3ff4cbec87da48b0ec1f7b6196607b034de0c680 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 16 Sep 2017 14:02:21 +0200 Subject: [PATCH 0174/1322] net/sched: cls_matchall: fix crash when used with classful qdisc this script, edited from Linux Advanced Routing and Traffic Control guide tc q a dev en0 root handle 1: htb default a tc c a dev en0 parent 1: classid 1:1 htb rate 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:a htb rate 5mbit ceil 6mbit burst 15k tc c a dev en0 parent 1:1 classid 1:b htb rate 1mbit ceil 6mbit burst 15k tc f a dev en0 parent 1:0 prio 1 $clsname $clsargs classid 1:b ping $address -c1 tc -s c s dev en0 classifies traffic to 1:b or 1:a, depending on whether the packet matches or not the pattern $clsargs of filter $clsname. However, when $clsname is 'matchall', a systematic crash can be observed in htb_classify(). HTB and classful qdiscs don't assign initial value to struct tcf_result, but then they expect it to contain valid values after filters have been run. Thus, current 'matchall' ignores the TCA_MATCHALL_CLASSID attribute, configured by user, and makes HTB (and classful qdiscs) dereference random pointers. By assigning head->res to *res in mall_classify(), before the actions are invoked, we fix this crash and enable TCA_MATCHALL_CLASSID functionality, that had no effect on 'matchall' classifier since its first introduction. BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1460213 Reported-by: Jiri Benc Fixes: b87f7936a932 ("net/sched: introduce Match-all classifier") Signed-off-by: Davide Caratti Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } From 51513748ddfe7a5d2812158a1e0570499b0c511c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 16 Sep 2017 13:10:06 -0700 Subject: [PATCH 0175/1322] Documentation: networking: fix ASCII art in switchdev.txt Fix ASCII art in Documentation/networking/switchdev.txt: Change non-ASCII "spaces" to ASCII spaces. Change 2 erroneous '+' characters in ASCII art to '-' (at the '*' characters below): line 32: +--+----+----+----+-*--+----+---+ +-----+-----+ line 41: +--------------+---*------------+ Signed-off-by: Randy Dunlap Acked-by: Pavel Machek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 64 +++++++++++++------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 5e40e1f68873..82236a17b5e6 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -13,42 +13,42 @@ an example setup using a data-center-class switch ASIC chip. Other setups with SR-IOV or soft switches, such as OVS, are possible. -                             User-space tools + User-space tools -       user space                   | -      +-------------------------------------------------------------------+ -       kernel                       | Netlink -                                    | -                     +--------------+-------------------------------+ -                     |         Network stack                        | -                     |           (Linux)                            | -                     |                                              | -                     +----------------------------------------------+ + user space | + +-------------------------------------------------------------------+ + kernel | Netlink + | + +--------------+-------------------------------+ + | Network stack | + | (Linux) | + | | + +----------------------------------------------+ sw1p2 sw1p4 sw1p6 -                      sw1p1  + sw1p3 +  sw1p5 +         eth1 -                        +    |    +    |    +    |            + -                        |    |    |    |    |    |            | -                     +--+----+----+----+-+--+----+---+  +-----+-----+ -                     |         Switch driver         |  |    mgmt   | -                     |        (this document)        |  |   driver  | -                     |                               |  |           | -                     +--------------+----------------+  +-----------+ -                                    | -       kernel                       | HW bus (eg PCI) -      +-------------------------------------------------------------------+ -       hardware                     | -                     +--------------+---+------------+ -                     |         Switch device (sw1)   | -                     |  +----+                       +--------+ -                     |  |    v offloaded data path   | mgmt port -                     |  |    |                       | -                     +--|----|----+----+----+----+---+ -                        |    |    |    |    |    | -                        +    +    +    +    +    + -                       p1   p2   p3   p4   p5   p6 + sw1p1 + sw1p3 + sw1p5 + eth1 + + | + | + | + + | | | | | | | + +--+----+----+----+----+----+---+ +-----+-----+ + | Switch driver | | mgmt | + | (this document) | | driver | + | | | | + +--------------+----------------+ +-----------+ + | + kernel | HW bus (eg PCI) + +-------------------------------------------------------------------+ + hardware | + +--------------+----------------+ + | Switch device (sw1) | + | +----+ +--------+ + | | v offloaded data path | mgmt port + | | | | + +--|----|----+----+----+----+---+ + | | | | | | + + + + + + + + p1 p2 p3 p4 p5 p6 -                             front-panel ports + front-panel ports Fig 1. From 6ce14f6416c84bd9c81777edf899b57ac5000c87 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Sep 2017 01:49:02 +0200 Subject: [PATCH 0176/1322] ACPI / watchdog: properly initialize resources We copy a local resource structure into a list, but only initialize some of its members, as pointed out by gcc-4.4: drivers/acpi/acpi_watchdog.c: In function 'acpi_watchdog_init': drivers/acpi/acpi_watchdog.c:105: error: 'res.child' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.sibling' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.parent' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.desc' may be used uninitialized in this function drivers/acpi/acpi_watchdog.c:105: error: 'res.name' may be used uninitialized in this function Newer compilers can presumably optimize the uninitialized access away entirely and don't warn at all, but rely on the kzalloc() to zero the structure first. This adds an explicit initialization to force consistent behavior. Fixes: 058dfc767008 (ACPI / watchdog: Add support for WDAT hardware watchdog) Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_watchdog.c b/drivers/acpi/acpi_watchdog.c index bf22c29d2517..11b113f8e367 100644 --- a/drivers/acpi/acpi_watchdog.c +++ b/drivers/acpi/acpi_watchdog.c @@ -66,7 +66,7 @@ void __init acpi_watchdog_init(void) for (i = 0; i < wdat->entries; i++) { const struct acpi_generic_address *gas; struct resource_entry *rentry; - struct resource res; + struct resource res = {}; bool found; gas = &entries[i].register_region; From 1e3c5ec66119783440ed211ae527674651affa9b Mon Sep 17 00:00:00 2001 From: Sathya Perla Date: Mon, 18 Sep 2017 17:05:37 +0530 Subject: [PATCH 0177/1322] bnxt_en: check for ingress qdisc in flower offload Check for ingress-only qdisc for flower offload, as other qdiscs are not supported for flower offload. Suggested-by: Jiri Pirko Signed-off-by: Sathya Perla Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index ccd699fb2d70..7dd3d131043a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -750,6 +750,10 @@ int bnxt_tc_setup_flower(struct bnxt *bp, u16 src_fid, { int rc = 0; + if (!is_classid_clsact_ingress(cls_flower->common.classid) || + cls_flower->common.chain_index) + return -EOPNOTSUPP; + switch (cls_flower->command) { case TC_CLSFLOWER_REPLACE: rc = bnxt_tc_add_flow(bp, src_fid, cls_flower); From 582db7e0c4c2fc5bb4f932f268035883385e3692 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 18 Sep 2017 15:03:46 +0200 Subject: [PATCH 0178/1322] bpf: devmap: pass on return value of bpf_map_precharge_memlock If bpf_map_precharge_memlock in dev_map_alloc, -ENOMEM is returned regardless of the actual error produced by bpf_map_precharge_memlock. Fix it by passing on the error returned by bpf_map_precharge_memlock. Also return -EINVAL instead of -ENOMEM if the page count overflow check fails. This makes dev_map_alloc match the behavior of other bpf maps' alloc functions wrt. return values. Signed-off-by: Tobias Klauser Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/devmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e093d9a2c4dd 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; + int err = -EINVAL; u64 cost; - int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + err = -ENOMEM; + /* A per cpu bitfield with a bit per possible net device */ dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), __alignof__(unsigned long)); @@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) free_dtab: free_percpu(dtab->flush_needed); kfree(dtab); - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void dev_map_free(struct bpf_map *map) From 5e75fe3927559505682b0053b5745e3f42d66e8c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 18 Sep 2017 17:19:10 -0700 Subject: [PATCH 0179/1322] tools/testing/nvdimm: disable labels for nfit_test.1 Improve coverage of NVDIMM-N test scenarios by providing a test bus incapable of label operations. Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index d20791c3f499..bef419d4266d 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1527,9 +1527,6 @@ static void nfit_test1_setup(struct nfit_test *t) set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en); - set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en); - set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en); - set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en); } static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, From 4c7124413aa759b8ea0b90cd39177e525396e662 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 18 Sep 2017 11:05:16 -0700 Subject: [PATCH 0180/1322] tcp: remove two unused functions remove tcp_may_send_now and tcp_snd_test that are no longer used Fixes: 840a3cbe8969 ("tcp: remove forward retransmit feature") Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - net/ipv4/tcp_output.c | 34 ---------------------------------- 2 files changed, 35 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..3bc910a9bfc6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -544,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); -bool tcp_may_send_now(struct sock *sk); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..517d737059d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1806,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions From 33a56086712561b8b9cdc881e0317f4c36861f72 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 18 Sep 2017 14:48:58 -0700 Subject: [PATCH 0181/1322] libnvdimm, namespace: fix btt claim class crash Maurice reports: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: holder_class_store+0x253/0x2b0 [libnvdimm] ...while trying to reconfigure an NVDIMM-N namespace into 'sector' / 'btt' mode. The crash points to this line: (gdb) li *(holder_class_store+0x253) 0x7773 is in holder_class_store (drivers/nvdimm/namespace_devs.c:1420). 1415 for (i = 0; i < nd_region->ndr_mappings; i++) { 1416 struct nd_mapping *nd_mapping = &nd_region->mapping[i]; 1417 struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); 1418 struct nd_namespace_index *nsindex; 1419 1420 nsindex = to_namespace_index(ndd, ndd->ns_current); ...where we are failing because ndd is NULL due to NVDIMM-N dimms not supporting labels. Long story short, default to the BTTv1 format in the label-less / NVDIMM-N case. Fixes: 14e494542636 ("libnvdimm, btt: BTT updates for UEFI 2.7 format") Cc: Cc: Vishal Verma Reported-by: Maurice A. Saldivar Tested-by: Maurice A. Saldivar Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 1427a386a033..3e4d1e7998da 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1417,6 +1417,15 @@ static int btt_claim_class(struct device *dev) struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_index *nsindex; + /* + * If any of the DIMMs do not support labels the only + * possible BTT format is v1. + */ + if (!ndd) { + loop_bitmask = 0; + break; + } + nsindex = to_namespace_index(ndd, ndd->ns_current); if (nsindex == NULL) loop_bitmask |= 1; From 47684e111f52fface17820d3ef84cc845b26070e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Sep 2017 13:10:15 -0700 Subject: [PATCH 0182/1322] Documentation: core-api: minor workqueue.rst cleanups Clean up workqueue.rst: - fix minor typos - put '@' after `` instead of preceding them (one place) - use "CPU" instead of "cpu" in text consistently - quote one function name Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Florian Mickler Signed-off-by: Tejun Heo --- Documentation/core-api/workqueue.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 3943b5bfa8cf..00a5ba51e63f 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -39,8 +39,8 @@ up. Although MT wq wasted a lot of resource, the level of concurrency provided was unsatisfactory. The limitation was common to both ST and MT wq albeit less severe on MT. Each wq maintained its own separate -worker pool. A MT wq could provide only one execution context per CPU -while a ST wq one for the whole system. Work items had to compete for +worker pool. An MT wq could provide only one execution context per CPU +while an ST wq one for the whole system. Work items had to compete for those very limited execution contexts leading to various problems including proneness to deadlocks around the single execution context. @@ -151,7 +151,7 @@ Application Programming Interface (API) ``alloc_workqueue()`` allocates a wq. The original ``create_*workqueue()`` functions are deprecated and scheduled for -removal. ``alloc_workqueue()`` takes three arguments - @``name``, +removal. ``alloc_workqueue()`` takes three arguments - ``@name``, ``@flags`` and ``@max_active``. ``@name`` is the name of the wq and also used as the name of the rescuer thread if there is one. @@ -197,7 +197,7 @@ resources, scheduled and executed. served by worker threads with elevated nice level. Note that normal and highpri worker-pools don't interact with - each other. Each maintain its separate pool of workers and + each other. Each maintains its separate pool of workers and implements concurrency management among its workers. ``WQ_CPU_INTENSIVE`` @@ -249,8 +249,8 @@ unbound worker-pools and only one work item could be active at any given time thus achieving the same ordering property as ST wq. In the current implementation the above configuration only guarantees -ST behavior within a given NUMA node. Instead alloc_ordered_queue should -be used to achieve system wide ST behavior. +ST behavior within a given NUMA node. Instead ``alloc_ordered_queue()`` should +be used to achieve system-wide ST behavior. Example Execution Scenarios From 80a921e207cea3dd18038501187078669a218dab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 18 Sep 2017 23:00:08 +0300 Subject: [PATCH 0183/1322] ata_piix: Add Fujitsu-Siemens Lifebook S6120 to short cable IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fujitsu-Siemens Lifebook S6120 misdetects the cable type for some drives. The problematic one in this case is an mSATA SSD hooked up via a mSATA->PATA bridge. With regular hard disks the detection seems to work correctly. Strangely an older Lifebook model (S6020) detects the cable as 80c with the mSATA SSD, even if using the exact same flex cable. Cc: Tejun Heo Signed-off-by: Ville Syrjälä Signed-off-by: Tejun Heo --- drivers/ata/ata_piix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c index 8401c3b5be92..b702c20fbc2b 100644 --- a/drivers/ata/ata_piix.c +++ b/drivers/ata/ata_piix.c @@ -492,6 +492,7 @@ static const struct ich_laptop ich_laptop[] = { { 0x27DF, 0x152D, 0x0778 }, /* ICH7 on unknown Intel */ { 0x24CA, 0x1025, 0x0061 }, /* ICH4 on ACER Aspire 2023WLMi */ { 0x24CA, 0x1025, 0x003d }, /* ICH4 on ACER TM290 */ + { 0x24CA, 0x10CF, 0x11AB }, /* ICH4M on Fujitsu-Siemens Lifebook S6120 */ { 0x266F, 0x1025, 0x0066 }, /* ICH6 on ACER Aspire 1694WLMi */ { 0x2653, 0x1043, 0x82D8 }, /* ICH6M on Asus Eee 701 */ { 0x27df, 0x104d, 0x900e }, /* ICH7 on Sony TZ-90 */ From 54640d238760a1a54dfebe039b49682522100186 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 18 Sep 2017 22:51:14 -0500 Subject: [PATCH 0184/1322] fcntl: Don't set si_code to SI_SIGIO when sig == SIGPOLL When fixing things to avoid ambiguous cases I had a thinko and included SIGPOLL/SIGIO in with all of the other signals that have signal specific si_codes. Which is completely wrong. Fix that. Reported-by: Vince Weaver Signed-off-by: "Eric W. Biederman" --- fs/fcntl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 0491da3b28c3..448a1119f0be 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -749,7 +749,7 @@ static void send_sigio_to_task(struct task_struct *p, * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ - if (sig_specific_sicodes(signum)) + if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* From 129c6cda2de2a8ac44fab096152469999b727faf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Sep 2017 13:03:43 -0700 Subject: [PATCH 0185/1322] 8139too: revisit napi_complete_done() usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems we have to be more careful in napi_complete_done() use. This patch is not a revert, as it seems we can avoid bug that Ville reported by moving the napi_complete_done() test in the spinlock section. Many thanks to Ville for detective work and all tests. Fixes: 617f01211baf ("8139too: use napi_complete_done()") Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/8139too.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c index ca22f2898664..d24b47b8e0b2 100644 --- a/drivers/net/ethernet/realtek/8139too.c +++ b/drivers/net/ethernet/realtek/8139too.c @@ -2135,11 +2135,12 @@ static int rtl8139_poll(struct napi_struct *napi, int budget) if (likely(RTL_R16(IntrStatus) & RxAckBits)) work_done += rtl8139_rx(dev, tp, budget); - if (work_done < budget && napi_complete_done(napi, work_done)) { + if (work_done < budget) { unsigned long flags; spin_lock_irqsave(&tp->lock, flags); - RTL_W16_F(IntrMask, rtl8139_intr_mask); + if (napi_complete_done(napi, work_done)) + RTL_W16_F(IntrMask, rtl8139_intr_mask); spin_unlock_irqrestore(&tp->lock, flags); } spin_unlock(&tp->rx_lock); From 8ecb1a29e11e24c458ee4ee59447d0ddf8274589 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 18 Sep 2017 16:31:30 -0700 Subject: [PATCH 0186/1322] net: systemport: Fix 64-bit statistics dependency There are several problems with commit 10377ba7673d ("net: systemport: Support 64bit statistics", first one got fixed in 7095c973453e ("net: systemport: Fix 64-bit stats deadlock"). The second problem is that this specific code updates the stats64.tx_{packets,bytes} from ndo_get_stats64() and that is what we are returning to ethtool -S. If we are not running a tool that involves calling ndo_get_stats64(), then we won't get updated ethtool stats. The solution to this is to update the stats from both call sites, factoring that into a specific function, While at it, don't just check the sizeof() but also the type of the statistics in order to use the 64-bit stats seqlock. Fixes: 10377ba7673d ("net: systemport: Support 64bit statistics") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 52 +++++++++++++--------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index c3c53f6cd9e6..83eec9a8c275 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -432,6 +432,27 @@ static void bcm_sysport_update_mib_counters(struct bcm_sysport_priv *priv) netif_dbg(priv, hw, priv->netdev, "updated MIB counters\n"); } +static void bcm_sysport_update_tx_stats(struct bcm_sysport_priv *priv, + u64 *tx_bytes, u64 *tx_packets) +{ + struct bcm_sysport_tx_ring *ring; + u64 bytes = 0, packets = 0; + unsigned int start; + unsigned int q; + + for (q = 0; q < priv->netdev->num_tx_queues; q++) { + ring = &priv->tx_rings[q]; + do { + start = u64_stats_fetch_begin_irq(&priv->syncp); + bytes = ring->bytes; + packets = ring->packets; + } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); + + *tx_bytes += bytes; + *tx_packets += packets; + } +} + static void bcm_sysport_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { @@ -439,11 +460,16 @@ static void bcm_sysport_get_stats(struct net_device *dev, struct bcm_sysport_stats64 *stats64 = &priv->stats64; struct u64_stats_sync *syncp = &priv->syncp; struct bcm_sysport_tx_ring *ring; + u64 tx_bytes = 0, tx_packets = 0; unsigned int start; int i, j; - if (netif_running(dev)) + if (netif_running(dev)) { bcm_sysport_update_mib_counters(priv); + bcm_sysport_update_tx_stats(priv, &tx_bytes, &tx_packets); + stats64->tx_bytes = tx_bytes; + stats64->tx_packets = tx_packets; + } for (i = 0, j = 0; i < BCM_SYSPORT_STATS_LEN; i++) { const struct bcm_sysport_stats *s; @@ -461,12 +487,13 @@ static void bcm_sysport_get_stats(struct net_device *dev, continue; p += s->stat_offset; - if (s->stat_sizeof == sizeof(u64)) + if (s->stat_sizeof == sizeof(u64) && + s->type == BCM_SYSPORT_STAT_NETDEV64) { do { start = u64_stats_fetch_begin_irq(syncp); data[i] = *(u64 *)p; } while (u64_stats_fetch_retry_irq(syncp, start)); - else + } else data[i] = *(u32 *)p; j++; } @@ -1716,27 +1743,12 @@ static void bcm_sysport_get_stats64(struct net_device *dev, { struct bcm_sysport_priv *priv = netdev_priv(dev); struct bcm_sysport_stats64 *stats64 = &priv->stats64; - struct bcm_sysport_tx_ring *ring; - u64 tx_packets = 0, tx_bytes = 0; unsigned int start; - unsigned int q; netdev_stats_to_stats64(stats, &dev->stats); - for (q = 0; q < dev->num_tx_queues; q++) { - ring = &priv->tx_rings[q]; - do { - start = u64_stats_fetch_begin_irq(&priv->syncp); - tx_bytes = ring->bytes; - tx_packets = ring->packets; - } while (u64_stats_fetch_retry_irq(&priv->syncp, start)); - - stats->tx_bytes += tx_bytes; - stats->tx_packets += tx_packets; - } - - stats64->tx_bytes = stats->tx_bytes; - stats64->tx_packets = stats->tx_packets; + bcm_sysport_update_tx_stats(priv, &stats->tx_bytes, + &stats->tx_packets); do { start = u64_stats_fetch_begin_irq(&priv->syncp); From c8b850241559d6bad7e640075e282a83a4ad7ead Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 18 Sep 2017 12:25:22 +0200 Subject: [PATCH 0187/1322] s390/scm_blk: consistently use blk_status_t as error type Fix these warnings found by sparse: drivers/s390/block/scm_blk.c:257:24: warning: incorrect type in assignment (different base types) drivers/s390/block/scm_blk.c:257:24: expected int [signed] drivers/s390/block/scm_blk.c:257:24: got restricted blk_status_t [usertype] error drivers/s390/block/scm_blk.c:420:33: warning: incorrect type in argument 2 (different base types) drivers/s390/block/scm_blk.c:420:33: expected restricted blk_status_t [usertype] error drivers/s390/block/scm_blk.c:420:33: got int [signed] Signed-off-by: Sebastian Ott Reported-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 2e7fd966c515..eb51893c74a4 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -249,7 +249,7 @@ static void scm_request_requeue(struct scm_request *scmrq) static void scm_request_finish(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; - int *error; + blk_status_t *error; int i; for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) { @@ -415,7 +415,7 @@ void scm_blk_irq(struct scm_device *scmdev, void *data, blk_status_t error) static void scm_blk_request_done(struct request *req) { - int *error = blk_mq_rq_to_pdu(req); + blk_status_t *error = blk_mq_rq_to_pdu(req); blk_mq_end_request(req, *error); } @@ -450,7 +450,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) atomic_set(&bdev->queued_reqs, 0); bdev->tag_set.ops = &scm_mq_ops; - bdev->tag_set.cmd_size = sizeof(int); + bdev->tag_set.cmd_size = sizeof(blk_status_t); bdev->tag_set.nr_hw_queues = nr_requests; bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; From 55fb7347579017bdb09550b4e8019447340b7004 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 14 Sep 2017 13:55:22 +0200 Subject: [PATCH 0188/1322] s390/cio: recover from bad paths In some situations we don't receive notification from firmware that a previously unusable channelpath is usable again. Schedule recovery for devices that return from path verification without using all potentially usable paths. The recovery thread will periodically trigger a path verification on the affected devices. Signed-off-by: Sebastian Ott Suggested-by: Peter Oberparleiter Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 12 +++++++++--- drivers/s390/cio/device.h | 1 + drivers/s390/cio/device_fsm.c | 12 ++++++++++++ drivers/s390/cio/io_sch.h | 2 ++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 489b583f263d..e5c32f4b5287 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1225,10 +1225,16 @@ static int device_is_disconnected(struct ccw_device *cdev) static int recovery_check(struct device *dev, void *data) { struct ccw_device *cdev = to_ccwdev(dev); + struct subchannel *sch; int *redo = data; spin_lock_irq(cdev->ccwlock); switch (cdev->private->state) { + case DEV_STATE_ONLINE: + sch = to_subchannel(cdev->dev.parent); + if ((sch->schib.pmcw.pam & sch->opm) == sch->vpm) + break; + /* fall through */ case DEV_STATE_DISCONNECTED: CIO_MSG_EVENT(3, "recovery: trigger 0.%x.%04x\n", cdev->private->dev_id.ssid, @@ -1260,7 +1266,7 @@ static void recovery_work_func(struct work_struct *unused) } spin_unlock_irq(&recovery_lock); } else - CIO_MSG_EVENT(4, "recovery: end\n"); + CIO_MSG_EVENT(3, "recovery: end\n"); } static DECLARE_WORK(recovery_work, recovery_work_func); @@ -1274,11 +1280,11 @@ static void recovery_func(unsigned long data) schedule_work(&recovery_work); } -static void ccw_device_schedule_recovery(void) +void ccw_device_schedule_recovery(void) { unsigned long flags; - CIO_MSG_EVENT(4, "recovery: schedule\n"); + CIO_MSG_EVENT(3, "recovery: schedule\n"); spin_lock_irqsave(&recovery_lock, flags); if (!timer_pending(&recovery_timer) || (recovery_phase != 0)) { recovery_phase = 0; diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h index ec497af99dd8..69cb70f080a5 100644 --- a/drivers/s390/cio/device.h +++ b/drivers/s390/cio/device.h @@ -134,6 +134,7 @@ void ccw_device_set_disconnected(struct ccw_device *cdev); void ccw_device_set_notoper(struct ccw_device *cdev); void ccw_device_set_timeout(struct ccw_device *, int); +void ccw_device_schedule_recovery(void); /* Channel measurement facility related */ void retry_set_schib(struct ccw_device *cdev); diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 12016e32e519..f98ea674c3d8 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -476,6 +476,17 @@ static void create_fake_irb(struct irb *irb, int type) } } +static void ccw_device_handle_broken_paths(struct ccw_device *cdev) +{ + struct subchannel *sch = to_subchannel(cdev->dev.parent); + u8 broken_paths = (sch->schib.pmcw.pam & sch->opm) ^ sch->vpm; + + if (broken_paths && (cdev->private->path_broken_mask != broken_paths)) + ccw_device_schedule_recovery(); + + cdev->private->path_broken_mask = broken_paths; +} + void ccw_device_verify_done(struct ccw_device *cdev, int err) { struct subchannel *sch; @@ -508,6 +519,7 @@ callback: memset(&cdev->private->irb, 0, sizeof(struct irb)); } ccw_device_report_path_events(cdev); + ccw_device_handle_broken_paths(cdev); break; case -ETIME: case -EUSERS: diff --git a/drivers/s390/cio/io_sch.h b/drivers/s390/cio/io_sch.h index 220f49145b2f..9a1b56b2df3e 100644 --- a/drivers/s390/cio/io_sch.h +++ b/drivers/s390/cio/io_sch.h @@ -131,6 +131,8 @@ struct ccw_device_private { not operable */ u8 path_gone_mask; /* mask of paths, that became unavailable */ u8 path_new_mask; /* mask of paths, that became available */ + u8 path_broken_mask; /* mask of paths, which were found to be + unusable */ struct { unsigned int fast:1; /* post with "channel end" */ unsigned int repall:1; /* report every interrupt status */ From 91c575b335766effa6103eba42a82aea560c365f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:10:35 +0200 Subject: [PATCH 0189/1322] s390/mm: make pmdp_invalidate() do invalidation only Commit 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") inadvertently changed the behavior of pmdp_invalidate(), so that it now clears the pmd instead of just marking it as invalid. Fix this by restoring the original behavior. A possible impact of the misbehaving pmdp_invalidate() would be the MADV_DONTNEED races (see commits ced10803 and 58ceeb6b), although we should not have any negative impact on the related dirty/young flags, since those flags are not set by the hardware on s390. Fixes: 227be799c39a ("s390/mm: uninline pmdp_xxx functions from pgtable.h") Cc: # v4.6+ Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index dce708e061ea..20e75a2ca93a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1507,7 +1507,9 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, static inline void pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + + pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT From ba385c0594e723d41790ecfb12c610e6f90c7785 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 18 Sep 2017 16:51:51 +0200 Subject: [PATCH 0190/1322] s390/mm: fix write access check in gup_huge_pmd() The check for the _SEGMENT_ENTRY_PROTECT bit in gup_huge_pmd() is the wrong way around. It must not be set for write==1, and not be checked for write==0. Fix this similar to how it was fixed for ptes long time ago in commit 25591b070336 ("[S390] fix get_user_pages_fast"). One impact of this bug would be unnecessarily using the gup slow path for write==0 on r/w mappings. A potentially more severe impact would be that gup_huge_pmd() will succeed for write==1 on r/o mappings. Cc: Signed-off-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 8ecc25e760fa..98ffe3ee9411 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -56,13 +56,12 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask, result; struct page *head, *page; + unsigned long mask; int refs; - result = write ? 0 : _SEGMENT_ENTRY_PROTECT; - mask = result | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != result) + mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; + if ((pmd_val(pmd) & mask) != 0) return 0; VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); From 95e2a3b3ef177730019e3799917193595133b275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Sat, 16 Sep 2017 22:12:24 +0200 Subject: [PATCH 0191/1322] Revert "KVM: Don't accept obviously wrong gsi values via KVM_IRQFD" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 36ae3c0a36b7456432fedce38ae2f7bd3e01a563. The commit broke compilation on !CONFIG_HAVE_KVM_IRQ_ROUTING. Also, there may be cases with CONFIG_HAVE_KVM_IRQ_ROUTING, where larger gsi values make sense. As the commit was meant as an early indicator to user space that something is wrong, reverting just restores the previous behavior where overly large values are ignored when encountered (without any direct feedback). Reported-by: Abdul Haleem Signed-off-by: Jan H. Schönherr Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- virt/kvm/eventfd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c608ab495282..f2ac53ab8243 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -565,8 +565,6 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; - if (args->gsi >= KVM_MAX_IRQ_ROUTES) - return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) return kvm_irqfd_deassign(kvm, args); From 55e001aabb826c96f09e0a440bdcbce620189dbc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 12:17:36 +0200 Subject: [PATCH 0192/1322] fpga: altera-cvp: remove DRIVER_ATTR() usage It's better to be explicit and use the DRIVER_ATTR_RW() macro when defining a driver's sysfs file. This is part of a series to drop DRIVER_ATTR() from the tree entirely. Cc: linux-fpga@vger.kernel.org Reviewed-by: Moritz Fischer Acked-by: Alan Tull Signed-off-by: Greg Kroah-Hartman --- drivers/fpga/altera-cvp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/fpga/altera-cvp.c b/drivers/fpga/altera-cvp.c index 08629ee69d11..00e73d28077c 100644 --- a/drivers/fpga/altera-cvp.c +++ b/drivers/fpga/altera-cvp.c @@ -361,12 +361,12 @@ static const struct fpga_manager_ops altera_cvp_ops = { .write_complete = altera_cvp_write_complete, }; -static ssize_t show_chkcfg(struct device_driver *dev, char *buf) +static ssize_t chkcfg_show(struct device_driver *dev, char *buf) { return snprintf(buf, 3, "%d\n", altera_cvp_chkcfg); } -static ssize_t store_chkcfg(struct device_driver *drv, const char *buf, +static ssize_t chkcfg_store(struct device_driver *drv, const char *buf, size_t count) { int ret; @@ -378,7 +378,7 @@ static ssize_t store_chkcfg(struct device_driver *drv, const char *buf, return count; } -static DRIVER_ATTR(chkcfg, 0600, show_chkcfg, store_chkcfg); +static DRIVER_ATTR_RW(chkcfg); static int altera_cvp_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id); From 850fdec8d2fd1eebfa003fea39bec08cd69b6155 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 12:17:57 +0200 Subject: [PATCH 0193/1322] driver core: remove DRIVER_ATTR DRIVER_ATTR is no longer in use, and driver authors should be using DRIVER_ATTR_RW() or DRIVER_ATTR_RO() or DRIVER_ATTR_WO() instead in order to always get the permissions correct. So remove it so that no one can use it anymore. Acked-by: Alan Tull Reviewed-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-model/driver.txt | 7 ++++--- Documentation/filesystems/sysfs.txt | 3 ++- include/linux/device.h | 2 -- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt index 4421135826a2..d661e6f7e6a0 100644 --- a/Documentation/driver-model/driver.txt +++ b/Documentation/driver-model/driver.txt @@ -196,12 +196,13 @@ struct driver_attribute { }; Device drivers can export attributes via their sysfs directories. -Drivers can declare attributes using a DRIVER_ATTR macro that works -identically to the DEVICE_ATTR macro. +Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO +macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO +macros. Example: -DRIVER_ATTR(debug,0644,show_debug,store_debug); +DRIVER_ATTR_RW(debug); This is equivalent to declaring: diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 24da7b32c489..9a3658cc399e 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -366,7 +366,8 @@ struct driver_attribute { Declaring: -DRIVER_ATTR(_name, _mode, _show, _store) +DRIVER_ATTR_RO(_name) +DRIVER_ATTR_RW(_name) Creation/Removal: diff --git a/include/linux/device.h b/include/linux/device.h index c6f27207dbe8..2bc70ddda09b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -307,8 +307,6 @@ struct driver_attribute { size_t count); }; -#define DRIVER_ATTR(_name, _mode, _show, _store) \ - struct driver_attribute driver_attr_##_name = __ATTR(_name, _mode, _show, _store) #define DRIVER_ATTR_RW(_name) \ struct driver_attribute driver_attr_##_name = __ATTR_RW(_name) #define DRIVER_ATTR_RO(_name) \ From a4aaeccc7a91e24cc17f72a7eb2f586f5c3d811d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 10 Sep 2017 13:43:37 -0700 Subject: [PATCH 0194/1322] iommu: Add missing dependencies parisc:allmodconfig, xtensa:allmodconfig, and possibly others generate the following Kconfig warning. warning: (IPMMU_VMSA && ARM_SMMU && ARM_SMMU_V3 && QCOM_IOMMU) selects IOMMU_IO_PGTABLE_LPAE which has unmet direct dependencies (IOMMU_SUPPORT && HAS_DMA && (ARM || ARM64 || COMPILE_TEST && !GENERIC_ATOMIC64)) IOMMU_IO_PGTABLE_LPAE depends on (COMPILE_TEST && !GENERIC_ATOMIC64), so any configuration option selecting it needs to have the same dependencies. Signed-off-by: Guenter Roeck Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 49bd2ab8c507..8530ef78925d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -278,7 +278,7 @@ config EXYNOS_IOMMU_DEBUG config IPMMU_VMSA bool "Renesas VMSA-compatible IPMMU" depends on ARM || IOMMU_DMA - depends on ARCH_RENESAS || COMPILE_TEST + depends on ARCH_RENESAS || (COMPILE_TEST && !GENERIC_ATOMIC64) select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU @@ -373,7 +373,7 @@ config MTK_IOMMU_V1 config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" - depends on ARCH_QCOM || COMPILE_TEST + depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU From 3bd71e18c5803c23014df962bdd74af1b34697fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 22:10:21 +0200 Subject: [PATCH 0195/1322] iommu/vt-d: Fix harmless section mismatch warning Building with gcc-4.6 results in this warning due to dmar_table_print_dmar_entry being inlined as in newer compiler versions: WARNING: vmlinux.o(.text+0x5c8bee): Section mismatch in reference from the function dmar_walk_remapping_entries() to the function .init.text:dmar_table_print_dmar_entry() The function dmar_walk_remapping_entries() references the function __init dmar_table_print_dmar_entry(). This is often because dmar_walk_remapping_entries lacks a __init annotation or the annotation of dmar_table_print_dmar_entry is wrong. This removes the __init annotation to avoid the warning. On compilers that don't show the warning today, this should have no impact since the function gets inlined anyway. Signed-off-by: Arnd Bergmann Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index ca5ebaeafd6a..57c920c1372d 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -497,7 +497,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) #define dmar_parse_one_rhsa dmar_res_noop #endif -static void __init +static void dmar_table_print_dmar_entry(struct acpi_dmar_header *header) { struct acpi_dmar_hardware_unit *drhd; From 5baf6bb0fd2388742a0846cc7bcacee6dec78235 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 14 Sep 2017 14:01:00 +0200 Subject: [PATCH 0196/1322] drm/exynos: Fix locking in the suspend/resume paths Commit 48a92916729b ("drm/exynos: use drm_for_each_connector_iter()") replaced unsafe drm_for_each_connector() with drm_for_each_connector_iter() and removed surrounding drm_modeset_lock calls. However, that lock was there not only to protect unsafe drm_for_each_connector(), but it was also required to be held by the dpms code which was called from the loop body. This patch restores those drm_modeset_lock calls to fix broken suspend and resume of Exynos DRM subsystem in v4.13 kernel. Fixes: 48a92916729b ("drm/exynos: use drm_for_each_connector_iter()") CC: stable@vger.kernel.org # v4.13 Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index b1f7299600f0..7f3cfc5dd320 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -174,6 +174,7 @@ static int exynos_drm_suspend(struct device *dev) if (pm_runtime_suspended(dev) || !drm_dev) return 0; + drm_modeset_lock_all(drm_dev); drm_connector_list_iter_begin(drm_dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { int old_dpms = connector->dpms; @@ -185,6 +186,7 @@ static int exynos_drm_suspend(struct device *dev) connector->dpms = old_dpms; } drm_connector_list_iter_end(&conn_iter); + drm_modeset_unlock_all(drm_dev); return 0; } @@ -198,6 +200,7 @@ static int exynos_drm_resume(struct device *dev) if (pm_runtime_suspended(dev) || !drm_dev) return 0; + drm_modeset_lock_all(drm_dev); drm_connector_list_iter_begin(drm_dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->funcs->dpms) { @@ -208,6 +211,7 @@ static int exynos_drm_resume(struct device *dev) } } drm_connector_list_iter_end(&conn_iter); + drm_modeset_unlock_all(drm_dev); return 0; } From 6e8edf8a7d8d98e1734ff41e5c69439419319402 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 14 Sep 2017 14:01:01 +0200 Subject: [PATCH 0197/1322] drm/exynos: Fix suspend/resume support Commit 7d902c05b480 ("drm: Nuke drm_atomic_helper_connector_dpms") removed drm_atomic_helper_connector_dpms() helper saying that it was a dead code. It was however indirectly used by Exynos DRM driver for implementing suspend/resume support. To fix this regression (after that patch Exynos DRM suspend/resume functions became no-ops and hardware fails to suspend), this patch rewrites them with drm_atomic_helper_suspend/resume() helpers. Fixes: 7d902c05b480 ("drm: Nuke drm_atomic_helper_connector_dpms") Signed-off-by: Marek Szyprowski Acked-by: Krzysztof Kozlowski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 40 +++++++---------------- drivers/gpu/drm/exynos/exynos_drm_drv.h | 1 + drivers/gpu/drm/exynos/exynos_drm_fbdev.c | 20 ++++++++++++ drivers/gpu/drm/exynos/exynos_drm_fbdev.h | 10 ++++++ 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 7f3cfc5dd320..e651a58c18cf 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -168,25 +168,19 @@ static struct drm_driver exynos_drm_driver = { static int exynos_drm_suspend(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct drm_connector *connector; - struct drm_connector_list_iter conn_iter; + struct exynos_drm_private *private = drm_dev->dev_private; if (pm_runtime_suspended(dev) || !drm_dev) return 0; - drm_modeset_lock_all(drm_dev); - drm_connector_list_iter_begin(drm_dev, &conn_iter); - drm_for_each_connector_iter(connector, &conn_iter) { - int old_dpms = connector->dpms; - - if (connector->funcs->dpms) - connector->funcs->dpms(connector, DRM_MODE_DPMS_OFF); - - /* Set the old mode back to the connector for resume */ - connector->dpms = old_dpms; + drm_kms_helper_poll_disable(drm_dev); + exynos_drm_fbdev_suspend(drm_dev); + private->suspend_state = drm_atomic_helper_suspend(drm_dev); + if (IS_ERR(private->suspend_state)) { + exynos_drm_fbdev_resume(drm_dev); + drm_kms_helper_poll_enable(drm_dev); + return PTR_ERR(private->suspend_state); } - drm_connector_list_iter_end(&conn_iter); - drm_modeset_unlock_all(drm_dev); return 0; } @@ -194,24 +188,14 @@ static int exynos_drm_suspend(struct device *dev) static int exynos_drm_resume(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct drm_connector *connector; - struct drm_connector_list_iter conn_iter; + struct exynos_drm_private *private = drm_dev->dev_private; if (pm_runtime_suspended(dev) || !drm_dev) return 0; - drm_modeset_lock_all(drm_dev); - drm_connector_list_iter_begin(drm_dev, &conn_iter); - drm_for_each_connector_iter(connector, &conn_iter) { - if (connector->funcs->dpms) { - int dpms = connector->dpms; - - connector->dpms = DRM_MODE_DPMS_OFF; - connector->funcs->dpms(connector, dpms); - } - } - drm_connector_list_iter_end(&conn_iter); - drm_modeset_unlock_all(drm_dev); + drm_atomic_helper_resume(drm_dev, private->suspend_state); + exynos_drm_fbdev_resume(drm_dev); + drm_kms_helper_poll_enable(drm_dev); return 0; } diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.h b/drivers/gpu/drm/exynos/exynos_drm_drv.h index cf131c2aa23e..f8bae4cb4823 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.h +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.h @@ -202,6 +202,7 @@ struct drm_exynos_file_private { */ struct exynos_drm_private { struct drm_fb_helper *fb_helper; + struct drm_atomic_state *suspend_state; struct device *dma_dev; void *mapping; diff --git a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c index c3a068409b48..dfb66ecf417b 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c @@ -18,6 +18,8 @@ #include #include +#include + #include "exynos_drm_drv.h" #include "exynos_drm_fb.h" #include "exynos_drm_fbdev.h" @@ -285,3 +287,21 @@ void exynos_drm_output_poll_changed(struct drm_device *dev) drm_fb_helper_hotplug_event(fb_helper); } + +void exynos_drm_fbdev_suspend(struct drm_device *dev) +{ + struct exynos_drm_private *private = dev->dev_private; + + console_lock(); + drm_fb_helper_set_suspend(private->fb_helper, 1); + console_unlock(); +} + +void exynos_drm_fbdev_resume(struct drm_device *dev) +{ + struct exynos_drm_private *private = dev->dev_private; + + console_lock(); + drm_fb_helper_set_suspend(private->fb_helper, 0); + console_unlock(); +} diff --git a/drivers/gpu/drm/exynos/exynos_drm_fbdev.h b/drivers/gpu/drm/exynos/exynos_drm_fbdev.h index 330eef87f718..645d1bb7f665 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fbdev.h +++ b/drivers/gpu/drm/exynos/exynos_drm_fbdev.h @@ -21,6 +21,8 @@ int exynos_drm_fbdev_init(struct drm_device *dev); void exynos_drm_fbdev_fini(struct drm_device *dev); void exynos_drm_fbdev_restore_mode(struct drm_device *dev); void exynos_drm_output_poll_changed(struct drm_device *dev); +void exynos_drm_fbdev_suspend(struct drm_device *drm); +void exynos_drm_fbdev_resume(struct drm_device *drm); #else @@ -39,6 +41,14 @@ static inline void exynos_drm_fbdev_restore_mode(struct drm_device *dev) #define exynos_drm_output_poll_changed (NULL) +static inline void exynos_drm_fbdev_suspend(struct drm_device *drm) +{ +} + +static inline void exynos_drm_fbdev_resume(struct drm_device *drm) +{ +} + #endif #endif From 9ac30ef6d8ec3533d0feec53e268c4fa32ea700c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 10:19:55 +0200 Subject: [PATCH 0198/1322] drm: exynos: include linux/irq.h I ran into a build error on x86: drivers/gpu/drm/exynos/exynos5433_drm_decon.c: In function 'decon_conf_irq': drivers/gpu/drm/exynos/exynos5433_drm_decon.c:706:2: error: implicit declaration of function 'irq_set_status_flags'; did you mean 'dquot_state_flag'? [-Werror=implicit-function-declaration] irq_set_status_flags(irq, IRQ_NOAUTOEN); Adding the missing include fixes the error. Fixes: b37d53a0382c ("drm/exynos/decon5433: move TE handling to DECON") Signed-off-by: Arnd Bergmann Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos5433_drm_decon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c index 730b8d9db187..6be5b53c3b27 100644 --- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include From d6500149bc4fddc5a91cd1a0c31b38fa36bff3ee Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Mon, 18 Sep 2017 18:45:01 +0800 Subject: [PATCH 0199/1322] KVM: x86: Fix the NULL pointer parameter in check_cr_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Routine check_cr_write() will trigger emulator_get_cpuid()-> kvm_cpuid() to get maxphyaddr, and NULL is passed as values for ebx/ecx/edx. This is problematic because kvm_cpuid() will dereference these pointers. Fixes: d1cd3ce90044 ("KVM: MMU: check guest CR3 reserved bits based on its physical address width.") Reported-by: Jim Mattson Signed-off-by: Yu Zhang Reviewed-by: David Hildenbrand Reviewed-by: Jim Mattson Signed-off-by: Radim Krčmář --- arch/x86/kvm/emulate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..15f527b44aa7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4102,10 +4102,12 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if (efer & EFER_LMA) { u64 maxphyaddr; - u32 eax = 0x80000008; + u32 eax, ebx, ecx, edx; - if (ctxt->ops->get_cpuid(ctxt, &eax, NULL, NULL, - NULL, false)) + eax = 0x80000008; + ecx = 0; + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, + &edx, false)) maxphyaddr = eax & 0xff; else maxphyaddr = 36; From dc91f2eb1a4021eb6705c15e474942f84ab9b211 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:49 +0800 Subject: [PATCH 0200/1322] KVM: VMX: do not change SN bit in vmx_update_pi_irte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM assumes that PI notification events should not be suppressed when the target vCPU is not blocked. vmx_update_pi_irte() sets the SN field before changing an interrupt from posting to remapping, but it does not check the vCPU mode. Therefore, the change of SN field may break above the assumption. Besides, I don't see reasons to suppress notification events here, so remove the changes of SN field to avoid race condition. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..7328c8c0ea3b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11911,12 +11911,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", From 5753743fa5108b8f98bd61e40dc63f641b26c768 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:50 +0800 Subject: [PATCH 0201/1322] KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt() intends to detect the violation of invariant that VT-d PI notification event is not suppressed when vcpu is in the guest mode. Because the two checks for the target vcpu mode and the target suppress field cannot be performed atomically, the target vcpu mode may change in between. If that does happen, WARN_ON_ONCE() here may raise false alarms. As the previous patch fixed the real invariant breaker, remove this WARN_ON_ONCE() to avoid false alarms, and document the allowed cases instead. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7328c8c0ea3b..0726ca7a1b02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5077,21 +5077,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; From 0555ac4333d7da7cff83d5b06bd3679a3e1ef584 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Mon, 18 Sep 2017 16:35:32 -0600 Subject: [PATCH 0202/1322] xen, arm64: drop dummy lookup_address() This is unused, and conflicts with the definition that we'll add for XPFO. Signed-off-by: Tycho Andersen Reviewed-by: Julien Grall CC: Boris Ostrovsky CC: Juergen Gross CC: Stefano Stabellini Signed-off-by: Boris Ostrovsky --- include/xen/arm/page.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h index 415dbc6e43fd..6adc2a955340 100644 --- a/include/xen/arm/page.h +++ b/include/xen/arm/page.h @@ -84,16 +84,6 @@ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) BUG(); } -/* TODO: this shouldn't be here but it is because the frontend drivers - * are using it (its rolled in headers) even though we won't hit the code path. - * So for right now just punt with this. - */ -static inline pte_t *lookup_address(unsigned long address, unsigned int *level) -{ - BUG(); - return NULL; -} - extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); From 986a5f70173626eea9863fee6d6e029f0f2bc361 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:34:34 +0200 Subject: [PATCH 0203/1322] iommu/qcom: Depend on HAS_DMA to fix compile error If NO_DMA=y: warning: (IPMMU_VMSA && ARM_SMMU && ARM_SMMU_V3 && QCOM_IOMMU) selects IOMMU_IO_PGTABLE_LPAE which has unmet direct dependencies (IOMMU_SUPPORT && HAS_DMA && (ARM || ARM64 || COMPILE_TEST && !GENERIC_ATOMIC64)) and drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_sync_pte': io-pgtable-arm.c:(.text+0x206): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_free_pages': io-pgtable-arm.c:(.text+0x6a6): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `__arm_lpae_alloc_pages': io-pgtable-arm.c:(.text+0x812): undefined reference to `bad_dma_ops' io-pgtable-arm.c:(.text+0x81c): undefined reference to `bad_dma_ops' io-pgtable-arm.c:(.text+0x862): undefined reference to `bad_dma_ops' drivers/iommu/io-pgtable-arm.o: In function `arm_lpae_run_tests': io-pgtable-arm.c:(.init.text+0x86): undefined reference to `alloc_io_pgtable_ops' io-pgtable-arm.c:(.init.text+0x47c): undefined reference to `free_io_pgtable_ops' drivers/iommu/qcom_iommu.o: In function `qcom_iommu_init_domain': qcom_iommu.c:(.text+0x1ce): undefined reference to `alloc_io_pgtable_ops' drivers/iommu/qcom_iommu.o: In function `qcom_iommu_domain_free': qcom_iommu.c:(.text+0x754): undefined reference to `free_io_pgtable_ops' QCOM_IOMMU selects IOMMU_IO_PGTABLE_LPAE, which bypasses its dependency on HAS_DMA. Make QCOM_IOMMU depend on HAS_DMA to fix this. Fixes: 0ae349a0f33fb040 ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 8530ef78925d..f3a21343e636 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -374,6 +374,7 @@ config QCOM_IOMMU # Note: iommu drivers cannot (yet?) be built as modules bool "Qualcomm IOMMU Support" depends on ARCH_QCOM || (COMPILE_TEST && !GENERIC_ATOMIC64) + depends on HAS_DMA select IOMMU_API select IOMMU_IO_PGTABLE_LPAE select ARM_DMA_USE_IOMMU From d03d5d530a46c9def981a2a4ec98f6ec2e9b7878 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 8 Jul 2017 20:16:29 +0100 Subject: [PATCH 0204/1322] MAINTAINERS: add Macchiatobin maintainers entry Add a maintainers entry for the Macchiatobin board. Signed-off-by: Russell King Signed-off-by: Gregory CLEMENT --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..531ba1acae3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8253,6 +8253,12 @@ L: libertas-dev@lists.infradead.org S: Orphan F: drivers/net/wireless/marvell/libertas/ +MARVELL MACCHIATOBIN SUPPORT +M: Russell King +L: linux-arm-kernel@lists.infradead.org +S: Maintained +F: arch/arm64/boot/dts/marvell/armada-8040-mcbin.dts + MARVELL MV643XX ETHERNET DRIVER M: Sebastian Hesselbarth L: netdev@vger.kernel.org From 9e7460fc325dad06d2066abdbc1f4dd49456f9a4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 15 Sep 2017 10:50:07 +0300 Subject: [PATCH 0205/1322] arm64: dt marvell: Fix AP806 system controller size Extend the container size to 0x2000 to include the gpio controller at offset 0x1040. While at it, add start address notation to the gpio node name to match its 'offset' property. Fixes: 63dac0f4924b ("arm64: dts: marvell: add gpio support for Armada 7K/8K") Cc: Signed-off-by: Baruch Siach Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-ap806.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi index 4d360713ed12..30d48ecf46e0 100644 --- a/arch/arm64/boot/dts/marvell/armada-ap806.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-ap806.dtsi @@ -254,7 +254,7 @@ ap_syscon: system-controller@6f4000 { compatible = "syscon", "simple-mfd"; - reg = <0x6f4000 0x1000>; + reg = <0x6f4000 0x2000>; ap_clk: clock { compatible = "marvell,ap806-clock"; @@ -265,7 +265,7 @@ compatible = "marvell,ap806-pinctrl"; }; - ap_gpio: gpio { + ap_gpio: gpio@1040 { compatible = "marvell,armada-8k-gpio"; offset = <0x1040>; ngpios = <20>; From bd7a3fe770ebd8391d1c7d072ff88e9e76d063eb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Sep 2017 15:07:17 +0200 Subject: [PATCH 0206/1322] USB: fix out-of-bounds in usb_set_configuration Andrey Konovalov reported a possible out-of-bounds problem for a USB interface association descriptor. He writes: It seems there's no proper size check of a USB_DT_INTERFACE_ASSOCIATION descriptor. It's only checked that the size is >= 2 in usb_parse_configuration(), so find_iad() might do out-of-bounds access to intf_assoc->bInterfaceCount. And he's right, we don't check for crazy descriptors of this type very well, so resolve this problem. Yet another issue found by syzkaller... Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 14 +++++++++++--- include/uapi/linux/usb/ch9.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 854c8d66cfbe..68b54bd88d1e 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -643,15 +643,23 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, } else if (header->bDescriptorType == USB_DT_INTERFACE_ASSOCIATION) { + struct usb_interface_assoc_descriptor *d; + + d = (struct usb_interface_assoc_descriptor *)header; + if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) { + dev_warn(ddev, + "config %d has an invalid interface association descriptor of length %d, skipping\n", + cfgno, d->bLength); + continue; + } + if (iad_num == USB_MAXIADS) { dev_warn(ddev, "found more Interface " "Association Descriptors " "than allocated for in " "configuration %d\n", cfgno); } else { - config->intf_assoc[iad_num] = - (struct usb_interface_assoc_descriptor - *)header; + config->intf_assoc[iad_num] = d; iad_num++; } diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index ce1169af39d7..2a5d63040a0b 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -780,6 +780,7 @@ struct usb_interface_assoc_descriptor { __u8 iFunction; } __attribute__ ((packed)); +#define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ From 8dd33bcb7050dd6f8c1432732f930932c9d3a33e Mon Sep 17 00:00:00 2001 From: Bo Yan Date: Mon, 18 Sep 2017 10:03:35 -0700 Subject: [PATCH 0207/1322] tracing: Erase irqsoff trace with empty write One convenient way to erase trace is "echo > trace". However, this is currently broken if the current tracer is irqsoff tracer. This is because irqsoff tracer use max_buffer as the default trace buffer. Set the max_buffer as the one to be cleared when it's the trace buffer currently in use. Link: http://lkml.kernel.org/r/1505754215-29411-1-git-send-email-byan@nvidia.com Cc: Cc: stable@vger.kernel.org Fixes: 4acd4d00f ("tracing: give easy way to clear trace buffer") Signed-off-by: Bo Yan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5360b7aec57a..a7fb136da891 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { From c7b3ae0bd2ca658c7a71c49901d08c590294fac9 Mon Sep 17 00:00:00 2001 From: "Ziqian SUN (Zamir)" Date: Mon, 11 Sep 2017 14:26:35 +0800 Subject: [PATCH 0208/1322] tracing: Ignore mmiotrace from kernel commandline The mmiotrace tracer cannot be enabled with ftrace=mmiotrace in kernel commandline. With this patch, noboot is added to the tracer struct, and when system boot with a tracer that has noboot=true, it will print out a warning message and continue booting. Link: http://lkml.kernel.org/r/1505111195-31942-1-git-send-email-zsun@redhat.com Signed-off-by: Ziqian SUN (Zamir) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 7 +++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_mmiotrace.c | 1 + 3 files changed, 10 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a7fb136da891..d3ca35f38803 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5364,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + /* Some tracers are only allowed for the top level buffer */ if (!trace_ok_for_array(t, tr)) { ret = -EINVAL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fb5d54d0d1b3..652c682707cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,8 @@ struct tracer { #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; }; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index cd7480d0a201..dca78fc48439 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly = .close = mmio_close, .read = mmio_read, .print_line = mmio_print_line, + .noboot = true, }; __init static int init_mmio_trace(void) From aa767cfb75341a6b93742f4d7d1589ac2532c29e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Sep 2017 22:07:50 +0200 Subject: [PATCH 0209/1322] of: provide inline helper for of_find_device_by_node The ipmmu-vmsa driver fails in compile-testing on non-OF platforms: drivers/iommu/ipmmu-vmsa.o: In function `ipmmu_of_xlate': ipmmu-vmsa.c:(.text+0x740): undefined reference to `of_find_device_by_node' It would be reasonable to assume that this interface works but returns failure on non-OF builds, like it does on machines that have been booted in another way, so this adds another inline function helper. Fixes: 7b2d59611fef ("iommu/ipmmu-vmsa: Replace local utlb code with fwspec ids") Signed-off-by: Arnd Bergmann Signed-off-by: Rob Herring --- include/linux/of_platform.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index e0d1946270f3..fb908e598348 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -57,7 +57,14 @@ extern const struct of_device_id of_default_bus_match_table[]; extern struct platform_device *of_device_alloc(struct device_node *np, const char *bus_id, struct device *parent); +#ifdef CONFIG_OF extern struct platform_device *of_find_device_by_node(struct device_node *np); +#else +static inline struct platform_device *of_find_device_by_node(struct device_node *np) +{ + return NULL; +} +#endif /* Platform devices and busses creation */ extern struct platform_device *of_platform_device_create(struct device_node *np, From a6899e900509bb2442239d6198dc8bc64f8437ec Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 14 Sep 2017 23:39:38 +0200 Subject: [PATCH 0210/1322] dt-bindings: fix vendor prefix for Abracon Commit 446810f2dd41 ("of: add vendor prefix for Abracon Corporation") claimed that "abcn" was used as the vendor prefix while in fact "abracon" was used in the subsequent commits. It is also the only prefix used in the tree. Signed-off-by: Alexandre Belloni [robh: fix alphabetical order] Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/vendor-prefixes.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 1ea1fd4232ab..1afd298eddd7 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -3,8 +3,8 @@ Device tree binding vendor prefix registry. Keep list in alphabetical order. This isn't an exhaustive list, but you should add new prefixes to it before using them to avoid name-space collisions. -abcn Abracon Corporation abilis Abilis Systems +abracon Abracon Corporation actions Actions Semiconductor Co., Ltd. active-semi Active-Semi International Inc ad Avionic Design GmbH From bb4e6ff01ac356f82327d980e45fee8a65491328 Mon Sep 17 00:00:00 2001 From: Nickey Yang Date: Mon, 18 Sep 2017 17:05:37 +0800 Subject: [PATCH 0211/1322] arm64: dts: rockchip: Correct MIPI DPHY PLL clock on rk3399 There is a further gate in between the mipidphy reference clock and the actual ref-clock input to the dsi host, making the clock hirarchy look like clk_24m --> Gate11[14] --> clk_mipidphy_ref --> Gate21[0] --> clk_dphy_pll Fix the clock reference so that the whole clock subtree gets enabled when the dsi host needs it. Signed-off-by: Nickey Yang [amended commit message] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index d79e9b3265b9..6aa43fd47148 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1629,7 +1629,7 @@ compatible = "rockchip,rk3399-mipi-dsi", "snps,dw-mipi-dsi"; reg = <0x0 0xff960000 0x0 0x8000>; interrupts = ; - clocks = <&cru SCLK_MIPIDPHY_REF>, <&cru PCLK_MIPI_DSI0>, + clocks = <&cru SCLK_DPHY_PLL>, <&cru PCLK_MIPI_DSI0>, <&cru SCLK_DPHY_TX0_CFG>; clock-names = "ref", "pclk", "phy_cfg"; power-domains = <&power RK3399_PD_VIO>; From 3993491bf27117782bee05debc6a6afa51d61760 Mon Sep 17 00:00:00 2001 From: Ariel Elior Date: Tue, 19 Sep 2017 12:54:34 +0300 Subject: [PATCH 0212/1322] MAINTAINERS: Remove Yuval Mintz from maintainers list Remove Yuval from maintaining the bnx2x & qed* modules as he is no longer working for the company. Thanks Yuval for your huge contributions and tireless efforts over the many years and various companies. Ariel Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..955f034fd523 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2853,7 +2853,6 @@ S: Supported F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org @@ -11047,7 +11046,6 @@ S: Supported F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER -M: Yuval Mintz M: Ariel Elior M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org From 4ae7c364b9320063504db78834fabe59d16f85bf Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 28 Aug 2017 15:18:16 +0200 Subject: [PATCH 0213/1322] ARM: dts: nokia n900: drop unneeded/undocumented parts of the dts Sakari mentioned that some parts of the dts are not needed and do not have proper documentation, yet. As the camera works without them, remove them for now. Signed-off-by: Pavel Machek Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-n900.dts | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index 26c20e1167b9..4acd32a1c4ef 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -144,15 +144,6 @@ io-channel-names = "temp", "bsi", "vbat"; }; - rear_camera: camera@0 { - compatible = "linux,camera"; - - module { - model = "TCM8341MD"; - sensor = <&cam1>; - }; - }; - pwm9: dmtimer-pwm { compatible = "ti,omap-dmtimer-pwm"; #pwm-cells = <3>; @@ -189,10 +180,8 @@ clock-lanes = <1>; data-lanes = <0>; lane-polarity = <0 0>; - clock-inv = <0>; /* Select strobe = <1> for back camera, <0> for front camera */ strobe = <1>; - crc = <0>; }; }; }; From 06480f8cf559001c9eb49b4e9d822e13ad1cc5c4 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 6 Sep 2017 16:03:58 +0530 Subject: [PATCH 0214/1322] ARM: OMAP2+: dra7xx: Set OPT_CLKS_IN_RESET flag for gpio1 gpio1 soft reset fails in the kexec path as the optional clock is not enabled hence enable the HWMOD_CONTROL_OPT_CLKS_IN_RESET flag for gpio1 hwmod. Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_hwmod_7xx_data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c index f040244c57e7..2f4f7002f38d 100644 --- a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c +++ b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c @@ -839,6 +839,7 @@ static struct omap_hwmod dra7xx_gpio1_hwmod = { .name = "gpio1", .class = &dra7xx_gpio_hwmod_class, .clkdm_name = "wkupaon_clkdm", + .flags = HWMOD_CONTROL_OPT_CLKS_IN_RESET, .main_clk = "wkupaon_iclk_mux", .prcm = { .omap4 = { From 8aed1026ccfe9cf5772c62bde35bc101ead9308c Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 6 Sep 2017 19:09:32 +0530 Subject: [PATCH 0215/1322] ARM: dts: dra7: Set a default parent to mcasp3_ahclkx_mux Assign a default parent to mcasp3_ahclkx_mux clock using the assigned-clock-parents property. This is helpful in cases like kexec where in the clock parent can be something other than the value at reset. Suggested-by: Tero Kristo Signed-off-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/dra7xx-clocks.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/dra7xx-clocks.dtsi b/arch/arm/boot/dts/dra7xx-clocks.dtsi index cf229dfabf61..e62b62875cba 100644 --- a/arch/arm/boot/dts/dra7xx-clocks.dtsi +++ b/arch/arm/boot/dts/dra7xx-clocks.dtsi @@ -1817,6 +1817,8 @@ clocks = <&abe_24m_fclk>, <&abe_sys_clk_div>, <&func_24m_clk>, <&atl_clkin3_ck>, <&atl_clkin2_ck>, <&atl_clkin1_ck>, <&atl_clkin0_ck>, <&sys_clkin2>, <&ref_clkin0_ck>, <&ref_clkin1_ck>, <&ref_clkin2_ck>, <&ref_clkin3_ck>, <&mlb_clk>, <&mlbp_clk>; ti,bit-shift = <24>; reg = <0x1868>; + assigned-clocks = <&mcasp3_ahclkx_mux>; + assigned-clock-parents = <&abe_24m_fclk>; }; mcasp3_aux_gfclk_mux: mcasp3_aux_gfclk_mux@1868 { From 20547dfd85f5baaf27ca01b32570bd6bfd7b209c Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Fri, 8 Sep 2017 21:15:51 +0200 Subject: [PATCH 0216/1322] ARM: OMAP2+: hsmmc: fix logic to call either omap_hsmmc_init or omap_hsmmc_late_init but not both MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 4.13 kernel I get this boot message: [    1.051727] ------------[ cut here ]------------ [    1.051818] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x54/0x74 [    1.051849] sysfs: cannot create duplicate filename '/devices/platform/omap_hsmmc.2' [    1.051879] Modules linked in: [    1.051971] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-letux+ #1360 [    1.052001] Hardware name: Generic OMAP3 (Flattened Device Tree) [    1.052062] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [    1.052124] [] (show_stack) from [] (dump_stack+0x98/0xd0) [    1.052185] [] (dump_stack) from [] (__warn+0xd0/0x100) [    1.052215] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [    1.052276] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x54/0x74) [    1.052337] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x74/0x84) [    1.052398] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0xd0/0x294) [    1.052429] [] (kobject_add_internal) from [] (kobject_add+0x6c/0x8c) [    1.052490] [] (kobject_add) from [] (device_add+0xe4/0x510) [    1.052551] [] (device_add) from [] (platform_device_add+0x130/0x1c0) [    1.052612] [] (platform_device_add) from [] (omap_hsmmc_late_init+0x3c/0x60) [    1.052673] [] (omap_hsmmc_late_init) from [] (omap3_pandora_legacy_init+0x24/0xb4) [    1.052734] [] (omap3_pandora_legacy_init) from [] (pdata_quirks_check+0x30/0x3c) [    1.052795] [] (pdata_quirks_check) from [] (omap_generic_init+0xc/0x18) [    1.052856] [] (omap_generic_init) from [] (customize_machine+0x1c/0x28) [    1.052917] [] (customize_machine) from [] (do_one_initcall+0xa8/0x150) [    1.052947] [] (do_one_initcall) from [] (kernel_init_freeable+0x110/0x1d4) [    1.053009] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x10c) [    1.053070] [] (kernel_init) from [] (ret_from_fork+0x14/0x24) [    1.055023] ---[ end trace 44e490b09ac4ab88 ]--- This can be traced down to the calls of omap_hsmmc_init(pandora_mmc3); omap_hsmmc_late_init(pandora_mmc3); in omap3_pandora_legacy_init(). It turns out that both funcions disagree how to decide if the other one was alredy called. Signed-off-by: H. Nikolaus Schaller Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/hsmmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-omap2/hsmmc.c b/arch/arm/mach-omap2/hsmmc.c index 5b614388d72f..6d28aa20a7d3 100644 --- a/arch/arm/mach-omap2/hsmmc.c +++ b/arch/arm/mach-omap2/hsmmc.c @@ -58,10 +58,10 @@ void omap_hsmmc_late_init(struct omap2_hsmmc_info *c) struct platform_device *pdev; int res; - if (omap_hsmmc_done != 1) + if (omap_hsmmc_done) return; - omap_hsmmc_done++; + omap_hsmmc_done = 1; for (; c->mmc; c++) { pdev = c->pdev; From cddfae253c875076750a03bd05ba5b1569e6876e Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Mon, 11 Sep 2017 12:00:16 +0530 Subject: [PATCH 0217/1322] ARM: dts: am33xx: Add spi alias to match SOC schematics Linux bus numbers should match the numbers defined by the chip manufacturer. This patch add's spi aliases to achieve that bus naming convention. Signed-off-by: Suniel Mahesh Signed-off-by: Karthik Tummala Tested-by: Karthik Tummala Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am33xx.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 7d7ca054c557..e58fab8aec5d 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -36,6 +36,8 @@ phy1 = &usb1_phy; ethernet0 = &cpsw_emac0; ethernet1 = &cpsw_emac1; + spi0 = &spi0; + spi1 = &spi1; }; cpus { From 4afa616ce937f88d9a69a71b8c561551596a81e3 Mon Sep 17 00:00:00 2001 From: Yogesh Siraswar Date: Thu, 14 Sep 2017 14:30:07 -0500 Subject: [PATCH 0218/1322] ARM: dts: am43xx-epos-evm: Remove extra CPSW EMAC entry On am438x EPOS boards there is only one ethernet port, remove extra port definition. This boot log warnings during PHY detection. Signed-off-by: Yogesh Siraswar Signed-off-by: Andrew F. Davis Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am43x-epos-evm.dts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts index 9d276af7c539..081fa68b6f98 100644 --- a/arch/arm/boot/dts/am43x-epos-evm.dts +++ b/arch/arm/boot/dts/am43x-epos-evm.dts @@ -388,6 +388,7 @@ pinctrl-0 = <&cpsw_default>; pinctrl-1 = <&cpsw_sleep>; status = "okay"; + slaves = <1>; }; &davinci_mdio { @@ -402,11 +403,6 @@ phy-mode = "rmii"; }; -&cpsw_emac1 { - phy_id = <&davinci_mdio>, <1>; - phy-mode = "rmii"; -}; - &phy_sel { rmii-clock-ext; }; From 29a0cfbf91ba997591535a4f7246835ce8328141 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 18 Sep 2017 12:21:37 +0200 Subject: [PATCH 0219/1322] libceph: don't allow bidirectional swap of pg-upmap-items This reverts most of commit f53b7665c8ce ("libceph: upmap semantic changes"). We need to prevent duplicates in the final result. For example, we can currently take [1,2,3] and apply [(1,2)] and get [2,2,3] or [1,2,3] and apply [(3,2)] and get [1,2,2] The rest of the system is not prepared to handle duplicates in the result set like this. The reverted piece was intended to allow [1,2,3] and [(1,2),(2,1)] to get [2,1,3] to reorder primaries. First, this bidirectional swap is hard to implement in a way that also prevents dups. For example, [1,2,3] and [(1,4),(2,3),(3,4)] would give [4,3,4] but would we just drop the last step we'd have [4,3,3] which is also invalid, etc. Simpler to just not handle bidirectional swaps. In practice, they are not needed: if you just want to choose a different primary then use primary_affinity, or pg_upmap (not pg_upmap_items). Cc: stable@vger.kernel.org # 4.13 Link: http://tracker.ceph.com/issues/21410 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f358d0bfa76b..79d14d70b7ea 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2445,19 +2445,34 @@ static void apply_upmap(struct ceph_osdmap *osdmap, pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - for (i = 0; i < raw->size; i++) { - for (j = 0; j < pg->pg_upmap_items.len; j++) { - int from = pg->pg_upmap_items.from_to[j][0]; - int to = pg->pg_upmap_items.from_to[j][1]; + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; - if (from == raw->osds[i]) { - if (!(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) - raw->osds[i] = to; + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; break; } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } } + if (!exists && pos >= 0) + raw->osds[pos] = to; } } } From 3fad4cdac235c5b13227d0c09854c689ae62c70b Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Sat, 16 Sep 2017 01:59:41 +0800 Subject: [PATCH 0220/1322] irqchip/gic-v3: Iterate over possible CPUs by for_each_possible_cpu() get_cpu_number() doesn't use existing helper to iterate over possible CPUs, It will cause an error in case of discontinuous @cpu_possible_mask such as 0b11110001, which can result from a core having failed to come up on a SMP machine. Fixed by using existing helper for_each_possible_cpu(). Signed-off-by: zijun_hu Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 519149ec9053..b5df99c6f680 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1042,7 +1042,7 @@ static int get_cpu_number(struct device_node *dn) { const __be32 *cell; u64 hwid; - int i; + int cpu; cell = of_get_property(dn, "reg", NULL); if (!cell) @@ -1056,9 +1056,9 @@ static int get_cpu_number(struct device_node *dn) if (hwid & ~MPIDR_HWID_BITMASK) return -1; - for (i = 0; i < num_possible_cpus(); i++) - if (cpu_logical_map(i) == hwid) - return i; + for_each_possible_cpu(cpu) + if (cpu_logical_map(cpu) == hwid) + return cpu; return -1; } From 6c09ffd027255d97c536aa4ac51a90973adfedc6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 22:08:23 +0200 Subject: [PATCH 0221/1322] irqchip/gic-v4: Fix building with ancient gcc gcc-4.5 and earlier don't like named initializers for anonymous union members: drivers/irqchip/irq-gic-v4.c: In function 'its_map_vlpi': drivers/irqchip/irq-gic-v4.c:176:3: error: unknown field 'map' specified in initializer drivers/irqchip/irq-gic-v4.c:176:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:176:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c: In function 'its_get_vlpi': drivers/irqchip/irq-gic-v4.c:192:3: error: unknown field 'map' specified in initializer drivers/irqchip/irq-gic-v4.c:192:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:192:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c: In function 'its_prop_update_vlpi': drivers/irqchip/irq-gic-v4.c:208:3: error: unknown field 'config' specified in initializer drivers/irqchip/irq-gic-v4.c:208:3: error: missing braces around initializer drivers/irqchip/irq-gic-v4.c:208:3: error: (near initialization for 'info.') drivers/irqchip/irq-gic-v4.c:208:3: error: initialization makes pointer from integer without a cast This is fairly easy to work around, by using extra curly braces. Signed-off-by: Arnd Bergmann Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v4.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 2370e6d9e603..cd0bcc3b7e33 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -173,7 +173,9 @@ int its_map_vlpi(int irq, struct its_vlpi_map *map) { struct its_cmd_info info = { .cmd_type = MAP_VLPI, - .map = map, + { + .map = map, + }, }; /* @@ -189,7 +191,9 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map) { struct its_cmd_info info = { .cmd_type = GET_VLPI, - .map = map, + { + .map = map, + }, }; return irq_set_vcpu_affinity(irq, &info); @@ -205,7 +209,9 @@ int its_prop_update_vlpi(int irq, u8 config, bool inv) { struct its_cmd_info info = { .cmd_type = inv ? PROP_UPDATE_AND_INV_VLPI : PROP_UPDATE_VLPI, - .config = config, + { + .config = config, + }, }; return irq_set_vcpu_affinity(irq, &info); From 90019f8fcd71bf71653690329e32f41489e96122 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 5 Sep 2017 11:28:46 -0700 Subject: [PATCH 0222/1322] irqchip.mips-gic: Fix shared interrupt mask writes The write_gic_smask() & write_gic_rmask() functions take a shared interrupt number as a parameter, but we're incorrectly providing them a bitmask with the shared interrupt's bit set. This effectively means that we mask or unmask the shared interrupt 1< Fixes: 68898c8765f4 ("irqchip: mips-gic: Drop gic_(re)set_mask() functions") Cc: Jason Cooper Cc: Marc Zyngier Cc: Ralf Baechle Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 6e52a88bbd9e..40159ac12ac8 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -169,7 +169,7 @@ static void gic_mask_irq(struct irq_data *d) { unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); - write_gic_rmask(BIT(intr)); + write_gic_rmask(intr); gic_clear_pcpu_masks(intr); } @@ -179,7 +179,7 @@ static void gic_unmask_irq(struct irq_data *d) unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); unsigned int cpu; - write_gic_smask(BIT(intr)); + write_gic_smask(intr); gic_clear_pcpu_masks(intr); cpu = cpumask_first_and(affinity, cpu_online_mask); @@ -767,7 +767,7 @@ static int __init gic_of_init(struct device_node *node, for (i = 0; i < gic_shared_intrs; i++) { change_gic_pol(i, GIC_POL_ACTIVE_HIGH); change_gic_trig(i, GIC_TRIG_LEVEL); - write_gic_rmask(BIT(i)); + write_gic_rmask(i); } for (i = 0; i < gic_vpes; i++) { From 717e6f2893eb35ce6728c3cacdc297b78d371b31 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Sep 2017 12:10:08 +0800 Subject: [PATCH 0223/1322] ceph: avoid panic in create_session_open_msg() if utsname() returns NULL utsname() can return NULL while process is exiting. Kernel releases file locks during process exits. We send request to mds when releasing file lock. So it's possible that we open mds session while process is exiting. utsname() is called in create_session_open_msg(). Link: http://tracker.ceph.com/issues/21275 Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton [idryomov@gmail.com: drop utsname.h include from mds_client.c] Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 7 ++++--- fs/ceph/mds_client.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9dd6b836ac9e..84edfc60d87a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "super.h" @@ -884,8 +883,8 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 void *p; const char* metadata[][2] = { - {"hostname", utsname()->nodename}, - {"kernel_version", utsname()->release}, + {"hostname", mdsc->nodename}, + {"kernel_version", init_utsname()->release}, {"entity_id", opt->name ? : ""}, {"root", fsopt->server_path ? : "/"}, {NULL, NULL} @@ -3539,6 +3538,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; + strncpy(mdsc->nodename, utsname()->nodename, + sizeof(mdsc->nodename) - 1); return 0; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index db57ae98ed34..636d6b2ec49c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,8 @@ struct ceph_mds_client { struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; + + char nodename[__NEW_UTS_LEN + 1]; }; extern const char *ceph_mds_op_name(int op); From 19a8d6b7604df85402deecae01d7861cb1d40c89 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 19 Sep 2017 15:50:42 +0100 Subject: [PATCH 0224/1322] MIPS: PCI: Move map_irq() hooks out of initdata 04c81c7293df ("MIPS: PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") moved the PCI IRQ fixup to the new host bridge map/swizzle_irq() hooks mechanism. Those hooks can also be called after boot, when all the __init/__initdata/__initconst sections have been freed. Therefore, functions called by them (and the data they refer to) must not be marked as __init/__initdata/__initconst lest compilation trigger section mismatch warnings. Fix all the board files map_irq() hooks by simply removing the respective __init/__initdata/__initconst section markers and by adding another persistent hook IRQ map for the txx9 board files. Fixes: 04c81c7293df ("MIPS: PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Arnd Bergmann Cc: Ralf Baechle Cc: Steve French --- arch/mips/ath79/pci.c | 12 ++++++------ arch/mips/pci/fixup-capcella.c | 4 ++-- arch/mips/pci/fixup-cobalt.c | 8 ++++---- arch/mips/pci/fixup-emma2rh.c | 4 ++-- arch/mips/pci/fixup-fuloong2e.c | 2 +- arch/mips/pci/fixup-ip32.c | 4 ++-- arch/mips/pci/fixup-jmr3927.c | 2 +- arch/mips/pci/fixup-lantiq.c | 2 +- arch/mips/pci/fixup-lemote2f.c | 4 ++-- arch/mips/pci/fixup-loongson3.c | 2 +- arch/mips/pci/fixup-malta.c | 4 ++-- arch/mips/pci/fixup-mpc30x.c | 6 +++--- arch/mips/pci/fixup-pmcmsp.c | 8 ++++---- arch/mips/pci/fixup-rbtx4927.c | 2 +- arch/mips/pci/fixup-rbtx4938.c | 2 +- arch/mips/pci/fixup-sni.c | 12 ++++++------ arch/mips/pci/fixup-tb0219.c | 2 +- arch/mips/pci/fixup-tb0226.c | 2 +- arch/mips/pci/fixup-tb0287.c | 2 +- arch/mips/pci/pci-alchemy.c | 2 +- arch/mips/pci/pci-bcm47xx.c | 2 +- arch/mips/pci/pci-lasat.c | 2 +- arch/mips/pci/pci-mt7620.c | 2 +- arch/mips/pci/pci-octeon.c | 5 ++--- arch/mips/pci/pci-rt2880.c | 2 +- arch/mips/pci/pci-rt3883.c | 2 +- arch/mips/pci/pci-tx4938.c | 2 +- arch/mips/pci/pci-tx4939.c | 4 ++-- arch/mips/pci/pci-xlp.c | 2 +- arch/mips/pci/pci-xlr.c | 2 +- arch/mips/pci/pcie-octeon.c | 3 +-- arch/mips/txx9/generic/pci.c | 8 ++++++-- 32 files changed, 62 insertions(+), 60 deletions(-) diff --git a/arch/mips/ath79/pci.c b/arch/mips/ath79/pci.c index 730c0b03060d..b816cb4a25ff 100644 --- a/arch/mips/ath79/pci.c +++ b/arch/mips/ath79/pci.c @@ -22,10 +22,10 @@ #include "pci.h" static int (*ath79_pci_plat_dev_init)(struct pci_dev *dev); -static const struct ath79_pci_irq *ath79_pci_irq_map __initdata; -static unsigned ath79_pci_nr_irqs __initdata; +static const struct ath79_pci_irq *ath79_pci_irq_map; +static unsigned ath79_pci_nr_irqs; -static const struct ath79_pci_irq ar71xx_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq ar71xx_pci_irq_map[] = { { .slot = 17, .pin = 1, @@ -41,7 +41,7 @@ static const struct ath79_pci_irq ar71xx_pci_irq_map[] __initconst = { } }; -static const struct ath79_pci_irq ar724x_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq ar724x_pci_irq_map[] = { { .slot = 0, .pin = 1, @@ -49,7 +49,7 @@ static const struct ath79_pci_irq ar724x_pci_irq_map[] __initconst = { } }; -static const struct ath79_pci_irq qca955x_pci_irq_map[] __initconst = { +static const struct ath79_pci_irq qca955x_pci_irq_map[] = { { .bus = 0, .slot = 0, @@ -64,7 +64,7 @@ static const struct ath79_pci_irq qca955x_pci_irq_map[] __initconst = { }, }; -int __init pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin) +int pcibios_map_irq(const struct pci_dev *dev, uint8_t slot, uint8_t pin) { int irq = -1; int i; diff --git a/arch/mips/pci/fixup-capcella.c b/arch/mips/pci/fixup-capcella.c index 1c02f5737367..b4c263f16b15 100644 --- a/arch/mips/pci/fixup-capcella.c +++ b/arch/mips/pci/fixup-capcella.c @@ -32,13 +32,13 @@ #define INTC PC104PLUS_INTC_IRQ #define INTD PC104PLUS_INTD_IRQ -static char irq_tab_capcella[][5] __initdata = { +static char irq_tab_capcella[][5] = { [11] = { -1, INT1, INT1, INT1, INT1 }, [12] = { -1, INT2, INT2, INT2, INT2 }, [14] = { -1, INTA, INTB, INTC, INTD } }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_capcella[slot][pin]; } diff --git a/arch/mips/pci/fixup-cobalt.c b/arch/mips/pci/fixup-cobalt.c index b3ab59318d91..44be65c3e6bb 100644 --- a/arch/mips/pci/fixup-cobalt.c +++ b/arch/mips/pci/fixup-cobalt.c @@ -147,7 +147,7 @@ static void qube_raq_via_board_id_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_0, qube_raq_via_board_id_fixup); -static char irq_tab_qube1[] __initdata = { +static char irq_tab_qube1[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = QUBE1_ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = SCSI_IRQ, @@ -156,7 +156,7 @@ static char irq_tab_qube1[] __initdata = { [COBALT_PCICONF_ETH1] = 0 }; -static char irq_tab_cobalt[] __initdata = { +static char irq_tab_cobalt[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = SCSI_IRQ, @@ -165,7 +165,7 @@ static char irq_tab_cobalt[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -static char irq_tab_raq2[] __initdata = { +static char irq_tab_raq2[] = { [COBALT_PCICONF_CPU] = 0, [COBALT_PCICONF_ETH0] = ETH0_IRQ, [COBALT_PCICONF_RAQSCSI] = RAQ2_SCSI_IRQ, @@ -174,7 +174,7 @@ static char irq_tab_raq2[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (cobalt_board_id <= COBALT_BRD_ID_QUBE1) return irq_tab_qube1[slot]; diff --git a/arch/mips/pci/fixup-emma2rh.c b/arch/mips/pci/fixup-emma2rh.c index 19caf775c206..c31cb6af1cd0 100644 --- a/arch/mips/pci/fixup-emma2rh.c +++ b/arch/mips/pci/fixup-emma2rh.c @@ -43,7 +43,7 @@ */ #define MAX_SLOT_NUM 10 -static unsigned char irq_map[][5] __initdata = { +static unsigned char irq_map[][5] = { [3] = {0, MARKEINS_PCI_IRQ_INTB, MARKEINS_PCI_IRQ_INTC, MARKEINS_PCI_IRQ_INTD, 0,}, [4] = {0, MARKEINS_PCI_IRQ_INTA, 0, 0, 0,}, @@ -85,7 +85,7 @@ static void emma2rh_pci_host_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_EMMA2RH, emma2rh_pci_host_fixup); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_map[slot][pin]; } diff --git a/arch/mips/pci/fixup-fuloong2e.c b/arch/mips/pci/fixup-fuloong2e.c index 50da773faede..b47c2771dc99 100644 --- a/arch/mips/pci/fixup-fuloong2e.c +++ b/arch/mips/pci/fixup-fuloong2e.c @@ -19,7 +19,7 @@ /* South bridge slot number is set by the pci probe process */ static u8 sb_slot = 5; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = 0; diff --git a/arch/mips/pci/fixup-ip32.c b/arch/mips/pci/fixup-ip32.c index 133685e215ee..c6ec18a07e63 100644 --- a/arch/mips/pci/fixup-ip32.c +++ b/arch/mips/pci/fixup-ip32.c @@ -21,7 +21,7 @@ #define INTB MACEPCI_SHARED0_IRQ #define INTC MACEPCI_SHARED1_IRQ #define INTD MACEPCI_SHARED2_IRQ -static char irq_tab_mace[][5] __initdata = { +static char irq_tab_mace[][5] = { /* Dummy INT#A INT#B INT#C INT#D */ {0, 0, 0, 0, 0}, /* This is placeholder row - never used */ {0, SCSI0, SCSI0, SCSI0, SCSI0}, @@ -39,7 +39,7 @@ static char irq_tab_mace[][5] __initdata = { * irqs. I suppose a device without a pin A will thank us for doing it * right if there exists such a broken piece of crap. */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_mace[slot][pin]; } diff --git a/arch/mips/pci/fixup-jmr3927.c b/arch/mips/pci/fixup-jmr3927.c index 0f1069527cba..d3102eeea898 100644 --- a/arch/mips/pci/fixup-jmr3927.c +++ b/arch/mips/pci/fixup-jmr3927.c @@ -31,7 +31,7 @@ #include #include -int __init jmr3927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int jmr3927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char irq = pin; diff --git a/arch/mips/pci/fixup-lantiq.c b/arch/mips/pci/fixup-lantiq.c index 2b5427d3f35c..81530a13b349 100644 --- a/arch/mips/pci/fixup-lantiq.c +++ b/arch/mips/pci/fixup-lantiq.c @@ -23,7 +23,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/fixup-lemote2f.c b/arch/mips/pci/fixup-lemote2f.c index 95ab9a1bd010..20cdfdc08938 100644 --- a/arch/mips/pci/fixup-lemote2f.c +++ b/arch/mips/pci/fixup-lemote2f.c @@ -30,7 +30,7 @@ #define PCID 7 /* all the pci device has the PCIA pin, check the datasheet. */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0}, /* 11: Unused */ {0, 0, 0, 0, 0}, /* 12: Unused */ @@ -51,7 +51,7 @@ static char irq_tab[][5] __initdata = { {0, 0, 0, 0, 0}, /* 27: Unused */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; diff --git a/arch/mips/pci/fixup-loongson3.c b/arch/mips/pci/fixup-loongson3.c index 2b6d5e196f99..8a741c2c6685 100644 --- a/arch/mips/pci/fixup-loongson3.c +++ b/arch/mips/pci/fixup-loongson3.c @@ -32,7 +32,7 @@ static void print_fixup_info(const struct pci_dev *pdev) pdev->vendor, pdev->device, pdev->irq); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { print_fixup_info(dev); return dev->irq; diff --git a/arch/mips/pci/fixup-malta.c b/arch/mips/pci/fixup-malta.c index 40e920c653cc..3ec85331795e 100644 --- a/arch/mips/pci/fixup-malta.c +++ b/arch/mips/pci/fixup-malta.c @@ -12,7 +12,7 @@ static char pci_irq[5] = { }; -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* 0: GT64120 PCI bridge */ {0, 0, 0, 0, 0 }, /* 1: Unused */ @@ -38,7 +38,7 @@ static char irq_tab[][5] __initdata = { {0, PCID, PCIA, PCIB, PCIC } /* 21: PCI Slot 4 */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; virq = irq_tab[slot][pin]; diff --git a/arch/mips/pci/fixup-mpc30x.c b/arch/mips/pci/fixup-mpc30x.c index 8e4f8288eca2..66eaf456bc89 100644 --- a/arch/mips/pci/fixup-mpc30x.c +++ b/arch/mips/pci/fixup-mpc30x.c @@ -22,19 +22,19 @@ #include -static const int internal_func_irqs[] __initconst = { +static const int internal_func_irqs[] = { VRC4173_CASCADE_IRQ, VRC4173_AC97_IRQ, VRC4173_USB_IRQ, }; -static const int irq_tab_mpc30x[] __initconst = { +static const int irq_tab_mpc30x[] = { [12] = VRC4173_PCMCIA1_IRQ, [13] = VRC4173_PCMCIA2_IRQ, [29] = MQ200_IRQ, }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (slot == 30) return internal_func_irqs[PCI_FUNC(dev->devfn)]; diff --git a/arch/mips/pci/fixup-pmcmsp.c b/arch/mips/pci/fixup-pmcmsp.c index fab405c21c2f..4ad2ef02087b 100644 --- a/arch/mips/pci/fixup-pmcmsp.c +++ b/arch/mips/pci/fixup-pmcmsp.c @@ -47,7 +47,7 @@ #if defined(CONFIG_PMC_MSP7120_GW) /* Garibaldi Board IRQ wiring to PCI slots */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -86,7 +86,7 @@ static char irq_tab[][5] __initdata = { #elif defined(CONFIG_PMC_MSP7120_EVAL) /* MSP7120 Eval Board IRQ wiring to PCI slots */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -125,7 +125,7 @@ static char irq_tab[][5] __initdata = { #else /* Unknown board -- don't assign any IRQs */ -static char irq_tab[][5] __initdata = { +static char irq_tab[][5] = { /* INTA INTB INTC INTD */ {0, 0, 0, 0, 0 }, /* (AD[0]): Unused */ {0, 0, 0, 0, 0 }, /* (AD[1]): Unused */ @@ -202,7 +202,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) * RETURNS: IRQ number * ****************************************************************************/ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { #if !defined(CONFIG_PMC_MSP7120_GW) && !defined(CONFIG_PMC_MSP7120_EVAL) printk(KERN_WARNING "PCI: unknown board, no PCI IRQs assigned.\n"); diff --git a/arch/mips/pci/fixup-rbtx4927.c b/arch/mips/pci/fixup-rbtx4927.c index 321db265829c..d6aaed1d6be9 100644 --- a/arch/mips/pci/fixup-rbtx4927.c +++ b/arch/mips/pci/fixup-rbtx4927.c @@ -36,7 +36,7 @@ #include #include -int __init rbtx4927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int rbtx4927_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char irq = pin; diff --git a/arch/mips/pci/fixup-rbtx4938.c b/arch/mips/pci/fixup-rbtx4938.c index a80579af609b..ff22a22db73e 100644 --- a/arch/mips/pci/fixup-rbtx4938.c +++ b/arch/mips/pci/fixup-rbtx4938.c @@ -13,7 +13,7 @@ #include #include -int __init rbtx4938_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int rbtx4938_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = tx4938_pcic1_map_irq(dev, slot); diff --git a/arch/mips/pci/fixup-sni.c b/arch/mips/pci/fixup-sni.c index f67ebeeb4200..adb9a58641e8 100644 --- a/arch/mips/pci/fixup-sni.c +++ b/arch/mips/pci/fixup-sni.c @@ -40,7 +40,7 @@ * seem to be a documentation error. At least on my RM200C the Cirrus * Logic CL-GD5434 VGA is device 3. */ -static char irq_tab_rm200[8][5] __initdata = { +static char irq_tab_rm200[8][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* EISA bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -57,7 +57,7 @@ static char irq_tab_rm200[8][5] __initdata = { * * The VGA card is optional for RM300 systems. */ -static char irq_tab_rm300d[8][5] __initdata = { +static char irq_tab_rm300d[8][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* EISA bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -69,7 +69,7 @@ static char irq_tab_rm300d[8][5] __initdata = { { 0, INTD, INTA, INTB, INTC }, /* Slot 4 */ }; -static char irq_tab_rm300e[5][5] __initdata = { +static char irq_tab_rm300e[5][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { SCSI, SCSI, SCSI, SCSI, SCSI }, /* SCSI */ @@ -96,7 +96,7 @@ static char irq_tab_rm300e[5][5] __initdata = { #define INTC PCIT_IRQ_INTC #define INTD PCIT_IRQ_INTD -static char irq_tab_pcit[13][5] __initdata = { +static char irq_tab_pcit[13][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { SCSI0, SCSI0, SCSI0, SCSI0, SCSI0 }, /* SCSI */ @@ -113,7 +113,7 @@ static char irq_tab_pcit[13][5] __initdata = { { 0, INTA, INTB, INTC, INTD }, /* Slot 5 */ }; -static char irq_tab_pcit_cplus[13][5] __initdata = { +static char irq_tab_pcit_cplus[13][5] = { /* INTA INTB INTC INTD */ { 0, 0, 0, 0, 0 }, /* HOST bridge */ { 0, INTB, INTC, INTD, INTA }, /* PCI Slot 9 */ @@ -130,7 +130,7 @@ static inline int is_rm300_revd(void) return (csmsr & 0xa0) == 0x20; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (sni_brd_type) { case SNI_BRD_PCI_TOWER_CPLUS: diff --git a/arch/mips/pci/fixup-tb0219.c b/arch/mips/pci/fixup-tb0219.c index d0b0083fbd27..cc581535f257 100644 --- a/arch/mips/pci/fixup-tb0219.c +++ b/arch/mips/pci/fixup-tb0219.c @@ -23,7 +23,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0226.c b/arch/mips/pci/fixup-tb0226.c index 4196ccf3ea3d..b827b5cad5fd 100644 --- a/arch/mips/pci/fixup-tb0226.c +++ b/arch/mips/pci/fixup-tb0226.c @@ -23,7 +23,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0287.c b/arch/mips/pci/fixup-tb0287.c index 8c5039ed75d7..98f26285f2e3 100644 --- a/arch/mips/pci/fixup-tb0287.c +++ b/arch/mips/pci/fixup-tb0287.c @@ -22,7 +22,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char bus; int irq = -1; diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index e99ca7702d8a..f15ec98de2de 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -522,7 +522,7 @@ static int __init alchemy_pci_init(void) arch_initcall(alchemy_pci_init); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct alchemy_pci_context *ctx = dev->sysdata; if (ctx && ctx->board_map_irq) diff --git a/arch/mips/pci/pci-bcm47xx.c b/arch/mips/pci/pci-bcm47xx.c index 76f16eaed0ad..230d7dd273e2 100644 --- a/arch/mips/pci/pci-bcm47xx.c +++ b/arch/mips/pci/pci-bcm47xx.c @@ -28,7 +28,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return 0; } diff --git a/arch/mips/pci/pci-lasat.c b/arch/mips/pci/pci-lasat.c index 40d2797d2bc4..47f4ee6bbb3b 100644 --- a/arch/mips/pci/pci-lasat.c +++ b/arch/mips/pci/pci-lasat.c @@ -61,7 +61,7 @@ arch_initcall(lasat_pci_setup); #define LASAT_IRQ_PCIC (LASAT_IRQ_BASE + 7) #define LASAT_IRQ_PCID (LASAT_IRQ_BASE + 8) -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 1: diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c index 4e633c1e7ff3..90fba9bf98da 100644 --- a/arch/mips/pci/pci-mt7620.c +++ b/arch/mips/pci/pci-mt7620.c @@ -361,7 +361,7 @@ static int mt7620_pci_probe(struct platform_device *pdev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; u32 val; diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c index 9ee01936862e..3e92a06fa772 100644 --- a/arch/mips/pci/pci-octeon.c +++ b/arch/mips/pci/pci-octeon.c @@ -59,8 +59,7 @@ union octeon_pci_address { } s; }; -int __initconst (*octeon_pcibios_map_irq)(const struct pci_dev *dev, - u8 slot, u8 pin); +int (*octeon_pcibios_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; /** @@ -74,7 +73,7 @@ enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (octeon_pcibios_map_irq) return octeon_pcibios_map_irq(dev, slot, pin); diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c index d6360fe73d05..711cdccdf65b 100644 --- a/arch/mips/pci/pci-rt2880.c +++ b/arch/mips/pci/pci-rt2880.c @@ -181,7 +181,7 @@ static inline void rt2880_pci_write_u32(unsigned long reg, u32 val) spin_unlock_irqrestore(&rt2880_pci_lock, flags); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; int irq = -1; diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c index 04f8ea953297..958899ffe99c 100644 --- a/arch/mips/pci/pci-rt3883.c +++ b/arch/mips/pci/pci-rt3883.c @@ -564,7 +564,7 @@ err_put_intc_node: return err; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/pci-tx4938.c b/arch/mips/pci/pci-tx4938.c index 000c0e1f9ef8..a6418460e3c4 100644 --- a/arch/mips/pci/pci-tx4938.c +++ b/arch/mips/pci/pci-tx4938.c @@ -112,7 +112,7 @@ int __init tx4938_pciclk66_setup(void) return pciclk; } -int __init tx4938_pcic1_map_irq(const struct pci_dev *dev, u8 slot) +int tx4938_pcic1_map_irq(const struct pci_dev *dev, u8 slot) { if (get_tx4927_pcicptr(dev->bus->sysdata) == tx4938_pcic1ptr) { switch (slot) { diff --git a/arch/mips/pci/pci-tx4939.c b/arch/mips/pci/pci-tx4939.c index 9d6acc00f348..09a65f7dbe7c 100644 --- a/arch/mips/pci/pci-tx4939.c +++ b/arch/mips/pci/pci-tx4939.c @@ -48,7 +48,7 @@ void __init tx4939_report_pci1clk(void) ((pciclk + 50000) / 100000) % 10); } -int __init tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) +int tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) { if (get_tx4927_pcicptr(dev->bus->sysdata) == tx4939_pcic1ptr) { switch (slot) { @@ -68,7 +68,7 @@ int __init tx4939_pcic1_map_irq(const struct pci_dev *dev, u8 slot) return -1; } -int __init tx4939_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int tx4939_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = tx4939_pcic1_map_irq(dev, slot); diff --git a/arch/mips/pci/pci-xlp.c b/arch/mips/pci/pci-xlp.c index 7babf01600cb..9eff9137f78e 100644 --- a/arch/mips/pci/pci-xlp.c +++ b/arch/mips/pci/pci-xlp.c @@ -205,7 +205,7 @@ int xlp_socdev_to_node(const struct pci_dev *lnkdev) return PCI_SLOT(lnkdev->devfn) / 8; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct pci_dev *lnkdev; int lnkfunc, node; diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 26d2dabef281..2a1c81a129ba 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -315,7 +315,7 @@ static void xls_pcie_ack_b(struct irq_data *d) } } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return get_irq_vector(dev); } diff --git a/arch/mips/pci/pcie-octeon.c b/arch/mips/pci/pcie-octeon.c index ad3584dbc9d7..fd2887415bc8 100644 --- a/arch/mips/pci/pcie-octeon.c +++ b/arch/mips/pci/pcie-octeon.c @@ -1464,8 +1464,7 @@ static int cvmx_pcie_rc_initialize(int pcie_port) * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init octeon_pcie_pcibios_map_irq(const struct pci_dev *dev, - u8 slot, u8 pin) +int octeon_pcie_pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { /* * The EBH5600 board with the PCI to PCIe bridge mistakenly diff --git a/arch/mips/txx9/generic/pci.c b/arch/mips/txx9/generic/pci.c index 0bd2a1e1ff9a..fb998726bd5d 100644 --- a/arch/mips/txx9/generic/pci.c +++ b/arch/mips/txx9/generic/pci.c @@ -386,9 +386,10 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +static int (*txx9_pci_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { - return txx9_board_vec->pci_map_irq(dev, slot, pin); + return txx9_pci_map_irq(dev, slot, pin); } char * (*txx9_board_pcibios_setup)(char *str) __initdata; @@ -424,5 +425,8 @@ char *__init txx9_pcibios_setup(char *str) txx9_pci_err_action = TXX9_PCI_ERR_IGNORE; return NULL; } + + txx9_pci_map_irq = txx9_board_vec->pci_map_irq; + return str; } From e1af344df4e5c8fe90f4a63235a68d5405afc41b Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 01:21:56 -0400 Subject: [PATCH 0225/1322] ALSA: asihpi: fix a potential double-fetch bug when copying puhm The hm->h.size is intended to hold the actual size of the hm struct that is copied from userspace and should always be <= sizeof(*hm). However, after copy_from_user(hm, puhm, hm->h.size), since userspace process has full control over the memory region pointed by puhm, it is possible that the value of hm->h.size is different from what is fetched-in previously (get_user(hm->h.size, (u16 __user *)puhm)). In other words, hm->h.size is overriden and the relation between hm->h.size and the hm struct is broken. This patch proposes to use a seperate variable, msg_size, to hold the value of the first fetch and override hm->h.size to msg_size after the second fetch to maintain the relation. Signed-off-by: Meng Xu Signed-off-by: Takashi Iwai --- sound/pci/asihpi/hpioctl.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 7e3aa50b21f9..5badd08e1d69 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -103,6 +103,7 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) void __user *puhr; union hpi_message_buffer_v1 *hm; union hpi_response_buffer_v1 *hr; + u16 msg_size; u16 res_max_size; u32 uncopied_bytes; int err = 0; @@ -127,22 +128,25 @@ long asihpi_hpi_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } /* Now read the message size and data from user space. */ - if (get_user(hm->h.size, (u16 __user *)puhm)) { + if (get_user(msg_size, (u16 __user *)puhm)) { err = -EFAULT; goto out; } - if (hm->h.size > sizeof(*hm)) - hm->h.size = sizeof(*hm); + if (msg_size > sizeof(*hm)) + msg_size = sizeof(*hm); /* printk(KERN_INFO "message size %d\n", hm->h.wSize); */ - uncopied_bytes = copy_from_user(hm, puhm, hm->h.size); + uncopied_bytes = copy_from_user(hm, puhm, msg_size); if (uncopied_bytes) { HPI_DEBUG_LOG(ERROR, "uncopied bytes %d\n", uncopied_bytes); err = -EFAULT; goto out; } + /* Override h.size in case it is changed between two userspace fetches */ + hm->h.size = msg_size; + if (get_user(res_max_size, (u16 __user *)puhr)) { err = -EFAULT; goto out; From fbcab13d2e2511a858590846ac2e2d7cbd830591 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:28 -0400 Subject: [PATCH 0226/1322] selftests: silence test output by default Some of the networking tests are very noisy and make it impossible to see if we actually passed the tests as they run. Default to suppressing the output from any tests run in order to make it easier to track what failed. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 693616651da5..4665463779f5 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -24,7 +24,7 @@ define RUN_TESTS echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ - cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ + cd `dirname $$TEST` > /dev/null; (./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo "ok 1..$$test_num selftests: $$BASENAME_TEST [PASS]") || echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; cd - > /dev/null;\ fi; \ done; endef @@ -55,7 +55,7 @@ endif define EMIT_TESTS @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ - echo "(./$$BASENAME_TEST && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ + echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; endef From 422d8dc6fd3afecd66bd6acfcd73a2d53e338ff3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:26 -0400 Subject: [PATCH 0227/1322] selftest: add a reuseaddr test This is to test for a regression introduced by b9470c27607b ("inet: kill smallest_size and smallest_port") which introduced a problem with reuseaddr and bind conflicts. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 +- .../selftests/net/reuseaddr_conflict.c | 114 ++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/reuseaddr_conflict.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 9801253e4802..c612d6e38c62 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -6,3 +6,4 @@ reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa reuseport_dualstack +reuseaddr_conflict diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index de1f5772b878..3df542c84610 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy +TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict include ../lib.mk diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c new file mode 100644 index 000000000000..7c5b12664b03 --- /dev/null +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -0,0 +1,114 @@ +/* + * Test for the regression introduced by + * + * b9470c27607b ("inet: kill smallest_size and smallest_port") + * + * If we open an ipv4 socket on a port with reuseaddr we shouldn't reset the tb + * when we open the ipv6 conterpart, which is what was happening previously. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PORT 9999 + +int open_port(int ipv6, int any) +{ + int fd = -1; + int reuseaddr = 1; + int v6only = 1; + int addrlen; + int ret = -1; + struct sockaddr *addr; + int family = ipv6 ? AF_INET6 : AF_INET; + + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(PORT), + .sin6_addr = in6addr_any + }; + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(PORT), + .sin_addr.s_addr = any ? htonl(INADDR_ANY) : inet_addr("127.0.0.1"), + }; + + + if (ipv6) { + addr = (struct sockaddr*)&addr6; + addrlen = sizeof(addr6); + } else { + addr = (struct sockaddr*)&addr4; + addrlen = sizeof(addr4); + } + + if ((fd = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) { + perror("socket"); + goto out; + } + + if (ipv6 && setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&v6only, + sizeof(v6only)) < 0) { + perror("setsockopt IPV6_V6ONLY"); + goto out; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, + sizeof(reuseaddr)) < 0) { + perror("setsockopt SO_REUSEADDR"); + goto out; + } + + if (bind(fd, addr, addrlen) < 0) { + perror("bind"); + goto out; + } + + if (any) + return fd; + + if (listen(fd, 1) < 0) { + perror("listen"); + goto out; + } + return fd; +out: + close(fd); + return ret; +} + +int main(void) +{ + int listenfd; + int fd1, fd2; + + fprintf(stderr, "Opening 127.0.0.1:%d\n", PORT); + listenfd = open_port(0, 0); + if (listenfd < 0) + error(1, errno, "Couldn't open listen socket"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + fprintf(stderr, "Opening in6addr_any:%d\n", PORT); + fd1 = open_port(1, 1); + if (fd1 < 0) + error(1, errno, "Couldn't open ipv6 reuseport"); + fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT); + fd2 = open_port(0, 1); + if (fd2 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket"); + close(fd1); + fprintf(stderr, "Opening INADDR_ANY:%d after closing ipv6 socket\n", PORT); + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); + fprintf(stderr, "Success"); + return 0; +} From e06d79fbc338935ef27befc84f8b8b2e5f878a10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Sep 2017 09:51:27 -0400 Subject: [PATCH 0228/1322] selftests: actually run the various net selftests These self tests are just self contained binaries, they are not run by any of the scripts in the directory. This means they need to be marked with TEST_GEN_PROGS to actually be run, not TEST_GEN_FILES. Signed-off-by: Josef Bacik Signed-off-by: Shuah Khan --- tools/testing/selftests/net/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 3df542c84610..d86bca991f45 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -5,9 +5,9 @@ CFLAGS += -I../../../../usr/include/ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh TEST_GEN_FILES = socket -TEST_GEN_FILES += psock_fanout psock_tpacket -TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack msg_zerocopy reuseaddr_conflict +TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy +TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict include ../lib.mk From 06e8852ceecccdeff6841a6c5cd78a947a75d5bc Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Fri, 8 Sep 2017 14:01:19 +0200 Subject: [PATCH 0229/1322] selftests/net: msg_zerocopy enable build with older kernel headers Explicitly define SO_EE_ORIGIN_ZEROCOPY. This makes the test program build with older kernel headers, e.g. from Debian 9. Signed-off-by: Thomas Meyer Signed-off-by: Shuah Khan --- tools/testing/selftests/net/msg_zerocopy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c index 40232af5b023..3ab6ec403905 100644 --- a/tools/testing/selftests/net/msg_zerocopy.c +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -55,7 +55,7 @@ #include #ifndef SO_EE_ORIGIN_ZEROCOPY -#define SO_EE_ORIGIN_ZEROCOPY SO_EE_ORIGIN_UPAGE +#define SO_EE_ORIGIN_ZEROCOPY 5 #endif #ifndef SO_ZEROCOPY From 9e987b70ada27554c5d176421de1d167218c49b5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 15 Sep 2017 17:35:27 -0700 Subject: [PATCH 0230/1322] ACPI / bus: Make ACPI_HANDLE() work for non-GPL code again Due to commit db3e50f3234b (device property: Get rid of struct fwnode_handle type field), ACPI_HANDLE() inadvertently became a GPL-only call. The call path that led to that was: ACPI_HANDLE() ACPI_COMPANION() to_acpi_device_node() is_acpi_device_node() acpi_device_fwnode_ops DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); ...and the new DECLARE_ACPI_FWNODE_OPS() includes EXPORT_SYMBOL_GPL, whereas previously it was a static struct. In order to avoid changing any of that, let's instead provide ever so slightly better encapsulation of those struct fwnode_operations instances. Those do not really need to be directly used in inline function calls in header files. Simply moving two small functions (is_acpi_device_node and is_acpi_data_node) out of acpi_bus.h, and into a .c file, does that. That leaves the internals of struct fwnode_operations as GPL-only (which I think was the intent all along), but un-breaks any driver code out there that relies on the ACPI subsystem's being (historically) an EXPORT_SYMBOL-usable system. By that, I mean, ACPI_HANDLE() and other basic ACPI calls were non-GPL-protected. Also, while I'm there, remove a tiny bit of redundancy that was missed in the earlier commit, by having is_acpi_node() use the other two routines, instead of checking fwnode directly. Fixes: db3e50f3234b (device property: Get rid of struct fwnode_handle type field) Signed-off-by: John Hubbard Acked-by: Sakari Ailus Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 13 +++++++++++++ include/acpi/acpi_bus.h | 18 ++++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index c1c216163de3..1e3c2517a1ac 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1293,3 +1293,16 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops); const struct fwnode_operations acpi_static_fwnode_ops; + +bool is_acpi_device_node(const struct fwnode_handle *fwnode) +{ + return !IS_ERR_OR_NULL(fwnode) && + fwnode->ops == &acpi_device_fwnode_ops; +} +EXPORT_SYMBOL(is_acpi_device_node); + +bool is_acpi_data_node(const struct fwnode_handle *fwnode) +{ + return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; +} +EXPORT_SYMBOL(is_acpi_data_node); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index dedf9d789166..fa1505292f6c 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -399,17 +399,12 @@ extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; +bool is_acpi_device_node(const struct fwnode_handle *fwnode); +bool is_acpi_data_node(const struct fwnode_handle *fwnode); + static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { - return !IS_ERR_OR_NULL(fwnode) && - (fwnode->ops == &acpi_device_fwnode_ops - || fwnode->ops == &acpi_data_fwnode_ops); -} - -static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) -{ - return !IS_ERR_OR_NULL(fwnode) && - fwnode->ops == &acpi_device_fwnode_ops; + return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ @@ -422,11 +417,6 @@ static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) NULL; \ }) -static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode) -{ - return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; -} - #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ From 6073512cc8e2c48bed5c6625c02c5e4ae50cec34 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 18 Sep 2017 14:59:20 +0200 Subject: [PATCH 0231/1322] net: phy: Kconfig: Fix PHY infrastructure menu in menuconfig Since the integration of PHYLINK, the configuration option which used to be under the PHY infrastructure menu in menuconfig ended up one level up (the network device driver section) By placing PHYLINK option right after PHYLIB entry, it broke the way Kconfig used to build the menu. See kconfig-language.txt, section "Menu structure", 2nd method. This is fixed by placing the PHYLINK option just before PHYLIB. Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Jerome Brunet Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index a9d16a3af514..cd931cf9dcc2 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -160,15 +160,6 @@ config MDIO_XGENE endif -menuconfig PHYLIB - tristate "PHY Device support and infrastructure" - depends on NETDEVICES - select MDIO_DEVICE - help - Ethernet controllers are usually attached to PHY - devices. This option provides infrastructure for - managing PHY devices. - config PHYLINK tristate depends on NETDEVICES @@ -179,6 +170,15 @@ config PHYLINK configuration links, PHYs, and Serdes links with MAC level autonegotiation modes. +menuconfig PHYLIB + tristate "PHY Device support and infrastructure" + depends on NETDEVICES + select MDIO_DEVICE + help + Ethernet controllers are usually attached to PHY + devices. This option provides infrastructure for + managing PHY devices. + if PHYLIB config SWPHY From 38c5eb93aca9dc1b21a2c96d583ce7f9886a44e6 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 18 Sep 2017 15:36:51 +0200 Subject: [PATCH 0232/1322] net: mvpp2: remove useless goto Remove a goto in the PPv2 tx function which jumps to the next line anyway. This is a cosmetic commit. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index dd0ee2691c86..8041d692db3c 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -6452,7 +6452,6 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev) if (mvpp2_tx_frag_process(port, skb, aggr_txq, txq)) { tx_desc_unmap_put(port, txq, tx_desc); frags = 0; - goto out; } } From 173f4c5ebbd803023e42799d956cf174dea92db5 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 18 Sep 2017 20:18:55 +0200 Subject: [PATCH 0233/1322] vsock: vmci: Remove unneeded linux/miscdevice.h include net/vmw_vsock/vmci_transport.c does not use any miscdevice so this patch remove this unnecessary inclusion. Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 10ae7823a19d..0206155bff53 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:55 -0400 Subject: [PATCH 0234/1322] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE This is a simple non-recursive delete operation. It prunes paths of empty nodes in the tree, but it does not try to further compress the tree as nodes are removed. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..9d58a576b2ae 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,84 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim; + struct lpm_trie_node *node; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of where we could begin trimming the tree. The trim-point + * is the sub-tree along the walk consisting of only single-child + * intermediate nodes and ending at a leaf node that we want to + * remove. + */ + trim = &trie->root; + node = rcu_dereference_protected( + trie->root, lockdep_is_held(&trie->lock)); + while (node) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid + * prefix itself, do not remove it. Reset the root of the trim + * path to its descendant on our path. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = &node->child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(&trie->lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from + * zero or more intermediate nodes to our leaf node for deletion. + */ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + &node->child[0] : + &node->child[1]; + kfree_rcu(node, rcu); + } + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 From bae30468dfc1522464f16051addd7bc7da6da75a Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:56 -0400 Subject: [PATCH 0235/1322] bpf: Add uniqueness invariant to trivial lpm test implementation The 'trivial' lpm implementation in this test allows equivalent nodes to be added (that is, nodes consisting of the same prefix and prefix length). For lookup operations, this is fine because insertion happens at the head of the (singly linked) list and the first, best match is returned. In order to support deletion, the tlpm data structue must first enforce uniqueness. This change modifies the insertion algorithm to search for equivalent nodes and remove them. Note: the BPF_MAP_TYPE_LPM_TRIE already has a uniqueness invariant that is implemented as node replacement. Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_lpm_map.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index e97565243d59..a5ed7c4f1819 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -31,6 +31,10 @@ struct tlpm_node { uint8_t key[]; }; +static struct tlpm_node *tlpm_match(struct tlpm_node *list, + const uint8_t *key, + size_t n_bits); + static struct tlpm_node *tlpm_add(struct tlpm_node *list, const uint8_t *key, size_t n_bits) @@ -38,9 +42,17 @@ static struct tlpm_node *tlpm_add(struct tlpm_node *list, struct tlpm_node *node; size_t n; + n = (n_bits + 7) / 8; + + /* 'overwrite' an equivalent entry if one already exists */ + node = tlpm_match(list, key, n_bits); + if (node && node->n_bits == n_bits) { + memcpy(node->key, key, n); + return list; + } + /* add new entry with @key/@n_bits to @list and return new head */ - n = (n_bits + 7) / 8; node = malloc(sizeof(*node) + n); assert(node); From e8d1749910b56a7248cb9a5392274348d10108bb Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:57 -0400 Subject: [PATCH 0236/1322] bpf: Test deletion in BPF_MAP_TYPE_LPM_TRIE Extend the 'random' operation tests to include a delete operation (delete half of the nodes from both lpm implementions and ensure that lookups are still equivalent). Also, add a simple IPv4 test which verifies lookup behavior as nodes are deleted from the tree. Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_lpm_map.c | 187 ++++++++++++++++++++- 1 file changed, 183 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index a5ed7c4f1819..6fedb9fec36b 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -104,6 +104,34 @@ static struct tlpm_node *tlpm_match(struct tlpm_node *list, return best; } +static struct tlpm_node *tlpm_delete(struct tlpm_node *list, + const uint8_t *key, + size_t n_bits) +{ + struct tlpm_node *best = tlpm_match(list, key, n_bits); + struct tlpm_node *node; + + if (!best || best->n_bits != n_bits) + return list; + + if (best == list) { + node = best->next; + free(best); + return node; + } + + for (node = list; node; node = node->next) { + if (node->next == best) { + node->next = best->next; + free(best); + return list; + } + } + /* should never get here */ + assert(0); + return list; +} + static void test_lpm_basic(void) { struct tlpm_node *list = NULL, *t1, *t2; @@ -126,6 +154,13 @@ static void test_lpm_basic(void) assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15)); assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16)); + list = tlpm_delete(list, (uint8_t[]){ 0xff, 0xff }, 16); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8)); + assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16)); + + list = tlpm_delete(list, (uint8_t[]){ 0xff }, 8); + assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8)); + tlpm_clear(list); } @@ -170,7 +205,7 @@ static void test_lpm_order(void) static void test_lpm_map(int keysize) { - size_t i, j, n_matches, n_nodes, n_lookups; + size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; struct bpf_lpm_trie_key *key; uint8_t *data, *value; @@ -182,6 +217,7 @@ static void test_lpm_map(int keysize) */ n_matches = 0; + n_matches_after_delete = 0; n_nodes = 1 << 8; n_lookups = 1 << 16; @@ -235,15 +271,54 @@ static void test_lpm_map(int keysize) } } + /* Remove the first half of the elements in the tlpm and the + * corresponding nodes from the bpf-lpm. Then run the same + * large number of random lookups in both and make sure they match. + * Note: we need to count the number of nodes actually inserted + * since there may have been duplicates. + */ + for (i = 0, t = list; t; i++, t = t->next) + ; + for (j = 0; j < i / 2; ++j) { + key->prefixlen = list->n_bits; + memcpy(key->data, list->key, keysize); + r = bpf_map_delete_elem(map, key); + assert(!r); + list = tlpm_delete(list, list->key, list->n_bits); + assert(list); + } + for (i = 0; i < n_lookups; ++i) { + for (j = 0; j < keysize; ++j) + data[j] = rand() & 0xff; + + t = tlpm_match(list, data, 8 * keysize); + + key->prefixlen = 8 * keysize; + memcpy(key->data, data, keysize); + r = bpf_map_lookup_elem(map, key, value); + assert(!r || errno == ENOENT); + assert(!t == !!r); + + if (t) { + ++n_matches_after_delete; + assert(t->n_bits == value[keysize]); + for (j = 0; j < t->n_bits; ++j) + assert((t->key[j / 8] & (1 << (7 - j % 8))) == + (value[j / 8] & (1 << (7 - j % 8)))); + } + } + close(map); tlpm_clear(list); /* With 255 random nodes in the map, we are pretty likely to match * something on every lookup. For statistics, use this: * - * printf(" nodes: %zu\n" - * "lookups: %zu\n" - * "matches: %zu\n", n_nodes, n_lookups, n_matches); + * printf(" nodes: %zu\n" + * " lookups: %zu\n" + * " matches: %zu\n" + * "matches(delete): %zu\n", + * n_nodes, n_lookups, n_matches, n_matches_after_delete); */ } @@ -343,6 +418,108 @@ static void test_lpm_ipaddr(void) close(map_fd_ipv6); } +static void test_lpm_delete(void) +{ + struct bpf_lpm_trie_key *key; + size_t key_size; + int map_fd; + __u64 value; + + key_size = sizeof(*key) + sizeof(__u32); + key = alloca(key_size); + + map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + key_size, sizeof(value), + 100, BPF_F_NO_PREALLOC); + assert(map_fd >= 0); + + /* Add nodes: + * 192.168.0.0/16 (1) + * 192.168.0.0/24 (2) + * 192.168.128.0/24 (3) + * 192.168.1.0/24 (4) + * + * (1) + * / \ + * (IM) (3) + * / \ + * (2) (4) + */ + value = 1; + key->prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", key->data); + assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0); + + value = 2; + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", key->data); + assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0); + + value = 3; + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", key->data); + assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0); + + value = 4; + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", key->data); + assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0); + + /* remove non-existent node */ + key->prefixlen = 32; + inet_pton(AF_INET, "10.0.0.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 && + errno == ENOENT); + + /* assert initial lookup */ + key->prefixlen = 32; + inet_pton(AF_INET, "192.168.0.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == 0); + assert(value == 2); + + /* remove leaf node */ + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == 0); + + key->prefixlen = 32; + inet_pton(AF_INET, "192.168.0.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == 0); + assert(value == 1); + + /* remove leaf (and intermediary) node */ + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.1.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == 0); + + key->prefixlen = 32; + inet_pton(AF_INET, "192.168.1.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == 0); + assert(value == 1); + + /* remove root node */ + key->prefixlen = 16; + inet_pton(AF_INET, "192.168.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == 0); + + key->prefixlen = 32; + inet_pton(AF_INET, "192.168.128.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == 0); + assert(value == 3); + + /* remove last node */ + key->prefixlen = 24; + inet_pton(AF_INET, "192.168.128.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == 0); + + key->prefixlen = 32; + inet_pton(AF_INET, "192.168.128.1", key->data); + assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 && + errno == ENOENT); + + close(map_fd); +} + int main(void) { struct rlimit limit = { RLIM_INFINITY, RLIM_INFINITY }; @@ -365,6 +542,8 @@ int main(void) test_lpm_ipaddr(); + test_lpm_delete(); + printf("test_lpm: OK\n"); return 0; } From b247c211ee367cfe975dd54e381094083adfd8d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Sep 2017 02:43:13 +0200 Subject: [PATCH 0237/1322] PM: docs: Drop an excess character from devices.rst Drop an excess "`" from Documentation/driver-api/pm/devices.rst. Fixes: 2728b2d2e5be (PM / core / docs: Convert sleep states API document to reST) Signed-off-by: Rafael J. Wysocki --- Documentation/driver-api/pm/devices.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index bedd32388dac..a0dc2879a152 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -675,7 +675,7 @@ sub-domain of the parent domain. Support for power domains is provided through the :c:member:`pm_domain` field of |struct device|. This field is a pointer to an object of type -|struct dev_pm_domain|, defined in :file:`include/linux/pm.h``, providing a set +|struct dev_pm_domain|, defined in :file:`include/linux/pm.h`, providing a set of power management callbacks analogous to the subsystem-level and device driver callbacks that are executed for the given device during all power transitions, instead of the respective subsystem-level callbacks. Specifically, if a From 157c460e10cb6eca29ccbd0f023db159d0c55ec7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 19 Sep 2017 02:22:39 +0200 Subject: [PATCH 0238/1322] PM: core: Fix device_pm_check_callbacks() The device_pm_check_callbacks() function doesn't check legacy ->suspend and ->resume callback pointers under the device's bus type, class and driver, so in some cases it may set the no_pm_callbacks flag for the device incorrectly and then the callbacks may be skipped during system suspend/resume, which shouldn't happen. Fixes: aa8e54b55947 (PM / sleep: Go direct_complete if driver has no callbacks) Signed-off-by: Rafael J. Wysocki Cc: 4.5+ # 4.5+ --- drivers/base/power/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index ea1732ed7a9d..770b1539a083 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1860,10 +1860,13 @@ void device_pm_check_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_pm_callbacks = - (!dev->bus || pm_ops_is_empty(dev->bus->pm)) && - (!dev->class || pm_ops_is_empty(dev->class->pm)) && + (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && + !dev->bus->suspend && !dev->bus->resume)) && + (!dev->class || (pm_ops_is_empty(dev->class->pm) && + !dev->class->suspend && !dev->class->resume)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && - (!dev->driver || pm_ops_is_empty(dev->driver->pm)); + (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && + !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irq(&dev->power.lock); } From ff76898c0a14a43f71bfe63dd1903087e96ce238 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 19 Sep 2017 08:23:22 -0700 Subject: [PATCH 0239/1322] cpufreq: dt-platdev: Add some missing platforms to the blacklist Commit edeec420de24 (cpufreq: dt-platdev: Automatically create cpufreq device with OPP v2) missed adding few platforms to the blacklist which create the cpufreq-dt device from their own drivers, after some dependencies are sorted out. And for those platforms, both the platform specific driver and the cpufreq-dt-platdev driver try to create the cpufreq-dt device now. Fix that by including those platforms in the blacklist. This doesn't include the TI platforms, for which there is a separate patch. Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index a020da7940d6..430edadca527 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -106,6 +106,18 @@ static const struct of_device_id whitelist[] __initconst = { * platforms using "operating-points-v2" property. */ static const struct of_device_id blacklist[] __initconst = { + { .compatible = "calxeda,highbank", }, + { .compatible = "calxeda,ecx-2000", }, + + { .compatible = "marvell,armadaxp", }, + + { .compatible = "nvidia,tegra124", }, + + { .compatible = "st,stih407", }, + { .compatible = "st,stih410", }, + + { .compatible = "sigma,tango4", }, + { } }; From ed40fad9a568efe83f5e80897ded71117b6911b3 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 31 Aug 2017 22:24:36 +0200 Subject: [PATCH 0240/1322] ARM: cpuidle: Avoid memleak if init fail In case there are no DT idle states defined or cpuidle_register_driver() fails, the copy of the idle driver is leaked: unreferenced object 0xede0dc00 (size 1024): comm "swapper/0", pid 1, jiffies 4294937431 (age 744.510s) hex dump (first 32 bytes): 94 9e 0b c1 00 00 00 00 00 00 00 00 00 00 00 00 ................ 57 46 49 00 00 00 00 00 00 00 00 00 00 00 00 00 WFI............. backtrace: [] arm_idle_init+0x44/0x1ac [] do_one_initcall+0x3c/0x16c [] kernel_init_freeable+0x110/0x1d0 [] kernel_init+0x8/0x114 [] ret_from_fork+0x14/0x3c So fix this by freeing the unregistered copy in error case. Signed-off-by: Stefan Wahren Fixes: d50a7d8acd78 (ARM: cpuidle: Support asymmetric idle definition) Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-arm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 7080c384ad5d..52a75053ee03 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -104,13 +104,13 @@ static int __init arm_idle_init(void) ret = dt_init_idle_driver(drv, arm_idle_state_match, 1); if (ret <= 0) { ret = ret ? : -ENODEV; - goto out_fail; + goto init_fail; } ret = cpuidle_register_driver(drv); if (ret) { pr_err("Failed to register cpuidle driver\n"); - goto out_fail; + goto init_fail; } /* @@ -149,6 +149,8 @@ static int __init arm_idle_init(void) } return 0; +init_fail: + kfree(drv); out_fail: while (--cpu >= 0) { dev = per_cpu(cpuidle_devices, cpu); From 0c0bceb796878a5baea1f47033215fac0774e498 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 8 Sep 2017 12:24:41 +0300 Subject: [PATCH 0241/1322] ACPI: properties: Return _DSD hierarchical extension (data) sub-nodes correctly The recently merged patch "ACPI: Prepare for constifying acpi_get_next_subnode() fwnode argument" was part of a patchset constifying the fwnode arguments across the fwnode property API. The purpose of the patch was to allow returning non-const fwnodes from a data structure the root of which is const. Unfortunately the patch introduced the functionality, in particular when starting parsed from an ACPI device node, the hierarchical data extension nodes would not be enumerated. Restore the old behaviour while still retaining constness properties of the patch. Fixes: 01c1da289791 "ACPI: Prepare for constifying acpi_get_next_subnode() fwnode argument" Signed-off-by: Sakari Ailus Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index c1c216163de3..265b74f440b6 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -908,11 +908,12 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { const struct acpi_device *adev = to_acpi_device_node(fwnode); - struct acpi_device *child_adev = NULL; const struct list_head *head; struct list_head *next; if (!child || is_acpi_device_node(child)) { + struct acpi_device *child_adev; + if (adev) head = &adev->children; else @@ -922,8 +923,8 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, goto nondev; if (child) { - child_adev = to_acpi_device_node(child); - next = child_adev->node.next; + adev = to_acpi_device_node(child); + next = adev->node.next; if (next == head) { child = NULL; goto nondev; @@ -941,8 +942,8 @@ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, const struct acpi_data_node *data = to_acpi_data_node(fwnode); struct acpi_data_node *dn; - if (child_adev) - head = &child_adev->data.subnodes; + if (adev) + head = &adev->data.subnodes; else if (data) head = &data->data.subnodes; else From e2b2d35a052d9264a774715bc6aa3395a45dcfa2 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:08 +0200 Subject: [PATCH 0242/1322] mlxsw: spectrum: Change init order The multicast router offloading code is going to require the counter_pools initialization to occur before the router initialization, thus, change the spectrum initialization order to fix it. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 696b99e65a5a..97284161ea35 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3693,6 +3693,12 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core, goto err_switchdev_init; } + err = mlxsw_sp_counter_pool_init(mlxsw_sp); + if (err) { + dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n"); + goto err_counter_pool_init; + } + err = mlxsw_sp_router_init(mlxsw_sp); if (err) { dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n"); @@ -3711,12 +3717,6 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core, goto err_acl_init; } - err = mlxsw_sp_counter_pool_init(mlxsw_sp); - if (err) { - dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n"); - goto err_counter_pool_init; - } - err = mlxsw_sp_dpipe_init(mlxsw_sp); if (err) { dev_err(mlxsw_sp->bus_info->dev, "Failed to init pipeline debug\n"); @@ -3734,14 +3734,14 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core, err_ports_create: mlxsw_sp_dpipe_fini(mlxsw_sp); err_dpipe_init: - mlxsw_sp_counter_pool_fini(mlxsw_sp); -err_counter_pool_init: mlxsw_sp_acl_fini(mlxsw_sp); err_acl_init: mlxsw_sp_span_fini(mlxsw_sp); err_span_init: mlxsw_sp_router_fini(mlxsw_sp); err_router_init: + mlxsw_sp_counter_pool_fini(mlxsw_sp); +err_counter_pool_init: mlxsw_sp_switchdev_fini(mlxsw_sp); err_switchdev_init: mlxsw_sp_lag_fini(mlxsw_sp); @@ -3760,10 +3760,10 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core) mlxsw_sp_ports_remove(mlxsw_sp); mlxsw_sp_dpipe_fini(mlxsw_sp); - mlxsw_sp_counter_pool_fini(mlxsw_sp); mlxsw_sp_acl_fini(mlxsw_sp); mlxsw_sp_span_fini(mlxsw_sp); mlxsw_sp_router_fini(mlxsw_sp); + mlxsw_sp_counter_pool_fini(mlxsw_sp); mlxsw_sp_switchdev_fini(mlxsw_sp); mlxsw_sp_lag_fini(mlxsw_sp); mlxsw_sp_buffers_fini(mlxsw_sp); From d3b939b8f9a571da82359b6baa5506c9179770d1 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:09 +0200 Subject: [PATCH 0243/1322] mlxsw: spectrum: Move ACL flexible actions instance to spectrum A flexible action instance allows, given a set of ops, creating, committing and sharing a set of ACL action blocks. The flexible action instance in question is using the spectrum KVD linear space to store the flexible action sets. Move this flexible action instance to the common spectrum struct to allow other users (such as multicast router) to get that functionality. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 2 +- .../net/ethernet/mellanox/mlxsw/spectrum.c | 10 ++ .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../ethernet/mellanox/mlxsw/spectrum_acl.c | 93 +------------ .../mlxsw/spectrum_acl_flex_actions.c | 129 ++++++++++++++++++ .../mlxsw/spectrum_acl_flex_actions.h | 44 ++++++ 6 files changed, 186 insertions(+), 93 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 891ff418bb5e..4b88158173f3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -17,7 +17,7 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_kvdl.o spectrum_acl_tcam.o \ spectrum_acl.o spectrum_flower.o \ spectrum_cnt.o spectrum_fid.o \ - spectrum_ipip.o + spectrum_ipip.o spectrum_acl_flex_actions.o mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB) += spectrum_dcb.o mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o obj-$(CONFIG_MLXSW_MINIMAL) += mlxsw_minimal.o diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 97284161ea35..6ba6ff276b17 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -69,6 +69,7 @@ #include "txheader.h" #include "spectrum_cnt.h" #include "spectrum_dpipe.h" +#include "spectrum_acl_flex_actions.h" #include "../mlxfw/mlxfw.h" #define MLXSW_FWREV_MAJOR 13 @@ -3699,6 +3700,12 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core, goto err_counter_pool_init; } + err = mlxsw_sp_afa_init(mlxsw_sp); + if (err) { + dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL actions\n"); + goto err_afa_init; + } + err = mlxsw_sp_router_init(mlxsw_sp); if (err) { dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n"); @@ -3740,6 +3747,8 @@ err_acl_init: err_span_init: mlxsw_sp_router_fini(mlxsw_sp); err_router_init: + mlxsw_sp_afa_fini(mlxsw_sp); +err_afa_init: mlxsw_sp_counter_pool_fini(mlxsw_sp); err_counter_pool_init: mlxsw_sp_switchdev_fini(mlxsw_sp); @@ -3763,6 +3772,7 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core) mlxsw_sp_acl_fini(mlxsw_sp); mlxsw_sp_span_fini(mlxsw_sp); mlxsw_sp_router_fini(mlxsw_sp); + mlxsw_sp_afa_fini(mlxsw_sp); mlxsw_sp_counter_pool_fini(mlxsw_sp); mlxsw_sp_switchdev_fini(mlxsw_sp); mlxsw_sp_lag_fini(mlxsw_sp); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 84ce83acdc19..7180d8f3de75 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -152,6 +152,7 @@ struct mlxsw_sp { struct mlxsw_sp_sb *sb; struct mlxsw_sp_bridge *bridge; struct mlxsw_sp_router *router; + struct mlxsw_afa *afa; struct mlxsw_sp_acl *acl; struct mlxsw_sp_fid_core *fid_core; struct { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 4b2455e3e079..2523785f1904 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -52,7 +52,6 @@ struct mlxsw_sp_acl { struct mlxsw_sp *mlxsw_sp; struct mlxsw_afk *afk; - struct mlxsw_afa *afa; struct mlxsw_sp_fid *dummy_fid; const struct mlxsw_sp_acl_ops *ops; struct rhashtable ruleset_ht; @@ -333,7 +332,7 @@ mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl) rulei = kzalloc(sizeof(*rulei), GFP_KERNEL); if (!rulei) return NULL; - rulei->act_block = mlxsw_afa_block_create(acl->afa); + rulei->act_block = mlxsw_afa_block_create(acl->mlxsw_sp->afa); if (IS_ERR(rulei->act_block)) { err = PTR_ERR(rulei->act_block); goto err_afa_block_create; @@ -653,85 +652,6 @@ int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp, return 0; } -#define MLXSW_SP_KDVL_ACT_EXT_SIZE 1 - -static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index, - char *enc_actions, bool is_first) -{ - struct mlxsw_sp *mlxsw_sp = priv; - char pefa_pl[MLXSW_REG_PEFA_LEN]; - u32 kvdl_index; - int err; - - /* The first action set of a TCAM entry is stored directly in TCAM, - * not KVD linear area. - */ - if (is_first) - return 0; - - err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KDVL_ACT_EXT_SIZE, - &kvdl_index); - if (err) - return err; - mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, enc_actions); - err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl); - if (err) - goto err_pefa_write; - *p_kvdl_index = kvdl_index; - return 0; - -err_pefa_write: - mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); - return err; -} - -static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index, - bool is_first) -{ - struct mlxsw_sp *mlxsw_sp = priv; - - if (is_first) - return; - mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); -} - -static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index, - u8 local_port) -{ - struct mlxsw_sp *mlxsw_sp = priv; - char ppbs_pl[MLXSW_REG_PPBS_LEN]; - u32 kvdl_index; - int err; - - err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &kvdl_index); - if (err) - return err; - mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port); - err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl); - if (err) - goto err_ppbs_write; - *p_kvdl_index = kvdl_index; - return 0; - -err_ppbs_write: - mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); - return err; -} - -static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index) -{ - struct mlxsw_sp *mlxsw_sp = priv; - - mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); -} - -static const struct mlxsw_afa_ops mlxsw_sp_act_afa_ops = { - .kvdl_set_add = mlxsw_sp_act_kvdl_set_add, - .kvdl_set_del = mlxsw_sp_act_kvdl_set_del, - .kvdl_fwd_entry_add = mlxsw_sp_act_kvdl_fwd_entry_add, - .kvdl_fwd_entry_del = mlxsw_sp_act_kvdl_fwd_entry_del, -}; - int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp) { const struct mlxsw_sp_acl_ops *acl_ops = &mlxsw_sp_acl_tcam_ops; @@ -753,14 +673,6 @@ int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp) goto err_afk_create; } - acl->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core, - ACL_ACTIONS_PER_SET), - &mlxsw_sp_act_afa_ops, mlxsw_sp); - if (IS_ERR(acl->afa)) { - err = PTR_ERR(acl->afa); - goto err_afa_create; - } - err = rhashtable_init(&acl->ruleset_ht, &mlxsw_sp_acl_ruleset_ht_params); if (err) @@ -792,8 +704,6 @@ err_acl_ops_init: err_fid_get: rhashtable_destroy(&acl->ruleset_ht); err_rhashtable_init: - mlxsw_afa_destroy(acl->afa); -err_afa_create: mlxsw_afk_destroy(acl->afk); err_afk_create: kfree(acl); @@ -810,7 +720,6 @@ void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp) WARN_ON(!list_empty(&acl->rules)); mlxsw_sp_fid_put(acl->dummy_fid); rhashtable_destroy(&acl->ruleset_ht); - mlxsw_afa_destroy(acl->afa); mlxsw_afk_destroy(acl->afk); kfree(acl); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c new file mode 100644 index 000000000000..4d3340ed0291 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c @@ -0,0 +1,129 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Jiri Pirko + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spectrum_acl_flex_actions.h" +#include "core_acl_flex_actions.h" + +#define MLXSW_SP_KVDL_ACT_EXT_SIZE 1 + +static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index, + char *enc_actions, bool is_first) +{ + struct mlxsw_sp *mlxsw_sp = priv; + char pefa_pl[MLXSW_REG_PEFA_LEN]; + u32 kvdl_index; + int err; + + /* The first action set of a TCAM entry is stored directly in TCAM, + * not KVD linear area. + */ + if (is_first) + return 0; + + err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ACT_EXT_SIZE, + &kvdl_index); + if (err) + return err; + mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, enc_actions); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl); + if (err) + goto err_pefa_write; + *p_kvdl_index = kvdl_index; + return 0; + +err_pefa_write: + mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); + return err; +} + +static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index, + bool is_first) +{ + struct mlxsw_sp *mlxsw_sp = priv; + + if (is_first) + return; + mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); +} + +static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index, + u8 local_port) +{ + struct mlxsw_sp *mlxsw_sp = priv; + char ppbs_pl[MLXSW_REG_PPBS_LEN]; + u32 kvdl_index; + int err; + + err = mlxsw_sp_kvdl_alloc(mlxsw_sp, 1, &kvdl_index); + if (err) + return err; + mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl); + if (err) + goto err_ppbs_write; + *p_kvdl_index = kvdl_index; + return 0; + +err_ppbs_write: + mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); + return err; +} + +static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index) +{ + struct mlxsw_sp *mlxsw_sp = priv; + + mlxsw_sp_kvdl_free(mlxsw_sp, kvdl_index); +} + +static const struct mlxsw_afa_ops mlxsw_sp_act_afa_ops = { + .kvdl_set_add = mlxsw_sp_act_kvdl_set_add, + .kvdl_set_del = mlxsw_sp_act_kvdl_set_del, + .kvdl_fwd_entry_add = mlxsw_sp_act_kvdl_fwd_entry_add, + .kvdl_fwd_entry_del = mlxsw_sp_act_kvdl_fwd_entry_del, +}; + +int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp) +{ + mlxsw_sp->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core, + ACL_ACTIONS_PER_SET), + &mlxsw_sp_act_afa_ops, mlxsw_sp); + return PTR_ERR_OR_ZERO(mlxsw_sp->afa); +} + +void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp) +{ + mlxsw_afa_destroy(mlxsw_sp->afa); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h new file mode 100644 index 000000000000..2726192836ad --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h @@ -0,0 +1,44 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Jiri Pirko + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H +#define _MLXSW_SPECTRUM_ACL_FLEX_KEYS_H + +#include "spectrum.h" + +int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp); +void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp); + +#endif From 4b8a79ff27645c1201287c3b17091add748d1fb9 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:10 +0200 Subject: [PATCH 0244/1322] mlxsw: acl: Introduce mcrouter ACL action The Spectrum multicast forwarding is done using an ACL action. Add the mcrouter ACL action that will be used to offload the multicast router logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/core_acl_flex_actions.c | 71 +++++++++++++++++++ .../mellanox/mlxsw/core_acl_flex_actions.h | 3 + 2 files changed, 74 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c index 5ae110172c22..65a32d7b4350 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c @@ -891,3 +891,74 @@ int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid) return 0; } EXPORT_SYMBOL(mlxsw_afa_block_append_fid_set); + +/* MC Routing Action + * ----------------- + * The Multicast router action. Can be used by RMFT_V2 - Router Multicast + * Forwarding Table Version 2 Register. + */ + +#define MLXSW_AFA_MCROUTER_CODE 0x10 +#define MLXSW_AFA_MCROUTER_SIZE 2 + +enum mlxsw_afa_mcrouter_rpf_action { + MLXSW_AFA_MCROUTER_RPF_ACTION_NOP, + MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP, + MLXSW_AFA_MCROUTER_RPF_ACTION_DISCARD_ERROR, +}; + +/* afa_mcrouter_rpf_action */ +MLXSW_ITEM32(afa, mcrouter, rpf_action, 0x00, 28, 3); + +/* afa_mcrouter_expected_irif */ +MLXSW_ITEM32(afa, mcrouter, expected_irif, 0x00, 0, 16); + +/* afa_mcrouter_min_mtu */ +MLXSW_ITEM32(afa, mcrouter, min_mtu, 0x08, 0, 16); + +enum mlxsw_afa_mrouter_vrmid { + MLXSW_AFA_MCROUTER_VRMID_INVALID, + MLXSW_AFA_MCROUTER_VRMID_VALID +}; + +/* afa_mcrouter_vrmid + * Valid RMID: rigr_rmid_index is used as RMID + */ +MLXSW_ITEM32(afa, mcrouter, vrmid, 0x0C, 31, 1); + +/* afa_mcrouter_rigr_rmid_index + * When the vrmid field is set to invalid, the field is used as pointer to + * Router Interface Group (RIGR) Table in the KVD linear. + * When the vrmid is set to valid, the field is used as RMID index, ranged + * from 0 to max_mid - 1. The index is to the Port Group Table. + */ +MLXSW_ITEM32(afa, mcrouter, rigr_rmid_index, 0x0C, 0, 24); + +static inline void +mlxsw_afa_mcrouter_pack(char *payload, + enum mlxsw_afa_mcrouter_rpf_action rpf_action, + u16 expected_irif, u16 min_mtu, + enum mlxsw_afa_mrouter_vrmid vrmid, u32 rigr_rmid_index) + +{ + mlxsw_afa_mcrouter_rpf_action_set(payload, rpf_action); + mlxsw_afa_mcrouter_expected_irif_set(payload, expected_irif); + mlxsw_afa_mcrouter_min_mtu_set(payload, min_mtu); + mlxsw_afa_mcrouter_vrmid_set(payload, vrmid); + mlxsw_afa_mcrouter_rigr_rmid_index_set(payload, rigr_rmid_index); +} + +int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block, + u16 expected_irif, u16 min_mtu, + bool rmid_valid, u32 kvdl_index) +{ + char *act = mlxsw_afa_block_append_action(block, + MLXSW_AFA_MCROUTER_CODE, + MLXSW_AFA_MCROUTER_SIZE); + if (!act) + return -ENOBUFS; + mlxsw_afa_mcrouter_pack(act, MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP, + expected_irif, min_mtu, rmid_valid, kvdl_index); + return 0; +} +EXPORT_SYMBOL(mlxsw_afa_block_append_mcrouter); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h index f99c341b2497..5dbb31fa5a27 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h @@ -68,5 +68,8 @@ int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block, int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block, u32 counter_index); int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid); +int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block, + u16 expected_irif, u16 min_mtu, + bool rmid_valid, u32 kvdl_index); #endif From 9cb3fa940e2c1c62d35972ab8433531a4ba421a5 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:11 +0200 Subject: [PATCH 0245/1322] mlxsw: acl: Change trap ACL action to get the trap_id as a parameter Allow the trap ACL action to be configured with different traps. This allows the multicast router offloading code to use that same ACL action with the multicast router traps. By using different traps, the multicast router can have different trap policies and can handle the packet differently. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c | 4 ++-- drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c index 65a32d7b4350..ab3ffe7a8eda 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c @@ -712,7 +712,7 @@ int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block) } EXPORT_SYMBOL(mlxsw_afa_block_append_drop); -int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block) +int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id) { char *act = mlxsw_afa_block_append_action(block, MLXSW_AFA_TRAPDISC_CODE, @@ -722,7 +722,7 @@ int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block) return -ENOBUFS; mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP, MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD, - MLXSW_TRAP_ID_ACL0); + trap_id); return 0; } EXPORT_SYMBOL(mlxsw_afa_block_append_trap); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h index 5dbb31fa5a27..501819c790d6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h @@ -60,7 +60,7 @@ u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block); void mlxsw_afa_block_continue(struct mlxsw_afa_block *block); void mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id); int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block); -int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block); +int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id); int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block, u8 local_port, bool in_port); int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 2523785f1904..eede75fbd585 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -396,7 +396,8 @@ int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei) int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei) { - return mlxsw_afa_block_append_trap(rulei->act_block); + return mlxsw_afa_block_append_trap(rulei->act_block, + MLXSW_TRAP_ID_ACL0); } int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, From 587265655159d73247a56236092917131183496e Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:12 +0200 Subject: [PATCH 0246/1322] mlxsw: reg: Rename the flexible action set length field The MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN is relevant for the multicast router registers too, so rename it to have a general name which is not bound to a specific register. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index cc27c5de5a1d..fb8ab441b11e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -2142,15 +2142,14 @@ MLXSW_REG_DEFINE(pefa, MLXSW_REG_PEFA_ID, MLXSW_REG_PEFA_LEN); */ MLXSW_ITEM32(reg, pefa, index, 0x00, 0, 24); -#define MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN 0xA8 +#define MLXSW_REG_FLEX_ACTION_SET_LEN 0xA8 /* reg_pefa_flex_action_set * Action-set to perform when rule is matched. * Must be zero padded if action set is shorter. * Access: RW */ -MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08, - MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN); +MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08, MLXSW_REG_FLEX_ACTION_SET_LEN); static inline void mlxsw_reg_pefa_pack(char *payload, u32 index, const char *flex_action_set) @@ -2243,7 +2242,7 @@ MLXSW_ITEM_BUF(reg, ptce2, mask, 0x80, * Access: RW */ MLXSW_ITEM_BUF(reg, ptce2, flex_action_set, 0xE0, - MLXSW_REG_PXXX_FLEX_ACTION_SET_LEN); + MLXSW_REG_FLEX_ACTION_SET_LEN); static inline void mlxsw_reg_ptce2_pack(char *payload, bool valid, enum mlxsw_reg_ptce2_op op, From 46a7054ebace0fcd0d1826881aa5ab219faa6a77 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:13 +0200 Subject: [PATCH 0247/1322] mlxsw: reg: Add The Router TCAM Allocation register This register is used for allocation of regions in the TCAM table and it will be used by the multicast router offloading logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 52 +++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index fb8ab441b11e..e9f37eac8788 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4310,6 +4310,57 @@ mlxsw_reg_ritr_loopback_ipip4_pack(char *payload, mlxsw_reg_ritr_loopback_ipip_usip4_set(payload, usip); } +/* RTAR - Router TCAM Allocation Register + * -------------------------------------- + * This register is used for allocation of regions in the TCAM table. + */ +#define MLXSW_REG_RTAR_ID 0x8004 +#define MLXSW_REG_RTAR_LEN 0x20 + +MLXSW_REG_DEFINE(rtar, MLXSW_REG_RTAR_ID, MLXSW_REG_RTAR_LEN); + +enum mlxsw_reg_rtar_op { + MLXSW_REG_RTAR_OP_ALLOCATE, + MLXSW_REG_RTAR_OP_RESIZE, + MLXSW_REG_RTAR_OP_DEALLOCATE, +}; + +/* reg_rtar_op + * Access: WO + */ +MLXSW_ITEM32(reg, rtar, op, 0x00, 28, 4); + +enum mlxsw_reg_rtar_key_type { + MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST = 1, + MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST = 3 +}; + +/* reg_rtar_key_type + * TCAM key type for the region. + * Access: WO + */ +MLXSW_ITEM32(reg, rtar, key_type, 0x00, 0, 8); + +/* reg_rtar_region_size + * TCAM region size. When allocating/resizing this is the requested + * size, the response is the actual size. + * Note: Actual size may be larger than requested. + * Reserved for op = Deallocate + * Access: WO + */ +MLXSW_ITEM32(reg, rtar, region_size, 0x04, 0, 16); + +static inline void mlxsw_reg_rtar_pack(char *payload, + enum mlxsw_reg_rtar_op op, + enum mlxsw_reg_rtar_key_type key_type, + u16 region_size) +{ + MLXSW_REG_ZERO(rtar, payload); + mlxsw_reg_rtar_op_set(payload, op); + mlxsw_reg_rtar_key_type_set(payload, key_type); + mlxsw_reg_rtar_region_size_set(payload, region_size); +} + /* RATR - Router Adjacency Table Register * -------------------------------------- * The RATR register is used to configure the Router Adjacency (next-hop) @@ -6855,6 +6906,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(hpkt), MLXSW_REG(rgcr), MLXSW_REG(ritr), + MLXSW_REG(rtar), MLXSW_REG(ratr), MLXSW_REG(rtdp), MLXSW_REG(ricnt), From 5080c7e91701744ef1a5d7aab51f568f889bfddb Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:14 +0200 Subject: [PATCH 0248/1322] mlxsw: reg: Add the Router Interface Group Version 2 register The RIGR-V2 register is used to add, remove and query egress interface list of a multicast forwarding entry and it will be used by the multicast router offloading logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 83 +++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index e9f37eac8788..1778d7f5f843 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5646,6 +5646,88 @@ mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif, mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key); } +/* RIGR-V2 - Router Interface Group Register Version 2 + * --------------------------------------------------- + * The RIGR_V2 register is used to add, remove and query egress interface list + * of a multicast forwarding entry. + */ +#define MLXSW_REG_RIGR2_ID 0x8023 +#define MLXSW_REG_RIGR2_LEN 0xB0 + +#define MLXSW_REG_RIGR2_MAX_ERIFS 32 + +MLXSW_REG_DEFINE(rigr2, MLXSW_REG_RIGR2_ID, MLXSW_REG_RIGR2_LEN); + +/* reg_rigr2_rigr_index + * KVD Linear index. + * Access: Index + */ +MLXSW_ITEM32(reg, rigr2, rigr_index, 0x04, 0, 24); + +/* reg_rigr2_vnext + * Next RIGR Index is valid. + * Access: RW + */ +MLXSW_ITEM32(reg, rigr2, vnext, 0x08, 31, 1); + +/* reg_rigr2_next_rigr_index + * Next RIGR Index. The index is to the KVD linear. + * Reserved when vnxet = '0'. + * Access: RW + */ +MLXSW_ITEM32(reg, rigr2, next_rigr_index, 0x08, 0, 24); + +/* reg_rigr2_vrmid + * RMID Index is valid. + * Access: RW + */ +MLXSW_ITEM32(reg, rigr2, vrmid, 0x20, 31, 1); + +/* reg_rigr2_rmid_index + * RMID Index. + * Range 0 .. max_mid - 1 + * Reserved when vrmid = '0'. + * The index is to the Port Group Table (PGT) + * Access: RW + */ +MLXSW_ITEM32(reg, rigr2, rmid_index, 0x20, 0, 16); + +/* reg_rigr2_erif_entry_v + * Egress Router Interface is valid. + * Note that low-entries must be set if high-entries are set. For + * example: if erif_entry[2].v is set then erif_entry[1].v and + * erif_entry[0].v must be set. + * Index can be from 0 to cap_mc_erif_list_entries-1 + * Access: RW + */ +MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_v, 0x24, 31, 1, 4, 0, false); + +/* reg_rigr2_erif_entry_erif + * Egress Router Interface. + * Valid range is from 0 to cap_max_router_interfaces - 1 + * Index can be from 0 to MLXSW_REG_RIGR2_MAX_ERIFS - 1 + * Access: RW + */ +MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_erif, 0x24, 0, 16, 4, 0, false); + +static inline void mlxsw_reg_rigr2_pack(char *payload, u32 rigr_index, + bool vnext, u32 next_rigr_index) +{ + MLXSW_REG_ZERO(rigr2, payload); + mlxsw_reg_rigr2_rigr_index_set(payload, rigr_index); + mlxsw_reg_rigr2_vnext_set(payload, vnext); + mlxsw_reg_rigr2_next_rigr_index_set(payload, next_rigr_index); + mlxsw_reg_rigr2_vrmid_set(payload, 0); + mlxsw_reg_rigr2_rmid_index_set(payload, 0); +} + +static inline void mlxsw_reg_rigr2_erif_entry_pack(char *payload, int index, + bool v, u16 erif) +{ + mlxsw_reg_rigr2_erif_entry_v_set(payload, index, v); + mlxsw_reg_rigr2_erif_entry_erif_set(payload, index, erif); +} + /* MFCR - Management Fan Control Register * -------------------------------------- * This register controls the settings of the Fan Speed PWM mechanism. @@ -6917,6 +6999,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(rauht), MLXSW_REG(raleu), MLXSW_REG(rauhtd), + MLXSW_REG(rigr2), MLXSW_REG(mfcr), MLXSW_REG(mfsc), MLXSW_REG(mfsm), From 771ced742a4f02ac248ad679325bd434843d78d0 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:15 +0200 Subject: [PATCH 0249/1322] mlxsw: resources: Add multicast ERIF list entries resource The multicast ERIF list entries resource indicates the number of entries that can be put in one rigr2 register operation. While the register can hold up to MLXSW_REG_RIGR2_MAX_ERIFS ( = 32) ERIF entries, the actual number allowed by firmware is indicated with this resource. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/resources.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h index 9556d934714b..087aad52c195 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/resources.h +++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h @@ -63,6 +63,7 @@ enum mlxsw_res_id { MLXSW_RES_ID_MAX_CPU_POLICERS, MLXSW_RES_ID_MAX_VRS, MLXSW_RES_ID_MAX_RIFS, + MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES, MLXSW_RES_ID_MAX_LPM_TREES, /* Internal resources. @@ -100,6 +101,7 @@ static u16 mlxsw_res_ids[] = { [MLXSW_RES_ID_MAX_CPU_POLICERS] = 0x2A13, [MLXSW_RES_ID_MAX_VRS] = 0x2C01, [MLXSW_RES_ID_MAX_RIFS] = 0x2C02, + [MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES] = 0x2C10, [MLXSW_RES_ID_MAX_LPM_TREES] = 0x2C30, }; From 2e654e33c5791332d7abf759fd9d34a39082ffc7 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:16 +0200 Subject: [PATCH 0250/1322] mlxsw: reg: Add the Router Multicast Forwarding Table Version 2 register The RMFT-V2 register is used to configure and query the multicast table and will be used by the multicast router offloading logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 142 ++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 1778d7f5f843..046525ebe5ac 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -5728,6 +5728,147 @@ static inline void mlxsw_reg_rigr2_erif_entry_pack(char *payload, int index, mlxsw_reg_rigr2_erif_entry_erif_set(payload, index, erif); } +/* RMFT-V2 - Router Multicast Forwarding Table Version 2 Register + * -------------------------------------------------------------- + * The RMFT_V2 register is used to configure and query the multicast table. + */ +#define MLXSW_REG_RMFT2_ID 0x8027 +#define MLXSW_REG_RMFT2_LEN 0x174 + +MLXSW_REG_DEFINE(rmft2, MLXSW_REG_RMFT2_ID, MLXSW_REG_RMFT2_LEN); + +/* reg_rmft2_v + * Valid + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, v, 0x00, 31, 1); + +enum mlxsw_reg_rmft2_type { + MLXSW_REG_RMFT2_TYPE_IPV4, + MLXSW_REG_RMFT2_TYPE_IPV6 +}; + +/* reg_rmft2_type + * Access: Index + */ +MLXSW_ITEM32(reg, rmft2, type, 0x00, 28, 2); + +enum mlxsw_sp_reg_rmft2_op { + /* For Write: + * Write operation. Used to write a new entry to the table. All RW + * fields are relevant for new entry. Activity bit is set for new + * entries - Note write with v (Valid) 0 will delete the entry. + * For Query: + * Read operation + */ + MLXSW_REG_RMFT2_OP_READ_WRITE, +}; + +/* reg_rmft2_op + * Operation. + * Access: OP + */ +MLXSW_ITEM32(reg, rmft2, op, 0x00, 20, 2); + +/* reg_rmft2_a + * Activity. Set for new entries. Set if a packet lookup has hit on the specific + * entry. + * Access: RO + */ +MLXSW_ITEM32(reg, rmft2, a, 0x00, 16, 1); + +/* reg_rmft2_offset + * Offset within the multicast forwarding table to write to. + * Access: Index + */ +MLXSW_ITEM32(reg, rmft2, offset, 0x00, 0, 16); + +/* reg_rmft2_virtual_router + * Virtual Router ID. Range from 0..cap_max_virtual_routers-1 + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, virtual_router, 0x04, 0, 16); + +enum mlxsw_reg_rmft2_irif_mask { + MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, + MLXSW_REG_RMFT2_IRIF_MASK_COMPARE +}; + +/* reg_rmft2_irif_mask + * Ingress RIF mask. + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, irif_mask, 0x08, 24, 1); + +/* reg_rmft2_irif + * Ingress RIF index. + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, irif, 0x08, 0, 16); + +/* reg_rmft2_dip4 + * Destination IPv4 address + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, dip4, 0x1C, 0, 32); + +/* reg_rmft2_dip4_mask + * A bit that is set directs the TCAM to compare the corresponding bit in key. A + * bit that is clear directs the TCAM to ignore the corresponding bit in key. + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, dip4_mask, 0x2C, 0, 32); + +/* reg_rmft2_sip4 + * Source IPv4 address + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, sip4, 0x3C, 0, 32); + +/* reg_rmft2_sip4_mask + * A bit that is set directs the TCAM to compare the corresponding bit in key. A + * bit that is clear directs the TCAM to ignore the corresponding bit in key. + * Access: RW + */ +MLXSW_ITEM32(reg, rmft2, sip4_mask, 0x4C, 0, 32); + +/* reg_rmft2_flexible_action_set + * ACL action set. The only supported action types in this field and in any + * action-set pointed from here are as follows: + * 00h: ACTION_NULL + * 01h: ACTION_MAC_TTL, only TTL configuration is supported. + * 03h: ACTION_TRAP + * 06h: ACTION_QOS + * 08h: ACTION_POLICING_MONITORING + * 10h: ACTION_ROUTER_MC + * Access: RW + */ +MLXSW_ITEM_BUF(reg, rmft2, flexible_action_set, 0x80, + MLXSW_REG_FLEX_ACTION_SET_LEN); + +static inline void +mlxsw_reg_rmft2_ipv4_pack(char *payload, bool v, u16 offset, u16 virtual_router, + enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif, + u32 dip4, u32 dip4_mask, u32 sip4, u32 sip4_mask, + const char *flexible_action_set) +{ + MLXSW_REG_ZERO(rmft2, payload); + mlxsw_reg_rmft2_v_set(payload, v); + mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV4); + mlxsw_reg_rmft2_op_set(payload, MLXSW_REG_RMFT2_OP_READ_WRITE); + mlxsw_reg_rmft2_offset_set(payload, offset); + mlxsw_reg_rmft2_virtual_router_set(payload, virtual_router); + mlxsw_reg_rmft2_irif_mask_set(payload, irif_mask); + mlxsw_reg_rmft2_irif_set(payload, irif); + mlxsw_reg_rmft2_dip4_set(payload, dip4); + mlxsw_reg_rmft2_dip4_mask_set(payload, dip4_mask); + mlxsw_reg_rmft2_sip4_set(payload, sip4); + mlxsw_reg_rmft2_sip4_mask_set(payload, sip4_mask); + if (flexible_action_set) + mlxsw_reg_rmft2_flexible_action_set_memcpy_to(payload, + flexible_action_set); +} + /* MFCR - Management Fan Control Register * -------------------------------------- * This register controls the settings of the Fan Speed PWM mechanism. @@ -7000,6 +7141,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(raleu), MLXSW_REG(rauhtd), MLXSW_REG(rigr2), + MLXSW_REG(rmft2), MLXSW_REG(mfcr), MLXSW_REG(mfsc), MLXSW_REG(mfsm), From 4fc92846f65b0a3470b433c54251a40feae7b2d5 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:17 +0200 Subject: [PATCH 0251/1322] mlxsw: reg: Add Router Rules Copy Register The RRCR register is used for copying and moving TCAM multicast routes from different offsets. It will be used to allow routes relocation for parman ops as part of the multicast router offloading logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 60 +++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 046525ebe5ac..31d120ae8dc6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4680,6 +4680,65 @@ static inline void mlxsw_reg_ricnt_pack(char *payload, u32 index, MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC); } +/* RRCR - Router Rules Copy Register Layout + * ---------------------------------------- + * This register is used for moving and copying route entry rules. + */ +#define MLXSW_REG_RRCR_ID 0x800F +#define MLXSW_REG_RRCR_LEN 0x24 + +MLXSW_REG_DEFINE(rrcr, MLXSW_REG_RRCR_ID, MLXSW_REG_RRCR_LEN); + +enum mlxsw_reg_rrcr_op { + /* Move rules */ + MLXSW_REG_RRCR_OP_MOVE, + /* Copy rules */ + MLXSW_REG_RRCR_OP_COPY, +}; + +/* reg_rrcr_op + * Access: WO + */ +MLXSW_ITEM32(reg, rrcr, op, 0x00, 28, 4); + +/* reg_rrcr_offset + * Offset within the region from which to copy/move. + * Access: Index + */ +MLXSW_ITEM32(reg, rrcr, offset, 0x00, 0, 16); + +/* reg_rrcr_size + * The number of rules to copy/move. + * Access: WO + */ +MLXSW_ITEM32(reg, rrcr, size, 0x04, 0, 16); + +/* reg_rrcr_table_id + * Identifier of the table on which to perform the operation. Encoding is the + * same as in RTAR.key_type + * Access: Index + */ +MLXSW_ITEM32(reg, rrcr, table_id, 0x10, 0, 4); + +/* reg_rrcr_dest_offset + * Offset within the region to which to copy/move + * Access: Index + */ +MLXSW_ITEM32(reg, rrcr, dest_offset, 0x20, 0, 16); + +static inline void mlxsw_reg_rrcr_pack(char *payload, enum mlxsw_reg_rrcr_op op, + u16 offset, u16 size, + enum mlxsw_reg_rtar_key_type table_id, + u16 dest_offset) +{ + MLXSW_REG_ZERO(rrcr, payload); + mlxsw_reg_rrcr_op_set(payload, op); + mlxsw_reg_rrcr_offset_set(payload, offset); + mlxsw_reg_rrcr_size_set(payload, size); + mlxsw_reg_rrcr_table_id_set(payload, table_id); + mlxsw_reg_rrcr_dest_offset_set(payload, dest_offset); +} + /* RALTA - Router Algorithmic LPM Tree Allocation Register * ------------------------------------------------------- * RALTA is used to allocate the LPM trees of the SHSPM method. @@ -7133,6 +7192,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = { MLXSW_REG(ratr), MLXSW_REG(rtdp), MLXSW_REG(ricnt), + MLXSW_REG(rrcr), MLXSW_REG(ralta), MLXSW_REG(ralst), MLXSW_REG(raltb), From 4af5964e58884855d28ae68ddf01279868e70853 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:18 +0200 Subject: [PATCH 0252/1322] mlxsw: reg: Configure RIF to forward IPv4 multicast packets by default Turn on two bits on the Spectrum RIF configuration: - IPv4 multicast: when a multicast packet arrives on a RIF, send it to go through multicast routes lookup. - IPv4 multicast forwarding enable: when multicast packet arrives on a RIF, allow it to be forwarded by multicast routes. If this bit is not set, multicast packets will go through multicast routing lookup but will be dropped at the egress of the ports. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 31d120ae8dc6..c203e0dfa827 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3991,6 +3991,12 @@ MLXSW_ITEM32(reg, ritr, ipv4, 0x00, 29, 1); */ MLXSW_ITEM32(reg, ritr, ipv6, 0x00, 28, 1); +/* reg_ritr_ipv4_mc + * IPv4 multicast routing enable. + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, ipv4_mc, 0x00, 27, 1); + enum mlxsw_reg_ritr_if_type { /* VLAN interface. */ MLXSW_REG_RITR_VLAN_IF, @@ -4048,6 +4054,14 @@ MLXSW_ITEM32(reg, ritr, ipv4_fe, 0x04, 29, 1); */ MLXSW_ITEM32(reg, ritr, ipv6_fe, 0x04, 28, 1); +/* reg_ritr_ipv4_mc_fe + * IPv4 Multicast Forwarding Enable. + * When disabled, forwarding is blocked but local traffic (traps and IP to me) + * will be enabled. + * Access: RW + */ +MLXSW_ITEM32(reg, ritr, ipv4_mc_fe, 0x04, 27, 1); + /* reg_ritr_lb_en * Loop-back filter enable for unicast packets. * If the flag is set then loop-back filter for unicast packets is @@ -4270,11 +4284,13 @@ static inline void mlxsw_reg_ritr_pack(char *payload, bool enable, mlxsw_reg_ritr_enable_set(payload, enable); mlxsw_reg_ritr_ipv4_set(payload, 1); mlxsw_reg_ritr_ipv6_set(payload, 1); + mlxsw_reg_ritr_ipv4_mc_set(payload, 1); mlxsw_reg_ritr_type_set(payload, type); mlxsw_reg_ritr_op_set(payload, op); mlxsw_reg_ritr_rif_set(payload, rif); mlxsw_reg_ritr_ipv4_fe_set(payload, 1); mlxsw_reg_ritr_ipv6_fe_set(payload, 1); + mlxsw_reg_ritr_ipv4_mc_fe_set(payload, 1); mlxsw_reg_ritr_lb_en_set(payload, 1); mlxsw_reg_ritr_virtual_router_set(payload, vr_id); mlxsw_reg_ritr_mtu_set(payload, mtu); From 91e4d59a4600afe64b44e013a7c1805bbfe61e59 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:19 +0200 Subject: [PATCH 0253/1322] mlxsw: spectrum_router: Export RIF dev access function The mlxsw_sp_rif struct, defined as private struct in spectrum_router.c will be used in the multicast router source file. Due to the fact that the dev field will be needed by the multicast router logic, add an access function to it. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +++++ drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2cfb3f5d092d..0bd93dc88ffa 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5049,6 +5049,11 @@ int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif) return rif->dev->ifindex; } +const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif) +{ + return rif->dev; +} + static struct mlxsw_sp_rif * mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, const struct mlxsw_sp_rif_params *params) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index 345fcc4f38e9..ae4c99b3f2fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -69,6 +69,7 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif); u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif); u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif); int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif); +const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif); int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_rif *rif, enum mlxsw_sp_rif_counter_dir dir, From b48cfc80ce9c27368e331d9aa742314487b0ee12 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 19 Sep 2017 10:00:20 +0200 Subject: [PATCH 0254/1322] mlxsw: spectrum: Add multicast router traps and trap groups Add three new traps needed for multicast routing: - PIM: Trap for PIM protocol control packets. - RPF: Trap for packets that fail the RPF check on a specific hardware route entry. - MULTICAST: Generic trap for multicast. It is used for routes that trap the packets to the CPU. The RPF and MULTICAST traps have rate limiters as these traps may have line-rate of packets trapped. The PIM trap has a rate limiter similarly to other L3 control protocols. The rate limiters are implemented by adding three new trap groups for the newly introduced traps. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 3 +++ drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 ++++++++++ drivers/net/ethernet/mellanox/mlxsw/trap.h | 4 ++++ 3 files changed, 17 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index c203e0dfa827..17eba19100de 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -3681,12 +3681,15 @@ enum mlxsw_reg_htgt_trap_group { MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP, MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP, MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF, + MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM, + MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST, MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP, MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS, MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP, MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE, MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME, MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP, + MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF, MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT, MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD, MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 6ba6ff276b17..e9b94430afed 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3421,6 +3421,10 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { false, SP_IP2ME, DISCARD), /* ACL trap */ MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, IP2ME, false), + /* Multicast Router Traps */ + MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false), + MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false), + MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), }; static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) @@ -3446,6 +3450,8 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF: rate = 128; burst_size = 7; break; @@ -3461,6 +3467,7 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE: case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: rate = 1024; burst_size = 7; break; @@ -3506,6 +3513,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: priority = 5; tc = 5; break; @@ -3522,12 +3530,14 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF: priority = 2; tc = 2; break; case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS: case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE: + case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: priority = 1; tc = 1; break; diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index f396a1fef633..a98103539f6b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -62,6 +62,8 @@ enum { MLXSW_TRAP_ID_TTLERROR = 0x53, MLXSW_TRAP_ID_LBERROR = 0x54, MLXSW_TRAP_ID_IPV4_OSPF = 0x55, + MLXSW_TRAP_ID_IPV4_PIM = 0x58, + MLXSW_TRAP_ID_RPF = 0x5C, MLXSW_TRAP_ID_IP2ME = 0x5F, MLXSW_TRAP_ID_IPV6_UNSPECIFIED_ADDRESS = 0x60, MLXSW_TRAP_ID_IPV6_LINK_LOCAL_DEST = 0x61, @@ -89,6 +91,8 @@ enum { MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6, MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7, MLXSW_TRAP_ID_ACL0 = 0x1C0, + /* Multicast trap used for routes with trap action */ + MLXSW_TRAP_ID_ACL1 = 0x1C1, MLXSW_TRAP_ID_MAX = 0x1FF }; From 0647169cf9aa441700eb8f23ea49be060626534b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 12:41:37 +0200 Subject: [PATCH 0255/1322] rhashtable: Documentation tweak Clarify that rhashtable_walk_{stop,start} will not reset the iterator to the beginning of the hash table. Confusion between rhashtable_walk_enter and rhashtable_walk_start has already lead to a bug. Signed-off-by: Andreas Gruenbacher Signed-off-by: David S. Miller --- lib/rhashtable.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 707ca5d677c6..ddd7dde87c3c 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -735,9 +735,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * rhashtable_walk_start - Start a hash table walk * @iter: Hash table iterator * - * Start a hash table walk. Note that we take the RCU lock in all - * cases including when we return an error. So you must always call - * rhashtable_walk_stop to clean up. + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * @@ -846,7 +846,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * - * Finish a hash table walk. + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 05:14:24 -0700 Subject: [PATCH 0256/1322] net: sk_buff rbnode reorg skb->rbnode shares space with skb->next, skb->prev and skb->tstamp Current uses (TCP receive ofo queue and netem) need to save/restore tstamp, while skb->dev is either NULL (TCP) or a constant for a given queue (netem). Since we plan using an RB tree for TCP retransmit queue to speedup SACK processing with large BDP, this patch exchanges skb->dev and skb->tstamp. This saves some overhead in both TCP and netem. v2: removes the swtstamp field from struct tcp_skb_cb Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Wei Wang Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/skbuff.h | 16 ++++++++-------- include/net/tcp.h | 6 ------ net/ipv4/tcp_input.c | 27 +++++---------------------- net/sched/sch_netem.c | 7 ++++--- 4 files changed, 17 insertions(+), 39 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 72299ef00061..492828801acb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -661,8 +661,12 @@ struct sk_buff { struct sk_buff *prev; union { - ktime_t tstamp; - u64 skb_mstamp; + struct net_device *dev; + /* Some protocols might use this space to store information, + * while device pointer would be NULL. + * UDP receive path is one user. + */ + unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ @@ -670,12 +674,8 @@ struct sk_buff { struct sock *sk; union { - struct net_device *dev; - /* Some protocols might use this space to store information, - * while device pointer would be NULL. - * UDP receive path is one user. - */ - unsigned long dev_scratch; + ktime_t tstamp; + u64 skb_mstamp; }; /* * This is the control buffer. It is free to use for every diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..49a8a46466f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -797,12 +797,6 @@ struct tcp_skb_cb { u16 tcp_gso_segs; u16 tcp_gso_size; }; - - /* Used to stash the receive timestamp while this skb is in the - * out of order queue, as skb->tstamp is overwritten by the - * rbnode. - */ - ktime_t swtstamp; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bddf724f5c02..db9bb46b5776 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4266,11 +4266,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4286,7 +4281,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4311,10 +4305,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4351,9 +4342,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4365,8 +4353,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4420,10 +4407,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4451,7 +4434,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4502,7 +4485,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4554,7 +4537,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b1266e75ca43..063a4bdb9ee6 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -146,7 +146,6 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; - ktime_t tstamp_save; }; @@ -561,7 +560,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } cb->time_to_send = now + delay; - cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -629,7 +627,10 @@ deliver: qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; - skb->tstamp = netem_skb_cb(skb)->tstamp_save; + /* skb->dev shares skb->rbnode area, + * we need to restore its value. + */ + skb->dev = qdisc_dev(sch); #ifdef CONFIG_NET_CLS_ACT /* From 69e33b2754ea4fbe4fdd2d18bc8ca05d8670a5c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 19 Sep 2017 14:42:17 +0200 Subject: [PATCH 0257/1322] selftests: rtnetlink.sh: add test case for device ifalias Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 57 ++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 57b5ff576240..4b48de565cae 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -15,6 +15,14 @@ check_err() fi } +# same but inverted -- used when command must fail for test to pass +check_fail() +{ + if [ $1 -eq 0 ]; then + ret=1 + fi +} + kci_add_dummy() { ip link add name "$devdummy" type dummy @@ -235,6 +243,54 @@ kci_test_addrlabel() echo "PASS: ipv6 addrlabel" } +kci_test_ifalias() +{ + ret=0 + namewant=$(uuidgen) + syspathname="/sys/class/net/$devdummy/ifalias" + + ip link set dev "$devdummy" alias "$namewant" + check_err $? + + if [ $ret -ne 0 ]; then + echo "FAIL: cannot set interface alias of $devdummy to $namewant" + return 1 + fi + + ip link show "$devdummy" | grep -q "alias $namewant" + check_err $? + + if [ -r "$syspathname" ] ; then + read namehave < "$syspathname" + if [ "$namewant" != "$namehave" ]; then + echo "FAIL: did set ifalias $namewant but got $namehave" + return 1 + fi + + namewant=$(uuidgen) + echo "$namewant" > "$syspathname" + ip link show "$devdummy" | grep -q "alias $namewant" + check_err $? + + # sysfs interface allows to delete alias again + echo "" > "$syspathname" + + ip link show "$devdummy" | grep -q "alias $namewant" + check_fail $? + + # re-add the alias -- kernel should free mem when dummy dev is removed + ip link set dev "$devdummy" alias "$namewant" + check_err $? + fi + + if [ $ret -ne 0 ]; then + echo "FAIL: set interface alias $devdummy to $namewant" + return 1 + fi + + echo "PASS: set ifalias $namewant for $devdummy" +} + kci_test_rtnl() { kci_add_dummy @@ -249,6 +305,7 @@ kci_test_rtnl() kci_test_gre kci_test_bridge kci_test_addrlabel + kci_test_ifalias kci_del_dummy } From 75df6e688ccd517e339a7c422ef7ad73045b18a2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 17 Sep 2017 03:23:48 -0700 Subject: [PATCH 0258/1322] tracing: Fix trace_pipe behavior for instance traces When reading data from trace_pipe, tracing_wait_pipe() performs a check to see if tracing has been turned off after some data was read. Currently, this check always looks at global trace state, but it should be checking the trace instance where trace_pipe is located at. Because of this bug, cat instances/i1/trace_pipe in the following script will immediately exit instead of waiting for data: cd /sys/kernel/debug/tracing echo 0 > tracing_on mkdir -p instances/i1 echo 1 > instances/i1/tracing_on echo 1 > instances/i1/events/sched/sched_process_exec/enable cat instances/i1/trace_pipe Link: http://lkml.kernel.org/r/20170917102348.1615-1-tahsin@google.com Cc: stable@vger.kernel.org Fixes: 10246fa35d4f ("tracing: give easy way to clear trace buffer") Signed-off-by: Tahsin Erdogan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3ca35f38803..752e5daf0896 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5680,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); From 930651a75bf1ba6893a8b8475270664ebdb6cf4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 09:15:59 -0700 Subject: [PATCH 0259/1322] bpf: do not disable/enable BH in bpf_map_free_id() syzkaller reported following splat [1] Since hard irq are disabled by the caller, bpf_map_free_id() should not try to enable/disable BH. Another solution would be to change htab_map_delete_elem() to defer the free_htab_elem() call after raw_spin_unlock_irqrestore(&b->lock, flags), but this might be not enough to cover other code paths. [1] WARNING: CPU: 1 PID: 8052 at kernel/softirq.c:161 __local_bh_enable_ip +0x1e/0x160 kernel/softirq.c:161 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 8052 Comm: syz-executor1 Not tainted 4.13.0-next-20170915+ #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:181 __warn+0x1c4/0x1d9 kernel/panic.c:542 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178 do_trap_no_signal arch/x86/kernel/traps.c:212 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:261 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:__local_bh_enable_ip+0x1e/0x160 kernel/softirq.c:161 RSP: 0018:ffff8801cdcd7748 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 1ffffffff0b5933c RSI: 0000000000000201 RDI: ffffffff85ac99e0 RBP: ffff8801cdcd7758 R08: ffffffff85b87158 R09: 1ffff10039b9aec6 R10: ffff8801c99f24c0 R11: 0000000000000002 R12: ffffffff817b0b47 R13: dffffc0000000000 R14: ffff8801cdcd77e8 R15: 0000000000000001 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline] _raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:207 spin_unlock_bh include/linux/spinlock.h:361 [inline] bpf_map_free_id kernel/bpf/syscall.c:197 [inline] __bpf_map_put+0x267/0x320 kernel/bpf/syscall.c:227 bpf_map_put+0x1a/0x20 kernel/bpf/syscall.c:235 bpf_map_fd_put_ptr+0x15/0x20 kernel/bpf/map_in_map.c:96 free_htab_elem+0xc3/0x1b0 kernel/bpf/hashtab.c:658 htab_map_delete_elem+0x74d/0x970 kernel/bpf/hashtab.c:1063 map_delete_elem kernel/bpf/syscall.c:633 [inline] SYSC_bpf kernel/bpf/syscall.c:1479 [inline] SyS_bpf+0x2188/0x46a0 kernel/bpf/syscall.c:1451 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: f3f1c054c288 ("bpf: Introduce bpf_map ID") Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map) static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { + unsigned long flags; + if (do_idr_lock) - spin_lock_bh(&map_idr_lock); + spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); if (do_idr_lock) - spin_unlock_bh(&map_idr_lock); + spin_unlock_irqrestore(&map_idr_lock, flags); else __release(&map_idr_lock); } From 039cc1c1eb971ffe1973fddb8769a80fd4950a6f Mon Sep 17 00:00:00 2001 From: Dave Gerlach Date: Tue, 19 Sep 2017 15:06:13 -0500 Subject: [PATCH 0260/1322] cpufreq: ti-cpufreq: Support additional am43xx platforms Rather than letting the ti-cpufreq driver match against 'ti,am4372' machine compatible during probe let's match against 'ti,am43' so that we can support both 'ti,am4372' and 'ti,am438x' platforms which both match to this compatible. Signed-off-by: Dave Gerlach Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/ti-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c index b29cd3398463..4bf47de6101f 100644 --- a/drivers/cpufreq/ti-cpufreq.c +++ b/drivers/cpufreq/ti-cpufreq.c @@ -190,7 +190,7 @@ static int ti_cpufreq_setup_syscon_register(struct ti_cpufreq_data *opp_data) static const struct of_device_id ti_cpufreq_of_match[] = { { .compatible = "ti,am33xx", .data = &am3x_soc_data, }, - { .compatible = "ti,am4372", .data = &am4x_soc_data, }, + { .compatible = "ti,am43", .data = &am4x_soc_data, }, { .compatible = "ti,dra7", .data = &dra7_soc_data }, {}, }; From f5619866592c65adc087364cc1a3ba709201ea26 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:57 -0400 Subject: [PATCH 0261/1322] net: dsa: remove copy of master ethtool_ops There is no need to store a copy of the master ethtool ops, storing the original pointer in DSA and the new one in the master netdev itself is enough. In the meantime, set orig_ethtool_ops to NULL when restoring the master ethtool ops and check the presence of the master original ethtool ops as well as its needed functions before calling them. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - net/dsa/dsa.c | 8 ++++---- net/dsa/slave.c | 19 +++++++++++-------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index dd44d6ce1097..8dee216a5a9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -188,7 +188,6 @@ struct dsa_port { /* * Original copy of the master netdev ethtool_ops */ - struct ethtool_ops ethtool_ops; const struct ethtool_ops *orig_ethtool_ops; }; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 03c58b0eb082..abadf7b49236 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -124,11 +124,10 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) if (!cpu_ops) return -ENOMEM; - memcpy(&cpu_dp->ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); cpu_dp->orig_ethtool_ops = master->ethtool_ops; - memcpy(cpu_ops, &cpu_dp->ethtool_ops, - sizeof(struct ethtool_ops)); + if (cpu_dp->orig_ethtool_ops) + memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); + dsa_cpu_port_ethtool_init(cpu_ops); master->ethtool_ops = cpu_ops; @@ -138,6 +137,7 @@ int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) { cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } void dsa_cpu_dsa_destroy(struct dsa_port *port) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..2ff4f907d137 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -574,12 +574,13 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) { - count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data); + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); } if (ds->ops->get_ethtool_stats) @@ -591,10 +592,11 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; int count = 0; - if (cpu_dp->ethtool_ops.get_sset_count) - count += cpu_dp->ethtool_ops.get_sset_count(dev, sset); + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); if (sset == ETH_SS_STATS && ds->ops->get_sset_count) count += ds->ops->get_sset_count(ds); @@ -608,6 +610,7 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); struct dsa_switch *ds = cpu_dp->ds; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; s8 cpu_port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; @@ -619,9 +622,9 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; - if (cpu_dp->ethtool_ops.get_sset_count) { - mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_strings(dev, stringset, data); + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); } if (stringset == ETH_SS_STATS && ds->ops->get_strings) { From cd8d7dd41bfddaa02e34db35abfab14ce4584dee Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:58 -0400 Subject: [PATCH 0262/1322] net: dsa: setup master ethtool unconditionally When a DSA switch tree is meant to be applied, it already has a CPU port. Thus remove the condition of dst->cpu_dp. Moreover, the next lines access dst->cpu_dp unconditionally. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 873af0108e24..bd19304f862f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,11 +433,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_dp) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - } + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get @@ -474,10 +472,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_dp) { - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dst->cpu_dp = NULL; - } + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; From 1943563dfd4b3a1f9dc102f056813112d29bb60f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:59 -0400 Subject: [PATCH 0263/1322] net: dsa: setup master ethtool after dsa_ptr DSA overrides the master's ethtool ops so that we can inject its CPU port's statistics. Because of that, we need to setup the ethtool ops after the master's dsa_ptr pointer has been assigned, not before. This patch setups the ethtool ops after dsa_ptr is assigned, and restores them before it gets cleared. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 12 +++++++----- net/dsa/legacy.c | 10 +++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index bd19304f862f..032f8bc3e788 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,16 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. */ wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; + + err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + if (err) + return err; + dst->applied = true; return 0; @@ -456,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -472,7 +475,6 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 91e6f7981d39..163910699db7 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -206,10 +206,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, netdev_err(master, "[%d] : can't configure CPU and DSA ports\n", index); - ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp); - if (ret) - return ret; - return 0; } @@ -606,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return 0; + return dsa_cpu_port_ethtool_setup(dst->cpu_dp); } static int dsa_probe(struct platform_device *pdev) @@ -671,6 +667,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -686,8 +684,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dev_put(dst->cpu_dp->netdev); } From f2f2356685bcaf1063859356fc65a5ac808b1382 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:57:00 -0400 Subject: [PATCH 0264/1322] net: dsa: move master ethtool code DSA overrides the master device ethtool ops, so that it can inject stats from its dedicated switch CPU port as well. The related code is currently split in dsa.c and slave.c, but it only scopes the master net device. Move it to a new master.c DSA core file. This file will be later extented with master net device specific code. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 28 ----------- net/dsa/dsa2.c | 4 +- net/dsa/dsa_priv.h | 7 +-- net/dsa/legacy.c | 4 +- net/dsa/master.c | 120 +++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 83 ------------------------------- 7 files changed, 129 insertions(+), 119 deletions(-) create mode 100644 net/dsa/master.c diff --git a/net/dsa/Makefile b/net/dsa/Makefile index fcce25da937c..2e7ac8bab19d 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index abadf7b49236..81c852e32821 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) -{ - struct dsa_switch *ds = cpu_dp->ds; - struct net_device *master; - struct ethtool_ops *cpu_ops; - - master = cpu_dp->netdev; - - cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); - if (!cpu_ops) - return -ENOMEM; - - cpu_dp->orig_ethtool_ops = master->ethtool_ops; - if (cpu_dp->orig_ethtool_ops) - memcpy(cpu_ops, cpu_dp->orig_ethtool_ops, sizeof(*cpu_ops)); - - dsa_cpu_port_ethtool_init(cpu_ops); - master->ethtool_ops = cpu_ops; - - return 0; -} - -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) -{ - cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; - cpu_dp->orig_ethtool_ops = NULL; -} - void dsa_cpu_dsa_destroy(struct dsa_port *port) { struct device_node *port_dn = port->dn; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 032f8bc3e788..dcccaebde708 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -440,7 +440,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) wmb(); dst->cpu_dp->netdev->dsa_ptr = dst; - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); + err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); if (err) return err; @@ -457,7 +457,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dsa_master_ethtool_restore(dst->cpu_dp->netdev); dst->cpu_dp->netdev->dsa_ptr = NULL; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9c3eeb72462d..f616b3444418 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -97,8 +97,6 @@ struct dsa_slave_priv { int dsa_cpu_dsa_setup(struct dsa_port *port); void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp); -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ @@ -112,6 +110,10 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); +/* master.c */ +int dsa_master_ethtool_setup(struct net_device *dev); +void dsa_master_ethtool_restore(struct net_device *dev); + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -139,7 +141,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_port *port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 163910699db7..ae505d8e4417 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -602,7 +602,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, wmb(); dev->dsa_ptr = dst; - return dsa_cpu_port_ethtool_setup(dst->cpu_dp); + return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } static int dsa_probe(struct platform_device *pdev) @@ -667,7 +667,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_cpu_port_ethtool_restore(dst->cpu_dp); + dsa_master_ethtool_restore(dst->cpu_dp->netdev); dst->cpu_dp->netdev->dsa_ptr = NULL; diff --git a/net/dsa/master.c b/net/dsa/master.c new file mode 100644 index 000000000000..5e5147ec5a44 --- /dev/null +++ b/net/dsa/master.c @@ -0,0 +1,120 @@ +/* + * Handling of a master device, switching frames via its switch fabric CPU port + * + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include "dsa_priv.h" + +static void dsa_master_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int count = 0; + + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); + } + + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, port->index, data + count); +} + +static int dsa_master_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int count = 0; + + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); + + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); + + return count; +} + +static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, + uint8_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + const struct ethtool_ops *ops = port->orig_ethtool_ops; + int len = ETH_GSTRING_LEN; + int mcount = 0, count; + unsigned int i; + uint8_t pfx[4]; + uint8_t *ndata; + + snprintf(pfx, sizeof(pfx), "p%.2d", port->index); + /* We do not want to be NULL-terminated, since this is a prefix */ + pfx[sizeof(pfx) - 1] = '_'; + + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); + } + + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { + ndata = data + mcount * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->ops->get_strings(ds, port->index, ndata); + count = ds->ops->get_sset_count(ds); + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + } +} + +int dsa_master_ethtool_setup(struct net_device *dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + struct dsa_switch *ds = port->ds; + struct ethtool_ops *ops; + + ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + port->orig_ethtool_ops = dev->ethtool_ops; + if (port->orig_ethtool_ops) + memcpy(ops, port->orig_ethtool_ops, sizeof(*ops)); + + ops->get_sset_count = dsa_master_get_sset_count; + ops->get_ethtool_stats = dsa_master_get_ethtool_stats; + ops->get_strings = dsa_master_get_strings; + + dev->ethtool_ops = ops; + + return 0; +} + +void dsa_master_ethtool_restore(struct net_device *dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *port = dst->cpu_dp; + + dev->ethtool_ops = port->orig_ethtool_ops; + port->orig_ethtool_ops = NULL; +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2ff4f907d137..d51b10450e1b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -567,82 +567,6 @@ static void dsa_slave_get_strings(struct net_device *dev, } } -static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, - uint64_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - s8 cpu_port = cpu_dp->index; - int count = 0; - - if (ops && ops->get_sset_count && ops->get_ethtool_stats) { - count = ops->get_sset_count(dev, ETH_SS_STATS); - ops->get_ethtool_stats(dev, stats, data); - } - - if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, cpu_port, data + count); -} - -static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - int count = 0; - - if (ops && ops->get_sset_count) - count += ops->get_sset_count(dev, sset); - - if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); - - return count; -} - -static void dsa_cpu_port_get_strings(struct net_device *dev, - uint32_t stringset, uint8_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - s8 cpu_port = cpu_dp->index; - int len = ETH_GSTRING_LEN; - int mcount = 0, count; - unsigned int i; - uint8_t pfx[4]; - uint8_t *ndata; - - snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port); - /* We do not want to be NULL-terminated, since this is a prefix */ - pfx[sizeof(pfx) - 1] = '_'; - - if (ops && ops->get_sset_count && ops->get_strings) { - mcount = ops->get_sset_count(dev, ETH_SS_STATS); - ops->get_strings(dev, stringset, data); - } - - if (stringset == ETH_SS_STATS && ds->ops->get_strings) { - ndata = data + mcount * len; - /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle - * the output after to prepend our CPU port prefix we - * constructed earlier - */ - ds->ops->get_strings(ds, cpu_port, ndata); - count = ds->ops->get_sset_count(ds); - for (i = 0; i < count; i++) { - memmove(ndata + (i * len + sizeof(pfx)), - ndata + i * len, len - sizeof(pfx)); - memcpy(ndata + i * len, pfx, sizeof(pfx)); - } - } -} - static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -979,13 +903,6 @@ static void dsa_slave_get_stats64(struct net_device *dev, } } -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) -{ - ops->get_sset_count = dsa_cpu_port_get_sset_count; - ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats; - ops->get_strings = dsa_cpu_port_get_strings; -} - static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { From 2a4776e14f7da3d48fffb4edbb82355742f23478 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:10 +0100 Subject: [PATCH 0265/1322] net: hns3: Fixes initialization of phy address from firmware Default phy address of every port is 0. Therefore, phy address for each port need to be fetched from firmware and device initialized with fetched non-default phy address. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index bb45365fb817..db4e07dac29a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1066,6 +1066,7 @@ static int hclge_configure(struct hclge_dev *hdev) for (i = 0; i < ETH_ALEN; i++) hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; hdev->hw.mac.media_type = cfg.media_type; + hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; hdev->tm_info.num_pg = 1; hdev->tm_info.num_tc = cfg.tc_num; From c5b1b97522ef32d2170c9aa1a0c1eec179acbb3a Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:11 +0100 Subject: [PATCH 0266/1322] net: hns3: Fixes the command used to unmap ring from vector This patch fixes the IMP command being used to unmap the vector from the corresponding ring. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index db4e07dac29a..e324bc6e9f4f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2779,7 +2779,7 @@ static int hclge_unmap_ring_from_vector( } i = 0; hclge_cmd_setup_basic_desc(&desc, - HCLGE_OPC_ADD_RING_TO_VECTOR, + HCLGE_OPC_DEL_RING_TO_VECTOR, false); req->int_vector_id = vector_id; } From 0305b443a3ba5b84aa474786026df04a70460135 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:12 +0100 Subject: [PATCH 0267/1322] net: hns3: Fixes ring-to-vector map-and-unmap command This patch fixes the vector-to-ring map and unmap command and adds INT_GL(for, Gap Limiting Interrupts) and VF id to it as required by the hardware interface. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Lipeng Signed-off-by: Mingguang Qu Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 8 ++++++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 91ae0135ee50..c2b613b40509 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -238,7 +238,7 @@ struct hclge_tqp_map { u8 rsv[18]; }; -#define HCLGE_VECTOR_ELEMENTS_PER_CMD 11 +#define HCLGE_VECTOR_ELEMENTS_PER_CMD 10 enum hclge_int_type { HCLGE_INT_TX, @@ -252,8 +252,12 @@ struct hclge_ctrl_vector_chain { #define HCLGE_INT_TYPE_S 0 #define HCLGE_INT_TYPE_M 0x3 #define HCLGE_TQP_ID_S 2 -#define HCLGE_TQP_ID_M (0x3fff << HCLGE_TQP_ID_S) +#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S) +#define HCLGE_INT_GL_IDX_S 13 +#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S) __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD]; + u8 vfid; + u8 rsv; }; #define HCLGE_TC_NUM 8 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e324bc6e9f4f..eafd9c678162 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2680,7 +2680,11 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; @@ -2764,8 +2768,12 @@ static int hclge_unmap_ring_from_vector( hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, node->tqp_index); + hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + HCLGE_INT_GL_IDX_S, + hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { req->int_cause_num = HCLGE_VECTOR_ELEMENTS_PER_CMD; From 139e8792537b327252a9676591a78e6408d50a85 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:13 +0100 Subject: [PATCH 0268/1322] net: hns3: Fixes the initialization of MAC address in hardware This patch fixes the initialization of MAC address, fetched from HNS3 firmware i.e. when it is not randomly generated, to the HNS3 hardware. Fixes: ca60906d2795 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 1c3e29447891..4d68d6ea5143 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2705,10 +2705,11 @@ static void hns3_init_mac_addr(struct net_device *netdev) eth_hw_addr_random(netdev); dev_warn(priv->dev, "using random MAC address %pM\n", netdev->dev_addr); - /* Also copy this new MAC address into hdev */ - if (h->ae_algo->ops->set_mac_addr) - h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); } + + if (h->ae_algo->ops->set_mac_addr) + h->ae_algo->ops->set_mac_addr(h, netdev->dev_addr); + } static void hns3_nic_set_priv_ops(struct net_device *netdev) From fbbb1536b220eb4f4f95cbceae6579489a8adad5 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 19 Sep 2017 17:17:14 +0100 Subject: [PATCH 0269/1322] net: hns3: Fixes the ether address copy with appropriate API This patch replaces the ethernet address copy instance with more appropriate ether_addr_copy() function. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eafd9c678162..8e172afd4876 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1063,8 +1063,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->base_tqp_pid = 0; hdev->rss_size_max = 1; hdev->rx_buf_len = cfg.rx_buf_len; - for (i = 0; i < ETH_ALEN; i++) - hdev->hw.mac.mac_addr[i] = cfg.mac_addr[i]; + ether_addr_copy(hdev->hw.mac.mac_addr, cfg.mac_addr); hdev->hw.mac.media_type = cfg.media_type; hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; From 5e43aef8491ae3b5feb79cd15260faf39303ef33 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:15 +0100 Subject: [PATCH 0270/1322] net: hns3: Fixes the default VLAN-id of PF When there is no vlan id in the packets, hardware will treat the vlan id as 0 and look for the mac_vlan table. This patch set the default vlan id of PF as 0. Without this config, it will fail when look for mac_vlan table, and hardware will drop packets. Fixes: 6427264ef330 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Mingguang Qu Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 8e172afd4876..74008ef23169 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3673,6 +3673,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) { #define HCLGE_VLAN_TYPE_VF_TABLE 0 #define HCLGE_VLAN_TYPE_PORT_TABLE 1 + struct hnae3_handle *handle; int ret; ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_VF_TABLE, @@ -3682,8 +3683,11 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_ctrl(hdev, HCLGE_VLAN_TYPE_PORT_TABLE, true); + if (ret) + return ret; - return ret; + handle = &hdev->vport[0].nic; + return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false); } static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) From 90f7b11a5a0081feb7041fcc795c9a131a62a725 Mon Sep 17 00:00:00 2001 From: Lipeng Date: Tue, 19 Sep 2017 17:17:16 +0100 Subject: [PATCH 0271/1322] net: hns3: Fixes the premature exit of loop when matching clients When register/unregister ae_dev, ae_dev should match all client in the client_list. Enet and roce can co-exists together so we should continue checking for enet and roce presence together. So break should not be there. Above caused problems in loading and unloading of modules. Fixes: 38eddd126772 ("net: hns3: Add support of the HNAE3 framework") Signed-off-by: Lipeng Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 43 +++++---------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 59efbd605416..5bcb2238acb2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -37,20 +37,15 @@ static bool hnae3_client_match(enum hnae3_client_type client_type, } static int hnae3_match_n_instantiate(struct hnae3_client *client, - struct hnae3_ae_dev *ae_dev, - bool is_reg, bool *matched) + struct hnae3_ae_dev *ae_dev, bool is_reg) { int ret; - *matched = false; - /* check if this client matches the type of ae_dev */ if (!(hnae3_client_match(client->type, ae_dev->dev_type) && hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))) { return 0; } - /* there is a match of client and dev */ - *matched = true; /* now, (un-)instantiate client by calling lower layer */ if (is_reg) { @@ -69,7 +64,6 @@ int hnae3_register_client(struct hnae3_client *client) { struct hnae3_client *client_tmp; struct hnae3_ae_dev *ae_dev; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -86,7 +80,7 @@ int hnae3_register_client(struct hnae3_client *client) /* if the client could not be initialized on current port, for * any error reasons, move on to next available port */ - ret = hnae3_match_n_instantiate(client, ae_dev, true, &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed for port\n"); @@ -102,12 +96,11 @@ EXPORT_SYMBOL(hnae3_register_client); void hnae3_unregister_client(struct hnae3_client *client) { struct hnae3_ae_dev *ae_dev; - bool matched; mutex_lock(&hnae3_common_lock); /* un-initialize the client on every matched port */ list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, &matched); + hnae3_match_n_instantiate(client, ae_dev, false); } list_del(&client->node); @@ -124,7 +117,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -151,13 +143,10 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } } @@ -175,7 +164,6 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) const struct pci_device_id *id; struct hnae3_ae_dev *ae_dev; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_dev */ @@ -187,12 +175,8 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo) /* check the client list for the match with this ae_dev type and * un-initialize the figure out client instance */ - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); @@ -212,7 +196,6 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; int ret = 0; mutex_lock(&hnae3_common_lock); @@ -246,13 +229,10 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev) * initialize the figure out client instance */ list_for_each_entry(client, &hnae3_client_list, node) { - ret = hnae3_match_n_instantiate(client, ae_dev, true, - &matched); + ret = hnae3_match_n_instantiate(client, ae_dev, true); if (ret) dev_err(&ae_dev->pdev->dev, "match and instantiation failed\n"); - if (matched) - break; } out_err: @@ -270,7 +250,6 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) const struct pci_device_id *id; struct hnae3_ae_algo *ae_algo; struct hnae3_client *client; - bool matched; mutex_lock(&hnae3_common_lock); /* Check if there are matched ae_algo */ @@ -279,12 +258,8 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev) if (!id) continue; - list_for_each_entry(client, &hnae3_client_list, node) { - hnae3_match_n_instantiate(client, ae_dev, false, - &matched); - if (matched) - break; - } + list_for_each_entry(client, &hnae3_client_list, node) + hnae3_match_n_instantiate(client, ae_dev, false); ae_algo->ops->uninit_ae_dev(ae_dev); hnae_set_bit(ae_dev->flag, HNAE3_DEV_INITED_B, 0); From 7131cc9fc9fd3291bff0e0d3326a1b6983fa0f34 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:43 -0700 Subject: [PATCH 0272/1322] net: dsa: b53: Remove is_cpu_port() This is not used anywhere, so remove it. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_priv.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 01bd8cbe9a3f..7528b22aeb03 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -186,11 +186,6 @@ static inline int is58xx(struct b53_device *dev) #define B53_CPU_PORT_25 5 #define B53_CPU_PORT 8 -static inline int is_cpu_port(struct b53_device *dev, int port) -{ - return dev->cpu_port; -} - struct b53_device *b53_switch_alloc(struct device *base, const struct b53_io_ops *ops, void *priv); From 299752a7d28654dd74304a76d3639e744df52084 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:44 -0700 Subject: [PATCH 0273/1322] net: dsa: b53: Make b53_enable_cpu_port() take a port argument In preparation for future changes allowing the configuring of multiple CPU ports, make b53_enable_cpu_port() take a port argument. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 274f3679f33d..d8bc54cfcfbe 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -538,19 +538,18 @@ static void b53_disable_port(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } -static void b53_enable_cpu_port(struct b53_device *dev) +static void b53_enable_cpu_port(struct b53_device *dev, int port) { - unsigned int cpu_port = dev->cpu_port; u8 port_ctrl; /* BCM5325 CPU port is at 8 */ - if ((is5325(dev) || is5365(dev)) && cpu_port == B53_CPU_PORT_25) - cpu_port = B53_CPU_PORT; + if ((is5325(dev) || is5365(dev)) && port == B53_CPU_PORT_25) + port = B53_CPU_PORT; port_ctrl = PORT_CTRL_RX_BCST_EN | PORT_CTRL_RX_MCST_EN | PORT_CTRL_RX_UCST_EN; - b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(cpu_port), port_ctrl); + b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl); } static void b53_enable_mib(struct b53_device *dev) @@ -820,7 +819,7 @@ static int b53_setup(struct dsa_switch *ds) if (BIT(port) & ds->enabled_port_mask) b53_enable_port(ds, port, NULL); else if (dsa_is_cpu_port(ds, port)) - b53_enable_cpu_port(dev); + b53_enable_cpu_port(dev, port); else b53_disable_port(ds, port, NULL); } From 34c8befd1365eb38f8dd7f3f81086fa766915610 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:45 -0700 Subject: [PATCH 0274/1322] net: dsa: b53: Defer port enabling to calling port_enable There is no need to configure the enabled ports once in b53_setup() and then a second time around when dsa_switch_ops::port_enable is called, just do it when port_enable is called which is better in terms of power consumption and correctness. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index d8bc54cfcfbe..3297af6aab8a 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -815,12 +815,13 @@ static int b53_setup(struct dsa_switch *ds) if (ret) dev_err(ds->dev, "failed to apply configuration\n"); + /* Configure IMP/CPU port, disable unused ports. Enabled + * ports will be configured with .port_enable + */ for (port = 0; port < dev->num_ports; port++) { - if (BIT(port) & ds->enabled_port_mask) - b53_enable_port(ds, port, NULL); - else if (dsa_is_cpu_port(ds, port)) + if (dsa_is_cpu_port(ds, port)) b53_enable_cpu_port(dev, port); - else + else if (!(BIT(port) & ds->enabled_port_mask)) b53_disable_port(ds, port, NULL); } From e85ec74ace29acfda4afdfd56df122a0a62e6a1a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:46 -0700 Subject: [PATCH 0275/1322] net: dsa: bcm_sf2: Defer port enabling to calling port_enable There is no need to configure the enabled ports once in bcm_sf2_sw_setup() and then a second time around when dsa_switch_ops::port_enable is called, just do it when port_enable is called which is better in terms of power consumption and correctness. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index d7b53d53c116..8acbd17bc1fd 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -890,14 +890,11 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); unsigned int port; - /* Enable all valid ports and disable those unused */ + /* Disable unused ports and configure IMP port */ for (port = 0; port < priv->hw_params.num_ports; port++) { - /* IMP port receives special treatment */ - if ((1 << port) & ds->enabled_port_mask) - bcm_sf2_port_setup(ds, port, NULL); - else if (dsa_is_cpu_port(ds, port)) + if (dsa_is_cpu_port(ds, port)) bcm_sf2_imp_setup(ds, port); - else + else if (!((1 << port) & ds->enabled_port_mask)) bcm_sf2_port_disable(ds, port, NULL); } From 5345862e9af02ce3780639bfe350ed2da9f8af13 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:47 -0700 Subject: [PATCH 0276/1322] net: dsa: b53: Use a macro to define I/O operations Instead of repeating the same pattern: acquire mutex, read/write, release mutex, define a macro: b53_build_op() which takes the type (read|write), I/O size, and value (scalar or pointer). This helps with fixing bugs that could exist (e.g: missing barrier, lock etc.). Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_priv.h | 131 ++++++--------------------------- 1 file changed, 21 insertions(+), 110 deletions(-) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 7528b22aeb03..5bebe97900e8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -199,119 +199,30 @@ static inline void b53_switch_remove(struct b53_device *dev) dsa_unregister_switch(dev->ds); } -static inline int b53_read8(struct b53_device *dev, u8 page, u8 reg, u8 *val) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->read8(dev, page, reg, val); - mutex_unlock(&dev->reg_mutex); - - return ret; +#define b53_build_op(type_op_size, val_type) \ +static inline int b53_##type_op_size(struct b53_device *dev, u8 page, \ + u8 reg, val_type val) \ +{ \ + int ret; \ + \ + mutex_lock(&dev->reg_mutex); \ + ret = dev->ops->type_op_size(dev, page, reg, val); \ + mutex_unlock(&dev->reg_mutex); \ + \ + return ret; \ } -static inline int b53_read16(struct b53_device *dev, u8 page, u8 reg, u16 *val) -{ - int ret; +b53_build_op(read8, u8 *); +b53_build_op(read16, u16 *); +b53_build_op(read32, u32 *); +b53_build_op(read48, u64 *); +b53_build_op(read64, u64 *); - mutex_lock(&dev->reg_mutex); - ret = dev->ops->read16(dev, page, reg, val); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_read32(struct b53_device *dev, u8 page, u8 reg, u32 *val) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->read32(dev, page, reg, val); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_read48(struct b53_device *dev, u8 page, u8 reg, u64 *val) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->read48(dev, page, reg, val); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_read64(struct b53_device *dev, u8 page, u8 reg, u64 *val) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->read64(dev, page, reg, val); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_write8(struct b53_device *dev, u8 page, u8 reg, u8 value) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->write8(dev, page, reg, value); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_write16(struct b53_device *dev, u8 page, u8 reg, - u16 value) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->write16(dev, page, reg, value); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_write32(struct b53_device *dev, u8 page, u8 reg, - u32 value) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->write32(dev, page, reg, value); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_write48(struct b53_device *dev, u8 page, u8 reg, - u64 value) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->write48(dev, page, reg, value); - mutex_unlock(&dev->reg_mutex); - - return ret; -} - -static inline int b53_write64(struct b53_device *dev, u8 page, u8 reg, - u64 value) -{ - int ret; - - mutex_lock(&dev->reg_mutex); - ret = dev->ops->write64(dev, page, reg, value); - mutex_unlock(&dev->reg_mutex); - - return ret; -} +b53_build_op(write8, u8); +b53_build_op(write16, u16); +b53_build_op(write32, u32); +b53_build_op(write48, u64); +b53_build_op(write64, u64); struct b53_arl_entry { u8 port; From b409a9efa183d92d99bc314fb26ebc1c2a8b4379 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:48 -0700 Subject: [PATCH 0277/1322] net: dsa: b53: Move Broadcom header setup to b53 The code to enable Broadcom tags/headers is largely switch independent, and in preparation for enabling it for multiple devices with b53, move the code we have in bcm_sf2.c to b53_common.c Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 47 ++++++++++++++++++++++++++++++++ drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/b53/b53_regs.h | 7 +++++ drivers/net/dsa/bcm_sf2.c | 43 ++--------------------------- drivers/net/dsa/bcm_sf2_regs.h | 8 ------ 5 files changed, 57 insertions(+), 49 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 3297af6aab8a..aa2187c71ea5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -538,6 +538,53 @@ static void b53_disable_port(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } +void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) +{ + struct b53_device *dev = ds->priv; + u8 hdr_ctl, val; + u16 reg; + + /* Resolve which bit controls the Broadcom tag */ + switch (port) { + case 8: + val = BRCM_HDR_P8_EN; + break; + case 7: + val = BRCM_HDR_P7_EN; + break; + case 5: + val = BRCM_HDR_P5_EN; + break; + default: + val = 0; + break; + } + + /* Enable Broadcom tags for IMP port */ + b53_read8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, &hdr_ctl); + hdr_ctl |= val; + b53_write8(dev, B53_MGMT_PAGE, B53_BRCM_HDR, hdr_ctl); + + /* Registers below are only accessible on newer devices */ + if (!is58xx(dev)) + return; + + /* Enable reception Broadcom tag for CPU TX (switch RX) to + * allow us to tag outgoing frames + */ + b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, ®); + reg &= ~BIT(port); + b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_RX_DIS, reg); + + /* Enable transmission of Broadcom tags from the switch (CPU RX) to + * allow delivering frames to the per-port net_devices + */ + b53_read16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, ®); + reg &= ~BIT(port); + b53_write16(dev, B53_MGMT_PAGE, B53_BRCM_HDR_TX_DIS, reg); +} +EXPORT_SYMBOL(b53_brcm_hdr_setup); + static void b53_enable_cpu_port(struct b53_device *dev, int port) { u8 port_ctrl; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 5bebe97900e8..77102f685da0 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -309,5 +309,6 @@ int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); +void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); #endif diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index e5c86d44667a..5e8b8e31fee8 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -210,6 +210,7 @@ #define B53_BRCM_HDR 0x03 #define BRCM_HDR_P8_EN BIT(0) /* Enable tagging on port 8 */ #define BRCM_HDR_P5_EN BIT(1) /* Enable tagging on port 5 */ +#define BRCM_HDR_P7_EN BIT(2) /* Enable tagging on port 7 */ /* Mirror capture control register (16 bit) */ #define B53_MIR_CAP_CTL 0x10 @@ -249,6 +250,12 @@ /* Revision ID register (8 bit) */ #define B53_REV_ID 0x40 +/* Broadcom header RX control (16 bit) */ +#define B53_BRCM_HDR_RX_DIS 0x60 + +/* Broadcom header TX control (16 bit) */ +#define B53_BRCM_HDR_TX_DIS 0x62 + /************************************************************************* * ARL Access Page Registers *************************************************************************/ diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 8acbd17bc1fd..49cb51223f70 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -60,45 +60,6 @@ static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } } -static void bcm_sf2_brcm_hdr_setup(struct bcm_sf2_priv *priv, int port) -{ - u32 reg, val; - - /* Resolve which bit controls the Broadcom tag */ - switch (port) { - case 8: - val = BRCM_HDR_EN_P8; - break; - case 7: - val = BRCM_HDR_EN_P7; - break; - case 5: - val = BRCM_HDR_EN_P5; - break; - default: - val = 0; - break; - } - - /* Enable Broadcom tags for IMP port */ - reg = core_readl(priv, CORE_BRCM_HDR_CTRL); - reg |= val; - core_writel(priv, reg, CORE_BRCM_HDR_CTRL); - - /* Enable reception Broadcom tag for CPU TX (switch RX) to - * allow us to tag outgoing frames - */ - reg = core_readl(priv, CORE_BRCM_HDR_RX_DIS); - reg &= ~(1 << port); - core_writel(priv, reg, CORE_BRCM_HDR_RX_DIS); - - /* Enable transmission of Broadcom tags from the switch (CPU RX) to - * allow delivering frames to the per-port net_devices - */ - reg = core_readl(priv, CORE_BRCM_HDR_TX_DIS); - reg &= ~(1 << port); - core_writel(priv, reg, CORE_BRCM_HDR_TX_DIS); -} static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) { @@ -138,7 +99,7 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) reg |= i << (PRT_TO_QID_SHIFT * i); core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port)); - bcm_sf2_brcm_hdr_setup(priv, port); + b53_brcm_hdr_setup(ds, port); /* Force link status for IMP port */ reg = core_readl(priv, offset); @@ -247,7 +208,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, /* Enable Broadcom tags for that port if requested */ if (priv->brcm_tag_mask & BIT(port)) - bcm_sf2_brcm_hdr_setup(priv, port); + b53_brcm_hdr_setup(ds, port); /* Configure Traffic Class to QoS mapping, allow each priority to map * to a different queue number diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h index 49695fcc2ea8..788361ad68a0 100644 --- a/drivers/net/dsa/bcm_sf2_regs.h +++ b/drivers/net/dsa/bcm_sf2_regs.h @@ -205,16 +205,8 @@ enum bcm_sf2_reg_offs { #define CORE_IMP0_PRT_ID 0x0804 -#define CORE_BRCM_HDR_CTRL 0x0080c -#define BRCM_HDR_EN_P8 (1 << 0) -#define BRCM_HDR_EN_P5 (1 << 1) -#define BRCM_HDR_EN_P7 (1 << 2) - #define CORE_RST_MIB_CNT_EN 0x0950 -#define CORE_BRCM_HDR_RX_DIS 0x0980 -#define CORE_BRCM_HDR_TX_DIS 0x0988 - #define CORE_ARLA_VTBL_RWCTRL 0x1600 #define ARLA_VTBL_CMD_WRITE 0 #define ARLA_VTBL_CMD_READ 1 From 909d812a668e8950ed8bb33ecdc9afbcdb0d371b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:49 -0700 Subject: [PATCH 0278/1322] net: dsa: b53: Define EEE register page In preparation for migrating the EEE code from bcm_sf2 to b53, define the full EEE register page and offsets within that page. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_regs.h | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index 5e8b8e31fee8..2a9f421680aa 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -50,6 +50,9 @@ /* Jumbo Frame Registers */ #define B53_JUMBO_PAGE 0x40 +/* EEE Control Registers Page */ +#define B53_EEE_PAGE 0x92 + /* CFP Configuration Registers Page */ #define B53_CFP_PAGE 0xa1 @@ -471,6 +474,44 @@ #define JMS_MIN_SIZE 1518 #define JMS_MAX_SIZE 9724 +/************************************************************************* + * EEE Configuration Page Registers + *************************************************************************/ + +/* EEE Enable control register (16 bit) */ +#define B53_EEE_EN_CTRL 0x00 + +/* EEE LPI assert status register (16 bit) */ +#define B53_EEE_LPI_ASSERT_STS 0x02 + +/* EEE LPI indicate status register (16 bit) */ +#define B53_EEE_LPI_INDICATE 0x4 + +/* EEE Receiving idle symbols status register (16 bit) */ +#define B53_EEE_RX_IDLE_SYM_STS 0x6 + +/* EEE Pipeline timer register (32 bit) */ +#define B53_EEE_PIP_TIMER 0xC + +/* EEE Sleep timer Gig register (32 bit) */ +#define B53_EEE_SLEEP_TIMER_GIG(i) (0x10 + 4 * (i)) + +/* EEE Sleep timer FE register (32 bit) */ +#define B53_EEE_SLEEP_TIMER_FE(i) (0x34 + 4 * (i)) + +/* EEE Minimum LP timer Gig register (32 bit) */ +#define B53_EEE_MIN_LP_TIMER_GIG(i) (0x58 + 4 * (i)) + +/* EEE Minimum LP timer FE register (32 bit) */ +#define B53_EEE_MIN_LP_TIMER_FE(i) (0x7c + 4 * (i)) + +/* EEE Wake timer Gig register (16 bit) */ +#define B53_EEE_WAKE_TIMER_GIG(i) (0xa0 + 2 * (i)) + +/* EEE Wake timer FE register (16 bit) */ +#define B53_EEE_WAKE_TIMER_FE(i) (0xb2 + 2 * (i)) + + /************************************************************************* * CFP Configuration Page Registers *************************************************************************/ From 22256b0afb12333571ad11799fa68fd27e4f4e80 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:50 -0700 Subject: [PATCH 0279/1322] net: dsa: b53: Move EEE functions to b53 Move the bcm_sf2 EEE-related functions to the b53 driver because this is shared code amongst Gigabit capable switch, only 5325 and 5365 are too old to support that. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 63 ++++++++++++++++++++++++++++++ drivers/net/dsa/b53/b53_priv.h | 5 +++ drivers/net/dsa/bcm_sf2.c | 66 +++----------------------------- drivers/net/dsa/bcm_sf2.h | 2 - drivers/net/dsa/bcm_sf2_regs.h | 3 -- 5 files changed, 74 insertions(+), 65 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index aa2187c71ea5..491e4ffa8a0e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1531,6 +1531,69 @@ void b53_mirror_del(struct dsa_switch *ds, int port, } EXPORT_SYMBOL(b53_mirror_del); +void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable) +{ + struct b53_device *dev = ds->priv; + u16 reg; + + b53_read16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, ®); + if (enable) + reg |= BIT(port); + else + reg &= ~BIT(port); + b53_write16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, reg); +} +EXPORT_SYMBOL(b53_eee_enable_set); + + +/* Returns 0 if EEE was not enabled, or 1 otherwise + */ +int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy) +{ + int ret; + + ret = phy_init_eee(phy, 0); + if (ret) + return 0; + + b53_eee_enable_set(ds, port, true); + + return 1; +} +EXPORT_SYMBOL(b53_eee_init); + +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e) +{ + struct b53_device *dev = ds->priv; + struct ethtool_eee *p = &dev->ports[port].eee; + u16 reg; + + if (is5325(dev) || is5365(dev)) + return -EOPNOTSUPP; + + b53_read16(dev, B53_EEE_PAGE, B53_EEE_LPI_INDICATE, ®); + e->eee_enabled = p->eee_enabled; + e->eee_active = !!(reg & BIT(port)); + + return 0; +} +EXPORT_SYMBOL(b53_get_mac_eee); + +int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e) +{ + struct b53_device *dev = ds->priv; + struct ethtool_eee *p = &dev->ports[port].eee; + + if (is5325(dev) || is5365(dev)) + return -EOPNOTSUPP; + + p->eee_enabled = e->eee_enabled; + b53_eee_enable_set(ds, port, e->eee_enabled); + + return 0; +} +EXPORT_SYMBOL(b53_set_mac_eee); + static const struct dsa_switch_ops b53_switch_ops = { .get_tag_protocol = b53_get_tag_protocol, .setup = b53_setup, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 77102f685da0..aabe80eab25d 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -70,6 +70,7 @@ enum { struct b53_port { u16 vlan_ctl_mask; + struct ethtool_eee eee; }; struct b53_vlan { @@ -310,5 +311,9 @@ int b53_mirror_add(struct dsa_switch *ds, int port, void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); +void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable); +int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); +int b53_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e); +int b53_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e); #endif diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 49cb51223f70..4e8ef4c07eab 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -107,19 +107,6 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) core_writel(priv, reg, offset); } -static void bcm_sf2_eee_enable_set(struct dsa_switch *ds, int port, bool enable) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - u32 reg; - - reg = core_readl(priv, CORE_EEE_EN_CTRL); - if (enable) - reg |= 1 << port; - else - reg &= ~(1 << port); - core_writel(priv, reg, CORE_EEE_EN_CTRL); -} - static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); @@ -256,8 +243,8 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, bcm_sf2_imp_vlan_setup(ds, cpu_port); /* If EEE was enabled, restore it */ - if (priv->port_sts[port].eee.eee_enabled) - bcm_sf2_eee_enable_set(ds, port, true); + if (priv->dev->ports[port].eee.eee_enabled) + b53_eee_enable_set(ds, port, true); return 0; } @@ -292,47 +279,6 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, core_writel(priv, reg, CORE_MEM_PSM_VDD_CTRL); } -/* Returns 0 if EEE was not enabled, or 1 otherwise - */ -static int bcm_sf2_eee_init(struct dsa_switch *ds, int port, - struct phy_device *phy) -{ - int ret; - - ret = phy_init_eee(phy, 0); - if (ret) - return 0; - - bcm_sf2_eee_enable_set(ds, port, true); - - return 1; -} - -static int bcm_sf2_sw_get_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_eee *e) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - struct ethtool_eee *p = &priv->port_sts[port].eee; - u32 reg; - - reg = core_readl(priv, CORE_EEE_LPI_INDICATE); - e->eee_enabled = p->eee_enabled; - e->eee_active = !!(reg & (1 << port)); - - return 0; -} - -static int bcm_sf2_sw_set_mac_eee(struct dsa_switch *ds, int port, - struct ethtool_eee *e) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - struct ethtool_eee *p = &priv->port_sts[port].eee; - - p->eee_enabled = e->eee_enabled; - bcm_sf2_eee_enable_set(ds, port, e->eee_enabled); - - return 0; -} static int bcm_sf2_sw_indir_rw(struct bcm_sf2_priv *priv, int op, int addr, int regnum, u16 val) @@ -567,7 +513,7 @@ static void bcm_sf2_sw_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phydev) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - struct ethtool_eee *p = &priv->port_sts[port].eee; + struct ethtool_eee *p = &priv->dev->ports[port].eee; u32 id_mode_dis = 0, port_mode; const char *str = NULL; u32 reg, offset; @@ -649,7 +595,7 @@ force_link: core_writel(priv, reg, offset); if (!phydev->is_pseudo_fixed_link) - p->eee_enabled = bcm_sf2_eee_init(ds, port, phydev); + p->eee_enabled = b53_eee_init(ds, port, phydev); } static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, @@ -978,8 +924,8 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .set_wol = bcm_sf2_sw_set_wol, .port_enable = bcm_sf2_port_setup, .port_disable = bcm_sf2_port_disable, - .get_mac_eee = bcm_sf2_sw_get_mac_eee, - .set_mac_eee = bcm_sf2_sw_set_mac_eee, + .get_mac_eee = b53_get_mac_eee, + .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, .port_stp_state_set = b53_br_set_stp_state, diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h index 02c499f9c56b..1922e027ff59 100644 --- a/drivers/net/dsa/bcm_sf2.h +++ b/drivers/net/dsa/bcm_sf2.h @@ -48,8 +48,6 @@ struct bcm_sf2_hw_params { struct bcm_sf2_port_status { unsigned int link; - - struct ethtool_eee eee; }; struct bcm_sf2_cfp_priv { diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h index 788361ad68a0..d8b8074a47b9 100644 --- a/drivers/net/dsa/bcm_sf2_regs.h +++ b/drivers/net/dsa/bcm_sf2_regs.h @@ -244,9 +244,6 @@ enum bcm_sf2_reg_offs { #define CORE_JOIN_ALL_VLAN_EN 0xd140 -#define CORE_EEE_EN_CTRL 0x24800 -#define CORE_EEE_LPI_INDICATE 0x24810 - #define CORE_CFP_ACC 0x28000 #define OP_STR_DONE (1 << 0) #define OP_SEL_SHIFT 1 From f43a2dbe9597038578171e70cb8fe751a0383fe2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:51 -0700 Subject: [PATCH 0280/1322] net: dsa: b53: Wire-up EEE Add support for enabling and disabling EEE, as well as re-negotiating it in .adjust_link() and in .port_enable(). Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 491e4ffa8a0e..4e37ec27e496 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -523,6 +523,10 @@ static int b53_enable_port(struct dsa_switch *ds, int port, b53_imp_vlan_setup(ds, cpu_port); + /* If EEE was enabled, restore it */ + if (dev->ports[port].eee.eee_enabled) + b53_eee_enable_set(ds, port, true); + return 0; } @@ -879,6 +883,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phydev) { struct b53_device *dev = ds->priv; + struct ethtool_eee *p = &dev->ports[port].eee; u8 rgmii_ctrl = 0, reg = 0, off; if (!phy_is_pseudo_fixed_link(phydev)) @@ -1000,6 +1005,9 @@ static void b53_adjust_link(struct dsa_switch *ds, int port, b53_write8(dev, B53_CTRL_PAGE, po_reg, gmii_po); } } + + /* Re-negotiate EEE if it was enabled already */ + p->eee_enabled = b53_eee_init(ds, port, phydev); } int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) @@ -1605,6 +1613,8 @@ static const struct dsa_switch_ops b53_switch_ops = { .adjust_link = b53_adjust_link, .port_enable = b53_enable_port, .port_disable = b53_disable_port, + .get_mac_eee = b53_get_mac_eee, + .set_mac_eee = b53_set_mac_eee, .port_bridge_join = b53_br_join, .port_bridge_leave = b53_br_leave, .port_stp_state_set = b53_br_set_stp_state, From aac028672cbea038f23e378ddb3af08c1dbe668c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:52 -0700 Subject: [PATCH 0281/1322] net: dsa: b53: Export b53_imp_vlan_setup() bcm_sf2 and b53 do exactly the same thing, so share that piece. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 3 ++- drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/bcm_sf2.c | 23 +---------------------- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 4e37ec27e496..c3f1cd2c33ea 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -484,7 +484,7 @@ static int b53_fast_age_vlan(struct b53_device *dev, u16 vid) return b53_flush_arl(dev, FAST_AGE_VLAN); } -static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) +void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) { struct b53_device *dev = ds->priv; unsigned int i; @@ -500,6 +500,7 @@ static void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(i), pvlan); } } +EXPORT_SYMBOL(b53_imp_vlan_setup); static int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index aabe80eab25d..8f4f83e2e4bd 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -284,6 +284,7 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev) #endif /* Exported functions towards other drivers */ +void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port); void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data); void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data); int b53_get_sset_count(struct dsa_switch *ds); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 4e8ef4c07eab..08639674947a 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -40,27 +40,6 @@ static enum dsa_tag_protocol bcm_sf2_sw_get_tag_protocol(struct dsa_switch *ds) return DSA_TAG_PROTO_BRCM; } -static void bcm_sf2_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) -{ - struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - unsigned int i; - u32 reg; - - /* Enable the IMP Port to be in the same VLAN as the other ports - * on a per-port basis such that we only have Port i and IMP in - * the same VLAN. - */ - for (i = 0; i < priv->hw_params.num_ports; i++) { - if (!((1 << i) & ds->enabled_port_mask)) - continue; - - reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(i)); - reg |= (1 << cpu_port); - core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(i)); - } -} - - static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); @@ -240,7 +219,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg |= priv->dev->ports[port].vlan_ctl_mask; core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port)); - bcm_sf2_imp_vlan_setup(ds, cpu_port); + b53_imp_vlan_setup(ds, cpu_port); /* If EEE was enabled, restore it */ if (priv->dev->ports[port].eee.eee_enabled) From 152b6fd60ae075dbe41707ed9c7caddc18b03b35 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:53 -0700 Subject: [PATCH 0282/1322] net: dsa: bcm_sf2: Use SF2_NUM_EGRESS_QUEUES for CFP The magic number 8 in 3 locations in bcm_sf2_cfp.c actually designates the number of switch port egress queues, so use that define instead of open-coding it. Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2_cfp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 8a1da7e67707..94649e1481ec 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -144,7 +144,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, * destination port is enabled and that we are within the * number of ports supported by the switch */ - port_num = fs->ring_cookie / 8; + port_num = fs->ring_cookie / SF2_NUM_EGRESS_QUEUES; if (fs->ring_cookie == RX_CLS_FLOW_DISC || !(BIT(port_num) & ds->enabled_port_mask) || @@ -280,7 +280,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, * We have a small oddity where Port 6 just does not have a * valid bit here (so we subtract by one). */ - queue_num = fs->ring_cookie % 8; + queue_num = fs->ring_cookie % SF2_NUM_EGRESS_QUEUES; if (port_num >= 7) port_num -= 1; @@ -401,7 +401,7 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port, /* There is no Port 6, so we compensate for that here */ if (nfc->fs.ring_cookie >= 6) nfc->fs.ring_cookie++; - nfc->fs.ring_cookie *= 8; + nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES; /* Extract the destination queue */ queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK; From f86ad77faf248d6101394f8c79d03f16a9ea461d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 10:46:54 -0700 Subject: [PATCH 0283/1322] net: dsa: bcm_sf2: Utilize b53_{enable, disable}_port Export b53_{enable,disable}_port and use these two functions in bcm_sf2_port_setup and bcm_sf2_port_disable. The generic functions cannot be used without wrapping because we need to manage additional switch integration details (PHY, Broadcom tag etc.). Reviewed-by: Vivien Didelot Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 8 ++++---- drivers/net/dsa/b53/b53_priv.h | 2 ++ drivers/net/dsa/bcm_sf2.c | 26 ++------------------------ 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c3f1cd2c33ea..a9f2a5b55a5e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -502,8 +502,7 @@ void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port) } EXPORT_SYMBOL(b53_imp_vlan_setup); -static int b53_enable_port(struct dsa_switch *ds, int port, - struct phy_device *phy) +int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; unsigned int cpu_port = dev->cpu_port; @@ -530,9 +529,9 @@ static int b53_enable_port(struct dsa_switch *ds, int port, return 0; } +EXPORT_SYMBOL(b53_enable_port); -static void b53_disable_port(struct dsa_switch *ds, int port, - struct phy_device *phy) +void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; u8 reg; @@ -542,6 +541,7 @@ static void b53_disable_port(struct dsa_switch *ds, int port, reg |= PORT_CTRL_RX_DISABLE | PORT_CTRL_TX_DISABLE; b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), reg); } +EXPORT_SYMBOL(b53_disable_port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port) { diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 8f4f83e2e4bd..603c66d240d8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -311,6 +311,8 @@ int b53_mirror_add(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); +int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); +void b53_disable_port(struct dsa_switch *ds, int port, struct phy_device *phy); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable); int b53_eee_init(struct dsa_switch *ds, int port, struct phy_device *phy); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 08639674947a..0072a959db5b 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -163,7 +163,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, struct phy_device *phy) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - s8 cpu_port = ds->dst->cpu_dp->index; unsigned int i; u32 reg; @@ -184,9 +183,6 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, reg |= i << (PRT_TO_QID_SHIFT * i); core_writel(priv, reg, CORE_PORT_TC2_QOS_MAP_PORT(port)); - /* Clear the Rx and Tx disable bits and set to no spanning tree */ - core_writel(priv, 0, CORE_G_PCTL_PORT(port)); - /* Re-enable the GPHY and re-apply workarounds */ if (priv->int_phy_mask & 1 << port && priv->hw_params.num_gphy == 1) { bcm_sf2_gphy_enable_set(ds, true); @@ -209,23 +205,7 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, if (port == priv->moca_port) bcm_sf2_port_intr_enable(priv, port); - /* Set this port, and only this one to be in the default VLAN, - * if member of a bridge, restore its membership prior to - * bringing down this port. - */ - reg = core_readl(priv, CORE_PORT_VLAN_CTL_PORT(port)); - reg &= ~PORT_VLAN_CTRL_MASK; - reg |= (1 << port); - reg |= priv->dev->ports[port].vlan_ctl_mask; - core_writel(priv, reg, CORE_PORT_VLAN_CTL_PORT(port)); - - b53_imp_vlan_setup(ds, cpu_port); - - /* If EEE was enabled, restore it */ - if (priv->dev->ports[port].eee.eee_enabled) - b53_eee_enable_set(ds, port, true); - - return 0; + return b53_enable_port(ds, port, phy); } static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, @@ -248,9 +228,7 @@ static void bcm_sf2_port_disable(struct dsa_switch *ds, int port, else off = CORE_G_PCTL_PORT(port); - reg = core_readl(priv, off); - reg |= RX_DIS | TX_DIS; - core_writel(priv, reg, off); + b53_disable_port(ds, port, phy); /* Power down the port memory */ reg = core_readl(priv, CORE_MEM_PSM_VDD_CTRL); From 7e936bd73483eb1873b9ab6f375c9a1d781bc5cb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Sep 2017 01:12:11 +0200 Subject: [PATCH 0284/1322] test_rhashtable: don't allocate huge static array Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 0ffca990a833..c40d6e636f33 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -72,8 +72,6 @@ struct thread_data { struct test_obj *objs; }; -static struct test_obj array[MAX_ENTRIES]; - static struct rhashtable_params test_rht_params = { .head_offset = offsetof(struct test_obj, node), .key_offset = offsetof(struct test_obj, value), @@ -85,7 +83,7 @@ static struct rhashtable_params test_rht_params = { static struct semaphore prestart_sem; static struct semaphore startup_sem = __SEMAPHORE_INITIALIZER(startup_sem, 0); -static int insert_retry(struct rhashtable *ht, struct rhash_head *obj, +static int insert_retry(struct rhashtable *ht, struct test_obj *obj, const struct rhashtable_params params) { int err, retries = -1, enomem_retries = 0; @@ -93,7 +91,7 @@ static int insert_retry(struct rhashtable *ht, struct rhash_head *obj, do { retries++; cond_resched(); - err = rhashtable_insert_fast(ht, obj, params); + err = rhashtable_insert_fast(ht, &obj->node, params); if (err == -ENOMEM && enomem_retry) { enomem_retries++; err = -EBUSY; @@ -107,7 +105,7 @@ static int insert_retry(struct rhashtable *ht, struct rhash_head *obj, return err ? : retries; } -static int __init test_rht_lookup(struct rhashtable *ht) +static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array) { unsigned int i; @@ -186,7 +184,7 @@ static void test_bucket_stats(struct rhashtable *ht) pr_warn("Test failed: Total count mismatch ^^^"); } -static s64 __init test_rhashtable(struct rhashtable *ht) +static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array) { struct test_obj *obj; int err; @@ -203,7 +201,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht) struct test_obj *obj = &array[i]; obj->value.id = i * 2; - err = insert_retry(ht, &obj->node, test_rht_params); + err = insert_retry(ht, obj, test_rht_params); if (err > 0) insert_retries += err; else if (err) @@ -216,7 +214,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht) test_bucket_stats(ht); rcu_read_lock(); - test_rht_lookup(ht); + test_rht_lookup(ht, array); rcu_read_unlock(); test_bucket_stats(ht); @@ -286,7 +284,7 @@ static int threadfunc(void *data) for (i = 0; i < entries; i++) { tdata->objs[i].value.id = i; tdata->objs[i].value.tid = tdata->id; - err = insert_retry(&ht, &tdata->objs[i].node, test_rht_params); + err = insert_retry(&ht, &tdata->objs[i], test_rht_params); if (err > 0) { insert_retries += err; } else if (err) { @@ -349,6 +347,10 @@ static int __init test_rht_init(void) test_rht_params.max_size = max_size ? : roundup_pow_of_two(entries); test_rht_params.nelem_hint = size; + objs = vzalloc((test_rht_params.max_size + 1) * sizeof(struct test_obj)); + if (!objs) + return -ENOMEM; + pr_info("Running rhashtable test nelem=%d, max_size=%d, shrinking=%d\n", size, max_size, shrinking); @@ -356,7 +358,8 @@ static int __init test_rht_init(void) s64 time; pr_info("Test %02d:\n", i); - memset(&array, 0, sizeof(array)); + memset(objs, 0, test_rht_params.max_size * sizeof(struct test_obj)); + err = rhashtable_init(&ht, &test_rht_params); if (err < 0) { pr_warn("Test failed: Unable to initialize hashtable: %d\n", @@ -364,9 +367,10 @@ static int __init test_rht_init(void) continue; } - time = test_rhashtable(&ht); + time = test_rhashtable(&ht, objs); rhashtable_destroy(&ht); if (time < 0) { + vfree(objs); pr_warn("Test failed: return code %lld\n", time); return -EINVAL; } @@ -374,6 +378,7 @@ static int __init test_rht_init(void) total_time += time; } + vfree(objs); do_div(total_time, runs); pr_info("Average test time: %llu\n", total_time); From f651616e799ff01d1e21abc053ee1e23ac1a2344 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Sep 2017 01:12:12 +0200 Subject: [PATCH 0285/1322] test_rhashtable: don't use global entries variable pass the entries to test as an argument instead. Followup patch will add an rhlist test case; rhlist delete opererations are slow so we need to use a smaller number to test it. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index c40d6e636f33..69f5b3849980 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -28,9 +28,9 @@ #define MAX_ENTRIES 1000000 #define TEST_INSERT_FAIL INT_MAX -static int entries = 50000; -module_param(entries, int, 0); -MODULE_PARM_DESC(entries, "Number of entries to add (default: 50000)"); +static int parm_entries = 50000; +module_param(parm_entries, int, 0); +MODULE_PARM_DESC(parm_entries, "Number of entries to add (default: 50000)"); static int runs = 4; module_param(runs, int, 0); @@ -67,6 +67,7 @@ struct test_obj { }; struct thread_data { + unsigned int entries; int id; struct task_struct *task; struct test_obj *objs; @@ -105,11 +106,12 @@ static int insert_retry(struct rhashtable *ht, struct test_obj *obj, return err ? : retries; } -static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array) +static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array, + unsigned int entries) { unsigned int i; - for (i = 0; i < entries * 2; i++) { + for (i = 0; i < entries; i++) { struct test_obj *obj; bool expected = !(i % 2); struct test_obj_val key = { @@ -142,7 +144,7 @@ static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array) return 0; } -static void test_bucket_stats(struct rhashtable *ht) +static void test_bucket_stats(struct rhashtable *ht, unsigned int entries) { unsigned int err, total = 0, chain_len = 0; struct rhashtable_iter hti; @@ -184,7 +186,8 @@ static void test_bucket_stats(struct rhashtable *ht) pr_warn("Test failed: Total count mismatch ^^^"); } -static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array) +static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array, + unsigned int entries) { struct test_obj *obj; int err; @@ -212,12 +215,12 @@ static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array) pr_info(" %u insertions retried due to memory pressure\n", insert_retries); - test_bucket_stats(ht); + test_bucket_stats(ht, entries); rcu_read_lock(); - test_rht_lookup(ht, array); + test_rht_lookup(ht, array, entries); rcu_read_unlock(); - test_bucket_stats(ht); + test_bucket_stats(ht, entries); pr_info(" Deleting %d keys\n", entries); for (i = 0; i < entries; i++) { @@ -245,6 +248,7 @@ static struct rhashtable ht; static int thread_lookup_test(struct thread_data *tdata) { + unsigned int entries = tdata->entries; int i, err = 0; for (i = 0; i < entries; i++) { @@ -281,7 +285,7 @@ static int threadfunc(void *data) if (down_interruptible(&startup_sem)) pr_err(" thread[%d]: down_interruptible failed\n", tdata->id); - for (i = 0; i < entries; i++) { + for (i = 0; i < tdata->entries; i++) { tdata->objs[i].value.id = i; tdata->objs[i].value.tid = tdata->id; err = insert_retry(&ht, &tdata->objs[i], test_rht_params); @@ -305,7 +309,7 @@ static int threadfunc(void *data) } for (step = 10; step > 0; step--) { - for (i = 0; i < entries; i += step) { + for (i = 0; i < tdata->entries; i += step) { if (tdata->objs[i].value.id == TEST_INSERT_FAIL) continue; err = rhashtable_remove_fast(&ht, &tdata->objs[i].node, @@ -336,12 +340,16 @@ out: static int __init test_rht_init(void) { + unsigned int entries; int i, err, started_threads = 0, failed_threads = 0; u64 total_time = 0; struct thread_data *tdata; struct test_obj *objs; - entries = min(entries, MAX_ENTRIES); + if (parm_entries < 0) + parm_entries = 1; + + entries = min(parm_entries, MAX_ENTRIES); test_rht_params.automatic_shrinking = shrinking; test_rht_params.max_size = max_size ? : roundup_pow_of_two(entries); @@ -367,7 +375,7 @@ static int __init test_rht_init(void) continue; } - time = test_rhashtable(&ht, objs); + time = test_rhashtable(&ht, objs, entries); rhashtable_destroy(&ht); if (time < 0) { vfree(objs); @@ -409,6 +417,7 @@ static int __init test_rht_init(void) } for (i = 0; i < tcount; i++) { tdata[i].id = i; + tdata[i].entries = entries; tdata[i].objs = objs + i * entries; tdata[i].task = kthread_run(threadfunc, &tdata[i], "rhashtable_thrad[%d]", i); From a6359bd8dd1c3a15c09e7dbb533bf89d13b42acd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Sep 2017 01:12:13 +0200 Subject: [PATCH 0286/1322] test_rhashtable: add a check for max_size add a test that tries to insert more than max_size elements. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 69f5b3849980..1eee90e6e394 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -246,6 +246,43 @@ static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array, static struct rhashtable ht; +static int __init test_rhashtable_max(struct test_obj *array, + unsigned int entries) +{ + unsigned int i, insert_retries = 0; + int err; + + test_rht_params.max_size = roundup_pow_of_two(entries / 8); + err = rhashtable_init(&ht, &test_rht_params); + if (err) + return err; + + for (i = 0; i < ht.max_elems; i++) { + struct test_obj *obj = &array[i]; + + obj->value.id = i * 2; + err = insert_retry(&ht, obj, test_rht_params); + if (err > 0) + insert_retries += err; + else if (err) + return err; + } + + err = insert_retry(&ht, &array[ht.max_elems], test_rht_params); + if (err == -E2BIG) { + err = 0; + } else { + pr_info("insert element %u should have failed with %d, got %d\n", + ht.max_elems, -E2BIG, err); + if (err == 0) + err = -1; + } + + rhashtable_destroy(&ht); + + return err; +} + static int thread_lookup_test(struct thread_data *tdata) { unsigned int entries = tdata->entries; @@ -386,7 +423,11 @@ static int __init test_rht_init(void) total_time += time; } + pr_info("test if its possible to exceed max_size %d: %s\n", + test_rht_params.max_size, test_rhashtable_max(objs, entries) == 0 ? + "no, ok" : "YES, failed"); vfree(objs); + do_div(total_time, runs); pr_info("Average test time: %llu\n", total_time); From cdd4de372ea06a18e104100b9f2c442527d26db1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Sep 2017 01:12:14 +0200 Subject: [PATCH 0287/1322] test_rhashtable: add test case for rhl_table interface also test rhltable. rhltable remove operations are slow as deletions require a list walk, thus test with 1/16th of the given entry count number to get a run duration similar to rhashtable one. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 196 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 194 insertions(+), 2 deletions(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 1eee90e6e394..de4d0584631a 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define MAX_ENTRIES 1000000 @@ -66,6 +67,11 @@ struct test_obj { struct rhash_head node; }; +struct test_obj_rhl { + struct test_obj_val value; + struct rhlist_head list_node; +}; + struct thread_data { unsigned int entries; int id; @@ -245,6 +251,186 @@ static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array, } static struct rhashtable ht; +static struct rhltable rhlt __initdata; + +static int __init test_rhltable(unsigned int entries) +{ + struct test_obj_rhl *rhl_test_objects; + unsigned long *obj_in_table; + unsigned int i, j, k; + int ret, err; + + if (entries == 0) + entries = 1; + + rhl_test_objects = vzalloc(sizeof(*rhl_test_objects) * entries); + if (!rhl_test_objects) + return -ENOMEM; + + ret = -ENOMEM; + obj_in_table = vzalloc(BITS_TO_LONGS(entries) * sizeof(unsigned long)); + if (!obj_in_table) + goto out_free; + + /* nulls_base not supported in rhlist interface */ + test_rht_params.nulls_base = 0; + err = rhltable_init(&rhlt, &test_rht_params); + if (WARN_ON(err)) + goto out_free; + + k = prandom_u32(); + ret = 0; + for (i = 0; i < entries; i++) { + rhl_test_objects[i].value.id = k; + err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, + test_rht_params); + if (WARN(err, "error %d on element %d\n", err, i)) + break; + if (err == 0) + set_bit(i, obj_in_table); + } + + if (err) + ret = err; + + pr_info("test %d add/delete pairs into rhlist\n", entries); + for (i = 0; i < entries; i++) { + struct rhlist_head *h, *pos; + struct test_obj_rhl *obj; + struct test_obj_val key = { + .id = k, + }; + bool found; + + rcu_read_lock(); + h = rhltable_lookup(&rhlt, &key, test_rht_params); + if (WARN(!h, "key not found during iteration %d of %d", i, entries)) { + rcu_read_unlock(); + break; + } + + if (i) { + j = i - 1; + rhl_for_each_entry_rcu(obj, pos, h, list_node) { + if (WARN(pos == &rhl_test_objects[j].list_node, "old element found, should be gone")) + break; + } + } + + cond_resched_rcu(); + + found = false; + + rhl_for_each_entry_rcu(obj, pos, h, list_node) { + if (pos == &rhl_test_objects[i].list_node) { + found = true; + break; + } + } + + rcu_read_unlock(); + + if (WARN(!found, "element %d not found", i)) + break; + + err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + WARN(err, "rhltable_remove: err %d for iteration %d\n", err, i); + if (err == 0) + clear_bit(i, obj_in_table); + } + + if (ret == 0 && err) + ret = err; + + for (i = 0; i < entries; i++) { + WARN(test_bit(i, obj_in_table), "elem %d allegedly still present", i); + + err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, + test_rht_params); + if (WARN(err, "error %d on element %d\n", err, i)) + break; + if (err == 0) + set_bit(i, obj_in_table); + } + + pr_info("test %d random rhlist add/delete operations\n", entries); + for (j = 0; j < entries; j++) { + u32 i = prandom_u32_max(entries); + u32 prand = prandom_u32(); + + cond_resched(); + + if (prand == 0) + prand = prandom_u32(); + + if (prand & 1) { + prand >>= 1; + continue; + } + + err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + if (test_bit(i, obj_in_table)) { + clear_bit(i, obj_in_table); + if (WARN(err, "cannot remove element at slot %d", i)) + continue; + } else { + if (WARN(err != -ENOENT, "removed non-existant element %d, error %d not %d", + i, err, -ENOENT)) + continue; + } + + if (prand & 1) { + prand >>= 1; + continue; + } + + err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + if (err == 0) { + if (WARN(test_and_set_bit(i, obj_in_table), "succeeded to insert same object %d", i)) + continue; + } else { + if (WARN(!test_bit(i, obj_in_table), "failed to insert object %d", i)) + continue; + } + + if (prand & 1) { + prand >>= 1; + continue; + } + + i = prandom_u32_max(entries); + if (test_bit(i, obj_in_table)) { + err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + WARN(err, "cannot remove element at slot %d", i); + if (err == 0) + clear_bit(i, obj_in_table); + } else { + err = rhltable_insert(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + WARN(err, "failed to insert object %d", i); + if (err == 0) + set_bit(i, obj_in_table); + } + } + + for (i = 0; i < entries; i++) { + cond_resched(); + err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); + if (test_bit(i, obj_in_table)) { + if (WARN(err, "cannot remove element at slot %d", i)) + continue; + } else { + if (WARN(err != -ENOENT, "removed non-existant element, error %d not %d", + err, -ENOENT)) + continue; + } + } + + rhltable_destroy(&rhlt); +out_free: + vfree(rhl_test_objects); + vfree(obj_in_table); + return ret; +} static int __init test_rhashtable_max(struct test_obj *array, unsigned int entries) @@ -480,11 +666,17 @@ static int __init test_rht_init(void) failed_threads++; } } - pr_info("Started %d threads, %d failed\n", - started_threads, failed_threads); rhashtable_destroy(&ht); vfree(tdata); vfree(objs); + + /* + * rhltable_remove is very expensive, default values can cause test + * to run for 2 minutes or more, use a smaller number instead. + */ + err = test_rhltable(entries / 16); + pr_info("Started %d threads, %d failed, rhltable test returns %d\n", + started_threads, failed_threads, err); return 0; } From b5b7db8d680464b1d631fd016f5e093419f0bfd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 10:05:57 -0700 Subject: [PATCH 0288/1322] tcp: fastopen: fix on syn-data transmit failure Our recent change exposed a bug in TCP Fastopen Client that syzkaller found right away [1] When we prepare skb with SYN+DATA, we attempt to transmit it, and we update socket state as if the transmit was a success. In socket RTX queue we have two skbs, one with the SYN alone, and a second one containing the DATA. When (malicious) ACK comes in, we now complain that second one had no skb_mstamp. The proper fix is to make sure that if the transmit failed, we do not pretend we sent the DATA skb, and make it our send_head. When 3WHS completes, we can now send the DATA right away, without having to wait for a timeout. [1] WARNING: CPU: 0 PID: 100189 at net/ipv4/tcp_input.c:3117 tcp_clean_rtx_queue+0x2057/0x2ab0 net/ipv4/tcp_input.c:3117() WARN_ON_ONCE(last_ackt == 0); Modules linked in: CPU: 0 PID: 100189 Comm: syz-executor1 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff8800b35cb1d8 ffffffff81cad00d 0000000000000000 ffffffff828a4347 ffff88009f86c080 ffffffff8316eb20 0000000000000d7f ffff8800b35cb220 ffffffff812c33c2 ffff8800baad2440 00000009d46575c0 Call Trace: [] __dump_stack [] dump_stack+0xc1/0x124 [] warn_slowpath_common+0xe2/0x150 [] warn_slowpath_null+0x2e/0x40 [] tcp_clean_rtx_queue+0x2057/0x2ab0 n [] tcp_ack+0x151d/0x3930 [] tcp_rcv_state_process+0x1c69/0x4fd0 [] tcp_v4_do_rcv+0x54f/0x7c0 [] sk_backlog_rcv [] __release_sock+0x12b/0x3a0 [] release_sock+0x5e/0x1c0 [] inet_wait_for_connect [] __inet_stream_connect+0x545/0xc50 [] tcp_sendmsg_fastopen [] tcp_sendmsg+0x2298/0x35a0 [] inet_sendmsg+0xe5/0x520 [] sock_sendmsg_nosec [] sock_sendmsg+0xcf/0x110 Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Fixes: 783237e8daf1 ("net-tcp: Fast Open client - sending SYN-data") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 517d737059d1..0bc9e46a5369 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3389,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3441,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ From bd7d2106b63adfd0dfd08331344e356461c29d70 Mon Sep 17 00:00:00 2001 From: Jim Hanko Date: Tue, 19 Sep 2017 11:33:39 -0700 Subject: [PATCH 0289/1322] team: fall back to hash if table entry is empty If the hash to port mapping table does not have a valid port (i.e. when a port goes down), fall back to the simple hashing mechanism to avoid dropping packets. Signed-off-by: Jim Hanko Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/team/team_mode_loadbalance.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c index 1468ddf424cc..a5ef97010eb3 100644 --- a/drivers/net/team/team_mode_loadbalance.c +++ b/drivers/net/team/team_mode_loadbalance.c @@ -137,7 +137,13 @@ static struct team_port *lb_htpm_select_tx_port(struct team *team, struct sk_buff *skb, unsigned char hash) { - return rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash)); + struct team_port *port; + + port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash)); + if (likely(port)) + return port; + /* If no valid port in the table, fall back to simple hash */ + return lb_hash_select_tx_port(team, lb_priv, skb, hash); } struct lb_select_tx_port { From f55956065ec94e3e9371463d693a1029c4cc3007 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 19 Sep 2017 19:35:18 +0200 Subject: [PATCH 0290/1322] net: emac: Fix napi poll list corruption This patch is pretty much a carbon copy of commit 3079c652141f ("caif: Fix napi poll list corruption") with "caif" replaced by "emac". The commit d75b1ade567f ("net: less interrupt masking in NAPI") breaks emac. It is now required that if the entire budget is consumed when poll returns, the napi poll_list must remain empty. However, like some other drivers emac tries to do a last-ditch check and if there is more work it will call napi_reschedule and then immediately process some of this new work. Should the entire budget be consumed while processing such new work then we will violate the new caller contract. This patch fixes this by not touching any work when we reschedule in emac. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/mal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index 2c74baa2398a..fff09dcf9e34 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -402,7 +402,7 @@ static int mal_poll(struct napi_struct *napi, int budget) unsigned long flags; MAL_DBG2(mal, "poll(%d)" NL, budget); - again: + /* Process TX skbs */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = @@ -451,7 +451,6 @@ static int mal_poll(struct napi_struct *napi, int budget) spin_lock_irqsave(&mal->lock, flags); mal_disable_eob_irq(mal); spin_unlock_irqrestore(&mal->lock, flags); - goto again; } mc->ops->poll_tx(mc->dev); } From 752fbcc33405d6f8249465e4b2c4e420091bb825 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 19 Sep 2017 13:15:42 -0700 Subject: [PATCH 0291/1322] net_sched: no need to free qdisc in RCU callback gen estimator has been rewritten in commit 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators"), the caller no longer needs to wait for a grace period. So this patch gets rid of it. Cc: Jamal Hadi Salim Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 - net/sched/sch_generic.c | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 135f5a2dd931..684d8ed27eaa 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -93,7 +93,6 @@ struct Qdisc { unsigned long state; struct Qdisc *next_sched; struct sk_buff *skb_bad_txq; - struct rcu_head rcu_head; int padded; refcount_t refcnt; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..1fb0c754b7fd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -688,10 +688,8 @@ void qdisc_reset(struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_rcu_free(struct rcu_head *head) +static void qdisc_free(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); @@ -724,11 +722,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(qdisc->gso_skb); kfree_skb(qdisc->skb_bad_txq); - /* - * gen_estimator est_timer() might access qdisc->q.lock, - * wait a RCU grace period before freeing qdisc. - */ - call_rcu(&qdisc->rcu_head, qdisc_rcu_free); + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); From 16dff336b33d87c15d9cbe933cfd275aae2a8251 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:03 -0700 Subject: [PATCH 0292/1322] kobject: add kobject_uevent_net_broadcast() This removes some #ifdef pollution and will ease follow up patches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 96 ++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index e590523ea476..4f48cc3b11d5 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -294,6 +294,57 @@ static void cleanup_uevent_env(struct subprocess_info *info) } #endif +static int kobject_uevent_net_broadcast(struct kobject *kobj, + struct kobj_uevent_env *env, + const char *action_string, + const char *devpath) +{ + int retval = 0; +#if defined(CONFIG_NET) + struct uevent_sock *ue_sk; + + /* send netlink message */ + list_for_each_entry(ue_sk, &uevent_sock_list, list) { + struct sock *uevent_sock = ue_sk->sk; + struct sk_buff *skb; + size_t len; + + if (!netlink_has_listeners(uevent_sock, 1)) + continue; + + /* allocate message with the maximum possible size */ + len = strlen(action_string) + strlen(devpath) + 2; + skb = alloc_skb(len + env->buflen, GFP_KERNEL); + if (skb) { + char *scratch; + int i; + + /* add header */ + scratch = skb_put(skb, len); + sprintf(scratch, "%s@%s", action_string, devpath); + + /* copy keys to our continuous event payload buffer */ + for (i = 0; i < env->envp_idx; i++) { + len = strlen(env->envp[i]) + 1; + scratch = skb_put(skb, len); + strcpy(scratch, env->envp[i]); + } + + NETLINK_CB(skb).dst_group = 1; + retval = netlink_broadcast_filtered(uevent_sock, skb, + 0, 1, GFP_KERNEL, + kobj_bcast_filter, + kobj); + /* ENOBUFS should be handled in userspace */ + if (retval == -ENOBUFS || retval == -ESRCH) + retval = 0; + } else + retval = -ENOMEM; + } +#endif + return retval; +} + /** * kobject_uevent_env - send an uevent with environmental data * @@ -316,9 +367,6 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, const struct kset_uevent_ops *uevent_ops; int i = 0; int retval = 0; -#ifdef CONFIG_NET - struct uevent_sock *ue_sk; -#endif pr_debug("kobject: '%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); @@ -427,46 +475,8 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, mutex_unlock(&uevent_sock_mutex); goto exit; } - -#if defined(CONFIG_NET) - /* send netlink message */ - list_for_each_entry(ue_sk, &uevent_sock_list, list) { - struct sock *uevent_sock = ue_sk->sk; - struct sk_buff *skb; - size_t len; - - if (!netlink_has_listeners(uevent_sock, 1)) - continue; - - /* allocate message with the maximum possible size */ - len = strlen(action_string) + strlen(devpath) + 2; - skb = alloc_skb(len + env->buflen, GFP_KERNEL); - if (skb) { - char *scratch; - - /* add header */ - scratch = skb_put(skb, len); - sprintf(scratch, "%s@%s", action_string, devpath); - - /* copy keys to our continuous event payload buffer */ - for (i = 0; i < env->envp_idx; i++) { - len = strlen(env->envp[i]) + 1; - scratch = skb_put(skb, len); - strcpy(scratch, env->envp[i]); - } - - NETLINK_CB(skb).dst_group = 1; - retval = netlink_broadcast_filtered(uevent_sock, skb, - 0, 1, GFP_KERNEL, - kobj_bcast_filter, - kobj); - /* ENOBUFS should be handled in userspace */ - if (retval == -ENOBUFS || retval == -ESRCH) - retval = 0; - } else - retval = -ENOMEM; - } -#endif + retval = kobject_uevent_net_broadcast(kobj, env, action_string, + devpath); mutex_unlock(&uevent_sock_mutex); #ifdef CONFIG_UEVENT_HELPER From 4a336a23d619e96aef37d4d054cfadcdd1b581ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:04 -0700 Subject: [PATCH 0293/1322] kobject: copy env blob in one go No need to iterate over strings, just copy in one efficient memcpy() call. Tested: time perf record "(for f in `seq 1 3000` ; do ip netns add tast$f; done)" [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 8.224 MB perf.data (~359301 samples) ] real 0m52.554s # instead of 1m7.492s user 0m0.309s sys 0m51.375s # instead of 1m6.875s 9.88% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 8.86% ip [kernel.kallsyms] [k] string 7.37% ip [kernel.kallsyms] [k] __ip6addrlbl_add 5.68% ip [kernel.kallsyms] [k] netlink_has_listeners 5.52% ip [kernel.kallsyms] [k] memcpy_erms 4.76% ip [kernel.kallsyms] [k] __alloc_skb 4.54% ip [kernel.kallsyms] [k] vsnprintf 3.94% ip [kernel.kallsyms] [k] format_decode 3.80% ip [kernel.kallsyms] [k] kmem_cache_alloc_node_trace 3.71% ip [kernel.kallsyms] [k] kmem_cache_alloc_node 3.66% ip [kernel.kallsyms] [k] kobject_uevent_env 3.38% ip [kernel.kallsyms] [k] strlen 2.65% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 2.20% ip [kernel.kallsyms] [k] kfree 2.09% ip [kernel.kallsyms] [k] memset_erms 2.07% ip [kernel.kallsyms] [k] ___cache_free 1.95% ip [kernel.kallsyms] [k] kmem_cache_free 1.91% ip [kernel.kallsyms] [k] _raw_read_lock 1.45% ip [kernel.kallsyms] [k] ksize 1.25% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 1.00% ip [kernel.kallsyms] [k] widen_string Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 4f48cc3b11d5..78b2a7e378c0 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -317,18 +317,12 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj, skb = alloc_skb(len + env->buflen, GFP_KERNEL); if (skb) { char *scratch; - int i; /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); - /* copy keys to our continuous event payload buffer */ - for (i = 0; i < env->envp_idx; i++) { - len = strlen(env->envp[i]) + 1; - scratch = skb_put(skb, len); - strcpy(scratch, env->envp[i]); - } + skb_put_data(skb, env->buf, env->buflen); NETLINK_CB(skb).dst_group = 1; retval = netlink_broadcast_filtered(uevent_sock, skb, From d464e84eed02993d40ad55fdc19f4523e4deee5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:05 -0700 Subject: [PATCH 0294/1322] kobject: factorize skb setup in kobject_uevent_net_broadcast() We can build one skb and let it be cloned in netlink. This is much faster, and use less memory (all clones will share the same skb->head) Tested: time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 4.110 MB perf.data (~179584 samples) ] real 0m24.227s # instead of 0m52.554s user 0m0.329s sys 0m23.753s # instead of 0m51.375s 14.77% ip [kernel.kallsyms] [k] __ip6addrlbl_add 14.56% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 11.65% ip [kernel.kallsyms] [k] netlink_has_listeners 6.19% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.66% ip [kernel.kallsyms] [k] kobject_uevent_env 4.97% ip [kernel.kallsyms] [k] memset_erms 4.67% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.41% ip [kernel.kallsyms] [k] _raw_read_lock 3.59% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.13% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 1.55% ip [kernel.kallsyms] [k] __wake_up 1.20% ip [kernel.kallsyms] [k] strlen 1.03% ip [kernel.kallsyms] [k] __wake_up_common 0.93% ip [kernel.kallsyms] [k] consume_skb 0.92% ip [kernel.kallsyms] [k] netlink_trim 0.87% ip [kernel.kallsyms] [k] insert_header 0.63% ip [kernel.kallsyms] [k] unmap_page_range Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- lib/kobject_uevent.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 78b2a7e378c0..147db91c10d0 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -301,23 +301,26 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj, { int retval = 0; #if defined(CONFIG_NET) + struct sk_buff *skb = NULL; struct uevent_sock *ue_sk; /* send netlink message */ list_for_each_entry(ue_sk, &uevent_sock_list, list) { struct sock *uevent_sock = ue_sk->sk; - struct sk_buff *skb; - size_t len; if (!netlink_has_listeners(uevent_sock, 1)) continue; - /* allocate message with the maximum possible size */ - len = strlen(action_string) + strlen(devpath) + 2; - skb = alloc_skb(len + env->buflen, GFP_KERNEL); - if (skb) { + if (!skb) { + /* allocate message with the maximum possible size */ + size_t len = strlen(action_string) + strlen(devpath) + 2; char *scratch; + retval = -ENOMEM; + skb = alloc_skb(len + env->buflen, GFP_KERNEL); + if (!skb) + continue; + /* add header */ scratch = skb_put(skb, len); sprintf(scratch, "%s@%s", action_string, devpath); @@ -325,16 +328,17 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj, skb_put_data(skb, env->buf, env->buflen); NETLINK_CB(skb).dst_group = 1; - retval = netlink_broadcast_filtered(uevent_sock, skb, - 0, 1, GFP_KERNEL, - kobj_bcast_filter, - kobj); - /* ENOBUFS should be handled in userspace */ - if (retval == -ENOBUFS || retval == -ESRCH) - retval = 0; - } else - retval = -ENOMEM; + } + + retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb), + 0, 1, GFP_KERNEL, + kobj_bcast_filter, + kobj); + /* ENOBUFS should be handled in userspace */ + if (retval == -ENOBUFS || retval == -ESRCH) + retval = 0; } + consume_skb(skb); #endif return retval; } From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:06 -0700 Subject: [PATCH 0295/1322] ipv6: addrlabel: per netns list Having a global list of labels do not scale to thousands of netns in the cloud era. This causes quadratic behavior on netns creation and deletion. This is time having a per netns list of ~10 labels. Tested: $ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ] real 0m20.837s # instead of 0m24.227s user 0m0.328s sys 0m20.338s # instead of 0m23.753s 16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 12.30% ip [kernel.kallsyms] [k] netlink_has_listeners 6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.78% ip [kernel.kallsyms] [k] memset_erms 5.77% ip [kernel.kallsyms] [k] kobject_uevent_env 5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.96% ip [kernel.kallsyms] [k] _raw_read_lock 3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 2.11% ip [kernel.kallsyms] [k] unmap_page_range 1.77% ip [kernel.kallsyms] [k] __wake_up 1.69% ip [kernel.kallsyms] [k] strlen 1.17% ip [kernel.kallsyms] [k] __wake_up_common 1.09% ip [kernel.kallsyms] [k] insert_header 1.04% ip [kernel.kallsyms] [k] page_remove_rmap 1.01% ip [kernel.kallsyms] [k] consume_skb 0.98% ip [kernel.kallsyms] [k] netlink_trim 0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling 0.51% ip [kernel.kallsyms] [k] filemap_map_pages 0.46% ip [kernel.kallsyms] [k] memcpy_erms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 5 +++ net/ipv6/addrlabel.c | 81 +++++++++++++++------------------------- 2 files changed, 35 insertions(+), 51 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2544f9760a42..2ea1ed341ef8 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -89,6 +89,11 @@ struct netns_ipv6 { atomic_t fib6_sernum; struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; + struct { + struct hlist_head head; + spinlock_t lock; + u32 seq; + } ip6addrlbl_table; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b055bc79f56d..c6311d7108f6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -30,7 +30,6 @@ * Policy Table */ struct ip6addrlbl_entry { - possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -41,19 +40,6 @@ struct ip6addrlbl_entry { struct rcu_head rcu; }; -static struct ip6addrlbl_table -{ - struct hlist_head head; - spinlock_t lock; - u32 seq; -} ip6addrlbl_table; - -static inline -struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) -{ - return read_pnet(&lbl->lbl_net); -} - /* * Default policy table (RFC6724 + extensions) * @@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static bool __ip6addrlbl_match(struct net *net, - const struct ip6addrlbl_entry *p, +static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, int addrtype, int ifindex) { - if (!net_eq(ip6addrlbl_net(p), net)) - return false; if (p->ifindex && p->ifindex != ifindex) return false; if (p->addrtype && p->addrtype != addrtype) @@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, int type, int ifindex) { struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(p, addr, type, ifindex)) return p; } return NULL; @@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net, } /* allocate one entry */ -static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, - const struct in6_addr *prefix, +static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { @@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - write_pnet(&newp->lbl_net, net); refcount_set(&newp->refcnt, 1); return newp; } /* add a label */ -static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, + int replace) { - struct hlist_node *n; struct ip6addrlbl_entry *last = NULL, *p = NULL; + struct hlist_node *n; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, replace); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && p->ifindex == newp->ifindex && ipv6_addr_equal(&p->prefix, &newp->prefix)) { if (!replace) { @@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) if (last) hlist_add_behind_rcu(&newp->list, &last->list); else - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - ip6addrlbl_table.seq++; + net->ipv6.ip6addrlbl_table.seq++; return ret; } @@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net, __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); - newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); - spin_lock(&ip6addrlbl_table.lock); - ret = __ip6addrlbl_add(newp, replace); - spin_unlock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(net, newp, replace); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) ip6addrlbl_free(newp); return ret; @@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net, ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && - net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); @@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net, __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); - spin_lock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); return ret; } @@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net) ADDRLABEL(KERN_DEBUG "%s\n", __func__); + spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); + INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { int ret = ip6addrlbl_add(net, ip6addrlbl_init_table[i].prefix, @@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) struct hlist_node *n; /* Remove all labels belonging to the exiting net */ - spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { - if (net_eq(ip6addrlbl_net(p), net)) { - hlist_del_rcu(&p->list); - ip6addrlbl_put(p); - } + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); } - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } static struct pernet_operations ipv6_addr_label_ops = { @@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = { int __init ipv6_addr_label_init(void) { - spin_lock_init(&ip6addrlbl_table.lock); - return register_pernet_subsys(&ipv6_addr_label_ops); } @@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (idx >= s_idx && - net_eq(ip6addrlbl_net(p), net)) { + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); if (p && !ip6addrlbl_hold(p)) p = NULL; - lseq = ip6addrlbl_table.seq; + lseq = net->ipv6.ip6addrlbl_table.seq; rcu_read_unlock(); if (!p) { From 789e6ddb0b2fb5d5024b760b178a47876e4de7a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:07 -0700 Subject: [PATCH 0296/1322] tcp: batch tcp_net_metrics_exit When dealing with a list of dismantling netns, we can scan tcp_metrics once, saving cpu cycles. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 102b2c90bb80..0ab78abc811b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -892,10 +892,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1018,14 +1022,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) From bb401caefe9d2c65e0c0fa23b21deecfbfa473fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:08 -0700 Subject: [PATCH 0297/1322] ipv6: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 110 267 5504 1 2 : tunables 8 4 0 : slabdata 110 267 0 real 3m25.292s user 0m0.644s sys 0m40.153s After patch: $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 20 +++++++++++--------- net/ipv6/ip6_vti.c | 23 ++++++++++++++--------- net/ipv6/sit.c | 9 ++++++--- 4 files changed, 36 insertions(+), 24 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..c82d41ef25e2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1155,19 +1155,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_net(struct net *net) +static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) { + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip6gre_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit = ip6gre_exit_net, + .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..3d6df489b39f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2167,17 +2167,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; - LIST_HEAD(list); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); @@ -2186,12 +2185,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } - - unregister_netdevice_many(&list); } static int __net_init ip6_tnl_init_net(struct net *net) @@ -2235,16 +2232,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) { + struct net *net; + LIST_HEAD(list); + rtnl_lock(); - ip6_tnl_destroy_tunnels(net); + list_for_each_entry(net, net_list, exit_list) + ip6_tnl_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit = ip6_tnl_exit_net, + .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..714914d1bb98 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1052,23 +1052,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, + struct list_head *list) { int h; struct ip6_tnl *t; - LIST_HEAD(list); for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); - unregister_netdevice_queue(t->dev, &list); - unregister_netdevice_many(&list); + unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) @@ -1108,18 +1107,24 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_net(struct net *net) +static void __net_exit vti6_exit_batch_net(struct list_head *net_list) { - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + struct net *net; + LIST_HEAD(list); rtnl_lock(); - vti6_destroy_tunnels(ip6n); + list_for_each_entry(net, net_list, exit_list) { + ip6n = net_generic(net, vti6_net_id); + vti6_destroy_tunnels(ip6n, &list); + } + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit = vti6_exit_net, + .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ac912bb21747..a799f5258614 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1848,19 +1848,22 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) +static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); + struct net *net; rtnl_lock(); - sit_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; From 64bc17811b72758753e2b64cd8f2a63812c61fe1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:09 -0700 Subject: [PATCH 0298/1322] ipv4: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s After patch: $ time ./add_del_unshare.sh net_namespace 135 291 5504 1 2 : tunables 8 4 0 : slabdata 135 291 0 real 0m22.117s user 0m0.728s sys 0m35.328s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 ++- net/ipv4/ip_gre.c | 22 +++++++++------------- net/ipv4/ip_tunnel.c | 12 +++++++++--- net/ipv4/ip_vti.c | 7 +++---- net/ipv4/ipip.c | 7 +++---- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 992652856fe8..b41a1e057fce 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -258,7 +258,8 @@ int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); +void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, + struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..9cee986ac6b8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1013,15 +1013,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1540,15 +1539,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1559,16 +1557,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..02d70ca99db1 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -452,15 +452,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fb1ad22b5e29..1e47818e38c7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -634,15 +634,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; From 7c30013133964aaa2f45c17d6e9782ac6cfd7f5f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Sep 2017 00:44:21 +0200 Subject: [PATCH 0299/1322] bpf: fix ri->map_owner pointer on bpf_prog_realloc Commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") passed the pointer to the prog itself to be loaded into r4 prior on bpf_redirect_map() helper call, so that we can store the owner into ri->map_owner out of the helper. Issue with that is that the actual address of the prog is still subject to change when subsequent rewrites occur that require slow path in bpf_prog_realloc() to alloc more memory, e.g. from patching inlining helper functions or constant blinding. Thus, we really need to take prog->aux as the address we're holding, which also works with prog clones as they share the same aux object. Instead of then fetching aux->prog during runtime, which could potentially incur cache misses due to false sharing, we are going to just use aux for comparison on the map owner. This will also keep the patchlet of the same size, and later check in xdp_map_invalid() only accesses read-only aux pointer from the prog, it's also in the same cacheline already from prior access when calling bpf_func. Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 7 ++++++- net/core/filter.c | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..b914fbe1383e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4205,7 +4205,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } if (insn->imm == BPF_FUNC_redirect_map) { - u64 addr = (unsigned long)prog; + /* Note, we cannot use prog directly as imm as subsequent + * rewrites would still change the prog pointer. The only + * stable address we can use is aux, which also works with + * prog clones during blinding. + */ + u64 addr = (unsigned long)prog->aux; struct bpf_insn r4_ld[] = { BPF_LD_IMM64(BPF_REG_4, addr), *insn, diff --git a/net/core/filter.c b/net/core/filter.c index 24dd33dd9f04..82edad58d066 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,7 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2500,11 +2500,17 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2512,9 +2518,9 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2574,7 +2580,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2583,10 +2589,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; if (map) { - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2632,7 +2638,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2646,7 +2652,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); From 6819a14ecbe2e089e5c5bb74edecafdde2028a00 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 4 Sep 2017 15:52:55 +0100 Subject: [PATCH 0300/1322] net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") incorrectly assumes that no RTM_NEWADDR are sent for addresses in tentative state, as this does happen for the standard IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is sent after DAD failure for a link-local when strict DAD (accept_dad=2) is configured, or on the next admin down in other cases. The absence of this notification breaks backwards compatibility and causes problems after DAD failure if this notification was being relied on. The solution is to allow RTM_DELADDR to still be sent after DAD failure. Fixes: f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") Signed-off-by: Mike Manning Cc: Mahesh Bandewar Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..d7dbcc8eda10 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4940,9 +4940,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); From 35e015e1f5773417952fe91ce8790baf9b4237a2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 12 Sep 2017 17:46:37 +0200 Subject: [PATCH 0301/1322] ipv6: fix net.ipv6.conf.all interface DAD handlers Currently, writing into net.ipv6.conf.all.{accept_dad,use_optimistic,optimistic_dad} has no effect. Fix handling of these flags by: - using the maximum of global and per-interface values for the accept_dad flag. That is, if at least one of the two values is non-zero, enable DAD on the interface. If at least one value is set to 2, enable DAD and disable IPv6 operation on the interface if MAC-based link-local address was found - using the logical OR of global and per-interface values for the optimistic_dad flag. If at least one of them is set to one, optimistic duplicate address detection (RFC 4429) is enabled on the interface - using the logical OR of global and per-interface values for the use_optimistic flag. If at least one of them is set to one, optimistic addresses won't be marked as deprecated during source address selection on the interface. While at it, as we're modifying the prototype for ipv6_use_optimistic_addr(), drop inline, and let the compiler decide. Fixes: 7fd2561e4ebd ("net: ipv6: Add a sysctl to make optimistic addresses useful candidates") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 18 +++++++++++++---- net/ipv6/addrconf.c | 27 +++++++++++++++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b3345d0fe0a6..77f4de59dc9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1680,6 +1680,9 @@ accept_dad - INTEGER 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate link-local address has been found. + DAD operation and mode on a given interface will be selected according + to the maximum value of conf/{all,interface}/accept_dad. + force_tllao - BOOLEAN Enable sending the target link-layer address option even when responding to a unicast neighbor solicitation. @@ -1727,16 +1730,23 @@ suppress_frag_ndisc - INTEGER optimistic_dad - BOOLEAN Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + Optimistic Duplicate Address Detection for the interface will be enabled + if at least one of conf/{all,interface}/optimistic_dad is set to 1, + it will be disabled otherwise. use_optimistic - BOOLEAN If enabled, do not classify optimistic addresses as deprecated during source address selection. Preferred addresses will still be chosen before optimistic addresses, subject to other ranking in the source address selection algorithm. - 0: disabled (default) - 1: enabled + 0: disabled (default) + 1: enabled + + This will be enabled if at least one of + conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. stable_secret - IPv6 address This IPv6 address will be used as a secret to generate IPv6 diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d7dbcc8eda10..96861c702c06 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3810,6 +3820,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; From 23586b66d84ba3184b8820277f3fc42761640f87 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Sep 2017 18:18:45 -0500 Subject: [PATCH 0302/1322] Fix SMB3.1.1 guest authentication to Samba Samba rejects SMB3.1.1 dialect (vers=3.1.1) negotiate requests from the kernel client due to the two byte pad at the end of the negotiate contexts. CC: Stable Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5c16591a128e..b0eaebe627e9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -439,7 +439,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req) build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ } #else From 04fc52fb222d35e1f7a0d5d85b19a676ea1e10e8 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Tue, 5 Sep 2017 14:23:02 +0200 Subject: [PATCH 0303/1322] drm/exynos/hdmi: Fix unsafe list iteration Function hdmi_mode_fixup() used bare list_for_each entry, which was unsafe and caused memory corruption detected by kasan. It now uses drm_for_each_connector_iter macro, which is now recommended by the documentation and safe. Signed-off-by: Maciej Purski Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_hdmi.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c index 214fa5e51963..0109ff40b1db 100644 --- a/drivers/gpu/drm/exynos/exynos_hdmi.c +++ b/drivers/gpu/drm/exynos/exynos_hdmi.c @@ -944,22 +944,27 @@ static bool hdmi_mode_fixup(struct drm_encoder *encoder, struct drm_device *dev = encoder->dev; struct drm_connector *connector; struct drm_display_mode *m; + struct drm_connector_list_iter conn_iter; int mode_ok; drm_mode_set_crtcinfo(adjusted_mode, 0); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &conn_iter); + drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder == encoder) break; } + if (connector) + drm_connector_get(connector); + drm_connector_list_iter_end(&conn_iter); - if (connector->encoder != encoder) + if (!connector) return true; mode_ok = hdmi_mode_valid(connector, adjusted_mode); if (mode_ok == MODE_OK) - return true; + goto cleanup; /* * Find the most suitable mode and copy it to adjusted_mode. @@ -979,6 +984,9 @@ static bool hdmi_mode_fixup(struct drm_encoder *encoder, } } +cleanup: + drm_connector_put(connector); + return true; } From 8632ec8cdcaead67c019f1ba36760d1bdd0c5d23 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 23 Aug 2017 15:37:53 +1000 Subject: [PATCH 0304/1322] powerpc/configs: Update for CONFIG_SND changes Commit eb3b705aaed9 ("ALSA: Make CONFIG_SND_OSSEMUL user-selectable") means we need to set CONFIG_SND_OSSEMUL in our configs, otherwise we lose some of the SND symbols. And commit 0181307abc1d ("ALSA: seq: Reorganize kconfig and build") reorganised things, which causes the churn. Signed-off-by: Michael Ellerman --- arch/powerpc/configs/g5_defconfig | 5 +++-- arch/powerpc/configs/gamecube_defconfig | 5 +++-- arch/powerpc/configs/pasemi_defconfig | 3 ++- arch/powerpc/configs/pmac32_defconfig | 7 ++++--- arch/powerpc/configs/ppc64_defconfig | 7 ++++--- arch/powerpc/configs/ppc64e_defconfig | 7 ++++--- arch/powerpc/configs/ppc6xx_defconfig | 7 ++++--- arch/powerpc/configs/wii_defconfig | 5 +++-- 8 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index e084fa548d73..063817fee61c 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -138,10 +138,11 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m CONFIG_SND_AOA_FABRIC_LAYOUT=m diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index 79bbc8238b32..805b0f87653c 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -64,11 +64,12 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_CLUT224 is not set CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y # CONFIG_SND_VERBOSE_PROCFS is not set +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQUENCER_OSS=y # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_GENERIC=y diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index 8cf4a46bef86..6daa56f8895c 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -115,9 +115,10 @@ CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y +CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQUENCER_OSS=y CONFIG_SND_USB_AUDIO=y CONFIG_SND_USB_USX2Y=y diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 8e798b1fbc99..1aab9a62a681 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -227,11 +227,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_DUMMY=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 791db775a09c..6ddca80c52c3 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -222,11 +222,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_POWERMAC=m CONFIG_SND_AOA=m CONFIG_SND_AOA_FABRIC_LAYOUT=m diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig index d0fe0f8f77c2..41d85cb3c9a2 100644 --- a/arch/powerpc/configs/ppc64e_defconfig +++ b/arch/powerpc/configs/ppc64e_defconfig @@ -141,11 +141,12 @@ CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_HID_DRAGONRISE=y CONFIG_HID_GYRATION=y CONFIG_HID_TWINHAN=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ae6eba482d75..da0e8d535eb8 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -789,17 +789,18 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_SOUND=m CONFIG_SND=m -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=m CONFIG_SND_PCM_OSS=m -CONFIG_SND_SEQUENCER_OSS=y CONFIG_SND_DYNAMIC_MINORS=y # CONFIG_SND_SUPPORT_OLD_API is not set CONFIG_SND_VERBOSE_PRINTK=y CONFIG_SND_DEBUG=y CONFIG_SND_DEBUG_VERBOSE=y CONFIG_SND_PCM_XRUN_DEBUG=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m CONFIG_SND_DUMMY=m CONFIG_SND_VIRMIDI=m CONFIG_SND_MTPAV=m diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index aef41b17a8bc..9c7400a19e9d 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -79,11 +79,12 @@ CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_SOUND=y CONFIG_SND=y -CONFIG_SND_SEQUENCER=y +CONFIG_SND_OSSEMUL=y CONFIG_SND_MIXER_OSS=y CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y # CONFIG_SND_VERBOSE_PROCFS is not set +CONFIG_SND_SEQUENCER=y +CONFIG_SND_SEQUENCER_OSS=y CONFIG_HID_APPLE=m CONFIG_HID_WACOM=m CONFIG_MMC=y From 4917fcb58cc73f6b81455e3c5f960144809ddf1a Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 19 Sep 2017 11:47:06 +0530 Subject: [PATCH 0305/1322] powerpc/sysrq: Fix oops whem ppmu is not registered Kernel crashes if power pmu is not registered and user tries to dump regs with 'echo p > /proc/sysrq-trigger'. Sample log: Unable to handle kernel paging request for data at address 0x00000008 Faulting instruction address: 0xc0000000000d52f0 NIP [c0000000000d52f0] perf_event_print_debug+0x10/0x230 LR [c00000000058a938] sysrq_handle_showregs+0x38/0x50 Call Trace: printk+0x38/0x4c (unreliable) __handle_sysrq+0xe4/0x270 write_sysrq_trigger+0x64/0x80 proc_reg_write+0x80/0xd0 __vfs_write+0x40/0x200 vfs_write+0xc8/0x240 SyS_write+0x60/0x110 system_call+0x58/0x6c Fixes: 5f6d0380c640 ("powerpc/perf: Define perf_event_print_debug() to print PMU register values") Signed-off-by: Ravi Bangoria Reviewed-by: Kamalesh Babulal Signed-off-by: Michael Ellerman --- arch/powerpc/perf/core-book3s.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 2e3eb7431571..9e3da168d54c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -793,6 +793,11 @@ void perf_event_print_debug(void) u32 pmcs[MAX_HWEVENTS]; int i; + if (!ppmu) { + pr_info("Performance monitor hardware not registered.\n"); + return; + } + if (!ppmu->n_counter) return; From c1fa0768a8713b135848f78fd43ffc208d8ded70 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 13 Sep 2017 22:13:48 -0400 Subject: [PATCH 0306/1322] powerpc/tm: Flush TM only if CPU has TM feature Commit cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") added code to access TM SPRs in flush_tmregs_to_thread(). However flush_tmregs_to_thread() does not check if TM feature is available on CPU before trying to access TM SPRs in order to copy live state to thread structures. flush_tmregs_to_thread() is indeed guarded by CONFIG_PPC_TRANSACTIONAL_MEM but it might be the case that kernel was compiled with CONFIG_PPC_TRANSACTIONAL_MEM enabled and ran on a CPU without TM feature available, thus rendering the execution of TM instructions that are treated by the CPU as illegal instructions. The fix is just to add proper checking in flush_tmregs_to_thread() if CPU has the TM feature before accessing any TM-specific resource, returning immediately if TM is no available on the CPU. Adding that checking in flush_tmregs_to_thread() instead of in places where it is called, like in vsr_get() and vsr_set(), is better because avoids the same problem cropping up elsewhere. Cc: stable@vger.kernel.org # v4.13+ Fixes: cd63f3c ("powerpc/tm: Fix saving of TM SPRs in core dump") Signed-off-by: Gustavo Romero Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 07cd22e35405..f52ad5bb7109 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -131,7 +131,7 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * in the appropriate thread structures from live. */ - if (tsk != current) + if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) return; if (MSR_TM_SUSPENDED(mfmsr())) { From ad47ff3e33503e0969db2d4f9a40942aa6414598 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 19 Sep 2017 20:45:52 +1000 Subject: [PATCH 0307/1322] powerpc/sstep: Fix issues with set_cr0() set_cr0() broke when we changed analyse_instr() to not modify the register state. Instead of looking at regs->gpr[x] which has not been updated yet, we need to look at op->val. Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index fb9f58b868e7..9d72e5900320 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -944,9 +944,9 @@ NOKPROBE_SYMBOL(emulate_dcbz); : "r" (addr), "i" (-EFAULT), "0" (err)) static nokprobe_inline void set_cr0(const struct pt_regs *regs, - struct instruction_op *op, int rd) + struct instruction_op *op) { - long val = regs->gpr[rd]; + long val = op->val; op->type |= SETCC; op->ccval = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); @@ -1326,7 +1326,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 13: /* addic. */ imm = (short) instr; add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0); - set_cr0(regs, op, rd); + set_cr0(regs, op); return 1; case 14: /* addi */ @@ -1397,13 +1397,13 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 28: /* andi. */ op->val = regs->gpr[rd] & (unsigned short) instr; - set_cr0(regs, op, ra); + set_cr0(regs, op); goto logical_done_nocc; case 29: /* andis. */ imm = (unsigned short) instr; op->val = regs->gpr[rd] & (imm << 16); - set_cr0(regs, op, ra); + set_cr0(regs, op); goto logical_done_nocc; #ifdef __powerpc64__ @@ -2526,7 +2526,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, logical_done: if (instr & 1) - set_cr0(regs, op, ra); + set_cr0(regs, op); logical_done_nocc: op->reg = ra; op->type |= SETREG; @@ -2534,7 +2534,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, arith_done: if (instr & 1) - set_cr0(regs, op, rd); + set_cr0(regs, op); compute_done: op->reg = rd; op->type |= SETREG; From 5bcaa4cc41923871777c3d13906267e812775094 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 19 Sep 2017 20:45:53 +1000 Subject: [PATCH 0308/1322] powerpc/sstep: Fix issues with mcrf mcrf broke when we changed analyse_instr() to not modify the register state. The instruction writes to the CR, so we need to store the result in op->ccval, not op->val. Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9d72e5900320..c4cda1afb49d 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1513,10 +1513,10 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->type = COMPUTE + SETCC; imm = 0xf0000000UL; val = regs->gpr[rd]; - op->val = regs->ccr; + op->ccval = regs->ccr; for (sh = 0; sh < 8; ++sh) { if (instr & (0x80000 >> sh)) - op->val = (op->val & ~imm) | + op->ccval = (op->ccval & ~imm) | (val & imm); imm >>= 4; } From 1575fe06f6b1d156ed31fb22c8631d49d79751d8 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Sep 2017 09:32:19 +1000 Subject: [PATCH 0309/1322] powerpc/sstep: mullw should calculate a 64 bit signed result mullw should do a 32 bit signed multiply and create a 64 bit signed result. It currently truncates the result to 32 bits. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index c4cda1afb49d..5e8418c28bd8 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1651,8 +1651,9 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto arith_done; case 235: /* mullw */ - op->val = (unsigned int) regs->gpr[ra] * - (unsigned int) regs->gpr[rb]; + op->val = (long)(int) regs->gpr[ra] * + (int) regs->gpr[rb]; + goto arith_done; case 266: /* add */ From 5d298baa41883fc421acfd932799c0f4177249ae Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 31 Aug 2017 17:17:41 +0530 Subject: [PATCH 0310/1322] powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline Commit 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug") clears the PECE1 bit of the LPCR via stop-api during CPU-Hotplug to prevent wakeup due to a decrementer on an offlined CPU which is in a deep stop state. In the case where the stop-api support is found to be lacking, the commit 785a12afdb4a ("powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT states when stop-api fails") disables deep states that lose hypervisor context. Thus in this case, the offlined CPU will be put to some shallow idle state. However, we currently unconditionally clear the PECE1 in LPCR via stop-api during CPU-Hotplug even when deep states are disabled due to stop-api failure. Fix this by clearing PECE1 of LPCR via stop-api during CPU-Hotplug *only* when the offlined CPU will be put to a deep state that loses hypervisor context. Fixes: 24be85a23d1f ("powerpc/powernv: Clear PECE1 in LPCR via stop-api only on Hotplug") Reported-by: Pavithra Prakash Signed-off-by: Gautham R. Shenoy Reviewed-by: Nicholas Piggin Tested-by: Pavithra Prakash Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 9f59041a172b..443d5ca71995 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -393,7 +393,13 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) u64 pir = get_hard_smp_processor_id(cpu); mtspr(SPRN_LPCR, lpcr_val); - opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + + /* + * Program the LPCR via stop-api only if the deepest stop state + * can lose hypervisor context. + */ + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) + opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); } /* From 17df6453d4be17910456e99c5a85025aa1b7a246 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:53 +0200 Subject: [PATCH 0311/1322] brcmfmac: add length check in brcmf_cfg80211_escan_handler() Upon handling the firmware notification for scans the length was checked properly and may result in corrupting kernel heap memory due to buffer overruns. This fix addresses CVE-2017-0786. Cc: stable@vger.kernel.org # v4.0.x Cc: Kevin Cernekee Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index aaed4ab503ad..26a0de371c26 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3162,6 +3162,7 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, struct brcmf_cfg80211_info *cfg = ifp->drvr->config; s32 status; struct brcmf_escan_result_le *escan_result_le; + u32 escan_buflen; struct brcmf_bss_info_le *bss_info_le; struct brcmf_bss_info_le *bss = NULL; u32 bi_length; @@ -3181,11 +3182,23 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, if (status == BRCMF_E_STATUS_PARTIAL) { brcmf_dbg(SCAN, "ESCAN Partial result\n"); + if (e->datalen < sizeof(*escan_result_le)) { + brcmf_err("invalid event data length\n"); + goto exit; + } escan_result_le = (struct brcmf_escan_result_le *) data; if (!escan_result_le) { brcmf_err("Invalid escan result (NULL pointer)\n"); goto exit; } + escan_buflen = le32_to_cpu(escan_result_le->buflen); + if (escan_buflen > BRCMF_ESCAN_BUF_SIZE || + escan_buflen > e->datalen || + escan_buflen < sizeof(*escan_result_le)) { + brcmf_err("Invalid escan buffer length: %d\n", + escan_buflen); + goto exit; + } if (le16_to_cpu(escan_result_le->bss_count) != 1) { brcmf_err("Invalid bss_count %d: ignoring\n", escan_result_le->bss_count); @@ -3202,9 +3215,8 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, } bi_length = le32_to_cpu(bss_info_le->length); - if (bi_length != (le32_to_cpu(escan_result_le->buflen) - - WL_ESCAN_RESULTS_FIXED_SIZE)) { - brcmf_err("Invalid bss_info length %d: ignoring\n", + if (bi_length != escan_buflen - WL_ESCAN_RESULTS_FIXED_SIZE) { + brcmf_err("Ignoring invalid bss_info length: %d\n", bi_length); goto exit; } From 35f62727df0ed8e5e4857e162d94fd46d861f1cf Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:54 +0200 Subject: [PATCH 0312/1322] brcmfmac: setup passive scan if requested by user-space The driver was not properly configuring firmware with regard to the type of scan. It always performed an active scan even when user-space was requesting for passive scan, ie. the scan request was done without any SSIDs specified. Cc: stable@vger.kernel.org # v4.0.x Reported-by: Huang, Jiangyang Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 19 ++++--------------- .../broadcom/brcm80211/brcmfmac/fwil_types.h | 5 +++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 26a0de371c26..4157c90ad973 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -980,7 +980,7 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, eth_broadcast_addr(params_le->bssid); params_le->bss_type = DOT11_BSSTYPE_ANY; - params_le->scan_type = 0; + params_le->scan_type = BRCMF_SCANTYPE_ACTIVE; params_le->channel_num = 0; params_le->nprobes = cpu_to_le32(-1); params_le->active_time = cpu_to_le32(-1); @@ -988,12 +988,9 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, params_le->home_time = cpu_to_le32(-1); memset(¶ms_le->ssid_le, 0, sizeof(params_le->ssid_le)); - /* if request is null exit so it will be all channel broadcast scan */ - if (!request) - return; - n_ssids = request->n_ssids; n_channels = request->n_channels; + /* Copy channel array if applicable */ brcmf_dbg(SCAN, "### List of channelspecs to scan ### %d\n", n_channels); @@ -1030,16 +1027,8 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, ptr += sizeof(ssid_le); } } else { - brcmf_dbg(SCAN, "Broadcast scan %p\n", request->ssids); - if ((request->ssids) && request->ssids->ssid_len) { - brcmf_dbg(SCAN, "SSID %s len=%d\n", - params_le->ssid_le.SSID, - request->ssids->ssid_len); - params_le->ssid_le.SSID_len = - cpu_to_le32(request->ssids->ssid_len); - memcpy(¶ms_le->ssid_le.SSID, request->ssids->ssid, - request->ssids->ssid_len); - } + brcmf_dbg(SCAN, "Performing passive scan\n"); + params_le->scan_type = BRCMF_SCANTYPE_PASSIVE; } /* Adding mask to channel numbers */ params_le->channel_num = diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h index 8391989b1882..e0d22fedb2b4 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil_types.h @@ -45,6 +45,11 @@ #define BRCMF_SCAN_PARAMS_COUNT_MASK 0x0000ffff #define BRCMF_SCAN_PARAMS_NSSID_SHIFT 16 +/* scan type definitions */ +#define BRCMF_SCANTYPE_DEFAULT 0xFF +#define BRCMF_SCANTYPE_ACTIVE 0 +#define BRCMF_SCANTYPE_PASSIVE 1 + #define BRCMF_WSEC_MAX_PSK_LEN 32 #define BRCMF_WSEC_PASSPHRASE BIT(0) From 590d08d3da45e9fed423b08ab38d71886c07abc8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Sep 2017 11:43:47 -0500 Subject: [PATCH 0313/1322] SMB3: Fix endian warning Multi-dialect negotiate patch had a minor endian error. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable # 4.13+ --- fs/cifs/smb2pdu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b0eaebe627e9..b4c58a1db1ae 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -570,10 +570,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ops set to 3.0 by default for default so update */ ses->server->ops = &smb21_operations; } - } else if (rsp->DialectRevision != ses->server->vals->protocol_id) { + } else if (le16_to_cpu(rsp->DialectRevision) != + ses->server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ cifs_dbg(VFS, "Illegal 0x%x dialect returned: not requested\n", - cpu_to_le16(rsp->DialectRevision)); + le16_to_cpu(rsp->DialectRevision)); return -EIO; } From c721c38957fb19982416f6be71aae7b30630d83b Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Sep 2017 18:40:03 -0500 Subject: [PATCH 0314/1322] SMB3: Warn user if trying to sign connection that authenticated as guest It can be confusing if user ends up authenticated as guest but they requested signing (server will return error validating signed packets) so add log message for this. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/smb2pdu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b4c58a1db1ae..d499ce265c3b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1176,6 +1176,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, while (sess_data->func) sess_data->func(sess_data); + if ((ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) && (ses->sign)) + cifs_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: kfree(sess_data); From 6e82e929d98552ed5156919e0f532eeba2da704b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 20 Sep 2017 13:09:23 +1000 Subject: [PATCH 0315/1322] cifs: show 'soft' in the mount options for hard mounts Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 180b3356ff86..a864e6575309 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -461,6 +461,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nocase"); if (tcon->retry) seq_puts(s, ",hard"); + else + seq_puts(s, ",soft"); if (tcon->use_persistent) seq_puts(s, ",persistenthandles"); else if (tcon->use_resilient) From ce21574ad1922b403198ee664c4dff276f514f1d Mon Sep 17 00:00:00 2001 From: Sekhar Nori Date: Tue, 29 Aug 2017 13:52:51 +0530 Subject: [PATCH 0316/1322] ARM: dts: da850-evm: add serial and ethernet aliases Add aliases for serial and ethernet nodes. Serial aliases help keep order of tty nodes fixed and ethernet alias is used by bootloader to setup mac address correctly. Reported-by: Adam Ford Acked-by: Tony Lindgren Fixes: dd7deaf218bf ("ARM: davinci: da850: add DT node for ethernet") Signed-off-by: Sekhar Nori --- arch/arm/boot/dts/da850-evm.dts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/boot/dts/da850-evm.dts b/arch/arm/boot/dts/da850-evm.dts index 67e72bc72e80..c75507922f7d 100644 --- a/arch/arm/boot/dts/da850-evm.dts +++ b/arch/arm/boot/dts/da850-evm.dts @@ -15,6 +15,13 @@ compatible = "ti,da850-evm", "ti,da850"; model = "DA850/AM1808/OMAP-L138 EVM"; + aliases { + serial0 = &serial0; + serial1 = &serial1; + serial2 = &serial2; + ethernet0 = ð0; + }; + soc@1c00000 { pmx_core: pinmux@14120 { status = "okay"; From fd0b19ed5389187829b854900511c9195875bb42 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Tue, 19 Sep 2017 22:07:18 -0700 Subject: [PATCH 0317/1322] MIPS: Fix perf event init Commit c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") modified mipspmu_event_init() to cast the struct perf_event cpu field to an unsigned integer before it is compared with nr_cpumask_bits (and *ahem* did so without copying the linux-mips mailing list or any MIPS developers...). This is broken because the cpu field may be -1 for events which follow a process rather than being affine to a particular CPU. When this is the case the cast to an unsigned int results in a value equal to ULONG_MAX, which is always greater than nr_cpumask_bits so we always fail mipspmu_event_init() and return -ENODEV. The check against nr_cpumask_bits seems nonsensical anyway, so this patch simply removes it. The cpu field is going to either be -1 or a valid CPU number. Comparing it with nr_cpumask_bits is effectively checking that it's a valid cpu number, but it seems safe to rely on the core perf events code to ensure that's the case. The end result is that this fixes use of perf on MIPS when not constraining events to a particular CPU, and fixes the "perf list hw" command which fails to list any events without this. Signed-off-by: Paul Burton Fixes: c311c797998c ("cpumask: make "nr_cpumask_bits" unsigned") Cc: Alexey Dobriyan Cc: Andrew Morton Cc: linux-mips@linux-mips.org Cc: stable # v4.12+ Patchwork: https://patchwork.linux-mips.org/patch/17323/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/perf_event_mipsxx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 9e6c74bf66c4..6668f67a61c3 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -618,8 +618,7 @@ static int mipspmu_event_init(struct perf_event *event) return -ENOENT; } - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) + if (event->cpu >= 0 && !cpu_online(event->cpu)) return -ENODEV; if (!atomic_inc_not_zero(&active_events)) { From bd6227a150fdb56e7bb734976ef6e53a2c1cb334 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 14 Sep 2017 17:10:28 +0200 Subject: [PATCH 0318/1322] crypto: drbg - fix freeing of resources During the change to use aligned buffers, the deallocation code path was not updated correctly. The current code tries to free the aligned buffer pointer and not the original buffer pointer as it is supposed to. Thus, the code is updated to free the original buffer pointer and set the aligned buffer pointer that is used throughout the code to NULL. Fixes: 3cfc3b9721123 ("crypto: drbg - use aligned buffers") CC: CC: Herbert Xu Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/drbg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/drbg.c b/crypto/drbg.c index 633a88e93ab0..70018397e59a 100644 --- a/crypto/drbg.c +++ b/crypto/drbg.c @@ -1133,10 +1133,10 @@ static inline void drbg_dealloc_state(struct drbg_state *drbg) { if (!drbg) return; - kzfree(drbg->V); - drbg->Vbuf = NULL; - kzfree(drbg->C); - drbg->Cbuf = NULL; + kzfree(drbg->Vbuf); + drbg->V = NULL; + kzfree(drbg->Cbuf); + drbg->C = NULL; kzfree(drbg->scratchpadbuf); drbg->scratchpadbuf = NULL; drbg->reseed_ctr = 0; From 569f11c9f788959b640116b5bbd6d8a1f07326da Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:00 -0500 Subject: [PATCH 0319/1322] crypto: x86/blowfish - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R12 instead of RBP. R12 can't be used as the RT0 register because of x86 instruction encoding limitations. So use R12 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/blowfish-x86_64-asm_64.S | 48 +++++++++++++----------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@ #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ -#define CTX %rdi +#define CTX %r12 #define RIO %rsi #define RX0 %rax @@ -56,12 +56,12 @@ #define RX2bh %ch #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi #define RT1d %esi #define RT2d %r8d #define RT3d %r9d @@ -120,13 +120,14 @@ ENTRY(__blowfish_enc_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk) round_enc(14); add_roundkey_enc(16); - movq %r11, %rbp; + movq %r11, %r12; movq %r10, RIO; test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk) ENTRY(blowfish_dec_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk) movq %r10, RIO; write_block(); - movq %r11, %rbp; + movq %r11, %r12; ret; ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk) ENTRY(__blowfish_enc_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - pushq %rbp; + pushq %r12; pushq %rbx; pushq %rcx; - preload_roundkey_enc(0); - + movq %rdi, CTX movq %rsi, %r11; movq %rdx, RIO; + preload_roundkey_enc(0); + read_block4(); round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %rbp; + popq %r12; movq %r11, RIO; - test %bpl, %bpl; + test %r12b, %r12b; jnz .L__enc_xor4; write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; .L__enc_xor4: xor_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(__blowfish_enc_blk_4way) ENTRY(blowfish_dec_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - pushq %rbp; + pushq %r12; pushq %rbx; - preload_roundkey_dec(17); - movq %rsi, %r11; + movq %rdi, CTX; + movq %rsi, %r11 movq %rdx, RIO; + preload_roundkey_dec(17); read_block4(); round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way) write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(blowfish_dec_blk_4way) From b46c9d717645529417ca9045cfdbf59f84922573 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:01 -0500 Subject: [PATCH 0320/1322] crypto: x86/camellia - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R12 instead of RBP. Both are callee-saved registers, so the substitution is straightforward. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/camellia-x86_64-asm_64.S | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@ #define RCD1bh %dh #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12 #define RT2 %r8 #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d #define RT2d %r8d #define RT2bl %r8b #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10 #define RDST %r11 #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk) * %rdx: src * %rcx: bool xor */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk) enc_outunpack(mov, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; .L__enc_xor: enc_outunpack(xor, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk) movl $24, RXORd; cmovel RXORd, RT2d; /* max */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk) dec_outunpack(); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way) */ pushq %rbx; - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way) enc_outunpack2(mov, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; .L__enc2_xor: enc_outunpack2(xor, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way) cmovel RXORd, RT2d; /* max */ movq %rbx, RXOR; - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way) dec_outunpack2(); - movq RRBP, %rbp; + movq RR12, %r12; movq RXOR, %rbx; ret; ENDPROC(camellia_dec_blk_2way) From 4b15606664a2f8d7c4f0092fb0305fe1c7c65b7b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:02 -0500 Subject: [PATCH 0321/1322] crypto: x86/cast5 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R15 instead of RBP. R15 can't be used as the RID1 register because of x86 instruction encoding limitations. So use R15 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 47 +++++++++++++++-------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 16-way AVX cast5 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RL1 %xmm0 #define RR1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -226,7 +226,7 @@ .align 16 __cast5_enc_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: blocks 1 and 2 * RR1: blocks 3 and 4 * RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16: * RR4: encrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16: .L__skip_enc: popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16) .align 16 __cast5_dec_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: encrypted blocks 1 and 2 * RR1: encrypted blocks 3 and 4 * RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16: * RR4: decrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16: vmovdqa .Lbswap_mask, RKM; popq %rbx; - popq %rbp; + popq %r15; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16) ENTRY(cast5_ecb_enc_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) ENTRY(cast5_ecb_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) ENTRY(cast5_cbc_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) ENTRY(cast5_ctr_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: iv (big endian, 64bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_ctr_16way) From c66cc3be2951fad4d7d7f799baf57c8c5cc8d655 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:03 -0500 Subject: [PATCH 0322/1322] crypto: x86/cast6 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R15 instead of RBP. R15 can't be used as the RID1 register because of x86 instruction encoding limitations. So use R15 for CTX and RDI for CTX. This means that CTX is no longer an implicit function argument. Instead it needs to be explicitly copied from RDI. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 50 +++++++++++++++-------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 8-way AVX cast6 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RA1 %xmm0 #define RB1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -264,15 +264,17 @@ .align 8 __cast6_enc_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8: QBAR(11); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8) .align 8 __cast6_dec_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8: QBAR(0); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8) ENTRY(cast6_ecb_enc_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) ENTRY(cast6_ecb_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) ENTRY(cast6_cbc_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way) store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way) * %rcx: iv (little endian, 128bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15 + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way) store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_dec_8way) From 3ed7b4d67c6745300c9b5c6baa55da1161b57f60 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:04 -0500 Subject: [PATCH 0323/1322] crypto: x86/des3_ede - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use RSI instead of RBP for RT1. Since RSI is also used as a the 'dst' function argument, it needs to be saved on the stack until the argument is needed. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/des3_ede-asm_64.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@ #define RW2bh %ch #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi #define RT2d %r14d #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk) * %rsi: dst * %rdx: src */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi; /* dst */ + read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk) round1(32+15, RL0, RR0, dummy2); final_permutation(RR0, RL0); + + popq %rsi /* dst */ write_block(%rsi, RR0, RL0); popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) * %rdx: src (3 blocks) */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi /* dst */ + /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) bswapl RR2d; bswapl RL2d; + popq %rsi /* dst */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) From d7b1722c72aa915283ada27709c6feeb392f6038 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:05 -0500 Subject: [PATCH 0324/1322] crypto: x86/sha1-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R11 instead of RBP. Since R11 isn't a callee-saved register, it doesn't need to be saved and restored on the stack. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_avx2_x86_64_asm.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1eab79c9ac48..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@ #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx -#define REG_T1 %ebp +#define REG_T1 %r11d #define xmm_mov vmovups #define avx2_zeroupper vzeroupper #define RND_F1 1 @@ -637,7 +637,6 @@ _loop3: ENTRY(\name) push %rbx - push %rbp push %r12 push %r13 push %r14 @@ -673,7 +672,6 @@ _loop3: pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx ret From 6488bce756861b94810e54f83416d5e74c0f18bf Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:06 -0500 Subject: [PATCH 0325/1322] crypto: x86/sha1-ssse3 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the REG_D register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_ssse3_asm.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@ #define REG_A %ecx #define REG_B %esi #define REG_C %edi -#define REG_D %ebp +#define REG_D %r12d #define REG_E %edx #define REG_T1 %eax @@ -74,10 +74,10 @@ ENTRY(\name) push %rbx - push %rbp push %r12 + push %rbp + mov %rsp, %rbp - mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack @@ -99,10 +99,9 @@ xor %rax, %rax rep stosq - mov %r12, %rsp # deallocate workspace - - pop %r12 + mov %rbp, %rsp # deallocate workspace pop %rbp + pop %r12 pop %rbx ret From 673ac6fbc74f835e2125df9ee39e8a2a423832e2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:07 -0500 Subject: [PATCH 0326/1322] crypto: x86/sha256-avx - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the TBL register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx-asm.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -350,13 +350,13 @@ a = TMP_ ENTRY(sha256_transform_avx) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + movq %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp # allocate stack space and $~15, %rsp # align stack pointer @@ -452,13 +452,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret ENDPROC(sha256_transform_avx) From d3dfbfe2e6e7ecd620531d5201314ad14c4ed5b3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:08 -0500 Subject: [PATCH 0327/1322] crypto: x86/sha256-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. There's no need to use RBP as a temporary register for the TBL value, because it always stores the same value: the address of the K256 table. Instead just reference the address of K256 directly. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-avx2-asm.S | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP - -TBL = %rbp SRND = CTX # SRND is same register as CTX a = %eax @@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE ENTRY(sha256_transform_rorx) .align 32 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx) mov CTX, _CTX(%rsp) loop0: - lea K256(%rip), TBL - ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter: .align 16 loop1: - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X0, XFER + vpaddd K256+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd 2*32(TBL, SRND), X0, XFER + vpaddd K256+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd 3*32(TBL, SRND), X0, XFER + vpaddd K256+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 @@ -619,10 +614,11 @@ loop1: loop2: ## Do last 16 rounds with no scheduling - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X1, XFER + + vpaddd K256+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -676,9 +672,6 @@ loop3: ja done_hash do_last_block: - #### do last block - lea K256(%rip), TBL - VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash: popq %r14 popq %r13 popq %r12 - popq %rbp popq %rbx ret ENDPROC(sha256_transform_rorx) From 539012dcbdd1ff028268764385ed1f6d600812a7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:09 -0500 Subject: [PATCH 0328/1322] crypto: x86/sha256-ssse3 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Swap the usages of R12 and RBP. Use R12 for the TBL register, and use RBP to store the pre-aligned stack pointer. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha256-ssse3-asm.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -356,13 +356,13 @@ a = TMP_ ENTRY(sha256_transform_ssse3) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + mov %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp and $~15, %rsp @@ -462,13 +462,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret From ca04c823763e5b82c237cabe0c17f547ecdc6271 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:10 -0500 Subject: [PATCH 0329/1322] crypto: sha512-avx2 - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Mix things up a little bit to get rid of the RBP usage, without hurting performance too much. Use RDI instead of RBP for the TBL pointer. That will clobber CTX, so spill CTX onto the stack and use R12 to read it in the outer loop. R12 is used as a non-persistent temporary variable elsewhere, so it's safe to use. Also remove the unused y4 variable. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512-avx2-asm.S | 75 ++++++++++++++++--------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 -# 1st arg -CTX = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 # 2nd arg INP = %rsi # 3rd arg @@ -81,7 +82,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %rdi # clobbers CTX1 a = %rax b = %rbx @@ -91,26 +92,26 @@ g = %r10 h = %r11 old_h = %r11 -T1 = %r12 +T1 = %r12 # clobbers CTX2 y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx) mov %rax, frame_RSPSAVE(%rsp) # Save GPRs - mov %rbp, frame_GPRSAVE(%rsp) - mov %rbx, 8*1+frame_GPRSAVE(%rsp) - mov %r12, 8*2+frame_GPRSAVE(%rsp) - mov %r13, 8*3+frame_GPRSAVE(%rsp) - mov %r14, 8*4+frame_GPRSAVE(%rsp) - mov %r15, 8*5+frame_GPRSAVE(%rsp) + mov %rbx, 8*0+frame_GPRSAVE(%rsp) + mov %r12, 8*1+frame_GPRSAVE(%rsp) + mov %r13, 8*2+frame_GPRSAVE(%rsp) + mov %r14, 8*3+frame_GPRSAVE(%rsp) + mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx) mov NUM_BLKS, frame_INPEND(%rsp) ## load initial digest - mov 8*0(CTX),a - mov 8*1(CTX),b - mov 8*2(CTX),c - mov 8*3(CTX),d - mov 8*4(CTX),e - mov 8*5(CTX),f - mov 8*6(CTX),g - mov 8*7(CTX),h + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2: subq $1, frame_SRND(%rsp) jne loop2 - addm 8*0(CTX),a - addm 8*1(CTX),b - addm 8*2(CTX),c - addm 8*3(CTX),d - addm 8*4(CTX),e - addm 8*5(CTX),f - addm 8*6(CTX),g - addm 8*7(CTX),h + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h mov frame_INP(%rsp), INP add $128, INP @@ -669,12 +673,11 @@ loop2: done_hash: # Restore GPRs - mov frame_GPRSAVE(%rsp) ,%rbp - mov 8*1+frame_GPRSAVE(%rsp) ,%rbx - mov 8*2+frame_GPRSAVE(%rsp) ,%r12 - mov 8*3+frame_GPRSAVE(%rsp) ,%r13 - mov 8*4+frame_GPRSAVE(%rsp) ,%r14 - mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + mov 8*0+frame_GPRSAVE(%rsp), %rbx + mov 8*1+frame_GPRSAVE(%rsp), %r12 + mov 8*2+frame_GPRSAVE(%rsp), %r13 + mov 8*3+frame_GPRSAVE(%rsp), %r14 + mov 8*4+frame_GPRSAVE(%rsp), %r15 # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp From 8f182f845d0fa26ecc18d3bdcc3d1077a0ea3a31 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Sep 2017 14:42:11 -0500 Subject: [PATCH 0330/1322] crypto: x86/twofish - Fix RBP usage Using RBP as a temporary register breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Use R13 instead of RBP. Both are callee-saved registers, so the substitution is straightforward. Reported-by: Eric Biggers Reported-by: Peter Zijlstra Tested-by: Eric Biggers Acked-by: Eric Biggers Signed-off-by: Josh Poimboeuf Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@ #define RT %xmm14 #define RR %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %r13 +#define RID1d %r13d #define RID2 %rsi #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8: vmovdqu w(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8: popq %rcx; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8: vmovdqu (w+4*4)(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8: vmovdqu (w)(CTX), RK1; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); From afd62fa26343be6445479e75de9f07092a061459 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:51 +0200 Subject: [PATCH 0331/1322] crypto: talitos - fix sha224 Kernel crypto tests report the following error at startup [ 2.752626] alg: hash: Test 4 failed for sha224-talitos [ 2.757907] 00000000: 30 e2 86 e2 e7 8a dd 0d d7 eb 9f d5 83 fe f1 b0 00000010: 2d 5a 6c a5 f9 55 ea fd 0e 72 05 22 This patch fixes it Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 79791c690858..6596761bc0c8 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1756,9 +1756,9 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, req_ctx->swinit = 0; } else { desc->ptr[1] = zero_entry; - /* Indicate next op is not the first. */ - req_ctx->first = 0; } + /* Indicate next op is not the first. */ + req_ctx->first = 0; /* HMAC key */ if (ctx->keylen) From 886a27c0fc8a34633aadb0986dba11d8c150ae2e Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:57 +0200 Subject: [PATCH 0332/1322] crypto: talitos - fix hashing md5sum on some files gives wrong result Exemple: With the md5sum from libkcapi: c15115c05bad51113f81bdaee735dd09 test With the original md5sum: bbdf41d80ba7e8b2b7be3a0772be76cb test This patch fixes this issue Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 6596761bc0c8..8a48fc4f3642 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1769,7 +1769,7 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, sg_count = edesc->src_nents ?: 1; if (is_sec1 && sg_count > 1) - sg_copy_to_buffer(areq->src, sg_count, edesc->buf, length); + sg_copy_to_buffer(req_ctx->psrc, sg_count, edesc->buf, length); else sg_count = dma_map_sg(dev, req_ctx->psrc, sg_count, DMA_TO_DEVICE); From 56136631573baa537a15e0012055ffe8cfec1a33 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 12 Sep 2017 11:03:39 +0200 Subject: [PATCH 0333/1322] crypto: talitos - Don't provide setkey for non hmac hashing algs. Today, md5sum fails with error -ENOKEY because a setkey function is set for non hmac hashing algs, see strace output below: mmap(NULL, 378880, PROT_READ, MAP_SHARED, 6, 0) = 0x77f50000 accept(3, 0, NULL) = 7 vmsplice(5, [{"bin/\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 378880}], 1, SPLICE_F_MORE|SPLICE_F_GIFT) = 262144 splice(4, NULL, 7, NULL, 262144, SPLICE_F_MORE) = -1 ENOKEY (Required key not available) write(2, "Generation of hash for file kcap"..., 50) = 50 munmap(0x77f50000, 378880) = 0 This patch ensures that setkey() function is set only for hmac hashing. Cc: Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 8a48fc4f3642..dff88838dce7 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3057,7 +3057,8 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, t_alg->algt.alg.hash.final = ahash_final; t_alg->algt.alg.hash.finup = ahash_finup; t_alg->algt.alg.hash.digest = ahash_digest; - t_alg->algt.alg.hash.setkey = ahash_setkey; + if (!strncmp(alg->cra_name, "hmac", 4)) + t_alg->algt.alg.hash.setkey = ahash_setkey; t_alg->algt.alg.hash.import = ahash_import; t_alg->algt.alg.hash.export = ahash_export; From 3e1166b94e661ed51af1fe1fe5f74bd83450b50f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 12:12:16 +0200 Subject: [PATCH 0334/1322] crypto: inside-secure - fix gcc-4.9 warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All older compiler versions up to gcc-4.9 produce these harmless warnings: drivers/crypto/inside-secure/safexcel_cipher.c:389:9: warning: missing braces around initializer [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_cipher.c:389:9: warning: (near initialization for ‘result.completion’) [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_hash.c:422:9: warning: missing braces around initializer [-Wmissing-braces] drivers/crypto/inside-secure/safexcel_hash.c:422:9: warning: (near initialization for ‘result.completion’) [-Wmissing-braces] This changes the syntax to something that works on all versions without warnings. Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver") Signed-off-by: Arnd Bergmann Acked-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_cipher.c | 2 +- drivers/crypto/inside-secure/safexcel_hash.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index d2207ac5ba19..5438552bc6d7 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -386,7 +386,7 @@ static int safexcel_cipher_exit_inv(struct crypto_tfm *tfm) struct safexcel_cipher_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->priv; struct skcipher_request req; - struct safexcel_inv_result result = { 0 }; + struct safexcel_inv_result result = {}; int ring = ctx->base.ring; memset(&req, 0, sizeof(struct skcipher_request)); diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 3f819399cd95..3980f946874f 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -419,7 +419,7 @@ static int safexcel_ahash_exit_inv(struct crypto_tfm *tfm) struct safexcel_ahash_ctx *ctx = crypto_tfm_ctx(tfm); struct safexcel_crypto_priv *priv = ctx->priv; struct ahash_request req; - struct safexcel_inv_result result = { 0 }; + struct safexcel_inv_result result = {}; int ring = ctx->base.ring; memset(&req, 0, sizeof(struct ahash_request)); From c056d910f08029662080a01b4ce2110e2c9a27b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Fri, 1 Sep 2017 17:12:59 +0300 Subject: [PATCH 0335/1322] crypto: caam - fix LS1021A support on ARMv7 multiplatform kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When built using multi_v7_defconfig, driver does not work on LS1021A: [...] caam 1700000.crypto: can't identify CAAM ipg clk: -2 caam: probe of 1700000.crypto failed with error -2 [...] It turns out we have to detect at runtime whether driver is running on an i.MX platform or not. Cc: Fixes: 6c3af9559352 ("crypto: caam - add support for LS1021A") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/Kconfig | 5 +--- drivers/crypto/caam/ctrl.c | 19 ++++++------ drivers/crypto/caam/regs.h | 59 ++++++++++++++++++------------------- 3 files changed, 39 insertions(+), 44 deletions(-) diff --git a/drivers/crypto/caam/Kconfig b/drivers/crypto/caam/Kconfig index e36aeacd7635..1eb852765469 100644 --- a/drivers/crypto/caam/Kconfig +++ b/drivers/crypto/caam/Kconfig @@ -1,6 +1,7 @@ config CRYPTO_DEV_FSL_CAAM tristate "Freescale CAAM-Multicore driver backend" depends on FSL_SOC || ARCH_MXC || ARCH_LAYERSCAPE + select SOC_BUS help Enables the driver module for Freescale's Cryptographic Accelerator and Assurance Module (CAAM), also known as the SEC version 4 (SEC4). @@ -141,10 +142,6 @@ config CRYPTO_DEV_FSL_CAAM_RNG_API To compile this as a module, choose M here: the module will be called caamrng. -config CRYPTO_DEV_FSL_CAAM_IMX - def_bool SOC_IMX6 || SOC_IMX7D - depends on CRYPTO_DEV_FSL_CAAM - config CRYPTO_DEV_FSL_CAAM_DEBUG bool "Enable debug output in CAAM driver" depends on CRYPTO_DEV_FSL_CAAM diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index dacb53fb690e..027e121c6f70 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "compat.h" #include "regs.h" @@ -19,6 +20,8 @@ bool caam_little_end; EXPORT_SYMBOL(caam_little_end); bool caam_dpaa2; EXPORT_SYMBOL(caam_dpaa2); +bool caam_imx; +EXPORT_SYMBOL(caam_imx); #ifdef CONFIG_CAAM_QI #include "qi.h" @@ -28,19 +31,11 @@ EXPORT_SYMBOL(caam_dpaa2); * i.MX targets tend to have clock control subsystems that can * enable/disable clocking to our device. */ -#ifdef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX static inline struct clk *caam_drv_identify_clk(struct device *dev, char *clk_name) { - return devm_clk_get(dev, clk_name); + return caam_imx ? devm_clk_get(dev, clk_name) : NULL; } -#else -static inline struct clk *caam_drv_identify_clk(struct device *dev, - char *clk_name) -{ - return NULL; -} -#endif /* * Descriptor to instantiate RNG State Handle 0 in normal mode and @@ -430,6 +425,10 @@ static int caam_probe(struct platform_device *pdev) { int ret, ring, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN; u64 caam_id; + static const struct soc_device_attribute imx_soc[] = { + {.family = "Freescale i.MX"}, + {}, + }; struct device *dev; struct device_node *nprop, *np; struct caam_ctrl __iomem *ctrl; @@ -451,6 +450,8 @@ static int caam_probe(struct platform_device *pdev) dev_set_drvdata(dev, ctrlpriv); nprop = pdev->dev.of_node; + caam_imx = (bool)soc_device_match(imx_soc); + /* Enable clocking */ clk = caam_drv_identify_clk(&pdev->dev, "ipg"); if (IS_ERR(clk)) { diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h index 2b5efff9ec3c..17cfd23a38fa 100644 --- a/drivers/crypto/caam/regs.h +++ b/drivers/crypto/caam/regs.h @@ -67,6 +67,7 @@ */ extern bool caam_little_end; +extern bool caam_imx; #define caam_to_cpu(len) \ static inline u##len caam##len ## _to_cpu(u##len val) \ @@ -154,13 +155,10 @@ static inline u64 rd_reg64(void __iomem *reg) #else /* CONFIG_64BIT */ static inline void wr_reg64(void __iomem *reg, u64 data) { -#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX - if (caam_little_end) { + if (!caam_imx && caam_little_end) { wr_reg32((u32 __iomem *)(reg) + 1, data >> 32); wr_reg32((u32 __iomem *)(reg), data); - } else -#endif - { + } else { wr_reg32((u32 __iomem *)(reg), data >> 32); wr_reg32((u32 __iomem *)(reg) + 1, data); } @@ -168,41 +166,40 @@ static inline void wr_reg64(void __iomem *reg, u64 data) static inline u64 rd_reg64(void __iomem *reg) { -#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX - if (caam_little_end) + if (!caam_imx && caam_little_end) return ((u64)rd_reg32((u32 __iomem *)(reg) + 1) << 32 | (u64)rd_reg32((u32 __iomem *)(reg))); - else -#endif - return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 | - (u64)rd_reg32((u32 __iomem *)(reg) + 1)); + + return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 | + (u64)rd_reg32((u32 __iomem *)(reg) + 1)); } #endif /* CONFIG_64BIT */ +static inline u64 cpu_to_caam_dma64(dma_addr_t value) +{ + if (caam_imx) + return (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | + (u64)cpu_to_caam32(upper_32_bits(value))); + + return cpu_to_caam64(value); +} + +static inline u64 caam_dma64_to_cpu(u64 value) +{ + if (caam_imx) + return (((u64)caam32_to_cpu(lower_32_bits(value)) << 32) | + (u64)caam32_to_cpu(upper_32_bits(value))); + + return caam64_to_cpu(value); +} + #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT -#ifdef CONFIG_SOC_IMX7D -#define cpu_to_caam_dma(value) \ - (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | \ - (u64)cpu_to_caam32(upper_32_bits(value))) -#define caam_dma_to_cpu(value) \ - (((u64)caam32_to_cpu(lower_32_bits(value)) << 32) | \ - (u64)caam32_to_cpu(upper_32_bits(value))) -#else -#define cpu_to_caam_dma(value) cpu_to_caam64(value) -#define caam_dma_to_cpu(value) caam64_to_cpu(value) -#endif /* CONFIG_SOC_IMX7D */ +#define cpu_to_caam_dma(value) cpu_to_caam_dma64(value) +#define caam_dma_to_cpu(value) caam_dma64_to_cpu(value) #else #define cpu_to_caam_dma(value) cpu_to_caam32(value) #define caam_dma_to_cpu(value) caam32_to_cpu(value) -#endif /* CONFIG_ARCH_DMA_ADDR_T_64BIT */ - -#ifdef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX -#define cpu_to_caam_dma64(value) \ - (((u64)cpu_to_caam32(lower_32_bits(value)) << 32) | \ - (u64)cpu_to_caam32(upper_32_bits(value))) -#else -#define cpu_to_caam_dma64(value) cpu_to_caam64(value) -#endif +#endif /* CONFIG_ARCH_DMA_ADDR_T_64BIT */ /* * jr_outentry From e117765a117da3ece15689cb8a759d16c415b08c Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Wed, 30 Aug 2017 09:17:39 +0200 Subject: [PATCH 0336/1322] crypto: af_alg - update correct dst SGL entry When two adjacent TX SGL are processed and parts of both TX SGLs are pulled into the per-request TX SGL, the wrong per-request TX SGL entries were updated. This fixes a NULL pointer dereference when a cipher implementation walks the TX SGL where some of the SGL entries were NULL. Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory...") Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/af_alg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index ffa9f4ccd9b4..337cf382718e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -619,14 +619,14 @@ void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst, struct af_alg_ctx *ctx = ask->private; struct af_alg_tsgl *sgl; struct scatterlist *sg; - unsigned int i, j; + unsigned int i, j = 0; while (!list_empty(&ctx->tsgl_list)) { sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list); sg = sgl->sg; - for (i = 0, j = 0; i < sgl->cur; i++) { + for (i = 0; i < sgl->cur; i++) { size_t plen = min_t(size_t, used, sg[i].length); struct page *page = sg_page(sg + i); From 5a5d718f952b55e7fa96bfaf6f31f2c08babd77b Mon Sep 17 00:00:00 2001 From: Sriram Periyasamy Date: Tue, 19 Sep 2017 17:25:05 -0500 Subject: [PATCH 0337/1322] ALSA: hda - program ICT bits to support HBR audio On recent Intel platforms (Haswell, Broadwell, Skylake, ApolloLake, KabyLake, ...), the IEC Coding Type (ICT) bitfield in the Digital Converter Control #3 needs to be set explicitly for HDMI/DisplayPort High Bit Rate (HBR) audio playback to work. This was not required in earlier platforms when HBR was first introduced. The ICT bits are defined in Section 7.3.3.9 of the HDaudio 1.0a specification. Since the ICT bitfield was not specified for HDAudio 1.0 devices (before 2009), we only program it on machines more recent than Haswell. We tested that this fix is not needed on Baytrail-I (MinnowBoard Turbot) and believe by extension it also does not apply to Braswell. [ Moved AC_VERB_SET_DIGI_CONVERT_3 definition to the right place by tiwai ] Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98797 Signed-off-by: Sriram Periyasamy Signed-off-by: Pierre-Louis Bossart Signed-off-by: Subhransu S. Prusty Acked-by: Vinod Koul Signed-off-by: Takashi Iwai --- include/sound/hda_verbs.h | 1 + sound/pci/hda/patch_hdmi.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/sound/hda_verbs.h b/include/sound/hda_verbs.h index d0509db6d0ec..f89cd5ee1c7a 100644 --- a/include/sound/hda_verbs.h +++ b/include/sound/hda_verbs.h @@ -95,6 +95,7 @@ enum { #define AC_VERB_SET_EAPD_BTLENABLE 0x70c #define AC_VERB_SET_DIGI_CONVERT_1 0x70d #define AC_VERB_SET_DIGI_CONVERT_2 0x70e +#define AC_VERB_SET_DIGI_CONVERT_3 0x73e #define AC_VERB_SET_VOLUME_KNOB_CONTROL 0x70f #define AC_VERB_SET_GPIO_DATA 0x715 #define AC_VERB_SET_GPIO_MASK 0x716 diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 2b64fabd5faa..c19c81d230bd 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -906,6 +906,7 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid, hda_nid_t pin_nid, u32 stream_tag, int format) { struct hdmi_spec *spec = codec->spec; + unsigned int param; int err; err = spec->ops.pin_hbr_setup(codec, pin_nid, is_hbr_format(format)); @@ -915,6 +916,26 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid, return err; } + if (is_haswell_plus(codec)) { + + /* + * on recent platforms IEC Coding Type is required for HBR + * support, read current Digital Converter settings and set + * ICT bitfield if needed. + */ + param = snd_hda_codec_read(codec, cvt_nid, 0, + AC_VERB_GET_DIGI_CONVERT_1, 0); + + param = (param >> 16) & ~(AC_DIG3_ICT); + + /* on recent platforms ICT mode is required for HBR support */ + if (is_hbr_format(format)) + param |= 0x1; + + snd_hda_codec_write(codec, cvt_nid, 0, + AC_VERB_SET_DIGI_CONVERT_3, param); + } + snd_hda_codec_setup_stream(codec, cvt_nid, stream_tag, 0, format); return 0; } From 8afafa6fba7809c0785018b77c95b19e58b35b94 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 15 Sep 2017 15:38:21 +0530 Subject: [PATCH 0338/1322] powerpc/kprobes: Update optprobes to use emulate_update_regs() Optprobes depended on an updated regs->nip from analyse_instr() to identify the location to branch back from the optprobes trampoline. However, since commit 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs"), analyse_instr() doesn't update the registers anymore. Due to this, we end up branching back from the optprobes trampoline to the same branch into the trampoline resulting in a loop. Fix this by calling out to emulate_update_regs() before using the nip. Additionally, explicitly compare the return value from analyse_instr() to 1, rather than just checking for !0 so as to guard against any future changes to analyse_instr() that may result in -1 being returned in more scenarios. Fixes: 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs") Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/optprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 6f8273f5e988..91e037ab20a1 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -104,8 +104,10 @@ static unsigned long can_optimize(struct kprobe *p) * and that can be emulated. */ if (!is_conditional_branch(*p->ainsn.insn) && - analyse_instr(&op, ®s, *p->ainsn.insn)) + analyse_instr(&op, ®s, *p->ainsn.insn) == 1) { + emulate_update_regs(®s, &op); nip = regs.nip; + } return nip; } From 91f4aa977947f046dc144fa9e3b06f0ffb53be79 Mon Sep 17 00:00:00 2001 From: Diogenes Pereira Date: Wed, 9 Aug 2017 13:19:24 -0300 Subject: [PATCH 0339/1322] mac802154: replace hardcoded value with macro Use IEEE802154_SCF_SECLEVEL_NONE macro defined at ieee802154.h file. Signed-off-by: Diogenes Pereira Signed-off-by: Stefan Schmidt --- net/mac802154/llsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 1e1c9b20bab7..edec2f9919d0 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -713,7 +713,8 @@ int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA) return -EINVAL; - if (!hdr.fc.security_enabled || hdr.sec.level == 0) { + if (!hdr.fc.security_enabled || + (hdr.sec.level == IEEE802154_SCF_SECLEVEL_NONE)) { skb_push(skb, hlen); return 0; } From 3e4962667efb0f6c09fa3111e6ee53838118b227 Mon Sep 17 00:00:00 2001 From: Diogenes Pereira Date: Tue, 5 Sep 2017 09:18:04 -0300 Subject: [PATCH 0340/1322] mac802154: Fix MAC header and payload encrypted According to 802.15.4-2003/2006/2015 specifications the MAC frame is composed of MHR, MAC payload and MFR and just the outgoing MAC payload must be encrypted. If communication is secure,sender build Auxiliary Security Header(ASH), insert it next to the standard MHR header with security enabled bit ON, and secure frames before transmitting them. According to the information carried within the ASH, recipient retrieves the right cryptographic key and correctly un-secure MAC frames. The error scenario occurs on Linux using IEEE802154_SCF_SECLEVEL_ENC(4) security level when llsec_do_encrypt_unauth() function builds theses MAC frames incorrectly. On recipients these MAC frames are discarded,logging "got invalid frame" messages. Signed-off-by: Diogenes Pereira Signed-off-by: Stefan Schmidt --- net/mac802154/llsec.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index edec2f9919d0..2fb703d70803 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -623,13 +623,18 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; struct scatterlist src; SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); - int err; + int err, datalen; + unsigned char *data; llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); - sg_init_one(&src, skb->data, skb->len); + /* Compute data payload offset and data length */ + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + sg_init_one(&src, data, datalen); + skcipher_request_set_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &src, &src, skb->len, iv); + skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return err; From d5dd29e4dafef4baad7bf529ad73cafeb13e1aa8 Mon Sep 17 00:00:00 2001 From: Josef Filzmaier Date: Thu, 14 Sep 2017 14:32:10 +0200 Subject: [PATCH 0341/1322] ieee802154: atusb: Driver for Busware HUL dongle Busware manufactured an USB dongle that is quite similar to the atben and rzusb USB dongles. that are already supported. This patch aims to support the Busware HUL dongle (called hulusb) alongside atusb and rzusb. hulusb is using the at86rf212 transceiver which is specifically designed to support the 700/800/900 MHz wave band. The source code is heavily inspired by the existing atusb and at86rf2xx drivers. Signed-off-by: Josef Filzmaier Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/atusb.c | 317 ++++++++++++++++++++++++++++----- drivers/net/ieee802154/atusb.h | 8 + 2 files changed, 285 insertions(+), 40 deletions(-) diff --git a/drivers/net/ieee802154/atusb.c b/drivers/net/ieee802154/atusb.c index ef688518ad77..115fa3f37a86 100644 --- a/drivers/net/ieee802154/atusb.c +++ b/drivers/net/ieee802154/atusb.c @@ -21,6 +21,9 @@ * * USB initialization is * Copyright (c) 2013 Alexander Aring + * + * Busware HUL support is + * Copyright (c) 2017 Josef Filzmaier */ #include @@ -42,9 +45,12 @@ #define ATUSB_ALLOC_DELAY_MS 100 /* delay after failed allocation */ #define ATUSB_TX_TIMEOUT_MS 200 /* on the air timeout */ +struct atusb_chip_data; + struct atusb { struct ieee802154_hw *hw; struct usb_device *usb_dev; + struct atusb_chip_data *data; int shutdown; /* non-zero if shutting down */ int err; /* set by first error */ @@ -65,6 +71,14 @@ struct atusb { unsigned char fw_hw_type; /* Firmware hardware type */ }; +struct atusb_chip_data { + u16 t_channel_switch; + int rssi_base_val; + + int (*set_channel)(struct ieee802154_hw*, u8, u8); + int (*set_txpower)(struct ieee802154_hw*, s32); +}; + /* ----- USB commands without data ----------------------------------------- */ /* To reduce the number of error checks in the code, we record the first error @@ -163,6 +177,18 @@ static int atusb_write_subreg(struct atusb *atusb, uint8_t reg, uint8_t mask, return ret; } +static int atusb_read_subreg(struct atusb *lp, + unsigned int addr, unsigned int mask, + unsigned int shift) +{ + int rc; + + rc = atusb_read_reg(lp, addr); + rc = (rc & mask) >> shift; + + return rc; +} + static int atusb_get_and_clear_error(struct atusb *atusb) { int err = atusb->err; @@ -379,18 +405,6 @@ static int atusb_xmit(struct ieee802154_hw *hw, struct sk_buff *skb) return ret; } -static int atusb_channel(struct ieee802154_hw *hw, u8 page, u8 channel) -{ - struct atusb *atusb = hw->priv; - int ret; - - ret = atusb_write_subreg(atusb, SR_CHANNEL, channel); - if (ret < 0) - return ret; - msleep(1); /* @@@ ugly synchronization */ - return 0; -} - static int atusb_ed(struct ieee802154_hw *hw, u8 *level) { BUG_ON(!level); @@ -474,6 +488,17 @@ static const s32 atusb_powers[ATUSB_MAX_TX_POWERS + 1] = { -900, -1200, -1700, }; +static int +atusb_txpower(struct ieee802154_hw *hw, s32 mbm) +{ + struct atusb *atusb = hw->priv; + + if (atusb->data) + return atusb->data->set_txpower(hw, mbm); + else + return -ENOTSUPP; +} + static int atusb_set_txpower(struct ieee802154_hw *hw, s32 mbm) { @@ -488,12 +513,43 @@ atusb_set_txpower(struct ieee802154_hw *hw, s32 mbm) return -EINVAL; } +static int +hulusb_set_txpower(struct ieee802154_hw *hw, s32 mbm) +{ + u32 i; + + for (i = 0; i < hw->phy->supported.tx_powers_size; i++) { + if (hw->phy->supported.tx_powers[i] == mbm) + return atusb_write_subreg(hw->priv, SR_TX_PWR_212, i); + } + + return -EINVAL; +} + #define ATUSB_MAX_ED_LEVELS 0xF static const s32 atusb_ed_levels[ATUSB_MAX_ED_LEVELS + 1] = { -9100, -8900, -8700, -8500, -8300, -8100, -7900, -7700, -7500, -7300, -7100, -6900, -6700, -6500, -6300, -6100, }; +#define AT86RF212_MAX_TX_POWERS 0x1F +static const s32 at86rf212_powers[AT86RF212_MAX_TX_POWERS + 1] = { + 500, 400, 300, 200, 100, 0, -100, -200, -300, -400, -500, -600, -700, + -800, -900, -1000, -1100, -1200, -1300, -1400, -1500, -1600, -1700, + -1800, -1900, -2000, -2100, -2200, -2300, -2400, -2500, -2600, +}; + +#define AT86RF2XX_MAX_ED_LEVELS 0xF +static const s32 at86rf212_ed_levels_100[AT86RF2XX_MAX_ED_LEVELS + 1] = { + -10000, -9800, -9600, -9400, -9200, -9000, -8800, -8600, -8400, -8200, + -8000, -7800, -7600, -7400, -7200, -7000, +}; + +static const s32 at86rf212_ed_levels_98[AT86RF2XX_MAX_ED_LEVELS + 1] = { + -9800, -9600, -9400, -9200, -9000, -8800, -8600, -8400, -8200, -8000, + -7800, -7600, -7400, -7200, -7000, -6800, +}; + static int atusb_set_cca_mode(struct ieee802154_hw *hw, const struct wpan_phy_cca *cca) { @@ -527,6 +583,30 @@ atusb_set_cca_mode(struct ieee802154_hw *hw, const struct wpan_phy_cca *cca) return atusb_write_subreg(atusb, SR_CCA_MODE, val); } +static int hulusb_set_cca_ed_level(struct atusb *lp, int rssi_base_val) +{ + unsigned int cca_ed_thres; + + cca_ed_thres = atusb_read_subreg(lp, SR_CCA_ED_THRES); + + switch (rssi_base_val) { + case -98: + lp->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_98; + lp->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_98); + lp->hw->phy->cca_ed_level = at86rf212_ed_levels_98[cca_ed_thres]; + break; + case -100: + lp->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_100; + lp->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_100); + lp->hw->phy->cca_ed_level = at86rf212_ed_levels_100[cca_ed_thres]; + break; + default: + WARN_ON(1); + } + + return 0; +} + static int atusb_set_cca_ed_level(struct ieee802154_hw *hw, s32 mbm) { @@ -541,6 +621,92 @@ atusb_set_cca_ed_level(struct ieee802154_hw *hw, s32 mbm) return -EINVAL; } +static int atusb_channel(struct ieee802154_hw *hw, u8 page, u8 channel) +{ + struct atusb *atusb = hw->priv; + int ret = -ENOTSUPP; + + if (atusb->data) { + ret = atusb->data->set_channel(hw, page, channel); + /* @@@ ugly synchronization */ + msleep(atusb->data->t_channel_switch); + } + + return ret; +} + +static int atusb_set_channel(struct ieee802154_hw *hw, u8 page, u8 channel) +{ + struct atusb *atusb = hw->priv; + int ret; + + ret = atusb_write_subreg(atusb, SR_CHANNEL, channel); + if (ret < 0) + return ret; + return 0; +} + +static int hulusb_set_channel(struct ieee802154_hw *hw, u8 page, u8 channel) +{ + int rc; + int rssi_base_val; + + struct atusb *lp = hw->priv; + + if (channel == 0) + rc = atusb_write_subreg(lp, SR_SUB_MODE, 0); + else + rc = atusb_write_subreg(lp, SR_SUB_MODE, 1); + if (rc < 0) + return rc; + + if (page == 0) { + rc = atusb_write_subreg(lp, SR_BPSK_QPSK, 0); + rssi_base_val = -100; + } else { + rc = atusb_write_subreg(lp, SR_BPSK_QPSK, 1); + rssi_base_val = -98; + } + if (rc < 0) + return rc; + + rc = hulusb_set_cca_ed_level(lp, rssi_base_val); + if (rc < 0) + return rc; + + /* This sets the symbol_duration according frequency on the 212. + * TODO move this handling while set channel and page in cfg802154. + * We can do that, this timings are according 802.15.4 standard. + * If we do that in cfg802154, this is a more generic calculation. + * + * This should also protected from ifs_timer. Means cancel timer and + * init with a new value. For now, this is okay. + */ + if (channel == 0) { + if (page == 0) { + /* SUB:0 and BPSK:0 -> BPSK-20 */ + lp->hw->phy->symbol_duration = 50; + } else { + /* SUB:1 and BPSK:0 -> BPSK-40 */ + lp->hw->phy->symbol_duration = 25; + } + } else { + if (page == 0) + /* SUB:0 and BPSK:1 -> OQPSK-100/200/400 */ + lp->hw->phy->symbol_duration = 40; + else + /* SUB:1 and BPSK:1 -> OQPSK-250/500/1000 */ + lp->hw->phy->symbol_duration = 16; + } + + lp->hw->phy->lifs_period = IEEE802154_LIFS_PERIOD * + lp->hw->phy->symbol_duration; + lp->hw->phy->sifs_period = IEEE802154_SIFS_PERIOD * + lp->hw->phy->symbol_duration; + + return atusb_write_subreg(lp, SR_CHANNEL, channel); +} + static int atusb_set_csma_params(struct ieee802154_hw *hw, u8 min_be, u8 max_be, u8 retries) { @@ -558,6 +724,14 @@ atusb_set_csma_params(struct ieee802154_hw *hw, u8 min_be, u8 max_be, u8 retries return atusb_write_subreg(atusb, SR_MAX_CSMA_RETRIES, retries); } +static int +hulusb_set_lbt(struct ieee802154_hw *hw, bool on) +{ + struct atusb *atusb = hw->priv; + + return atusb_write_subreg(atusb, SR_CSMA_LBT_MODE, on); +} + static int atusb_set_frame_retries(struct ieee802154_hw *hw, s8 retries) { @@ -593,6 +767,20 @@ atusb_set_promiscuous_mode(struct ieee802154_hw *hw, const bool on) return 0; } +struct atusb_chip_data atusb_chip_data = { + .t_channel_switch = 1, + .rssi_base_val = -91, + .set_txpower = atusb_set_txpower, + .set_channel = atusb_set_channel, +}; + +struct atusb_chip_data hulusb_chip_data = { + .t_channel_switch = 11, + .rssi_base_val = -100, + .set_txpower = hulusb_set_txpower, + .set_channel = hulusb_set_channel, +}; + static const struct ieee802154_ops atusb_ops = { .owner = THIS_MODULE, .xmit_async = atusb_xmit, @@ -601,7 +789,8 @@ static const struct ieee802154_ops atusb_ops = { .start = atusb_start, .stop = atusb_stop, .set_hw_addr_filt = atusb_set_hw_addr_filt, - .set_txpower = atusb_set_txpower, + .set_txpower = atusb_txpower, + .set_lbt = hulusb_set_lbt, .set_cca_mode = atusb_set_cca_mode, .set_cca_ed_level = atusb_set_cca_ed_level, .set_csma_params = atusb_set_csma_params, @@ -614,6 +803,7 @@ static const struct ieee802154_ops atusb_ops = { static int atusb_get_and_show_revision(struct atusb *atusb) { struct usb_device *usb_dev = atusb->usb_dev; + char *hw_name; unsigned char *buffer; int ret; @@ -630,9 +820,31 @@ static int atusb_get_and_show_revision(struct atusb *atusb) atusb->fw_ver_min = buffer[1]; atusb->fw_hw_type = buffer[2]; + switch (atusb->fw_hw_type) { + case ATUSB_HW_TYPE_100813: + case ATUSB_HW_TYPE_101216: + case ATUSB_HW_TYPE_110131: + hw_name = "ATUSB"; + atusb->data = &atusb_chip_data; + break; + case ATUSB_HW_TYPE_RZUSB: + hw_name = "RZUSB"; + atusb->data = &atusb_chip_data; + break; + case ATUSB_HW_TYPE_HULUSB: + hw_name = "HULUSB"; + atusb->data = &hulusb_chip_data; + break; + default: + hw_name = "UNKNOWN"; + atusb->err = -ENOTSUPP; + ret = -ENOTSUPP; + break; + } + dev_info(&usb_dev->dev, - "Firmware: major: %u, minor: %u, hardware type: %u\n", - atusb->fw_ver_maj, atusb->fw_ver_min, atusb->fw_hw_type); + "Firmware: major: %u, minor: %u, hardware type: %s (%d)\n", + atusb->fw_ver_maj, atusb->fw_ver_min, hw_name, atusb->fw_hw_type); } if (atusb->fw_ver_maj == 0 && atusb->fw_ver_min < 2) { dev_info(&usb_dev->dev, @@ -667,11 +879,12 @@ static int atusb_get_and_show_build(struct atusb *atusb) return ret; } -static int atusb_get_and_show_chip(struct atusb *atusb) +static int atusb_get_and_conf_chip(struct atusb *atusb) { struct usb_device *usb_dev = atusb->usb_dev; uint8_t man_id_0, man_id_1, part_num, version_num; const char *chip; + struct ieee802154_hw *hw = atusb->hw; man_id_0 = atusb_read_reg(atusb, RG_MAN_ID_0); man_id_1 = atusb_read_reg(atusb, RG_MAN_ID_1); @@ -681,6 +894,22 @@ static int atusb_get_and_show_chip(struct atusb *atusb) if (atusb->err) return atusb->err; + hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT | + IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_CSMA_PARAMS; + + hw->phy->flags = WPAN_PHY_FLAG_TXPOWER | WPAN_PHY_FLAG_CCA_ED_LEVEL | + WPAN_PHY_FLAG_CCA_MODE; + + hw->phy->supported.cca_modes = BIT(NL802154_CCA_ENERGY) | + BIT(NL802154_CCA_CARRIER) | + BIT(NL802154_CCA_ENERGY_CARRIER); + hw->phy->supported.cca_opts = BIT(NL802154_CCA_OPT_ENERGY_CARRIER_AND) | + BIT(NL802154_CCA_OPT_ENERGY_CARRIER_OR); + + hw->phy->cca.mode = NL802154_CCA_ENERGY; + + hw->phy->current_page = 0; + if ((man_id_1 << 8 | man_id_0) != ATUSB_JEDEC_ATMEL) { dev_err(&usb_dev->dev, "non-Atmel transceiver xxxx%02x%02x\n", @@ -691,9 +920,36 @@ static int atusb_get_and_show_chip(struct atusb *atusb) switch (part_num) { case 2: chip = "AT86RF230"; + atusb->hw->phy->supported.channels[0] = 0x7FFF800; + atusb->hw->phy->current_channel = 11; /* reset default */ + atusb->hw->phy->symbol_duration = 16; + atusb->hw->phy->supported.tx_powers = atusb_powers; + atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers); + hw->phy->supported.cca_ed_levels = atusb_ed_levels; + hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels); break; case 3: chip = "AT86RF231"; + atusb->hw->phy->supported.channels[0] = 0x7FFF800; + atusb->hw->phy->current_channel = 11; /* reset default */ + atusb->hw->phy->symbol_duration = 16; + atusb->hw->phy->supported.tx_powers = atusb_powers; + atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers); + hw->phy->supported.cca_ed_levels = atusb_ed_levels; + hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels); + break; + case 7: + chip = "AT86RF212"; + atusb->hw->flags |= IEEE802154_HW_LBT; + atusb->hw->phy->supported.channels[0] = 0x00007FF; + atusb->hw->phy->supported.channels[2] = 0x00007FF; + atusb->hw->phy->current_channel = 5; + atusb->hw->phy->symbol_duration = 25; + atusb->hw->phy->supported.lbt = NL802154_SUPPORTED_BOOL_BOTH; + atusb->hw->phy->supported.tx_powers = at86rf212_powers; + atusb->hw->phy->supported.tx_powers_size = ARRAY_SIZE(at86rf212_powers); + atusb->hw->phy->supported.cca_ed_levels = at86rf212_ed_levels_100; + atusb->hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(at86rf212_ed_levels_100); break; default: dev_err(&usb_dev->dev, @@ -702,6 +958,9 @@ static int atusb_get_and_show_chip(struct atusb *atusb) goto fail; } + hw->phy->transmit_power = hw->phy->supported.tx_powers[0]; + hw->phy->cca_ed_level = hw->phy->supported.cca_ed_levels[7]; + dev_info(&usb_dev->dev, "ATUSB: %s version %d\n", chip, version_num); return 0; @@ -794,32 +1053,9 @@ static int atusb_probe(struct usb_interface *interface, goto fail; hw->parent = &usb_dev->dev; - hw->flags = IEEE802154_HW_TX_OMIT_CKSUM | IEEE802154_HW_AFILT | - IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_CSMA_PARAMS; - - hw->phy->flags = WPAN_PHY_FLAG_TXPOWER | WPAN_PHY_FLAG_CCA_ED_LEVEL | - WPAN_PHY_FLAG_CCA_MODE; - - hw->phy->supported.cca_modes = BIT(NL802154_CCA_ENERGY) | - BIT(NL802154_CCA_CARRIER) | BIT(NL802154_CCA_ENERGY_CARRIER); - hw->phy->supported.cca_opts = BIT(NL802154_CCA_OPT_ENERGY_CARRIER_AND) | - BIT(NL802154_CCA_OPT_ENERGY_CARRIER_OR); - - hw->phy->supported.cca_ed_levels = atusb_ed_levels; - hw->phy->supported.cca_ed_levels_size = ARRAY_SIZE(atusb_ed_levels); - - hw->phy->cca.mode = NL802154_CCA_ENERGY; - - hw->phy->current_page = 0; - hw->phy->current_channel = 11; /* reset default */ - hw->phy->supported.channels[0] = 0x7FFF800; - hw->phy->supported.tx_powers = atusb_powers; - hw->phy->supported.tx_powers_size = ARRAY_SIZE(atusb_powers); - hw->phy->transmit_power = hw->phy->supported.tx_powers[0]; - hw->phy->cca_ed_level = hw->phy->supported.cca_ed_levels[7]; atusb_command(atusb, ATUSB_RF_RESET, 0); - atusb_get_and_show_chip(atusb); + atusb_get_and_conf_chip(atusb); atusb_get_and_show_revision(atusb); atusb_get_and_show_build(atusb); atusb_set_extended_addr(atusb); @@ -941,5 +1177,6 @@ MODULE_AUTHOR("Alexander Aring "); MODULE_AUTHOR("Richard Sharpe "); MODULE_AUTHOR("Stefan Schmidt "); MODULE_AUTHOR("Werner Almesberger "); +MODULE_AUTHOR("Josef Filzmaier "); MODULE_DESCRIPTION("ATUSB IEEE 802.15.4 Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ieee802154/atusb.h b/drivers/net/ieee802154/atusb.h index b22bbaa77590..555d14bf14a3 100644 --- a/drivers/net/ieee802154/atusb.h +++ b/drivers/net/ieee802154/atusb.h @@ -50,6 +50,14 @@ enum atusb_requests { ATUSB_EUI64_READ, }; +enum { + ATUSB_HW_TYPE_100813, /* 2010-08-13 */ + ATUSB_HW_TYPE_101216, /* 2010-12-16 */ + ATUSB_HW_TYPE_110131, /* 2011-01-31, ATmega32U2-based */ + ATUSB_HW_TYPE_RZUSB, /* Atmel Raven USB dongle with at86rf230 */ + ATUSB_HW_TYPE_HULUSB, /* Busware HUL USB dongle with at86rf212 */ +}; + /* * Direction bRequest wValue wIndex wLength * From 1b25fda0533462c9cee3a22e8a7bea68fa670af2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 19 Sep 2017 12:52:22 +0200 Subject: [PATCH 0342/1322] s390/topology: alternative topology for topology-less machines If running on machines that do not provide topology information we currently generate a "fake" topology which defines the maximum distance between each cpu: each cpu will be put into an own drawer. Historically this used to be the best option for (virtual) machines in overcommited hypervisors. For some workloads however it is better to generate a different topology where all cpus are siblings within a package (all cpus are core siblings). This shows performance improvements of up to 10%, depending on the workload. In order to keep the current behaviour, but also allow to switch to the different core sibling topology use the existing "topology=" kernel parameter: Specifying "topology=on" on machines without topology information will generate the core siblings (fake) topology information, instead of the default topology information where all cpus have the maximum distance. On machines which provide topology information specifying "topology=on" does not have any effect. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/early.c | 12 ------- arch/s390/kernel/topology.c | 72 +++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 23 deletions(-) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ca8cd80e8feb..60181caf8e8a 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -404,18 +404,6 @@ static inline void save_vector_registers(void) #endif } -static int __init topology_setup(char *str) -{ - bool enabled; - int rc; - - rc = kstrtobool(str, &enabled); - if (!rc && !enabled) - S390_lowcore.machine_flags &= ~MACHINE_FLAG_TOPOLOGY; - return rc; -} -early_param("topology", topology_setup); - static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index bb47c92476f0..a0ce9c83f589 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -29,12 +29,20 @@ #define PTF_VERTICAL (1UL) #define PTF_CHECK (2UL) +enum { + TOPOLOGY_MODE_HW, + TOPOLOGY_MODE_SINGLE, + TOPOLOGY_MODE_PACKAGE, + TOPOLOGY_MODE_UNINITIALIZED +}; + struct mask_info { struct mask_info *next; unsigned char id; cpumask_t mask; }; +static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; @@ -59,11 +67,26 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) cpumask_t mask; cpumask_copy(&mask, cpumask_of(cpu)); - if (!MACHINE_HAS_TOPOLOGY) - return mask; - for (; info; info = info->next) { - if (cpumask_test_cpu(cpu, &info->mask)) - return info->mask; + switch (topology_mode) { + case TOPOLOGY_MODE_HW: + while (info) { + if (cpumask_test_cpu(cpu, &info->mask)) { + mask = info->mask; + break; + } + info = info->next; + } + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpumask_of(cpu)); + break; + case TOPOLOGY_MODE_PACKAGE: + cpumask_copy(&mask, cpu_present_mask); + break; + default: + /* fallthrough */ + case TOPOLOGY_MODE_SINGLE: + cpumask_copy(&mask, cpumask_of(cpu)); + break; } return mask; } @@ -74,7 +97,7 @@ static cpumask_t cpu_thread_map(unsigned int cpu) int i; cpumask_copy(&mask, cpumask_of(cpu)); - if (!MACHINE_HAS_TOPOLOGY) + if (topology_mode != TOPOLOGY_MODE_HW) return mask; cpu -= cpu % (smp_cpu_mtid + 1); for (i = 0; i <= smp_cpu_mtid; i++) @@ -223,7 +246,7 @@ int topology_set_cpu_management(int fc) static void update_cpu_masks(void) { struct cpu_topology_s390 *topo; - int cpu; + int cpu, id; for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; @@ -231,12 +254,13 @@ static void update_cpu_masks(void) topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); topo->drawer_mask = cpu_group_map(&drawer_info, cpu); - if (!MACHINE_HAS_TOPOLOGY) { + if (topology_mode != TOPOLOGY_MODE_HW) { + id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; topo->thread_id = cpu; topo->core_id = cpu; - topo->socket_id = cpu; - topo->book_id = cpu; - topo->drawer_id = cpu; + topo->socket_id = id; + topo->book_id = id; + topo->drawer_id = id; if (cpu_present(cpu)) cpumask_set_cpu(cpu, &cpus_with_topology); } @@ -459,6 +483,12 @@ void __init topology_init_early(void) struct sysinfo_15_1_x *info; set_sched_topology(s390_topology); + if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { + if (MACHINE_HAS_TOPOLOGY) + topology_mode = TOPOLOGY_MODE_HW; + else + topology_mode = TOPOLOGY_MODE_SINGLE; + } if (!MACHINE_HAS_TOPOLOGY) goto out; tl_info = memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE); @@ -474,6 +504,26 @@ out: __arch_update_cpu_topology(); } +static inline int topology_get_mode(int enabled) +{ + if (!enabled) + return TOPOLOGY_MODE_SINGLE; + return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; +} + +static int __init topology_setup(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + topology_mode = topology_get_mode(enabled); + return 0; +} +early_param("topology", topology_setup); + static int __init topology_init(void) { if (MACHINE_HAS_TOPOLOGY) From 51dce3867c6c63c7500332e5448c2ba76808d6b5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 14 Sep 2017 14:42:32 +0200 Subject: [PATCH 0343/1322] s390/topology: enable / disable topology dynamically Add a new sysctl file /proc/sys/s390/topology which displays if topology is on (1) or off (0) as specified by the "topology=" kernel parameter. This allows to change topology information during runtime and configuring it via /etc/sysctl.conf instead of using the kernel line parameter. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/topology.c | 76 ++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index a0ce9c83f589..ed0bdd220e1a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -207,10 +209,8 @@ static void topology_update_polarization_simple(void) { int cpu; - mutex_lock(&smp_cpu_state_mutex); for_each_possible_cpu(cpu) smp_cpu_set_polarization(cpu, POLARIZATION_HRZ); - mutex_unlock(&smp_cpu_state_mutex); } static int ptf(unsigned long fc) @@ -278,6 +278,7 @@ static int __arch_update_cpu_topology(void) struct sysinfo_15_1_x *info = tl_info; int rc = 0; + mutex_lock(&smp_cpu_state_mutex); cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; @@ -287,6 +288,7 @@ static int __arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + mutex_unlock(&smp_cpu_state_mutex); return rc; } @@ -313,6 +315,11 @@ void topology_schedule_update(void) schedule_work(&topology_work); } +static void topology_flush_work(void) +{ + flush_work(&topology_work); +} + static void topology_timer_fn(unsigned long ignored) { if (ptf(PTF_CHECK)) @@ -511,6 +518,11 @@ static inline int topology_get_mode(int enabled) return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; } +static inline int topology_is_enabled(void) +{ + return topology_mode != TOPOLOGY_MODE_SINGLE; +} + static int __init topology_setup(char *str) { bool enabled; @@ -524,12 +536,72 @@ static int __init topology_setup(char *str) } early_param("topology", topology_setup); +static int topology_ctl_handler(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int len; + int new_mode; + char buf[2]; + + if (!*lenp || *ppos) { + *lenp = 0; + return 0; + } + if (!write) { + strncpy(buf, topology_is_enabled() ? "1\n" : "0\n", + ARRAY_SIZE(buf)); + len = strnlen(buf, ARRAY_SIZE(buf)); + if (len > *lenp) + len = *lenp; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + goto out; + } + len = *lenp; + if (copy_from_user(buf, buffer, len > sizeof(buf) ? sizeof(buf) : len)) + return -EFAULT; + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + mutex_lock(&smp_cpu_state_mutex); + new_mode = topology_get_mode(buf[0] == '1'); + if (topology_mode != new_mode) { + topology_mode = new_mode; + topology_schedule_update(); + } + mutex_unlock(&smp_cpu_state_mutex); + topology_flush_work(); +out: + *lenp = len; + *ppos += len; + return 0; +} + +static struct ctl_table topology_ctl_table[] = { + { + .procname = "topology", + .mode = 0644, + .proc_handler = topology_ctl_handler, + }, + { }, +}; + +static struct ctl_table topology_dir_table[] = { + { + .procname = "s390", + .maxlen = 0, + .mode = 0555, + .child = topology_ctl_table, + }, + { }, +}; + static int __init topology_init(void) { if (MACHINE_HAS_TOPOLOGY) set_topology_timer(); else topology_update_polarization_simple(); + register_sysctl_table(topology_dir_table); return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); From 97e133d54c1ca8948b191e5721a145a76c4db33d Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Tue, 19 Sep 2017 11:46:16 +0300 Subject: [PATCH 0344/1322] usb: gadget: core: fix ->udc_set_speed() logic Consider the following case: udc controller supports SuperSpeed. If we first load a HighSpeed gadget followed by a SuperSpeed gadget, the SuperSpeed gadget will be limited to HighSpeed as UDC core driver doesn't call ->udc_set_speed() in the second case. Call ->udc_set_speed() unconditionally to fix this issue. This will also fix the case for dwc3 controller driver when SuperSpeed gadget is loaded first and works in HighSpeed only as udc_set_speed() was never being called. Fixes: 6099eca796ae ("usb: gadget: core: introduce ->udc_set_speed() method") Cc: [v4.13+] Signed-off-by: Roger Quadros Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 75c51ca4ee0f..d41d07aae0ce 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1320,8 +1320,7 @@ static int udc_bind_to_driver(struct usb_udc *udc, struct usb_gadget_driver *dri udc->dev.driver = &driver->driver; udc->gadget->dev.driver = &driver->driver; - if (driver->max_speed < udc->gadget->max_speed) - usb_gadget_udc_set_speed(udc, driver->max_speed); + usb_gadget_udc_set_speed(udc, driver->max_speed); ret = driver->bind(udc->gadget, driver); if (ret) From 9ada8c582088d32bd5c071c17213bc6edf37443a Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 13 Sep 2017 15:31:33 +0900 Subject: [PATCH 0345/1322] usb: gadget: function: printer: avoid spinlock recursion If usb_gadget_giveback_request() is called in usb_ep_queue(), this printer_write() is possible to cause spinlock recursion. So, this patch adds spin_unlock() before calls usb_ep_queue() to avoid it. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_printer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 8df244fc9d80..ea0da35a44e2 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -555,6 +555,7 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) size_t size; /* Amount of data in a TX request. */ size_t bytes_copied = 0; struct usb_request *req; + int value; DBG(dev, "printer_write trying to send %d bytes\n", (int)len); @@ -634,7 +635,11 @@ printer_write(struct file *fd, const char __user *buf, size_t len, loff_t *ptr) return -EAGAIN; } - if (usb_ep_queue(dev->in_ep, req, GFP_ATOMIC)) { + /* here, we unlock, and only unlock, to avoid deadlock. */ + spin_unlock(&dev->lock); + value = usb_ep_queue(dev->in_ep, req, GFP_ATOMIC); + spin_lock(&dev->lock); + if (value) { list_add(&req->list, &dev->tx_reqs); spin_unlock_irqrestore(&dev->lock, flags); mutex_unlock(&dev->lock_printer_io); From 641663a19f6002ecc5c5af517a5184d4164cc749 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Sep 2017 09:24:15 -0700 Subject: [PATCH 0346/1322] usb: gadget: udc: fix snps_udc_plat.c build errors Fix build errors that happen when CONFIG_EXTCON=m and CONFIG_USB_SNP_UDC_PLAT=y by preventing that combination in Kconfig. CONFIG_EXTCON can still be disabled or enabled for this driver since has stubs for the disabled case, but if CONFIG_EXTCON=m, USB_SNP_UDC_PLAT is restricted to m or n (cannot be builtin). drivers/built-in.o: In function `udc_plat_remove': snps_udc_plat.c:(.text+0x2c4060): undefined reference to `extcon_unregister_notifier' drivers/built-in.o: In function `udc_plat_probe': snps_udc_plat.c:(.text+0x2c438c): undefined reference to `extcon_get_edev_by_phandle' snps_udc_plat.c:(.text+0x2c43f2): undefined reference to `extcon_register_notifier' snps_udc_plat.c:(.text+0x2c4416): undefined reference to `extcon_get_state' snps_udc_plat.c:(.text+0x2c44f7): undefined reference to `extcon_unregister_notifier' Reported-by: kbuild test robot Signed-off-by: Randy Dunlap Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index 7cd5c969fcbe..1e9567091d86 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -273,6 +273,7 @@ config USB_SNP_CORE config USB_SNP_UDC_PLAT tristate "Synopsys USB 2.0 Device controller" depends on USB_GADGET && OF && HAS_DMA + depends on EXTCON || EXTCON=n select USB_GADGET_DUALSPEED select USB_SNP_CORE default ARCH_BCM_IPROC From 7661ca09b2ff98f48693f431bb01fed62830e433 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Sep 2017 16:14:31 +0200 Subject: [PATCH 0347/1322] usb: gadget: dummy: fix nonsensical comparisons gcc-8 points out two comparisons that are clearly bogus and almost certainly not what the author intended to write: drivers/usb/gadget/udc/dummy_hcd.c: In function 'set_link_state_by_speed': drivers/usb/gadget/udc/dummy_hcd.c:379:31: error: bitwise comparison always evaluates to false [-Werror=tautological-compare] USB_PORT_STAT_ENABLE) == 1 && ^~ drivers/usb/gadget/udc/dummy_hcd.c:381:25: error: bitwise comparison always evaluates to false [-Werror=tautological-compare] USB_SS_PORT_LS_U0) == 1 && ^~ I looked at the code for a bit and came up with a change that makes it look like what the author probably meant here. This makes it look reasonable to me and to gcc, shutting up the warning. It does of course change behavior as the two conditions are actually evaluated rather than being hardcoded to false, and I have made no attempt at verifying that the changed logic makes sense in the context of a USB HCD, so that part needs to be reviewed carefully. Fixes: 1cd8fd2887e1 ("usb: gadget: dummy_hcd: add SuperSpeed support") Cc: Tatyana Brokhman Cc: Felipe Balbi Acked-by: Alan Stern Signed-off-by: Arnd Bergmann Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index a030d7923d7d..b1e21b3be6e1 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -375,11 +375,10 @@ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); - if ((dum_hcd->port_status & - USB_PORT_STAT_ENABLE) == 1 && - (dum_hcd->port_status & - USB_SS_PORT_LS_U0) == 1 && - dum_hcd->rh_state != DUMMY_RH_SUSPENDED) + if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) && + (dum_hcd->port_status & + USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 && + dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } else { From 20da2ec06bfad2d4dfd40d77d3831f5e56365d20 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 18 Sep 2017 15:29:49 +0300 Subject: [PATCH 0348/1322] qtnfmac: lock access to h/w in tx path Fix tx path regression. Lock should be held when queuing packets to h/w fifos in order to properly handle configurations with multiple enabled interfaces. Signed-off-by: Sergey Matyukevich Signed-off-by: Kalle Valo --- drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c | 9 ++++++++- .../net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c index 502e72b7cdcc..69131965a298 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c +++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c @@ -661,14 +661,18 @@ static int qtnf_pcie_data_tx(struct qtnf_bus *bus, struct sk_buff *skb) struct qtnf_pcie_bus_priv *priv = (void *)get_bus_priv(bus); dma_addr_t txbd_paddr, skb_paddr; struct qtnf_tx_bd *txbd; + unsigned long flags; int len, i; u32 info; int ret = 0; + spin_lock_irqsave(&priv->tx0_lock, flags); + if (!qtnf_tx_queue_ready(priv)) { if (skb->dev) netif_stop_queue(skb->dev); + spin_unlock_irqrestore(&priv->tx0_lock, flags); return NETDEV_TX_BUSY; } @@ -717,8 +721,10 @@ tx_done: dev_kfree_skb_any(skb); } - qtnf_pcie_data_tx_reclaim(priv); priv->tx_done_count++; + spin_unlock_irqrestore(&priv->tx0_lock, flags); + + qtnf_pcie_data_tx_reclaim(priv); return NETDEV_TX_OK; } @@ -1247,6 +1253,7 @@ static int qtnf_pcie_probe(struct pci_dev *pdev, const struct pci_device_id *id) strcpy(bus->fwname, QTN_PCI_PEARL_FW_NAME); init_completion(&bus->request_firmware_complete); mutex_init(&bus->bus_lock); + spin_lock_init(&pcie_priv->tx0_lock); spin_lock_init(&pcie_priv->irq_lock); spin_lock_init(&pcie_priv->tx_reclaim_lock); diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h index e76a23716ee0..86ac1ccedb52 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h +++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie_bus_priv.h @@ -34,6 +34,8 @@ struct qtnf_pcie_bus_priv { /* lock for tx reclaim operations */ spinlock_t tx_reclaim_lock; + /* lock for tx0 operations */ + spinlock_t tx0_lock; u8 msi_enabled; int mps; From a715b3a0efe76d36c3ef96a93894a13db9d3a72f Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 18 Sep 2017 15:29:50 +0300 Subject: [PATCH 0349/1322] qtnfmac: cancel scans on wireless interface changes Cancel active scans and deactivate firmware scan watchdog timer when wireless interface configuration is changed. The usecases include wireless interface mode change, interface down, AP stop, virtual interface removal. Signed-off-by: Sergey Matyukevich Signed-off-by: Kalle Valo --- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 9 ++++++--- drivers/net/wireless/quantenna/qtnfmac/cfg80211.h | 3 +++ drivers/net/wireless/quantenna/qtnfmac/event.c | 2 -- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 856fa6e8327e..a450bc6bc774 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -115,6 +115,8 @@ int qtnf_del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev) vif = qtnf_netdev_get_priv(wdev->netdev); + qtnf_scan_done(vif->mac, true); + if (qtnf_cmd_send_del_intf(vif)) pr_err("VIF%u.%u: failed to delete VIF\n", vif->mac->macid, vif->vifid); @@ -335,6 +337,8 @@ static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev) struct qtnf_vif *vif = qtnf_netdev_get_priv(dev); int ret; + qtnf_scan_done(vif->mac, true); + ret = qtnf_cmd_send_stop_ap(vif); if (ret) { pr_err("VIF%u.%u: failed to stop AP operation in FW\n", @@ -570,8 +574,6 @@ qtnf_del_station(struct wiphy *wiphy, struct net_device *dev, !qtnf_sta_list_lookup(&vif->sta_list, params->mac)) return 0; - qtnf_scan_done(vif->mac, true); - ret = qtnf_cmd_send_del_sta(vif, params); if (ret) pr_err("VIF%u.%u: failed to delete STA %pM\n", @@ -1134,8 +1136,9 @@ void qtnf_virtual_intf_cleanup(struct net_device *ndev) } vif->sta_state = QTNF_STA_DISCONNECTED; - qtnf_scan_done(mac, true); } + + qtnf_scan_done(mac, true); } void qtnf_cfg80211_vif_reset(struct qtnf_vif *vif) diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h index 6a4af52522b8..66db26613b1f 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.h @@ -34,6 +34,9 @@ static inline void qtnf_scan_done(struct qtnf_wmac *mac, bool aborted) .aborted = aborted, }; + if (timer_pending(&mac->scan_timeout)) + del_timer_sync(&mac->scan_timeout); + mutex_lock(&mac->mac_lock); if (mac->scan_req) { diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c index 0fc2814eafad..43d2e7fd6e02 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/event.c +++ b/drivers/net/wireless/quantenna/qtnfmac/event.c @@ -345,8 +345,6 @@ qtnf_event_handle_scan_complete(struct qtnf_wmac *mac, return -EINVAL; } - if (timer_pending(&mac->scan_timeout)) - del_timer_sync(&mac->scan_timeout); qtnf_scan_done(mac, le32_to_cpu(status->flags) & QLINK_SCAN_ABORTED); return 0; From 9e09007486c883de4aa78a384e1d54c3fe6e42d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 18:12:36 +0900 Subject: [PATCH 0350/1322] kbuild: rpm-pkg: delete firmware_install to fix build error Commit 5620a0d1aacd ("firmware: delete in-kernel firmware") deleted in-kernel firmware support, including "make firmware_install". Since then, "make rpm-pkg" / "make binrpm-pkg" fails to build with the error: make[2]: *** No rule to make target `firmware_install'. Stop. Commit df85b2d767aa ("firmware: Restore support for built-in firmware") restored the build infrastructure for CONFIG_EXTRA_FIRMWARE, but this is out of the scope of "make firmware_install". So, the right thing to do is to kill the use of "make firmware_install". Fixes: 5620a0d1aacd ("firmware: delete in-kernel firmware") Signed-off-by: Masahiro Yamada Acked-by: Greg Kroah-Hartman --- scripts/package/mkspec | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index bb43f153fd8e..bf3e9abb2846 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -88,11 +88,8 @@ echo 'mkdir -p $RPM_BUILD_ROOT/boot/efi $RPM_BUILD_ROOT/lib/modules' echo "%else" echo 'mkdir -p $RPM_BUILD_ROOT/boot $RPM_BUILD_ROOT/lib/modules' echo "%endif" -echo 'mkdir -p $RPM_BUILD_ROOT'"/lib/firmware/$KERNELRELEASE" -echo 'INSTALL_MOD_PATH=$RPM_BUILD_ROOT make %{?_smp_mflags} KBUILD_SRC= mod-fw= modules_install' -echo 'INSTALL_FW_PATH=$RPM_BUILD_ROOT'"/lib/firmware/$KERNELRELEASE" -echo 'make INSTALL_FW_PATH=$INSTALL_FW_PATH' firmware_install +echo 'INSTALL_MOD_PATH=$RPM_BUILD_ROOT make %{?_smp_mflags} KBUILD_SRC= modules_install' echo "%ifarch ia64" echo 'cp $KBUILD_IMAGE $RPM_BUILD_ROOT'"/boot/efi/vmlinuz-$KERNELRELEASE" echo 'ln -s '"efi/vmlinuz-$KERNELRELEASE" '$RPM_BUILD_ROOT'"/boot/" @@ -119,7 +116,7 @@ if ! $PREBUILT; then echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/build" echo 'rm -f $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE/source" echo "mkdir -p "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE" -echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation --exclude=firmware --exclude .config.old --exclude .missing-syscalls.d\"" +echo "EXCLUDES=\"$RCS_TAR_IGNORE --exclude .tmp_versions --exclude=*vmlinux* --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation --exclude .config.old --exclude .missing-syscalls.d\"" echo "tar "'$EXCLUDES'" -cf- . | (cd "'$RPM_BUILD_ROOT'"/usr/src/kernels/$KERNELRELEASE;tar xvf -)" echo 'cd $RPM_BUILD_ROOT'"/lib/modules/$KERNELRELEASE" echo "ln -sf /usr/src/kernels/$KERNELRELEASE build" @@ -154,7 +151,6 @@ echo '%defattr (-, root, root)' echo "/lib/modules/$KERNELRELEASE" echo "%exclude /lib/modules/$KERNELRELEASE/build" echo "%exclude /lib/modules/$KERNELRELEASE/source" -echo "/lib/firmware/$KERNELRELEASE" echo "/boot/*" echo "" echo "%files headers" From cc18abbe449aafc013831a8e0440afc336ae1cba Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Sep 2017 18:22:19 +0900 Subject: [PATCH 0351/1322] kbuild: deb-pkg: remove firmware package support Commit 5620a0d1aacd ("firmware: delete in-kernel firmware") deleted in-kernel firmware support, including the firmware install command. So, the firmware package does not make sense any more. Remove it. Signed-off-by: Masahiro Yamada Reviewed-by: Riku Voipio Acked-by: Greg Kroah-Hartman --- scripts/package/builddeb | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index aad67000e4dd..0bc87473f68f 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -92,12 +92,10 @@ else fi sourcename=$KDEB_SOURCENAME tmpdir="$objtree/debian/tmp" -fwdir="$objtree/debian/fwtmp" kernel_headers_dir="$objtree/debian/hdrtmp" libc_headers_dir="$objtree/debian/headertmp" dbg_dir="$objtree/debian/dbgtmp" packagename=linux-image-$version -fwpackagename=linux-firmware-image-$version kernel_headers_packagename=linux-headers-$version libc_headers_packagename=linux-libc-dev dbg_packagename=$packagename-dbg @@ -126,10 +124,9 @@ esac BUILD_DEBUG="$(grep -s '^CONFIG_DEBUG_INFO=y' $KCONFIG_CONFIG || true)" # Setup the directory structure -rm -rf "$tmpdir" "$fwdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir" $objtree/debian/files +rm -rf "$tmpdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir" $objtree/debian/files mkdir -m 755 -p "$tmpdir/DEBIAN" mkdir -p "$tmpdir/lib" "$tmpdir/boot" -mkdir -p "$fwdir/lib/firmware/$version/" mkdir -p "$kernel_headers_dir/lib/modules/$version/" # Build and install the kernel @@ -306,7 +303,6 @@ else cat <> debian/control Package: $packagename -Suggests: $fwpackagename Architecture: any Description: Linux kernel, version $version This package contains the Linux kernel, modules and corresponding other @@ -345,22 +341,6 @@ Description: Linux kernel headers for $KERNELRELEASE on \${kernel:debarch} This is useful for people who need to build external modules EOF -# Do we have firmware? Move it out of the way and build it into a package. -if [ -e "$tmpdir/lib/firmware" ]; then - mv "$tmpdir/lib/firmware"/* "$fwdir/lib/firmware/$version/" - rmdir "$tmpdir/lib/firmware" - - cat <> debian/control - -Package: $fwpackagename -Architecture: all -Description: Linux kernel firmware, version $version - This package contains firmware from the Linux kernel, version $version. -EOF - - create_package "$fwpackagename" "$fwdir" -fi - cat <> debian/control Package: $libc_headers_packagename From 25b080bd53f2873785fa049f570ac1b361c11d72 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 20 Sep 2017 22:01:26 +0900 Subject: [PATCH 0352/1322] kbuild: rpm-pkg: fix version number handling The "Release:" field of the spec file is determined based on the .version file. However, the .version file is not copied to the source tar file. So, when we build the kernel from the source package, the UTS_VERSION always indicates #1. This does not match with "rpm -q". The kernel UTS_VERSION and "rpm -q" do not agree for binrpm-pkg, either. Please note the kernel has already been built before the spec file is created. Currently, mkspec invokes mkversion. This script returns an incremented version. So, the "Release:" field of the spec file is greater than the version in the kernel by one. For the source package build (where .version file is missing), we can give KBUILD_BUILD_VERSION=%{release} to the build command. For the binary package build, we can simply read out the .version file because it contains the version number that was used for building the kernel image. We can remove scripts/mkversion because scripts/package/Makefile need not touch the .version file. Signed-off-by: Masahiro Yamada --- scripts/mkversion | 6 ------ scripts/package/Makefile | 5 ----- scripts/package/mkspec | 6 ++---- 3 files changed, 2 insertions(+), 15 deletions(-) delete mode 100644 scripts/mkversion diff --git a/scripts/mkversion b/scripts/mkversion deleted file mode 100644 index c12addc9c7ef..000000000000 --- a/scripts/mkversion +++ /dev/null @@ -1,6 +0,0 @@ -if [ ! -f .version ] -then - echo 1 -else - expr 0`cat .version` + 1 -fi diff --git a/scripts/package/Makefile b/scripts/package/Makefile index 71b4a8af9d4d..73f9f3192b9f 100644 --- a/scripts/package/Makefile +++ b/scripts/package/Makefile @@ -50,8 +50,6 @@ rpm-pkg rpm: FORCE $(MAKE) clean $(CONFIG_SHELL) $(MKSPEC) >$(objtree)/kernel.spec $(call cmd,src_tar,$(KERNELPATH),kernel.spec) - $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version - mv -f $(objtree)/.tmp_version $(objtree)/.version rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz rm $(KERNELPATH).tar.gz kernel.spec @@ -60,9 +58,6 @@ rpm-pkg rpm: FORCE binrpm-pkg: FORCE $(MAKE) KBUILD_SRC= $(CONFIG_SHELL) $(MKSPEC) prebuilt > $(objtree)/binkernel.spec - $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version - mv -f $(objtree)/.tmp_version $(objtree)/.version - rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \ $(UTS_MACHINE) -bb $(objtree)/binkernel.spec rm binkernel.spec diff --git a/scripts/package/mkspec b/scripts/package/mkspec index bf3e9abb2846..f47f17aae135 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -27,9 +27,7 @@ __KERNELRELEASE=`echo $KERNELRELEASE | sed -e "s/-/_/g"` echo "Name: kernel" echo "Summary: The Linux Kernel" echo "Version: $__KERNELRELEASE" -# we need to determine the NEXT version number so that uname and -# rpm -q will agree -echo "Release: `. $srctree/scripts/mkversion`" +echo "Release: $(cat .version 2>/dev/null || echo 1)" echo "License: GPL" echo "Group: System Environment/Kernel" echo "Vendor: The Linux Community" @@ -77,7 +75,7 @@ fi echo "%build" if ! $PREBUILT; then -echo "make clean && make %{?_smp_mflags}" +echo "make clean && make %{?_smp_mflags} KBUILD_BUILD_VERSION=%{release}" echo "" fi From 35f3c984548524be5eb4c3f5295cf58d909e8746 Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Mon, 18 Sep 2017 17:18:44 -0700 Subject: [PATCH 0353/1322] scripts/dtc: dtx_diff - 2nd update of include dts paths to match build Update dtx_diff include paths in the same manner as: commit b12869a8d519 ("of: remove drivers/of/testcase-data from include search path for CPP"), commit 5ffa2aed389c ("of: remove arch/$(SRCARCH)/boot/dts from include search path for CPP"), and commit 50f9ddaf64e1 ("of: search scripts/dtc/include-prefixes path for both CPP and DTC"). Remove proposed include path kernel/dts/, which was never implemented for the dtb build. For the diff case, each source file is compiled separately. For each of those compiles, provide the location of the source file as an include path, not the location of both source files. Signed-off-by: Frank Rowand Signed-off-by: Rob Herring --- scripts/dtc/dtx_diff | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff index f9a3d8d23c64..8c4fbad2055e 100755 --- a/scripts/dtc/dtx_diff +++ b/scripts/dtc/dtx_diff @@ -86,6 +86,7 @@ eod compile_to_dts() { dtx="$1" + dtc_include="$2" if [ -d "${dtx}" ] ; then @@ -113,7 +114,7 @@ compile_to_dts() { # ----- input is DTS (source) if ( cpp ${cpp_flags} -x assembler-with-cpp ${dtx} \ - | ${DTC} -I dts ) ; then + | ${DTC} ${dtc_include} -I dts ) ; then return fi @@ -320,18 +321,13 @@ fi cpp_flags="\ -nostdinc \ - -I${srctree}/arch/${ARCH}/boot/dts \ -I${srctree}/scripts/dtc/include-prefixes \ - -I${srctree}/drivers/of/testcase-data \ -undef -D__DTS__" -dtc_flags="\ - -i ${srctree}/arch/${ARCH}/boot/dts/ \ - -i ${srctree}/kernel/dts \ - ${dtx_path_1_dtc_include} \ - ${dtx_path_2_dtc_include}" - -DTC="${DTC} ${dtc_flags} -O dts -qq -f ${dtc_sort} -o -" +DTC="\ + ${DTC} \ + -i ${srctree}/scripts/dtc/include-prefixes \ + -O dts -qq -f ${dtc_sort} -o -" # ----- do the diff or decompile @@ -339,11 +335,11 @@ DTC="${DTC} ${dtc_flags} -O dts -qq -f ${dtc_sort} -o -" if (( ${cmd_diff} )) ; then diff ${diff_flags} --label "${dtx_file_1}" --label "${dtx_file_2}" \ - <(compile_to_dts "${dtx_file_1}") \ - <(compile_to_dts "${dtx_file_2}") + <(compile_to_dts "${dtx_file_1}" "${dtx_path_1_dtc_include}") \ + <(compile_to_dts "${dtx_file_2}" "${dtx_path_2_dtc_include}") else - compile_to_dts "${dtx_file_1}" + compile_to_dts "${dtx_file_1}" "${dtx_path_1_dtc_include}" fi From 749aaf3372b8b56b8743c3e27700d64c8bd06921 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Wed, 20 Sep 2017 13:56:06 -0500 Subject: [PATCH 0354/1322] PCI: endpoint: Use correct "end of test" interrupt pci_epf_test_raise_irq() reads the interrupt to use for the response from reg->command, but this has been cleared at the beginning of the command handler so the value is always zero at this point. Instead, extract the interrupt index before handling the command and then pass the requested interrupt into pci_epf_test_raise_irq(). This allows us to remove the specific code to extract the interrupt for COMMAND_RAISE_MSI_IRQ since it is now handled in common code. Fixes: 3ecf3232c54c ("PCI: endpoint: Do not reset *command* inadvertently") Signed-off-by: John Keeping Signed-off-by: Bjorn Helgaas Acked-by: Kishon Vijay Abraham I --- drivers/pci/endpoint/functions/pci-epf-test.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index 4ddc6e8f9fe7..f9308c2f22e6 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -251,9 +251,8 @@ err: return ret; } -static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test) +static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test, u8 irq) { - u8 irq; u8 msi_count; struct pci_epf *epf = epf_test->epf; struct pci_epc *epc = epf->epc; @@ -262,7 +261,6 @@ static void pci_epf_test_raise_irq(struct pci_epf_test *epf_test) reg->status |= STATUS_IRQ_RAISED; msi_count = pci_epc_get_msi(epc); - irq = (reg->command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; if (irq > msi_count || msi_count <= 0) pci_epc_raise_irq(epc, PCI_EPC_IRQ_LEGACY, 0); else @@ -289,6 +287,8 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->command = 0; reg->status = 0; + irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; + if (command & COMMAND_RAISE_LEGACY_IRQ) { reg->status = STATUS_IRQ_RAISED; pci_epc_raise_irq(epc, PCI_EPC_IRQ_LEGACY, 0); @@ -301,7 +301,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_WRITE_FAIL; else reg->status |= STATUS_WRITE_SUCCESS; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } @@ -311,7 +311,7 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_READ_SUCCESS; else reg->status |= STATUS_READ_FAIL; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } @@ -321,13 +321,12 @@ static void pci_epf_test_cmd_handler(struct work_struct *work) reg->status |= STATUS_COPY_SUCCESS; else reg->status |= STATUS_COPY_FAIL; - pci_epf_test_raise_irq(epf_test); + pci_epf_test_raise_irq(epf_test, irq); goto reset_handler; } if (command & COMMAND_RAISE_MSI_IRQ) { msi_count = pci_epc_get_msi(epc); - irq = (command & MSI_NUMBER_MASK) >> MSI_NUMBER_SHIFT; if (irq > msi_count || msi_count <= 0) goto reset_handler; reg->status = STATUS_IRQ_RAISED; From 0dcd020b7abbff238f188d4db8f02389dc849553 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 20 Sep 2017 09:21:40 +0800 Subject: [PATCH 0355/1322] ALSA: usb-audio: Add sample rate quirk for Plantronics C310/C520-M Like other Plantronics devices, C310 and C520-M do not support sample rate reading. Add them to the sample rate quirk accordingly. BugLink: https://bugs.launchpad.net/bugs/1708499 BugLink: https://bugs.launchpad.net/bugs/1709282 Signed-off-by: Kai-Heng Feng Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 913552078285..b8cb57aeec77 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1137,6 +1137,8 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0x02F7): /* Plantronics BT-600 */ case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ + case USB_ID(0x047F, 0xC022): /* Plantronics C310 */ + case USB_ID(0x047F, 0xC036): /* Plantronics C520-M */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ From d4e1b299ec2853dd3d90b71ae86fa2626e60dc25 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 20 Sep 2017 12:18:17 -0400 Subject: [PATCH 0356/1322] ipv6: Use ipv6_authlen for len in ipv6_skip_exthdr In ipv6_skip_exthdr, the lengh of AH header is computed manually as (hp->hdrlen+2)<<2. However, in include/linux/ipv6.h, a macro named ipv6_authlen is already defined for exactly the same job. This commit replaces the manual computation code with the macro. Signed-off-by: Xiang Gao Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 305e2ed730bf..115d60919f72 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -99,7 +99,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen+2)<<2; + hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); From 07850a4f74ea0433fe26dccb1ab9556c738a6584 Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Wed, 20 Sep 2017 08:12:23 +0800 Subject: [PATCH 0357/1322] macvlan: code refine to check data before using This patch checks data first at one place, return if it's null. Signed-off-by: Zhang Shengju Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d2aea961e0f4..1ffe77e95d46 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1231,11 +1231,14 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return -EADDRNOTAVAIL; } - if (data && data[IFLA_MACVLAN_FLAGS] && + if (!data) + return 0; + + if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC) return -EINVAL; - if (data && data[IFLA_MACVLAN_MODE]) { + if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: @@ -1248,7 +1251,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], } } - if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: @@ -1260,7 +1263,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], } } - if (data && data[IFLA_MACVLAN_MACADDR]) { + if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; @@ -1268,7 +1271,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return -EADDRNOTAVAIL; } - if (data && data[IFLA_MACVLAN_MACADDR_COUNT]) + if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; From 008ba2a13f2d04c947adc536d19debb8fe66f110 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 14 Sep 2017 17:14:41 -0400 Subject: [PATCH 0358/1322] packet: hold bind lock when rebinding to fanout hook Packet socket bind operations must hold the po->bind_lock. This keeps po->running consistent with whether the socket is actually on a ptype list to receive packets. fanout_add unbinds a socket and its packet_rcv/tpacket_rcv call, then binds the fanout object to receive through packet_rcv_fanout. Make it hold the po->bind_lock when testing po->running and rebinding. Else, it can race with other rebind operations, such as that in packet_set_ring from packet_rcv to tpacket_rcv. Concurrent updates can result in a socket being added to a fanout group twice, causing use-after-free KASAN bug reports, among others. Reported independently by both trinity and syzkaller. Verified that the syzkaller reproducer passes after this patch. Fixes: dc99f600698d ("packet: Add fanout support.") Reported-by: nixioaming Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..d288f52c53f7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1684,10 +1684,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1745,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); From 00ba4cb36da682c68dc87d1703a8aaffe2b4e9c5 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Sat, 16 Sep 2017 16:18:33 +0200 Subject: [PATCH 0359/1322] bridge: also trigger RTM_NEWLINK when interface is released from bridge Currently, when an interface is released from a bridge via ioctl(), we get a RTM_DELLINK event through netlink: Deleted 2: dummy0: mtu 1500 master bridge0 state UNKNOWN link/ether 6e:23:c2:54:3a:b3 Userspace has to interpret that as a removal from the bridge, not as a complete removal of the interface. When an bridged interface is completely removed, we get two events: Deleted 2: dummy0: mtu 1500 master bridge0 state DOWN link/ether 6e:23:c2:54:3a:b3 Deleted 2: dummy0: mtu 1500 qdisc noop state DOWN group default link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff In constrast, when an interface is released from a bond, we get a RTM_NEWLINK with only the new characteristics (no master): 3: dummy1: mtu 1500 qdisc noqueue master bond0 state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 4: bond0: mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state DOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: mtu 1500 qdisc noqueue state DOWN group default link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff 4: bond0: mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff Userland may be confused by the fact we say a link is deleted while its characteristics are only modified. A first solution would have been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK event. However, maybe some piece of userland is relying on this RTM_DELLINK to detect when a bridged interface is released. Instead, we also emit a RTM_NEWLINK event once the interface is released (without master info). Deleted 2: dummy0: mtu 1500 master bridge0 state UNKNOWN link/ether 8a:bb:e7:94:b1:f8 2: dummy0: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff This is done only when using ioctl(). When using Netlink, such an event is already automatically emitted in do_setlink(). Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..3148cb3a8e82 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -99,8 +99,10 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (isadd) ret = br_add_if(br, dev); - else + else { ret = br_del_if(br, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + } return ret; } From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Sep 2017 16:38:36 -0700 Subject: [PATCH 0360/1322] bpf: one perf event close won't free bpf program attached by another perf event This patch fixes a bug exhibited by the following scenario: 1. fd1 = perf_event_open with attr.config = ID1 2. attach bpf program prog1 to fd1 3. fd2 = perf_event_open with attr.config = ID1 4. user program closes fd2 and prog1 is detached from the tracepoint. 5. user program with fd1 does not work properly as tracepoint no output any more. The issue happens at step 4. Multiple perf_event_open can be called successfully, but only one bpf prog pointer in the tp_event. In the current logic, any fd release for the same tp_event will free the tp_event->prog. The fix is to free tp_event->prog only when the closing fd corresponds to the one which registered the program. Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/trace_events.h | 1 + kernel/events/core.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 7f11050746ae..2e0f22298fe9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -272,6 +272,7 @@ struct trace_event_call { int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog *prog; + struct perf_event *bpf_prog_owner; int (*perf_perm)(struct trace_event_call *, struct perf_event *); diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e691b75b2db..6bc21e202ae4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put(prog); } From 4d6a78b477dd9686e7034aa140057ce7a5da5fd3 Mon Sep 17 00:00:00 2001 From: Egil Hjelmeland Date: Tue, 19 Sep 2017 10:09:24 +0200 Subject: [PATCH 0361/1322] net: dsa: lan9303: Add adjust_link() method Make the driver react to device tree "fixed-link" declaration on CPU port. - turn off autonegotiation - force speed 10 or 100 mb/s - force duplex mode Signed-off-by: Egil Hjelmeland Signed-off-by: David S. Miller --- drivers/net/dsa/lan9303-core.c | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index b471413d3df9..07355db2ad81 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "lan9303.h" @@ -57,6 +58,7 @@ #define LAN9303_SWITCH_CSR_CMD_LANES (BIT(19) | BIT(18) | BIT(17) | BIT(16)) #define LAN9303_VIRT_PHY_BASE 0x70 #define LAN9303_VIRT_SPECIAL_CTRL 0x77 +#define LAN9303_VIRT_SPECIAL_TURBO BIT(10) /*Turbo MII Enable*/ /*13.4 Switch Fabric Control and Status Registers * Accessed indirectly via SWITCH_CSR_CMD, SWITCH_CSR_DATA. @@ -760,6 +762,43 @@ static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum, return chip->ops->phy_write(chip, phy, regnum, val); } +static void lan9303_adjust_link(struct dsa_switch *ds, int port, + struct phy_device *phydev) +{ + struct lan9303 *chip = ds->priv; + int ctl, res; + + if (!phy_is_pseudo_fixed_link(phydev)) + return; + + ctl = lan9303_phy_read(ds, port, MII_BMCR); + + ctl &= ~BMCR_ANENABLE; + + if (phydev->speed == SPEED_100) + ctl |= BMCR_SPEED100; + else if (phydev->speed == SPEED_10) + ctl &= ~BMCR_SPEED100; + else + dev_err(ds->dev, "unsupported speed: %d\n", phydev->speed); + + if (phydev->duplex == DUPLEX_FULL) + ctl |= BMCR_FULLDPLX; + else + ctl &= ~BMCR_FULLDPLX; + + res = lan9303_phy_write(ds, port, MII_BMCR, ctl); + + if (port == chip->phy_addr_sel_strap) { + /* Virtual Phy: Remove Turbo 200Mbit mode */ + lan9303_read(chip->regmap, LAN9303_VIRT_SPECIAL_CTRL, &ctl); + + ctl &= ~LAN9303_VIRT_SPECIAL_TURBO; + res = regmap_write(chip->regmap, + LAN9303_VIRT_SPECIAL_CTRL, ctl); + } +} + static int lan9303_port_enable(struct dsa_switch *ds, int port, struct phy_device *phy) { @@ -803,6 +842,7 @@ static const struct dsa_switch_ops lan9303_switch_ops = { .get_strings = lan9303_get_strings, .phy_read = lan9303_phy_read, .phy_write = lan9303_phy_write, + .adjust_link = lan9303_adjust_link, .get_ethtool_stats = lan9303_get_ethtool_stats, .get_sset_count = lan9303_get_sset_count, .port_enable = lan9303_port_enable, From 9457642a405fd68d5ce90f5c432e462277fb666e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:41 +0800 Subject: [PATCH 0362/1322] virtio-net: remove unnecessary parameter of virtnet_xdp_xmit() CC: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 511f8339fa96..a0ef4b08f0e9 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -373,7 +373,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, } static bool virtnet_xdp_xmit(struct virtnet_info *vi, - struct receive_queue *rq, struct xdp_buff *xdp) { struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -542,7 +541,7 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp))) + if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); rcu_read_unlock(); goto xdp_xmit; @@ -677,7 +676,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp))) + if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); if (unlikely(xdp_page != page)) From 312403453532b209879aa7faeff296bd3dea78af Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:42 +0800 Subject: [PATCH 0363/1322] virtio-net: add packet len average only when needed during XDP There's no need to add packet len average in the case of XDP_PASS since it will be done soon after skb is created. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a0ef4b08f0e9..db5924c085aa 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -656,6 +656,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); + if (act != XDP_PASS) + ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); + switch (act) { case XDP_PASS: /* recalculate offset to account for any header @@ -671,14 +674,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, put_page(page); head_skb = page_to_skb(vi, rq, xdp_page, offset, len, PAGE_SIZE); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); return head_skb; } break; case XDP_TX: if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); if (unlikely(xdp_page != page)) goto err_xdp; rcu_read_unlock(); @@ -690,7 +691,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, case XDP_DROP: if (unlikely(xdp_page != page)) __free_pages(xdp_page, 0); - ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len); goto err_xdp; } } From 186b3c998c50fc241b51b905081c748455d16b4a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Sep 2017 17:42:43 +0800 Subject: [PATCH 0364/1322] virtio-net: support XDP_REDIRECT This patch tries to add XDP_REDIRECT for virtio-net. The changes are not complex as we could use exist XDP_TX helpers for most of the work. The rest is passing the XDP_TX to NAPI handler for implementing batching. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net. | 0 drivers/net/virtio_net.c | 77 ++++++++++++++++++++++++++++++++-------- 2 files changed, 62 insertions(+), 15 deletions(-) create mode 100644 drivers/net/virtio_net. diff --git a/drivers/net/virtio_net. b/drivers/net/virtio_net. new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index db5924c085aa..f6c1f135a024 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static int napi_weight = NAPI_POLL_WEIGHT; @@ -372,8 +373,20 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, return skb; } -static bool virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_buff *xdp) +static void virtnet_xdp_flush(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct send_queue *sq; + unsigned int qp; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; + + virtqueue_kick(sq->vq); +} + +static bool __virtnet_xdp_xmit(struct virtnet_info *vi, + struct xdp_buff *xdp) { struct virtio_net_hdr_mrg_rxbuf *hdr; unsigned int len; @@ -407,10 +420,19 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi, return false; } - virtqueue_kick(sq->vq); return true; } +static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) +{ + struct virtnet_info *vi = netdev_priv(dev); + bool sent = __virtnet_xdp_xmit(vi, xdp); + + if (!sent) + return -ENOSPC; + return 0; +} + static unsigned int virtnet_get_headroom(struct virtnet_info *vi) { return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; @@ -483,7 +505,8 @@ static struct sk_buff *receive_small(struct net_device *dev, struct virtnet_info *vi, struct receive_queue *rq, void *buf, void *ctx, - unsigned int len) + unsigned int len, + bool *xdp_xmit) { struct sk_buff *skb; struct bpf_prog *xdp_prog; @@ -493,7 +516,7 @@ static struct sk_buff *receive_small(struct net_device *dev, unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct page *page = virt_to_head_page(buf); - unsigned int delta = 0; + unsigned int delta = 0, err; struct page *xdp_page; len -= vi->hdr_len; @@ -541,8 +564,16 @@ static struct sk_buff *receive_small(struct net_device *dev, delta = orig_data - xdp.data; break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) + if (unlikely(!__virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); + else + *xdp_xmit = true; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + err = xdp_do_redirect(dev, &xdp, xdp_prog); + if (!err) + *xdp_xmit = true; rcu_read_unlock(); goto xdp_xmit; default: @@ -603,7 +634,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct receive_queue *rq, void *buf, void *ctx, - unsigned int len) + unsigned int len, + bool *xdp_xmit) { struct virtio_net_hdr_mrg_rxbuf *hdr = buf; u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); @@ -613,6 +645,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, struct bpf_prog *xdp_prog; unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); + int err; head_skb = NULL; @@ -678,12 +711,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, } break; case XDP_TX: - if (unlikely(!virtnet_xdp_xmit(vi, &xdp))) + if (unlikely(!__virtnet_xdp_xmit(vi, &xdp))) trace_xdp_exception(vi->dev, xdp_prog, act); + else + *xdp_xmit = true; if (unlikely(xdp_page != page)) goto err_xdp; rcu_read_unlock(); goto xdp_xmit; + case XDP_REDIRECT: + err = xdp_do_redirect(dev, &xdp, xdp_prog); + if (err) + *xdp_xmit = true; + rcu_read_unlock(); + goto xdp_xmit; default: bpf_warn_invalid_xdp_action(act); case XDP_ABORTED: @@ -788,7 +829,7 @@ xdp_xmit: } static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, - void *buf, unsigned int len, void **ctx) + void *buf, unsigned int len, void **ctx, bool *xdp_xmit) { struct net_device *dev = vi->dev; struct sk_buff *skb; @@ -809,11 +850,11 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, } if (vi->mergeable_rx_bufs) - skb = receive_mergeable(dev, vi, rq, buf, ctx, len); + skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit); else if (vi->big_packets) skb = receive_big(dev, vi, rq, buf, len); else - skb = receive_small(dev, vi, rq, buf, ctx, len); + skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit); if (unlikely(!skb)) return 0; @@ -1071,7 +1112,7 @@ static void refill_work(struct work_struct *work) } } -static int virtnet_receive(struct receive_queue *rq, int budget) +static int virtnet_receive(struct receive_queue *rq, int budget, bool *xdp_xmit) { struct virtnet_info *vi = rq->vq->vdev->priv; unsigned int len, received = 0, bytes = 0; @@ -1083,13 +1124,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget) while (received < budget && (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { - bytes += receive_buf(vi, rq, buf, len, ctx); + bytes += receive_buf(vi, rq, buf, len, ctx, xdp_xmit); received++; } } else { while (received < budget && (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { - bytes += receive_buf(vi, rq, buf, len, NULL); + bytes += receive_buf(vi, rq, buf, len, NULL, xdp_xmit); received++; } } @@ -1161,15 +1202,19 @@ static int virtnet_poll(struct napi_struct *napi, int budget) struct receive_queue *rq = container_of(napi, struct receive_queue, napi); unsigned int received; + bool xdp_xmit = false; virtnet_poll_cleantx(rq); - received = virtnet_receive(rq, budget); + received = virtnet_receive(rq, budget, &xdp_xmit); /* Out of packets? */ if (received < budget) virtqueue_napi_complete(napi, rq->vq, received); + if (xdp_xmit) + xdp_do_flush_map(); + return received; } @@ -2069,6 +2114,8 @@ static const struct net_device_ops virtnet_netdev = { .ndo_poll_controller = virtnet_netpoll, #endif .ndo_xdp = virtnet_xdp, + .ndo_xdp_xmit = virtnet_xdp_xmit, + .ndo_xdp_flush = virtnet_xdp_flush, .ndo_features_check = passthru_features_check, }; From 0d4a6608f68c7532dcbfec2ea1150c9761767d03 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 19 Sep 2017 12:11:43 +0200 Subject: [PATCH 0365/1322] udp: do rmem bulk free even if the rx sk queue is empty The commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()") reduced greatly the cacheline contention between the BH and the US reader batching the rmem updates in most scenarios. Such optimization is explicitly avoided if the US reader is faster then BH processing. My fault, I initially suggested this kind of behavior due to concerns of possible regressions with small sk_rcvbuf values. Tests showed such concerns are misplaced, so this commit relaxes the condition for rmem bulk updates, obtaining small but measurable performance gain in the scenario described above. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..784ced0b9150 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1212,8 +1212,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; From 1fa089ec6d68849bfe60abd141375dc9b21b0ee8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 16:46:49 -0500 Subject: [PATCH 0366/1322] [SMB3] Update session and share information displayed for debugging SMB2/SMB3 We were not displaying some key fields (session status and capabilities and whether guest authenticated) for SMB2/SMB3 session in /proc/fs/cifs/DebugData. This is needed for real world triage of problems with the (now much more common) SMB3 mounts. Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9727e1dcacd5..cbb9534b89b4 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -160,8 +160,13 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) entry for %s not fully " - "displayed\n\t", i, ses->serverName); + seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t", + i, ses->serverName, ses->ses_count, + ses->capabilities, ses->status); + if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) + seq_printf(m, "Guest\t"); + else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + seq_printf(m, "Anonymous\t"); } else { seq_printf(m, "\n%d) Name: %s Domain: %s Uses: %d OS:" From b22666febf6fc67776d49782057fe4dd06f41552 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:17 -0400 Subject: [PATCH 0367/1322] drm/amdkfd: Fix incorrect destroy_mqd parameter When uninitializing a kernel queue. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 0649dd43e780..a4ca1133482c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -184,7 +184,7 @@ static void uninitialize(struct kernel_queue *kq) if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) kq->mqd->destroy_mqd(kq->mqd, kq->queue->mqd, - false, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, kq->queue->pipe, kq->queue->queue); From cb1d9967461cdf3b6aac6317c8d954a14f842571 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Wed, 20 Sep 2017 18:10:21 -0400 Subject: [PATCH 0368/1322] drm/amdkfd: Fix kernel-queue wrapping bugs Avoid intermediate negative numbers when doing calculations with a mix of signed and unsigned variables where implicit conversions can lead to unexpected results. When kernel queue buffer wraps around to 0, we need to check that rptr won't be overwritten by the new packet. Signed-off-by: Yong Zhao Signed-off-by: Felix Kuehling Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index a4ca1133482c..ed71ad40e8f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -210,6 +210,11 @@ static int acquire_packet_buffer(struct kernel_queue *kq, uint32_t wptr, rptr; unsigned int *queue_address; + /* When rptr == wptr, the buffer is empty. + * When rptr == wptr + 1, the buffer is full. + * It is always rptr that advances to the position of wptr, rather than + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ rptr = *kq->rptr_kernel; wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; @@ -219,11 +224,10 @@ static int acquire_packet_buffer(struct kernel_queue *kq, pr_debug("wptr: %d\n", wptr); pr_debug("queue_address 0x%p\n", queue_address); - available_size = (rptr - 1 - wptr + queue_size_dwords) % + available_size = (rptr + queue_size_dwords - 1 - wptr) % queue_size_dwords; - if (packet_size_in_dwords >= queue_size_dwords || - packet_size_in_dwords >= available_size) { + if (packet_size_in_dwords > available_size) { /* * make sure calling functions know * acquire_packet_buffer() failed @@ -233,6 +237,14 @@ static int acquire_packet_buffer(struct kernel_queue *kq, } if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. + */ + if (packet_size_in_dwords >= rptr) { + *buffer_ptr = NULL; + return -ENOMEM; + } + /* fill nops, roll back and start at position 0 */ while (wptr > 0) { queue_address[wptr] = kq->nop_packet; wptr = (wptr + 1) % queue_size_dwords; From c986169fdec992c464c84d0a3abdbfc846ed3df9 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 20 Sep 2017 18:10:22 -0400 Subject: [PATCH 0369/1322] drm/amdkfd: Print event limit messages only once per process To avoid spamming the log. Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 5 ++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 5979158c3f7b..944abfad39c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -292,7 +292,10 @@ static int create_signal_event(struct file *devkfd, struct kfd_event *ev) { if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { - pr_warn("Signal event wasn't created because limit was reached\n"); + if (!p->signal_event_limit_reached) { + pr_warn("Signal event wasn't created because limit was reached\n"); + p->signal_event_limit_reached = true; + } return -ENOMEM; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index b397ec726400..b87e96cee5fa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -521,6 +521,7 @@ struct kfd_process { struct list_head signal_event_pages; u32 next_nonsignal_event_id; size_t signal_event_count; + bool signal_event_limit_reached; }; /** From c2a64bb9fcd31c39feddf30748b4ee8d82e53c6a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 13:19:13 -0400 Subject: [PATCH 0370/1322] net: compat: assert the size of cmsg copied in is as expected The actual length of cmsg fetched in during the second loop (i.e., kcmsg - kcmsg_base) could be different from what we get from the first loop (i.e., kcmlen). The main reason is that the two get_user() calls in the two loops (i.e., get_user(ucmlen, &ucmsg->cmsg_len) and __get_user(ucmlen, &ucmsg->cmsg_len)) could cause ucmlen to have different values even they fetch from the same userspace address, as user can race to change the memory content in &ucmsg->cmsg_len across fetches. Although in the second loop, the sanity check if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) is inplace, it only ensures that the cmsg fetched in during the second loop does not exceed the length of kcmlen, but not necessarily equal to kcmlen. But indicated by the assignment kmsg->msg_controllen = kcmlen, we should enforce that. This patch adds this additional sanity check and ensures that what is recorded in kmsg->msg_controllen is the actual cmsg length. Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- net/compat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; From eccaa9e51b77e504fa8e8357e4979bde724e22a8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 20 Sep 2017 15:39:59 -0700 Subject: [PATCH 0371/1322] Revert "bridge: also trigger RTM_NEWLINK when interface is released from bridge" This reverts commit 00ba4cb36da682c68dc87d1703a8aaffe2b4e9c5. Discussion with David Ahern determined that this change is actually not needed. Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 3148cb3a8e82..7970f8540cbb 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -99,10 +99,8 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (isadd) ret = br_add_if(br, dev); - else { + else ret = br_del_if(br, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); - } return ret; } From 92dd5452c1be873a1193561f4f691763103d22ac Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 19 Sep 2017 18:45:56 +0100 Subject: [PATCH 0372/1322] net: change skb->mac_header when Generic XDP calls adjust_head Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..9a2254f9802f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: From 5e62d98c4bbc243f3dca18e73a754b629839fc5c Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:07 -0700 Subject: [PATCH 0373/1322] net: fec: only check queue 0 if RXF_0/TXF_0 interrupt is set Before queue 0 was always checked if any queue caused an interrupt. It is better to just mark queue 0 if queue 0 has caused an interrupt. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 56f56d6ada9c..464055fb33d5 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1559,14 +1559,14 @@ fec_enet_collect_events(struct fec_enet_private *fep, uint int_events) if (int_events == 0) return false; - if (int_events & FEC_ENET_RXF) + if (int_events & FEC_ENET_RXF_0) fep->work_rx |= (1 << 2); if (int_events & FEC_ENET_RXF_1) fep->work_rx |= (1 << 0); if (int_events & FEC_ENET_RXF_2) fep->work_rx |= (1 << 1); - if (int_events & FEC_ENET_TXF) + if (int_events & FEC_ENET_TXF_0) fep->work_tx |= (1 << 2); if (int_events & FEC_ENET_TXF_1) fep->work_tx |= (1 << 0); From 7063c163cd4a20184b3bbada503dab8f254a8c56 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:08 -0700 Subject: [PATCH 0374/1322] net: fec: remove unused interrupt FEC_ENET_TS_TIMER FEC_ENET_TS_TIMER is not checked in the interrupt routine so there is no need to enable it. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 38c7b21e5d63..ede1876a9a19 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -374,8 +374,8 @@ struct bufdesc_ex { #define FEC_ENET_TS_AVAIL ((uint)0x00010000) #define FEC_ENET_TS_TIMER ((uint)0x00008000) -#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII | FEC_ENET_TS_TIMER) -#define FEC_NAPI_IMASK (FEC_ENET_MII | FEC_ENET_TS_TIMER) +#define FEC_DEFAULT_IMASK (FEC_ENET_TXF | FEC_ENET_RXF | FEC_ENET_MII) +#define FEC_NAPI_IMASK FEC_ENET_MII #define FEC_RX_DISABLED_IMASK (FEC_DEFAULT_IMASK & (~FEC_ENET_RXF)) /* ENET interrupt coalescing macro define */ From e24ee2780a1d6786b94139ebf95b44c9ae62bf13 Mon Sep 17 00:00:00 2001 From: Troy Kisky Date: Tue, 19 Sep 2017 17:33:09 -0700 Subject: [PATCH 0375/1322] net: fec: return IRQ_HANDLED if fec_ptp_check_pps_event handled it fec_ptp_check_pps_event will return 1 if FEC_T_TF_MASK caused an interrupt. Don't return IRQ_NONE in this case. Signed-off-by: Troy Kisky Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 464055fb33d5..3dc2d771a222 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1604,8 +1604,8 @@ fec_enet_interrupt(int irq, void *dev_id) } if (fep->ptp_clock) - fec_ptp_check_pps_event(fep); - + if (fec_ptp_check_pps_event(fep)) + ret = IRQ_HANDLED; return ret; } From 53bade8a3372eea0f1276bc96892735c517330fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 18:00:37 -0700 Subject: [PATCH 0376/1322] net: dsa: Utilize dsa_slave_dev_check() Instead of open coding the check. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d51b10450e1b..6fc9eb094267 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1294,7 +1294,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (dev->netdev_ops != &dsa_slave_netdev_ops) + if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; if (event == NETDEV_CHANGEUPPER) From 5c1adf606d6ff519b5c1b9b22b8055f161dd98dd Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 18:03:45 -0700 Subject: [PATCH 0377/1322] blackfin: tcm-bf518: Remove dsa.h inclusion Nothing in that file uses definitions from that header, so just get rid of it. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- arch/blackfin/mach-bf518/boards/tcm-bf518.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/blackfin/mach-bf518/boards/tcm-bf518.c b/arch/blackfin/mach-bf518/boards/tcm-bf518.c index 240d5cb1f02c..37d868085f6a 100644 --- a/arch/blackfin/mach-bf518/boards/tcm-bf518.c +++ b/arch/blackfin/mach-bf518/boards/tcm-bf518.c @@ -25,7 +25,6 @@ #include #include #include -#include /* * Name the Board for the /proc/cpuinfo From 637ae0f44580040702b53e8f0593a5a5fb00473e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 19 Sep 2017 18:03:46 -0700 Subject: [PATCH 0378/1322] blackfin: ezbrd: Remove non-functional DSA/KSZ8893M code There is no in tree driver for the KSZ8893M switch driver, so just get rid of the code in that board file. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- arch/blackfin/mach-bf518/boards/ezbrd.c | 47 ------------------------- 1 file changed, 47 deletions(-) diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c index d022112927c2..c51d1b810ac3 100644 --- a/arch/blackfin/mach-bf518/boards/ezbrd.c +++ b/arch/blackfin/mach-bf518/boards/ezbrd.c @@ -25,7 +25,6 @@ #include #include #include -#include /* * Name the Board for the /proc/cpuinfo @@ -105,11 +104,7 @@ static const unsigned short bfin_mac_peripherals[] = { static struct bfin_phydev_platform_data bfin_phydev_data[] = { { -#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M) - .addr = 3, -#else .addr = 1, -#endif .irq = IRQ_MAC_PHYINT, }, }; @@ -119,9 +114,6 @@ static struct bfin_mii_bus_platform_data bfin_mii_bus_data = { .phydev_data = bfin_phydev_data, .phy_mode = PHY_INTERFACE_MODE_MII, .mac_peripherals = bfin_mac_peripherals, -#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M) - .phy_mask = 0xfff7, /* Only probe the port phy connect to the on chip MAC */ -#endif .vlan1_mask = 1, .vlan2_mask = 2, }; @@ -140,29 +132,6 @@ static struct platform_device bfin_mac_device = { } }; -#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M) -static struct dsa_chip_data ksz8893m_switch_chip_data = { - .mii_bus = &bfin_mii_bus.dev, - .port_names = { - NULL, - "eth%d", - "eth%d", - "cpu", - }, -}; -static struct dsa_platform_data ksz8893m_switch_data = { - .nr_chips = 1, - .netdev = &bfin_mac_device.dev, - .chip = &ksz8893m_switch_chip_data, -}; - -static struct platform_device ksz8893m_switch_device = { - .name = "dsa", - .id = 0, - .num_resources = 0, - .dev.platform_data = &ksz8893m_switch_data, -}; -#endif #endif #if IS_ENABLED(CONFIG_MTD_M25P80) @@ -228,19 +197,6 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if IS_ENABLED(CONFIG_BFIN_MAC) -#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M) - { - .modalias = "ksz8893m", - .max_speed_hz = 5000000, - .bus_num = 0, - .chip_select = 1, - .platform_data = NULL, - .mode = SPI_MODE_3, - }, -#endif -#endif - #if IS_ENABLED(CONFIG_MMC_SPI) { .modalias = "mmc_spi", @@ -714,9 +670,6 @@ static struct platform_device *stamp_devices[] __initdata = { #if IS_ENABLED(CONFIG_BFIN_MAC) &bfin_mii_bus, &bfin_mac_device, -#if IS_ENABLED(CONFIG_NET_DSA_KSZ8893M) - &ksz8893m_switch_device, -#endif #endif #if IS_ENABLED(CONFIG_SPI_BFIN5XX) From 02388bf87f72e1d47174cd8f81c34443920eb5a0 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 19 Sep 2017 21:49:55 -0400 Subject: [PATCH 0379/1322] isdn/i4l: fetch the ppp_write buffer in one shot In isdn_ppp_write(), the header (i.e., protobuf) of the buffer is fetched twice from userspace. The first fetch is used to peek at the protocol of the message and reset the huptimer if necessary; while the second fetch copies in the whole buffer. However, given that buf resides in userspace memory, a user process can race to change its memory content across fetches. By doing so, we can either avoid resetting the huptimer for any type of packets (by first setting proto to PPP_LCP and later change to the actual type) or force resetting the huptimer for LCP packets. This patch changes this double-fetch behavior into two single fetches decided by condition (lp->isdn_device < 0 || lp->isdn_channel <0). A more detailed discussion can be found at https://marc.info/?l=linux-kernel&m=150586376926123&w=2 Signed-off-by: Meng Xu Signed-off-by: David S. Miller --- drivers/isdn/i4l/isdn_ppp.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 6c44609fd83a..cd2b3c69771a 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -825,7 +825,6 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) isdn_net_local *lp; struct ippp_struct *is; int proto; - unsigned char protobuf[4]; is = file->private_data; @@ -839,24 +838,28 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) if (!lp) printk(KERN_DEBUG "isdn_ppp_write: lp == NULL\n"); else { - /* - * Don't reset huptimer for - * LCP packets. (Echo requests). - */ - if (copy_from_user(protobuf, buf, 4)) - return -EFAULT; - proto = PPP_PROTOCOL(protobuf); - if (proto != PPP_LCP) - lp->huptimer = 0; + if (lp->isdn_device < 0 || lp->isdn_channel < 0) { + unsigned char protobuf[4]; + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + if (copy_from_user(protobuf, buf, 4)) + return -EFAULT; + + proto = PPP_PROTOCOL(protobuf); + if (proto != PPP_LCP) + lp->huptimer = 0; - if (lp->isdn_device < 0 || lp->isdn_channel < 0) return 0; + } if ((dev->drv[lp->isdn_device]->flags & DRV_FLAG_RUNNING) && lp->dialstate == 0 && (lp->flags & ISDN_NET_CONNECTED)) { unsigned short hl; struct sk_buff *skb; + unsigned char *cpy_buf; /* * we need to reserve enough space in front of * sk_buff. old call to dev_alloc_skb only reserved @@ -869,11 +872,21 @@ isdn_ppp_write(int min, struct file *file, const char __user *buf, int count) return count; } skb_reserve(skb, hl); - if (copy_from_user(skb_put(skb, count), buf, count)) + cpy_buf = skb_put(skb, count); + if (copy_from_user(cpy_buf, buf, count)) { kfree_skb(skb); return -EFAULT; } + + /* + * Don't reset huptimer for + * LCP packets. (Echo requests). + */ + proto = PPP_PROTOCOL(cpy_buf); + if (proto != PPP_LCP) + lp->huptimer = 0; + if (is->debug & 0x40) { printk(KERN_DEBUG "ppp xmit: len %d\n", (int) skb->len); isdn_ppp_frame_log("xmit", skb->data, skb->len, 32, is->unit, lp->ppp_slot); From 34929cb4d691f7f9e217ba0e3f536978cd56aa6c Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Wed, 20 Sep 2017 11:32:07 +0530 Subject: [PATCH 0380/1322] cxgb4: add new T5 pci device id's Add 0x50a5, 0x50a6, 0x50a7, 0x50a8 and 0x50a9 T5 device id's. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index aa28299aef5f..37d90d63e4a3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -176,6 +176,11 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x50a2), /* Custom T540-KR4 */ CH_PCI_ID_TABLE_FENTRY(0x50a3), /* Custom T580-KR4 */ CH_PCI_ID_TABLE_FENTRY(0x50a4), /* Custom 2x T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a5), /* Custom T522-BT */ + CH_PCI_ID_TABLE_FENTRY(0x50a6), /* Custom T522-BT-SO */ + CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */ + CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */ /* T6 adapters: */ From e92a0843795779678397ac0790a76de20f79cc13 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:50 +0800 Subject: [PATCH 0381/1322] net: hns3: Cleanup for ROCE capability flag in ae_dev This patch add the ROCE supported flag in the driver_data field of pci_device_id, delete roce_pci_tbl and change HNAE_DEV_SUPPORT_ROCE_B to HNAE3_DEV_SUPPORT_ROCE_B. This cleanup is done in order to support adding capability in pci_device_id and to fix initialization failure when cmd is not supported. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 5 +++- .../hisilicon/hns3/hns3pf/hclge_main.c | 25 +++---------------- .../hisilicon/hns3/hns3pf/hns3_enet.c | 16 ++++++++---- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index b2f28ae81273..0f7b61a92f44 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -49,7 +49,10 @@ #define HNAE3_CLASS_NAME_SIZE 16 #define HNAE3_DEV_INITED_B 0x0 -#define HNAE_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_ROCE_B 0x1 + +#define hnae3_dev_roce_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 74008ef23169..6953d19c6475 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -46,17 +46,7 @@ static const struct pci_device_id ae_algo_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ - {0, } -}; - -static const struct pci_device_id roce_pci_tbl[] = { - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, - /* Required last entry */ + /* required last entry */ {0, } }; @@ -894,7 +884,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) hdev->num_tqps = __le16_to_cpu(req->tqp_num); hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; - if (hnae_get_bit(hdev->ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->num_roce_msix = hnae_get_field(__le16_to_cpu(req->pf_intr_vector_number), HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S); @@ -3932,8 +3922,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, goto err; if (hdev->roce_client && - hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + hnae3_dev_roce_supported(hdev)) { struct hnae3_client *rc = hdev->roce_client; ret = hclge_init_roce_base_info(vport); @@ -3956,8 +3945,7 @@ static int hclge_init_client_instance(struct hnae3_client *client, break; case HNAE3_CLIENT_ROCE: - if (hnae_get_bit(hdev->ae_dev->flag, - HNAE_DEV_SUPPORT_ROCE_B)) { + if (hnae3_dev_roce_supported(hdev)) { hdev->roce_client = client; vport->roce.client = client; } @@ -4069,7 +4057,6 @@ static void hclge_pci_uninit(struct hclge_dev *hdev) static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) { struct pci_dev *pdev = ae_dev->pdev; - const struct pci_device_id *id; struct hclge_dev *hdev; int ret; @@ -4084,10 +4071,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hdev->ae_dev = ae_dev; ae_dev->priv = hdev; - id = pci_match_id(roce_pci_tbl, ae_dev->pdev); - if (id) - hnae_set_bit(ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B, 1); - ret = hclge_pci_init(hdev); if (ret) { dev_err(&pdev->dev, "PCI init failed\n"); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 4d68d6ea5143..94d8bb5b92f0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -41,11 +41,16 @@ static struct hnae3_client client; static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, - {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), + BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, /* required last entry */ {0, } }; @@ -1348,6 +1353,7 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } ae_dev->pdev = pdev; + ae_dev->flag = ent->driver_data; ae_dev->dev_type = HNAE3_DEV_KNIC; pci_set_drvdata(pdev, ae_dev); From 2daf4a6536f3109ed0ed758cec14743e0e5c20ea Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:51 +0800 Subject: [PATCH 0382/1322] net: hns3: Fix initialization when cmd is not supported When ae_dev doesn't support DCB, rx_priv_wl_config, common_thrd_config and tm_qs_bp_cfg can't be called, otherwise cmd return fail, which causes the hclge module initialization process to fail. This patch fix it by adding a DCB capability flag to check if the ae_dev support DCB. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 +++++ .../hisilicon/hns3/hns3pf/hclge_main.c | 26 +++++++++++-------- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 4 +++ .../hisilicon/hns3/hns3pf/hns3_enet.c | 10 +++---- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 0f7b61a92f44..ad685f5aa6d1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -50,10 +50,17 @@ #define HNAE3_DEV_INITED_B 0x0 #define HNAE3_DEV_SUPPORT_ROCE_B 0x1 +#define HNAE3_DEV_SUPPORT_DCB_B 0x2 + +#define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\ + BIT(HNAE3_DEV_SUPPORT_ROCE_B)) #define hnae3_dev_roce_supported(hdev) \ hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B) +#define hnae3_dev_dcb_supported(hdev) \ + hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_DCB_B) + #define ring_ptr_move_fw(ring, p) \ ((ring)->p = ((ring)->p + 1) % (ring)->desc_num) #define ring_ptr_move_bw(ring, p) \ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 6953d19c6475..903f43a8c2a1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1772,18 +1772,22 @@ int hclge_buffer_alloc(struct hclge_dev *hdev) return ret; } - ret = hclge_rx_priv_wl_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure rx private waterline %d\n", ret); - return ret; - } + if (hnae3_dev_dcb_supported(hdev)) { + ret = hclge_rx_priv_wl_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure rx private waterline %d\n", + ret); + return ret; + } - ret = hclge_common_thrd_config(hdev); - if (ret) { - dev_err(&hdev->pdev->dev, - "could not configure common threshold %d\n", ret); - return ret; + ret = hclge_common_thrd_config(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not configure common threshold %d\n", + ret); + return ret; + } } ret = hclge_common_wl_config(hdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 1c577d268f00..c91dbf19c4b1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -976,6 +976,10 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev) if (ret) return ret; + /* Only DCB-supported dev supports qset back pressure setting */ + if (!hnae3_dev_dcb_supported(hdev)) + return 0; + for (i = 0; i < hdev->tm_info.num_tc; i++) { ret = hclge_tm_qs_bp_cfg(hdev, i); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 94d8bb5b92f0..35369e1c8036 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -42,15 +42,15 @@ static const struct pci_device_id hns3_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), - BIT(HNAE3_DEV_SUPPORT_ROCE_B)}, + HNAE3_DEV_SUPPORT_ROCE_DCB_BITS}, /* required last entry */ {0, } }; From d221df4e0faae2b9cc8ad78f3e5e777461b6b542 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:52 +0800 Subject: [PATCH 0383/1322] net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB When ae_dev doesn't support DCB, DEFAULT_DV must be set to a lower value, otherwise the buffer allocation process will fail. This patch fix it by setting it to 30K bytes. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index c2b613b40509..30e2ad5ac0da 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -688,6 +688,7 @@ struct hclge_reset_tqp_queue { #define HCLGE_DEFAULT_TX_BUF 0x4000 /* 16k bytes */ #define HCLGE_TOTAL_PKT_BUF 0x108000 /* 1.03125M bytes */ #define HCLGE_DEFAULT_DV 0xA000 /* 40k byte */ +#define HCLGE_DEFAULT_NON_DCB_DV 0x7800 /* 30K byte */ #define HCLGE_TYPE_CRQ 0 #define HCLGE_TYPE_CSQ 1 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 903f43a8c2a1..796370adf99c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1444,7 +1444,11 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) tc_num = hclge_get_tc_num(hdev); pfc_enable_num = hclge_get_pfc_enalbe_num(hdev); - shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + if (hnae3_dev_dcb_supported(hdev)) + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV; + else + shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_NON_DCB_DV; + shared_buf_tc = pfc_enable_num * hdev->mps + (tc_num - pfc_enable_num) * hdev->mps / 2 + hdev->mps; From bb1fe9ea6371e075d3d1448cd3ff6441d31307be Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:53 +0800 Subject: [PATCH 0384/1322] net: hns3: Fix for not setting rx private buffer size to zero When rx private buffer is disabled, there may be some case that the rx private buffer is not set to zero, which may cause buffer allocation process to fail. This patch fixes this problem by setting priv->enable to 0 and priv->buf_size to zero when rx private buffer is disabled. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 796370adf99c..a7d8fb1e15f6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1504,6 +1504,11 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) priv->wl.high = 2 * hdev->mps; priv->buf_size = priv->wl.high; } + } else { + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; } } @@ -1516,8 +1521,15 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; - if (hdev->hw_tc_map & BIT(i)) - priv->enable = 1; + priv->enable = 0; + priv->wl.low = 0; + priv->wl.high = 0; + priv->buf_size = 0; + + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + priv->enable = 1; if (hdev->tm_info.hw_pfc_map & BIT(i)) { priv->wl.low = 128; From b8c8bf47da5576657370798da6f18a8cb0245d5b Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:54 +0800 Subject: [PATCH 0385/1322] net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer rx_priv_buf_alloc is used to tell hardware how much buffer is used for rx direction, right now only the private buffer is assigned. For ae_dev that doesn't support DCB, private rx buffer is assigned to zero, only shared rx buffer is used. So not setting the shared rx buffer cause dropping of packet in SSU. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 30e2ad5ac0da..758cf3948131 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -270,7 +270,8 @@ struct hclge_tx_buff_alloc { struct hclge_rx_priv_buff { __le16 buf_num[HCLGE_TC_NUM]; - u8 rsv[8]; + __le16 shared_buf; + u8 rsv[6]; }; struct hclge_query_version { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index a7d8fb1e15f6..e313552bb23d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1622,6 +1622,10 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B); } + req->shared_buf = + cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) | + (1 << HCLGE_TC0_PRI_BUF_EN_B)); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { dev_err(&hdev->pdev->dev, From d602a52540c9b92e0dd152cfe1d0848c23f08894 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:55 +0800 Subject: [PATCH 0386/1322] net: hns3: Fix for rx priv buf allocation when DCB is not supported When hdev doesn't support DCB, rx private buffer is not allocated, otherwise there is not enough buffer for rx shared buffer, causing buffer allocation process to fail. This patch fixes by checking the dcb capability in hclge_rx_buffer_calc. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e313552bb23d..c660f0caf709 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1489,6 +1489,16 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) struct hclge_priv_buf *priv; int i; + /* When DCB is not supported, rx private + * buffer is not allocated. + */ + if (!hnae3_dev_dcb_supported(hdev)) { + if (!hclge_is_rx_buf_ok(hdev, rx_all)) + return -ENOMEM; + + return 0; + } + /* step 1, try to alloc private buffer for all enabled tc */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { priv = &hdev->priv_buf[i]; From c4726338d928c824f56c27734d837b8244132705 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:56 +0800 Subject: [PATCH 0387/1322] net: hns3: Fix typo error for feild in hclge_tm This patch fixes a typo error for feild, which should be field. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 20 +++++++++---------- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index c91dbf19c4b1..fe659f752237 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -280,11 +280,11 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pg_id = pg_id; - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -307,11 +307,11 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pri_id = pri_id; - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); - hclge_tm_set_feild(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); + hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); return hclge_cmd_send(&hdev->hw, &desc, 1); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 7e67337dfaf2..85158b0d73fe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -94,10 +94,10 @@ struct hclge_bp_to_qs_map_cmd { u32 rsvd1; }; -#define hclge_tm_set_feild(dest, string, val) \ +#define hclge_tm_set_field(dest, string, val) \ hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH), val) -#define hclge_tm_get_feild(src, string) \ +#define hclge_tm_get_field(src, string) \ hnae_get_field((src), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH)) From 68ece54efd417d415462adbaa2700cba50de3ff6 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:57 +0800 Subject: [PATCH 0388/1322] net: hns3: Fix for setting rss_size incorrectly rss_size is 1, 2, 4, 8, 16, 32, 64, 128, but acutal tc queue size can be any u16 less than 128. If tc queue size is 5, we set the rss_size to 8, indirection table will be used to limit the size of actual queue size. It may cause dropping of receiving packet in hardware if rss_size is not set correctly. For now, each TC has the same rss size. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_main.c | 80 +++++++++---------- .../hisilicon/hns3/hns3pf/hclge_main.h | 1 + .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 1 + 3 files changed, 40 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c660f0caf709..e0685e630afe 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2606,6 +2606,7 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) u16 tc_valid[HCLGE_MAX_TC_NUM]; u16 tc_size[HCLGE_MAX_TC_NUM]; u32 *rss_indir = NULL; + u16 rss_size = 0, roundup_size; const u8 *key; int i, ret, j; @@ -2620,7 +2621,13 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) for (j = 0; j < hdev->num_vmdq_vport + 1; j++) { for (i = 0; i < HCLGE_RSS_IND_TBL_SIZE; i++) { vport[j].rss_indirection_tbl[i] = - i % hdev->rss_size_max; + i % vport[j].alloc_rss_size; + + /* vport 0 is for PF */ + if (j != 0) + continue; + + rss_size = vport[j].alloc_rss_size; rss_indir[i] = vport[j].rss_indirection_tbl[i]; } } @@ -2637,42 +2644,31 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) if (ret) goto err; - for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - if (hdev->hw_tc_map & BIT(i)) - tc_valid[i] = 1; - else - tc_valid[i] = 0; - - switch (hdev->rss_size_max) { - case HCLGE_RSS_TC_SIZE_0: - tc_size[i] = 0; - break; - case HCLGE_RSS_TC_SIZE_1: - tc_size[i] = 1; - break; - case HCLGE_RSS_TC_SIZE_2: - tc_size[i] = 2; - break; - case HCLGE_RSS_TC_SIZE_3: - tc_size[i] = 3; - break; - case HCLGE_RSS_TC_SIZE_4: - tc_size[i] = 4; - break; - case HCLGE_RSS_TC_SIZE_5: - tc_size[i] = 5; - break; - case HCLGE_RSS_TC_SIZE_6: - tc_size[i] = 6; - break; - case HCLGE_RSS_TC_SIZE_7: - tc_size[i] = 7; - break; - default: - break; - } - tc_offset[i] = hdev->rss_size_max * i; + /* Each TC have the same queue size, and tc_size set to hardware is + * the log2 of roundup power of two of rss_size, the acutal queue + * size is limited by indirection table. + */ + if (rss_size > HCLGE_RSS_TC_SIZE_7 || rss_size == 0) { + dev_err(&hdev->pdev->dev, + "Configure rss tc size failed, invalid TC_SIZE = %d\n", + rss_size); + return -EINVAL; } + + roundup_size = roundup_pow_of_two(rss_size); + roundup_size = ilog2(roundup_size); + + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { + tc_valid[i] = 0; + + if (!(hdev->hw_tc_map & BIT(i))) + continue; + + tc_valid[i] = 1; + tc_size[i] = roundup_size; + tc_offset[i] = rss_size * i; + } + ret = hclge_set_rss_tc_mode(hdev, tc_valid, tc_size, tc_offset); err: @@ -4167,12 +4163,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } - ret = hclge_rss_init_hw(hdev); - if (ret) { - dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); - return ret; - } - ret = hclge_init_vlan_config(hdev); if (ret) { dev_err(&pdev->dev, "VLAN init fail, ret =%d\n", ret); @@ -4185,6 +4175,12 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } + ret = hclge_rss_init_hw(hdev); + if (ret) { + dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret); + return ret; + } + setup_timer(&hdev->service_timer, hclge_service_timer, (unsigned long)hdev); INIT_WORK(&hdev->service_task, hclge_service_task); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index edb10ad075eb..7f8dd129c10d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -477,6 +477,7 @@ struct hclge_vport { u8 rss_hash_key[HCLGE_RSS_KEY_SIZE]; /* User configured hash keys */ /* User configured lookup table entries */ u8 rss_indirection_tbl[HCLGE_RSS_IND_TBL_SIZE]; + u16 alloc_rss_size; u16 qs_offset; u16 bw_limit; /* VSI BW Limit (0 = disabled) */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index fe659f752237..b7ba7aa66620 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -397,6 +397,7 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->num_tqps / kinfo->num_tc); vport->qs_offset = hdev->tm_info.num_tc * vport->vport_id; vport->dwrr = 100; /* 100 percent as init */ + vport->alloc_rss_size = kinfo->rss_size; for (i = 0; i < kinfo->num_tc; i++) { if (hdev->hw_tc_map & BIT(i)) { From c5795c5308af81568d1573598716091120c85a38 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 20 Sep 2017 18:52:58 +0800 Subject: [PATCH 0389/1322] net: hns3: Fix for pri to tc mapping in TM Current mapping between pri and tc is one to one, so user can't map multi priorities to the same tc. This patch changes the mapping to many to one. Fixes: 848440544b41f ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver") Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 3 ++- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 16 +++++++++------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index ad685f5aa6d1..1a01cadfe5f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -376,12 +376,12 @@ struct hnae3_ae_algo { struct hnae3_tc_info { u16 tqp_offset; /* TQP offset from base TQP */ u16 tqp_count; /* Total TQPs */ - u8 up; /* user priority */ u8 tc; /* TC index */ bool enable; /* If this TC is enable or not */ }; #define HNAE3_MAX_TC 8 +#define HNAE3_MAX_USER_PRIO 8 struct hnae3_knic_private_info { struct net_device *netdev; /* Set by KNIC client when init instance */ u16 rss_size; /* Allocated RSS queues */ @@ -389,6 +389,7 @@ struct hnae3_knic_private_info { u16 num_desc; u8 num_tc; /* Total number of enabled TCs */ + u8 prio_tc[HNAE3_MAX_USER_PRIO]; /* TC indexed by prio */ struct hnae3_tc_info tc_info[HNAE3_MAX_TC]; /* Idx of array is HW TC */ u16 num_tqps; /* total number of TQPs in this handle */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 7f8dd129c10d..9fcfd9395424 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -176,7 +176,6 @@ struct hclge_pg_info { struct hclge_tc_info { u8 tc_id; u8 tc_sch_mode; /* 0: sp; 1: dwrr */ - u8 up; u8 pgid; u32 bw_limit; }; @@ -197,6 +196,7 @@ struct hclge_tm_info { u8 num_tc; u8 num_pg; /* It must be 1 if vNET-Base schd */ u8 pg_dwrr[HCLGE_PG_NUM]; + u8 prio_tc[HNAE3_MAX_USER_PRIO]; struct hclge_pg_info pg_info[HCLGE_PG_NUM]; struct hclge_tc_info tc_info[HNAE3_MAX_TC]; enum hclge_fc_mode fc_mode; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index b7ba7aa66620..73a75d7cc551 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -128,9 +128,7 @@ static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id) { u8 tc; - for (tc = 0; tc < hdev->tm_info.num_tc; tc++) - if (hdev->tm_info.tc_info[tc].up == pri_id) - break; + tc = hdev->tm_info.prio_tc[pri_id]; if (tc >= hdev->tm_info.num_tc) return -EINVAL; @@ -158,7 +156,7 @@ static int hclge_up_to_tc_map(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_PRI_TO_TC_MAPPING, false); - for (pri_id = 0; pri_id < hdev->tm_info.num_tc; pri_id++) { + for (pri_id = 0; pri_id < HNAE3_MAX_USER_PRIO; pri_id++) { ret = hclge_fill_pri_array(hdev, pri, pri_id); if (ret) return ret; @@ -405,16 +403,17 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) kinfo->tc_info[i].tqp_offset = i * kinfo->rss_size; kinfo->tc_info[i].tqp_count = kinfo->rss_size; kinfo->tc_info[i].tc = i; - kinfo->tc_info[i].up = hdev->tm_info.tc_info[i].up; } else { /* Set to default queue if TC is disable */ kinfo->tc_info[i].enable = false; kinfo->tc_info[i].tqp_offset = 0; kinfo->tc_info[i].tqp_count = 1; kinfo->tc_info[i].tc = 0; - kinfo->tc_info[i].up = 0; } } + + memcpy(kinfo->prio_tc, hdev->tm_info.prio_tc, + FIELD_SIZEOF(struct hnae3_knic_private_info, prio_tc)); } static void hclge_tm_vport_info_update(struct hclge_dev *hdev) @@ -436,12 +435,15 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) for (i = 0; i < hdev->tm_info.num_tc; i++) { hdev->tm_info.tc_info[i].tc_id = i; hdev->tm_info.tc_info[i].tc_sch_mode = HCLGE_SCH_MODE_DWRR; - hdev->tm_info.tc_info[i].up = i; hdev->tm_info.tc_info[i].pgid = 0; hdev->tm_info.tc_info[i].bw_limit = hdev->tm_info.pg_info[0].bw_limit; } + for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) + hdev->tm_info.prio_tc[i] = + (i >= hdev->tm_info.num_tc) ? 0 : i; + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } From 4d61eda812041ef9c820d7f147884133fd3307bc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Sep 2017 16:27:39 +0100 Subject: [PATCH 0390/1322] CIFS: make arrays static const, reduces object code size Don't populate the read-only arrays types[] on the stack, instead make them both static const. Makes the object code smaller by over 200 bytes: Before: text data bss dec hex filename 111503 37696 448 149647 2488f fs/cifs/file.o After: text data bss dec hex filename 111140 37856 448 149444 247c4 fs/cifs/file.o Signed-off-by: Colin Ian King Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/file.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0786f19d288f..822311919163 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1102,8 +1102,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifs_tcon *tcon; unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; - int types[] = {LOCKING_ANDX_LARGE_FILES, - LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + static const int types[] = { + LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES + }; int i; xid = get_xid(); @@ -1434,8 +1436,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int xid) { int rc = 0, stored_rc; - int types[] = {LOCKING_ANDX_LARGE_FILES, - LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; + static const int types[] = { + LOCKING_ANDX_LARGE_FILES, + LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES + }; unsigned int i; unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; From 94183331e815617246b1baa97e0916f358c794bb Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 7 Sep 2017 16:03:27 +0800 Subject: [PATCH 0391/1322] cifs: release cifs root_cred after exit_cifs memory leak was found by kmemleak. exit_cifs_spnego should be called before cifs module removed, or cifs root_cred will not be released. kmemleak report: unreferenced object 0xffff880070a3ce40 (size 192): backtrace: kmemleak_alloc+0x4a/0xa0 kmem_cache_alloc+0xc7/0x1d0 prepare_kernel_cred+0x20/0x120 init_cifs_spnego+0x2d/0x170 [cifs] 0xffffffffc07801f3 do_one_initcall+0x51/0x1b0 do_init_module+0x60/0x1fd load_module+0x161e/0x1b60 SYSC_finit_module+0xa9/0x100 SyS_finit_module+0xe/0x10 Signed-off-by: Shu Wang Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a864e6575309..8c8b75d33f31 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1449,7 +1449,7 @@ exit_cifs(void) exit_cifs_idmap(); #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); #endif cifs_destroy_request_bufs(); cifs_destroy_mids(); From f5c4ba816315d3b813af16f5571f86c8d4e897bd Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Fri, 8 Sep 2017 18:48:33 +0800 Subject: [PATCH 0392/1322] cifs: release auth_key.response for reconnect. There is a race that cause cifs reconnect in cifs_mount, - cifs_mount - cifs_get_tcp_session - [ start thread cifs_demultiplex_thread - cifs_read_from_socket: -ECONNABORTED - DELAY_WORK smb2_reconnect_server ] - cifs_setup_session - [ smb2_reconnect_server ] auth_key.response was allocated in cifs_setup_session, and will release when the session destoried. So when session re- connect, auth_key.response should be check and released. Tested with my system: CIFS VFS: Free previous auth_key.response = ffff8800320bbf80 A simple auth_key.response allocation call trace: - cifs_setup_session - SMB2_sess_setup - SMB2_sess_auth_rawntlmssp_authenticate - build_ntlmssp_auth_blob - setup_ntlmv2_rsp Signed-off-by: Shu Wang Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg --- fs/cifs/connect.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8d38b22afb2b..0bfc2280436d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4154,6 +4154,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); + if (ses->auth_key.response) { + cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); From 0603c96f3af50e2f9299fa410c224ab1d465e0f9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 19:57:18 -0500 Subject: [PATCH 0393/1322] SMB: Validate negotiate (to protect against downgrade) even if signing off As long as signing is supported (ie not a guest user connection) and connection is SMB3 or SMB3.02, then validate negotiate (protect against man in the middle downgrade attacks). We had been doing this only when signing was required, not when signing was just enabled, but this more closely matches recommended SMB3 behavior and is better security. Suggested by Metze. Signed-off-by: Steve French Reviewed-by: Jeremy Allison Acked-by: Stefan Metzmacher Reviewed-by: Ronnie Sahlberg CC: Stable --- fs/cifs/smb2pdu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d499ce265c3b..6f0e6343c15e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -656,15 +656,22 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* * validation ioctl must be signed, so no point sending this if we - * can not sign it. We could eventually change this to selectively + * can not sign it (ie are not known user). Even if signing is not + * required (enabled but not negotiated), in those cases we selectively * sign just this, the first and only signed request on a connection. - * This is good enough for now since a user who wants better security - * would also enable signing on the mount. Having validation of - * negotiate info for signed connections helps reduce attack vectors + * Having validation of negotiate info helps reduce attack vectors. */ - if (tcon->ses->server->sign == false) + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) return 0; /* validation requires signing */ + if (tcon->ses->user_name == NULL) { + cifs_dbg(FYI, "Can't validate negotiate: null user mount\n"); + return 0; /* validation requires signing */ + } + + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, From dff37b58ca53b1978e02edf6f8c1dd681799342b Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:01 +0200 Subject: [PATCH 0394/1322] mlxsw: spectrum_switchdev: Change mc_router to mrouter Change the naming of mc_router to mrouter to keep consistency. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index d39ffbfcc436..22f8d7428d96 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -699,10 +699,10 @@ static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, return -EINVAL; } -static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port, - struct switchdev_trans *trans, - struct net_device *orig_dev, - bool is_port_mc_router) +static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, + struct switchdev_trans *trans, + struct net_device *orig_dev, + bool is_port_mrouter) { struct mlxsw_sp_bridge_port *bridge_port; int err; @@ -720,12 +720,12 @@ static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port, err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, MLXSW_SP_FLOOD_TYPE_MC, - is_port_mc_router); + is_port_mrouter); if (err) return err; out: - bridge_port->mrouter = is_port_mc_router; + bridge_port->mrouter = is_port_mrouter; return 0; } @@ -793,9 +793,9 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev, attr->u.vlan_filtering); break; case SWITCHDEV_ATTR_ID_PORT_MROUTER: - err = mlxsw_sp_port_attr_mc_router_set(mlxsw_sp_port, trans, - attr->orig_dev, - attr->u.mrouter); + err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, trans, + attr->orig_dev, + attr->u.mrouter); break; case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED: err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, trans, From 4cdc35e4ebf2e6b1cf4fe028eb9e711723f9199a Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:02 +0200 Subject: [PATCH 0395/1322] mlxsw: spectrum_switchdev: Add a ports bitmap to the mid db Add a bitmap of ports to the mid struct to hold the ports that are registered to this mid. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../mellanox/mlxsw/spectrum_switchdev.c | 20 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 7180d8f3de75..0424bee88407 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -95,6 +95,7 @@ struct mlxsw_sp_mid { u16 fid; u16 mid; unsigned int ref_count; + unsigned long *ports_in_mid; /* bits array */ }; enum mlxsw_sp_span_type { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 22f8d7428d96..0fde16a22b72 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1239,6 +1239,7 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, u16 fid) { struct mlxsw_sp_mid *mid; + size_t alloc_size; u16 mid_idx; mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap, @@ -1250,6 +1251,14 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, if (!mid) return NULL; + alloc_size = sizeof(unsigned long) * + BITS_TO_LONGS(mlxsw_core_max_ports(mlxsw_sp->core)); + mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL); + if (!mid->ports_in_mid) { + kfree(mid); + return NULL; + } + set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap); ether_addr_copy(mid->addr, addr); mid->fid = fid; @@ -1260,12 +1269,16 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, return mid; } -static int __mlxsw_sp_mc_dec_ref(struct mlxsw_sp *mlxsw_sp, +static int __mlxsw_sp_mc_dec_ref(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_mid *mid) { + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + + clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); if (--mid->ref_count == 0) { list_del(&mid->list); clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); + kfree(mid->ports_in_mid); kfree(mid); return 1; } @@ -1311,6 +1324,7 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, } } mid->ref_count++; + set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true, mid->ref_count == 1); @@ -1331,7 +1345,7 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, return 0; err_out: - __mlxsw_sp_mc_dec_ref(mlxsw_sp, mid); + __mlxsw_sp_mc_dec_ref(mlxsw_sp_port, mid); return err; } @@ -1437,7 +1451,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(dev, "Unable to remove port from SMID\n"); mid_idx = mid->mid; - if (__mlxsw_sp_mc_dec_ref(mlxsw_sp, mid)) { + if (__mlxsw_sp_mc_dec_ref(mlxsw_sp_port, mid)) { err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index, mid_idx, false); if (err) From 0161b9505ab59d4bfc0de66073c9240d1b05040d Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:03 +0200 Subject: [PATCH 0396/1322] mlxsw: spectrum_switchdev: Remove reference count from mid Since there is a bitmap for the ports registered to each mid, there is no need for a ref count, since it will always be the number of set bits in this bitmap. Any check of the ref count was replaced with checking if the bitmap is empty. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 - .../mellanox/mlxsw/spectrum_switchdev.c | 20 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 0424bee88407..6fd0afe4b7a3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -94,7 +94,6 @@ struct mlxsw_sp_mid { unsigned char addr[ETH_ALEN]; u16 fid; u16 mid; - unsigned int ref_count; unsigned long *ports_in_mid; /* bits array */ }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0fde16a22b72..cb2275ed42ab 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1263,19 +1263,19 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, ether_addr_copy(mid->addr, addr); mid->fid = fid; mid->mid = mid_idx; - mid->ref_count = 0; list_add_tail(&mid->list, &mlxsw_sp->bridge->mids_list); return mid; } -static int __mlxsw_sp_mc_dec_ref(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_mid *mid) +static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mid *mid) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); - if (--mid->ref_count == 0) { + if (bitmap_empty(mid->ports_in_mid, + mlxsw_core_max_ports(mlxsw_sp->core))) { list_del(&mid->list); clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); kfree(mid->ports_in_mid); @@ -1296,6 +1296,7 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_device *bridge_device; struct mlxsw_sp_bridge_port *bridge_port; struct mlxsw_sp_mid *mid; + bool is_new_mid = false; u16 fid_index; int err = 0; @@ -1322,18 +1323,17 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(dev, "Unable to allocate MC group\n"); return -ENOMEM; } + is_new_mid = true; } - mid->ref_count++; set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true, - mid->ref_count == 1); + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true, is_new_mid); if (err) { netdev_err(dev, "Unable to set SMID\n"); goto err_out; } - if (mid->ref_count == 1) { + if (is_new_mid) { err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index, mid->mid, true); if (err) { @@ -1345,7 +1345,7 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, return 0; err_out: - __mlxsw_sp_mc_dec_ref(mlxsw_sp_port, mid); + mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); return err; } @@ -1451,7 +1451,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(dev, "Unable to remove port from SMID\n"); mid_idx = mid->mid; - if (__mlxsw_sp_mc_dec_ref(mlxsw_sp_port, mid)) { + if (mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid)) { err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index, mid_idx, false); if (err) From b80888a9194f3aecd5edf6a7ede5c23d77bade8b Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:04 +0200 Subject: [PATCH 0397/1322] mlxsw: spectrum_switchdev: Save mids list per bridge device Instead of saving all the mids in the same list, save them per vlan device. This change allows a more efficient mid find. Also, in the next patches, there will be added a lot of loops over all the mids in bridge device for multicast disable, mrouter change and ndb flush. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index cb2275ed42ab..2ba8a44e1933 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -67,7 +67,6 @@ struct mlxsw_sp_bridge { u32 ageing_time; bool vlan_enabled_exists; struct list_head bridges_list; - struct list_head mids_list; DECLARE_BITMAP(mids_bitmap, MLXSW_SP_MID_MAX); const struct mlxsw_sp_bridge_ops *bridge_8021q_ops; const struct mlxsw_sp_bridge_ops *bridge_8021d_ops; @@ -77,6 +76,7 @@ struct mlxsw_sp_bridge_device { struct net_device *dev; struct list_head list; struct list_head ports_list; + struct list_head mids_list; u8 vlan_enabled:1, multicast_enabled:1; const struct mlxsw_sp_bridge_ops *ops; @@ -161,6 +161,7 @@ mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge, } else { bridge_device->ops = bridge->bridge_8021d_ops; } + INIT_LIST_HEAD(&bridge_device->mids_list); list_add(&bridge_device->list, &bridge->bridges_list); return bridge_device; @@ -170,10 +171,17 @@ static void mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge, struct mlxsw_sp_bridge_device *bridge_device) { + struct mlxsw_sp_mid *mid, *tmp; + list_del(&bridge_device->list); if (bridge_device->vlan_enabled) bridge->vlan_enabled_exists = false; WARN_ON(!list_empty(&bridge_device->ports_list)); + list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) { + list_del(&mid->list); + clear_bit(mid->mid, bridge->mids_bitmap); + kfree(mid); + } kfree(bridge_device); } @@ -1221,22 +1229,25 @@ static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid, return err; } -static struct mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp *mlxsw_sp, - const unsigned char *addr, - u16 fid) +static struct +mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device, + const unsigned char *addr, + u16 fid) { struct mlxsw_sp_mid *mid; - list_for_each_entry(mid, &mlxsw_sp->bridge->mids_list, list) { + list_for_each_entry(mid, &bridge_device->mids_list, list) { if (ether_addr_equal(mid->addr, addr) && mid->fid == fid) return mid; } return NULL; } -static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, - const unsigned char *addr, - u16 fid) +static struct +mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_bridge_device *bridge_device, + const unsigned char *addr, + u16 fid) { struct mlxsw_sp_mid *mid; size_t alloc_size; @@ -1263,7 +1274,7 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, ether_addr_copy(mid->addr, addr); mid->fid = fid; mid->mid = mid_idx; - list_add_tail(&mid->list, &mlxsw_sp->bridge->mids_list); + list_add_tail(&mid->list, &bridge_device->mids_list); return mid; } @@ -1316,9 +1327,10 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid); - mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid_index); + mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index); if (!mid) { - mid = __mlxsw_sp_mc_alloc(mlxsw_sp, mdb->addr, fid_index); + mid = __mlxsw_sp_mc_alloc(mlxsw_sp, bridge_device, mdb->addr, + fid_index); if (!mid) { netdev_err(dev, "Unable to allocate MC group\n"); return -ENOMEM; @@ -1440,7 +1452,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid); - mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid_index); + mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index); if (!mid) { netdev_err(dev, "Unable to remove port from MC DB\n"); return -EINVAL; @@ -1995,17 +2007,6 @@ static void mlxsw_sp_fdb_fini(struct mlxsw_sp *mlxsw_sp) } -static void mlxsw_sp_mids_fini(struct mlxsw_sp *mlxsw_sp) -{ - struct mlxsw_sp_mid *mid, *tmp; - - list_for_each_entry_safe(mid, tmp, &mlxsw_sp->bridge->mids_list, list) { - list_del(&mid->list); - clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); - kfree(mid); - } -} - int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_sp_bridge *bridge; @@ -2017,7 +2018,6 @@ int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp) bridge->mlxsw_sp = mlxsw_sp; INIT_LIST_HEAD(&mlxsw_sp->bridge->bridges_list); - INIT_LIST_HEAD(&mlxsw_sp->bridge->mids_list); bridge->bridge_8021q_ops = &mlxsw_sp_bridge_8021q_ops; bridge->bridge_8021d_ops = &mlxsw_sp_bridge_8021d_ops; @@ -2028,7 +2028,6 @@ int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp) void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp) { mlxsw_sp_fdb_fini(mlxsw_sp); - mlxsw_sp_mids_fini(mlxsw_sp); WARN_ON(!list_empty(&mlxsw_sp->bridge->bridges_list)); kfree(mlxsw_sp->bridge); } From 5f9abc597cdd7a63433b7ebfcf26ee2746a76638 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:05 +0200 Subject: [PATCH 0398/1322] mlxsw: spectrum_switchdev: Break smid write function Break the smid write function into two, one that cleans the ports that might be still written there and one that changes an exiting mid entry. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 2ba8a44e1933..09ead97d9442 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1190,7 +1190,7 @@ mlxsw_sp_port_fdb_set(struct mlxsw_sp_port *mlxsw_sp_port, } static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr, - u16 fid, u16 mid, bool adding) + u16 fid, u16 mid_idx, bool adding) { char *sfd_pl; int err; @@ -1201,16 +1201,16 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr, mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0); mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid, - MLXSW_REG_SFD_REC_ACTION_NOP, mid); + MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx); err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl); kfree(sfd_pl); return err; } -static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid, - bool add, bool clear_all_ports) +/* clean the an entry from the HW and write there a full new entry */ +static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, + u16 mid_idx) { - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; char *smid_pl; int err, i; @@ -1218,12 +1218,29 @@ static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid, if (!smid_pl) return -ENOMEM; - mlxsw_reg_smid_pack(smid_pl, mid, mlxsw_sp_port->local_port, add); - if (clear_all_ports) { - for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++) - if (mlxsw_sp->ports[i]) - mlxsw_reg_smid_port_mask_set(smid_pl, i, 1); + mlxsw_reg_smid_pack(smid_pl, mid_idx, 0, false); + for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++) { + if (mlxsw_sp->ports[i]) + mlxsw_reg_smid_port_mask_set(smid_pl, i, 1); } + + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl); + kfree(smid_pl); + return err; +} + +static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, + u16 mid_idx, bool add) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char *smid_pl; + int err; + + smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL); + if (!smid_pl) + return -ENOMEM; + + mlxsw_reg_smid_pack(smid_pl, mid_idx, mlxsw_sp_port->local_port, add); err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl); kfree(smid_pl); return err; @@ -1336,10 +1353,11 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, return -ENOMEM; } is_new_mid = true; + mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid->mid); } set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true, is_new_mid); + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true); if (err) { netdev_err(dev, "Unable to set SMID\n"); goto err_out; @@ -1458,7 +1476,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, return -EINVAL; } - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false, false); + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); if (err) netdev_err(dev, "Unable to remove port from SMID\n"); From 73b433e803d2a3547ee38d1fb2a0bc6f3b03a6d9 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:06 +0200 Subject: [PATCH 0399/1322] mlxsw: spectrum_switchdev: Attach mid id allocation to HW write Attach mid getting and releasing mid id to the HW write / remove, and add a flag to indicate whether the mid is in the HW. It is done because mid id is also HW index to this mid. This change allows adding in the following patches the ability to have a mid in the mdb cache but not in the HW. It will be useful for being able to disable the multicast. It means that the mdb is being written / delete to the HW in the mid allocation / removing function, not after them. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../mellanox/mlxsw/spectrum_switchdev.c | 90 ++++++++++++------- 2 files changed, 57 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 6fd0afe4b7a3..e907ec446a73 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -94,6 +94,7 @@ struct mlxsw_sp_mid { unsigned char addr[ETH_ALEN]; u16 fid; u16 mid; + bool in_hw; unsigned long *ports_in_mid; /* bits array */ }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 09ead97d9442..9dd05d87b662 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1260,6 +1260,42 @@ mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device, return NULL; } +static bool +mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mid *mid) +{ + u16 mid_idx; + int err; + + mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap, + MLXSW_SP_MID_MAX); + if (mid_idx == MLXSW_SP_MID_MAX) + return false; + + mid->mid = mid_idx; + err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx); + if (err) + return false; + + err = mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid_idx, + true); + if (err) + return false; + + set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap); + mid->in_hw = true; + return true; +} + +static int mlxsw_sp_mc_remove_mdb_entry(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mid *mid) +{ + clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); + mid->in_hw = false; + return mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid->mid, + false); +} + static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_bridge_device *bridge_device, @@ -1268,12 +1304,6 @@ mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_mid *mid; size_t alloc_size; - u16 mid_idx; - - mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap, - MLXSW_SP_MID_MAX); - if (mid_idx == MLXSW_SP_MID_MAX) - return NULL; mid = kzalloc(sizeof(*mid), GFP_KERNEL); if (!mid) @@ -1281,36 +1311,43 @@ mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, alloc_size = sizeof(unsigned long) * BITS_TO_LONGS(mlxsw_core_max_ports(mlxsw_sp->core)); - mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL); - if (!mid->ports_in_mid) { - kfree(mid); - return NULL; - } - set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap); + mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL); + if (!mid->ports_in_mid) + goto err_ports_in_mid_alloc; + ether_addr_copy(mid->addr, addr); mid->fid = fid; - mid->mid = mid_idx; + mid->in_hw = false; + if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid)) + goto err_write_mdb_entry; + list_add_tail(&mid->list, &bridge_device->mids_list); return mid; + +err_write_mdb_entry: + kfree(mid->ports_in_mid); +err_ports_in_mid_alloc: + kfree(mid); + return NULL; } static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_mid *mid) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + int err = 0; clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); if (bitmap_empty(mid->ports_in_mid, mlxsw_core_max_ports(mlxsw_sp->core))) { + err = mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid); list_del(&mid->list); - clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); kfree(mid->ports_in_mid); kfree(mid); - return 1; } - return 0; + return err; } static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, @@ -1324,7 +1361,6 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_device *bridge_device; struct mlxsw_sp_bridge_port *bridge_port; struct mlxsw_sp_mid *mid; - bool is_new_mid = false; u16 fid_index; int err = 0; @@ -1352,8 +1388,6 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(dev, "Unable to allocate MC group\n"); return -ENOMEM; } - is_new_mid = true; - mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid->mid); } set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); @@ -1363,15 +1397,6 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, goto err_out; } - if (is_new_mid) { - err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index, - mid->mid, true); - if (err) { - netdev_err(dev, "Unable to set MC SFD\n"); - goto err_out; - } - } - return 0; err_out: @@ -1481,12 +1506,9 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, netdev_err(dev, "Unable to remove port from SMID\n"); mid_idx = mid->mid; - if (mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid)) { - err = mlxsw_sp_port_mdb_op(mlxsw_sp, mdb->addr, fid_index, - mid_idx, false); - if (err) - netdev_err(dev, "Unable to remove MC SFD\n"); - } + err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); + if (err) + netdev_err(dev, "Unable to remove MC SFD\n"); return err; } From 061e55bfb83e632afcd34130bb19fe7a32325b02 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:07 +0200 Subject: [PATCH 0400/1322] mlxsw: spectrum_switchdev: Break mid deletion into two function Break mid deletion into two function, so it will be possible in the future to delete a mid entry for other reasons then switchdev command (like port deletion). Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 9dd05d87b662..7f622de6331c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1468,6 +1468,25 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port, return 0; } +static int +__mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_port *bridge_port, + struct mlxsw_sp_mid *mid) +{ + struct net_device *dev = mlxsw_sp_port->dev; + int err; + + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); + if (err) + netdev_err(dev, "Unable to remove port from SMID\n"); + + err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); + if (err) + netdev_err(dev, "Unable to remove MC SFD\n"); + + return err; +} + static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, const struct switchdev_obj_port_mdb *mdb) { @@ -1479,8 +1498,6 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_port *bridge_port; struct mlxsw_sp_mid *mid; u16 fid_index; - u16 mid_idx; - int err = 0; bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev); if (!bridge_port) @@ -1501,16 +1518,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, return -EINVAL; } - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); - if (err) - netdev_err(dev, "Unable to remove port from SMID\n"); - - mid_idx = mid->mid; - err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); - if (err) - netdev_err(dev, "Unable to remove MC SFD\n"); - - return err; + return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid); } static int mlxsw_sp_port_obj_del(struct net_device *dev, From 846fd8a0e7dcd9f455a86dc17ddf0a51c124f9c0 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:08 +0200 Subject: [PATCH 0401/1322] mlxsw: spectrum_switchdev: Don't write mids to the HW when mc is disabled Don't write multicast related data to the HW when mc is disabled. Also, don't allocate mid id to new mids (so the remove function could know that they weren't wrote to the HW) Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 7f622de6331c..cea257a77d09 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1290,6 +1290,9 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_mc_remove_mdb_entry(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_mid *mid) { + if (!mid->in_hw) + return 0; + clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); mid->in_hw = false; return mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid->mid, @@ -1319,11 +1322,15 @@ mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, ether_addr_copy(mid->addr, addr); mid->fid = fid; mid->in_hw = false; + + if (!bridge_device->multicast_enabled) + goto out; + if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid)) goto err_write_mdb_entry; +out: list_add_tail(&mid->list, &bridge_device->mids_list); - return mid; err_write_mdb_entry: @@ -1391,6 +1398,9 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, } set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid); + if (!bridge_device->multicast_enabled) + return 0; + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true); if (err) { netdev_err(dev, "Unable to set SMID\n"); @@ -1476,9 +1486,12 @@ __mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *dev = mlxsw_sp_port->dev; int err; - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); - if (err) - netdev_err(dev, "Unable to remove port from SMID\n"); + if (bridge_port->bridge_device->multicast_enabled) { + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); + + if (err) + netdev_err(dev, "Unable to remove port from SMID\n"); + } err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); if (err) From 2e3496cd3488729200ff0f1c6381b7016ecd41bd Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:09 +0200 Subject: [PATCH 0402/1322] mlxsw: spectrum_switchdev: Disable mdb when mc is disabled Remove all the mdb entries from the HW when mc is being disabled and re-write them when it is being enabled. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index cea257a77d09..79806af87b93 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -121,6 +121,11 @@ mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_bridge_port *bridge_port, u16 fid_index); +static void +mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_device + *bridge_device); + static struct mlxsw_sp_bridge_device * mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge, const struct net_device *br_dev) @@ -757,6 +762,12 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, if (!bridge_device) return 0; + if (bridge_device->multicast_enabled != !mc_disabled) { + bridge_device->multicast_enabled = !mc_disabled; + mlxsw_sp_bridge_mdb_mc_enable_sync(mlxsw_sp_port, + bridge_device); + } + list_for_each_entry(bridge_port, &bridge_device->ports_list, list) { enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC; bool member = mc_disabled ? true : bridge_port->mrouter; @@ -1207,9 +1218,8 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr, return err; } -/* clean the an entry from the HW and write there a full new entry */ -static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, - u16 mid_idx) +static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx, + long *ports_bitmap) { char *smid_pl; int err, i; @@ -1224,6 +1234,9 @@ static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, mlxsw_reg_smid_port_mask_set(smid_pl, i, 1); } + for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core)) + mlxsw_reg_smid_port_set(smid_pl, i, 1); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl); kfree(smid_pl); return err; @@ -1273,7 +1286,8 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, return false; mid->mid = mid_idx; - err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx); + err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, + mid->ports_in_mid); if (err) return false; @@ -1414,6 +1428,25 @@ err_out: return err; } +static void +mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_device + *bridge_device) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_mid *mid; + bool mc_enabled; + + mc_enabled = bridge_device->multicast_enabled; + + list_for_each_entry(mid, &bridge_device->mids_list, list) { + if (mc_enabled) + mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid); + else + mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid); + } +} + static int mlxsw_sp_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) From 218a8f8a6379bfd359e58f369b7b7660cd12e865 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:10 +0200 Subject: [PATCH 0403/1322] mlxsw: spectrum_switchdev: Use generic mc flood function Use the generic mc flood function to decide whether to flood mc to a port when mc is being enabled / disabled. Move this function in the file to avoid forward declaration. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 79806af87b93..19ac206879ff 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -742,6 +742,14 @@ out: return 0; } +static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port) +{ + const struct mlxsw_sp_bridge_device *bridge_device; + + bridge_device = bridge_port->bridge_device; + return !bridge_device->multicast_enabled ? true : bridge_port->mrouter; +} + static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, struct switchdev_trans *trans, struct net_device *orig_dev, @@ -770,7 +778,7 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, list_for_each_entry(bridge_port, &bridge_device->ports_list, list) { enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC; - bool member = mc_disabled ? true : bridge_port->mrouter; + bool member = mlxsw_sp_mc_flood(bridge_port); err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, @@ -829,14 +837,6 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev, return err; } -static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port) -{ - const struct mlxsw_sp_bridge_device *bridge_device; - - bridge_device = bridge_port->bridge_device; - return !bridge_device->multicast_enabled ? true : bridge_port->mrouter; -} - static int mlxsw_sp_port_vlan_fid_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan, struct mlxsw_sp_bridge_port *bridge_port) From 9dad51bdaa4b1846bd9d5307b54dca74efa555ea Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:11 +0200 Subject: [PATCH 0404/1322] mlxsw: spectrum_switchdev: Flood mc when mc is disabled by user flag When multicast is disabled, flood mc packets only to port that are marked BR_MCAST_FLOOD (instead to all). Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 19ac206879ff..50c4d7c735df 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -262,7 +262,8 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device, bridge_port->dev = brport_dev; bridge_port->bridge_device = bridge_device; bridge_port->stp_state = BR_STATE_DISABLED; - bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC; + bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC | + BR_MCAST_FLOOD; INIT_LIST_HEAD(&bridge_port->vlans_list); list_add(&bridge_port->list, &bridge_device->ports_list); bridge_port->ref_count = 1; @@ -468,7 +469,8 @@ static int mlxsw_sp_port_attr_get(struct net_device *dev, &attr->u.brport_flags); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT: - attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD; + attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD | + BR_MCAST_FLOOD; break; default: return -EOPNOTSUPP; @@ -653,8 +655,18 @@ static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, if (err) return err; - memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags)); + if (bridge_port->bridge_device->multicast_enabled) + goto out; + err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, + MLXSW_SP_FLOOD_TYPE_MC, + brport_flags & + BR_MCAST_FLOOD); + if (err) + return err; + +out: + memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags)); return 0; } @@ -747,7 +759,8 @@ static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port) const struct mlxsw_sp_bridge_device *bridge_device; bridge_device = bridge_port->bridge_device; - return !bridge_device->multicast_enabled ? true : bridge_port->mrouter; + return bridge_device->multicast_enabled ? bridge_port->mrouter : + bridge_port->flags & BR_MCAST_FLOOD; } static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, From bb5355b27c9da3786a2b5e1583c9d64f492ac7ad Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:12 +0200 Subject: [PATCH 0405/1322] mlxsw: spectrum_switchdev: Flush the mdb when a port is being removed When a port is being removed from a bridge, flush the bridge mdb to remove the mids of that port. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 50c4d7c735df..bc0787312a06 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -121,6 +121,10 @@ mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_bridge_port *bridge_port, u16 fid_index); +static void +mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_port *bridge_port); + static void mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_device @@ -176,17 +180,11 @@ static void mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge, struct mlxsw_sp_bridge_device *bridge_device) { - struct mlxsw_sp_mid *mid, *tmp; - list_del(&bridge_device->list); if (bridge_device->vlan_enabled) bridge->vlan_enabled_exists = false; WARN_ON(!list_empty(&bridge_device->ports_list)); - list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) { - list_del(&mid->list); - clear_bit(mid->mid, bridge->mids_bitmap); - kfree(mid); - } + WARN_ON(!list_empty(&bridge_device->mids_list)); kfree(bridge_device); } @@ -987,24 +985,28 @@ mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan) struct mlxsw_sp_bridge_vlan *bridge_vlan; struct mlxsw_sp_bridge_port *bridge_port; u16 vid = mlxsw_sp_port_vlan->vid; - bool last; + bool last_port, last_vlan; if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021Q && mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021D)) return; bridge_port = mlxsw_sp_port_vlan->bridge_port; + last_vlan = list_is_singular(&bridge_port->vlans_list); bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid); - last = list_is_singular(&bridge_vlan->port_vlan_list); + last_port = list_is_singular(&bridge_vlan->port_vlan_list); list_del(&mlxsw_sp_port_vlan->bridge_vlan_node); mlxsw_sp_bridge_vlan_put(bridge_vlan); mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED); mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false); - if (last) + if (last_port) mlxsw_sp_bridge_port_fdb_flush(mlxsw_sp_port->mlxsw_sp, bridge_port, mlxsw_sp_fid_index(fid)); + if (last_vlan) + mlxsw_sp_bridge_port_mdb_flush(mlxsw_sp_port, bridge_port); + mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan); mlxsw_sp_bridge_port_put(mlxsw_sp_port->mlxsw_sp->bridge, bridge_port); @@ -1580,6 +1582,23 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid); } +static void +mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_port *bridge_port) +{ + struct mlxsw_sp_bridge_device *bridge_device; + struct mlxsw_sp_mid *mid, *tmp; + + bridge_device = bridge_port->bridge_device; + + list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) { + if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) { + __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, + mid); + } + } +} + static int mlxsw_sp_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { From 3fba877cb68cfbc1826dd4abc7b1a8fe862adb2a Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:13 +0200 Subject: [PATCH 0406/1322] mlxsw: spectrum_switchdev: Flood all mc packets to mrouter ports When mc is enabled, whenever a mc packet doesn't hit any mdb entry it is being flood to the ports marked as mrouters. However, all mc packets should be flooded to them even if they match an entry in the mdb. This patch adds the mrouter ports to every mdb entry that is being written to the HW. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 65 +++++++++++++++++-- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index bc0787312a06..146beaa6b2da 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1288,10 +1288,55 @@ mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device, return NULL; } +static void +mlxsw_sp_bridge_port_get_ports_bitmap(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_bridge_port *bridge_port, + unsigned long *ports_bitmap) +{ + struct mlxsw_sp_port *mlxsw_sp_port; + u64 max_lag_members, i; + int lag_id; + + if (!bridge_port->lagged) { + set_bit(bridge_port->system_port, ports_bitmap); + } else { + max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core, + MAX_LAG_MEMBERS); + lag_id = bridge_port->lag_id; + for (i = 0; i < max_lag_members; i++) { + mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp, + lag_id, i); + if (mlxsw_sp_port) + set_bit(mlxsw_sp_port->local_port, + ports_bitmap); + } + } +} + +static void +mlxsw_sp_mc_get_mrouters_bitmap(unsigned long *flood_bitmap, + struct mlxsw_sp_bridge_device *bridge_device, + struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_bridge_port *bridge_port; + + list_for_each_entry(bridge_port, &bridge_device->ports_list, list) { + if (bridge_port->mrouter) { + mlxsw_sp_bridge_port_get_ports_bitmap(mlxsw_sp, + bridge_port, + flood_bitmap); + } + } +} + static bool mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_mid *mid) + struct mlxsw_sp_mid *mid, + struct mlxsw_sp_bridge_device *bridge_device) { + long *flood_bitmap; + int num_of_ports; + int alloc_size; u16 mid_idx; int err; @@ -1300,9 +1345,18 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, if (mid_idx == MLXSW_SP_MID_MAX) return false; + num_of_ports = mlxsw_core_max_ports(mlxsw_sp->core); + alloc_size = sizeof(long) * BITS_TO_LONGS(num_of_ports); + flood_bitmap = kzalloc(alloc_size, GFP_KERNEL); + if (!flood_bitmap) + return false; + + bitmap_copy(flood_bitmap, mid->ports_in_mid, num_of_ports); + mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp); + mid->mid = mid_idx; - err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, - mid->ports_in_mid); + err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap); + kfree(flood_bitmap); if (err) return false; @@ -1355,7 +1409,7 @@ mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, if (!bridge_device->multicast_enabled) goto out; - if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid)) + if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, bridge_device)) goto err_write_mdb_entry; out: @@ -1456,7 +1510,8 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, list_for_each_entry(mid, &bridge_device->mids_list, list) { if (mc_enabled) - mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid); + mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, + bridge_device); else mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid); } From 3ddda1178e41bbe26bb5c6ebf66ae3d0a87ac410 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:14 +0200 Subject: [PATCH 0407/1322] mlxsw: spectrum_switchdev: Update the mdb of mrouter port change Whenever a port starts / stops being mrouter, update all the mdb entries in the HW to flood / stop flooding mc packets there. The change should happen only if the port is not in the mid. (If it is, the mid should flood mc packets to this port anyway) Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 146beaa6b2da..bf1a17557fb3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -130,6 +130,11 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_bridge_device *bridge_device); +static void +mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_port *bridge_port, + bool add); + static struct mlxsw_sp_bridge_device * mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge, const struct net_device *br_dev) @@ -747,6 +752,8 @@ static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, if (err) return err; + mlxsw_sp_port_mrouter_update_mdb(mlxsw_sp_port, bridge_port, + is_port_mrouter); out: bridge_port->mrouter = is_port_mrouter; return 0; @@ -1517,6 +1524,22 @@ mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port, } } +static void +mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_bridge_port *bridge_port, + bool add) +{ + struct mlxsw_sp_bridge_device *bridge_device; + struct mlxsw_sp_mid *mid; + + bridge_device = bridge_port->bridge_device; + + list_for_each_entry(mid, &bridge_device->mids_list, list) { + if (!test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) + mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, add); + } +} + static int mlxsw_sp_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) From 0166277706e57779f06b741d25a9e86d99726e2a Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:15 +0200 Subject: [PATCH 0408/1322] mlxsw: spectrum_switchdev: Remove mrouter flood in mdb flush In mdb flush the port is being removed from all the mids it is registered to. But if the port is mrouter, all the mids floods to it. This patch remove mrouter ports from mids it is not registered to in the mdb flush. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index bf1a17557fb3..459cedc23c47 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1673,6 +1673,9 @@ mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port, if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) { __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid); + } else if (bridge_device->multicast_enabled && + bridge_port->mrouter) { + mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); } } } From ded711c87a0411a7f3f56f8c575d7b642ee0110e Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 20 Sep 2017 16:15:16 +0200 Subject: [PATCH 0409/1322] mlxsw: spectrum_switchdev: Consider mrouter status for mdb changes When a mrouter is registered or leaves a mid, don't update the HW. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 459cedc23c47..0f9eac5f4ebf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1491,6 +1491,9 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, if (!bridge_device->multicast_enabled) return 0; + if (bridge_port->mrouter) + return 0; + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true); if (err) { netdev_err(dev, "Unable to set SMID\n"); @@ -1613,10 +1616,12 @@ __mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, int err; if (bridge_port->bridge_device->multicast_enabled) { - err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false); - - if (err) - netdev_err(dev, "Unable to remove port from SMID\n"); + if (bridge_port->bridge_device->multicast_enabled) { + err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, + false); + if (err) + netdev_err(dev, "Unable to remove port from SMID\n"); + } } err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid); From a90bcb86ae700c12432446c4aa1819e7b8e172ec Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Tue, 29 Aug 2017 11:20:32 -0700 Subject: [PATCH 0410/1322] iov_iter: fix page_copy_sane for compound pages Issue is that if the data crosses a page boundary inside a compound page, this check will incorrectly trigger a WARN_ON. To fix this, compute the order using the head of the compound page and adjust the offset to be relative to that head. Fixes: 72e809ed81ed ("iov_iter: sanity checks for copy to/from page primitives") Signed-off-by: Petar Penkov CC: Al Viro CC: Eric Dumazet Signed-off-by: Al Viro --- lib/iov_iter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 52c8dd6d8e82..1c1c06ddc20a 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -687,8 +687,10 @@ EXPORT_SYMBOL(_copy_from_iter_full_nocache); static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { - size_t v = n + offset; - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(page)))) + struct page *head = compound_head(page); + size_t v = n + offset + page_address(page) - page_address(head); + + if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) return true; WARN_ON(1); return false; From 58aff0af757356065f33290d96a9cd46dfbcae88 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Sep 2017 17:47:38 +0100 Subject: [PATCH 0411/1322] ipc/shm: Fix order of parameters when calling copy_compat_shmid_to_user Commit 553f770ef71b ("ipc: move compat shmctl to native") moved the compat IPC syscall handling into ipc/shm.c and refactored the struct accessors in the process. Unfortunately, the call to copy_compat_shmid_to_user when handling a compat {IPC,SHM}_STAT command gets the arguments the wrong way round, passing a kernel stack address as the user buffer (destination) and the user buffer as the kernel stack address (source). This patch fixes the parameter ordering so the buffers are accessed correctly. Cc: Al Viro Cc: Andrew Morton Signed-off-by: Will Deacon Signed-off-by: Al Viro --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1b3adfe3c60e..1e2b1692ba2c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1237,7 +1237,7 @@ COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr) err = shmctl_stat(ns, shmid, cmd, &sem64); if (err < 0) return err; - if (copy_compat_shmid_to_user(&sem64, uptr, version)) + if (copy_compat_shmid_to_user(uptr, &sem64, version)) err = -EFAULT; return err; From 3e77adeea3c5393c9b624832f65441e92867f618 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 7 Sep 2017 16:35:40 +1000 Subject: [PATCH 0412/1322] powerpc/eeh: Create PHB PEs after EEH is initialized Otherwise we end up not yet having computed the right diag data size on powernv where EEH initialization is delayed, thus causing memory corruption later on when calling OPAL. Fixes: 5cb1f8fdddb7 ("powerpc/powernv/pci: Dynamically allocate PHB diag data") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Benjamin Herrenschmidt Acked-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 4 ++++ arch/powerpc/kernel/eeh_dev.c | 18 ------------------ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9e816787c0d4..116000b45531 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1019,6 +1019,10 @@ int eeh_init(void) } else if ((ret = eeh_ops->init())) return ret; + /* Initialize PHB PEs */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(hose); + /* Initialize EEH event */ ret = eeh_event_init(); if (ret) diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index ad04ecd63c20..a34e6912c15e 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -78,21 +78,3 @@ void eeh_dev_phb_init_dynamic(struct pci_controller *phb) /* EEH PE for PHB */ eeh_phb_pe_create(phb); } - -/** - * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs - * - * Scan all the existing PHBs and create EEH devices for their OF - * nodes and their children OF nodes - */ -static int __init eeh_dev_phb_init(void) -{ - struct pci_controller *phb, *tmp; - - list_for_each_entry_safe(phb, tmp, &hose_list, list_node) - eeh_dev_phb_init_dynamic(phb); - - return 0; -} - -core_initcall(eeh_dev_phb_init); From 087ff6a5ae3052bb2835e191094b793789cb8817 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:51 -0400 Subject: [PATCH 0413/1322] powerpc/pseries: Fix "OF: ERROR: Bad of_node_put() on /cpus" during DLPAR Commit 215ee763f8cb ("powerpc: pseries: remove dlpar_attach_node dependency on full path") reworked dlpar_attach_node() to no longer look up the parent node "/cpus", but instead to have the parent node passed by the caller in the function parameter list. As a result dlpar_attach_node() is no longer responsible for freeing the reference to the parent node. However, commit 215ee763f8cb failed to remove the of_node_put(parent) call in dlpar_attach_node(), or to take into account that the reference to the parent in the caller dlpar_cpu_add() needs to be held until after dlpar_attach_node() returns. As a result doing repeated cpu add/remove dlpar operations will eventually result in the following error: OF: ERROR: Bad of_node_put() on /cpus CPU: 0 PID: 10896 Comm: drmgr Not tainted 4.13.0-autotest #1 Call Trace: dump_stack+0x15c/0x1f8 (unreliable) of_node_release+0x1a4/0x1c0 kobject_put+0x1a8/0x310 kobject_del+0xbc/0xf0 __of_detach_node_sysfs+0x144/0x210 of_detach_node+0xf0/0x180 dlpar_detach_node+0xc4/0x120 dlpar_cpu_remove+0x280/0x560 dlpar_cpu_release+0xbc/0x1b0 arch_cpu_release+0x6c/0xb0 cpu_release_store+0xa0/0x100 dev_attr_store+0x68/0xa0 sysfs_kf_write+0xa8/0xf0 kernfs_fop_write+0x2cc/0x400 __vfs_write+0x5c/0x340 vfs_write+0x1a8/0x3d0 SyS_write+0xa8/0x1a0 system_call+0x58/0x6c Fix the issue by removing the of_node_put(parent) call from dlpar_attach_node(), and ensuring that the reference to the parent node is properly held and released by the caller dlpar_cpu_add(). Fixes: 215ee763f8cb ("powerpc: pseries: remove dlpar_attach_node dependency on full path") Signed-off-by: Tyrel Datwyler Reported-by: Abdul Haleem [mpe: Add a comment in the code and frob the change log slightly] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dlpar.c | 1 - arch/powerpc/platforms/pseries/hotplug-cpu.c | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 783f36364690..e45b5f10645a 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -266,7 +266,6 @@ int dlpar_attach_node(struct device_node *dn, struct device_node *parent) return rc; } - of_node_put(dn->parent); return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fc0d8f97c03a..fadb95efbb9e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -462,15 +462,19 @@ static ssize_t dlpar_cpu_add(u32 drc_index) } dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent); - of_node_put(parent); if (!dn) { pr_warn("Failed call to configure-connector, drc index: %x\n", drc_index); dlpar_release_drc(drc_index); + of_node_put(parent); return -EINVAL; } rc = dlpar_attach_node(dn, parent); + + /* Regardless we are done with parent now */ + of_node_put(parent); + if (rc) { saved_rc = rc; pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n", From b537ca6fede69a281dc524983e5e633d79a10a08 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:52 -0400 Subject: [PATCH 0414/1322] powerpc/pseries: Fix parent_dn reference leak in add_dt_node() A reference to the parent device node is held by add_dt_node() for the node to be added. If the call to dlpar_configure_connector() fails add_dt_node() returns ENOENT and that reference is not freed. Add a call to of_node_put(parent_dn) prior to bailing out after a failed dlpar_configure_connector() call. Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware") Cc: stable@vger.kernel.org # v3.12+ Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/mobility.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 210ce632d63e..f7042ad492ba 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -226,8 +226,10 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) return -ENOENT; dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) + if (!dn) { + of_node_put(parent_dn); return -ENOENT; + } rc = dlpar_attach_node(dn, parent_dn); if (rc) From 0551968add53777fddd18f4ffb4e3bbc1f646d79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Sep 2017 11:54:44 +0200 Subject: [PATCH 0415/1322] Revert "genirq: Restrict effective affinity to interrupts actually using it" This reverts commit 74def747bcd09692bdbf8c6a15350795b0f11ca8. The change to the helper function is only correct for the /proc/irq/ readout usage, but breaks the existing x86 usage of that function. Reported-by: Yanko Kaneti Signed-off-by: Thomas Gleixner Cc: Marc Zyngier --- include/linux/irq.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index b99a784635ff..d4728bf6a537 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -783,10 +783,7 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) static inline struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { - if (!cpumask_empty(d->common->effective_affinity)) - return d->common->effective_affinity; - - return d->common->affinity; + return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) From 544e3bf4f0e8278400f19ca7918a3cdf2548b4eb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:22:08 +0200 Subject: [PATCH 0416/1322] reset: Restrict RESET_HSDK to ARC_SOC_HSDK or COMPILE_TEST The HSDK reset driver is only useful when building for an ARC HSDK platform. While at it, drop the "default n", as that is the default. Fixes: e0be864f14240cb1 ("ARC: reset: introduce HSDKv1 reset driver") Signed-off-by: Geert Uytterhoeven [p.zabel@pengutronix.de: rebased, renamed RESET_HSDK_V1 to RESET_HSDK] Signed-off-by: Philipp Zabel --- drivers/reset/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index a7c7d5a8c089..e2baecbb9dd3 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -37,7 +37,7 @@ config RESET_BERLIN config RESET_HSDK bool "Synopsys HSDK Reset Driver" depends on HAS_IOMEM - default n + depends on ARC_SOC_HSDK || COMPILE_TEST help This enables the reset controller driver for HSDK board. From 2bc84526d174a2a89c76438f049fc03ac259a159 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 16:44:35 -0600 Subject: [PATCH 0417/1322] Makefile: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Fix it so it can find the source tree to build from. make O=/tmp/kselftest_top kselftest make[1]: Entering directory '/tmp/kselftest_top' make[2]: Entering directory '/tmp/kselftest_top' make[2]: *** tools/testing/selftests: No such file or directory. Stop. make[2]: Leaving directory '/tmp/kselftest_top' ./linux-kselftest/Makefile:1185: recipe for target 'kselftest' failed make[1]: *** [kselftest] Error 2 make[1]: Leaving directory '/tmp/kselftest_top' Makefile:145: recipe for target 'sub-make' failed make: *** [sub-make] Error 2 Signed-off-by: Shuah Khan Acked-by: Masahiro Yamada --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 64cbc66cebca..9a70578922d3 100644 --- a/Makefile +++ b/Makefile @@ -1172,11 +1172,11 @@ headers_check: headers_install PHONY += kselftest kselftest: - $(Q)$(MAKE) -C tools/testing/selftests run_tests + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests run_tests PHONY += kselftest-clean kselftest-clean: - $(Q)$(MAKE) -C tools/testing/selftests clean + $(Q)$(MAKE) -C $(srctree)/tools/testing/selftests clean PHONY += kselftest-merge kselftest-merge: From 8050ef2b83a18f628f9501af958fbff39443d58d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 6 Sep 2017 18:36:22 -0600 Subject: [PATCH 0418/1322] selftests: lib.mk: kselftest and kselftest-clean fail for make O=dir case kselftest and kselftest-clean targets fail when object directory is specified to relocate objects. Main Makefile make O= path clears the built-in defines LINK.c, COMPILE.S, LINK.S, and RM that are used in lib.mk to build and clean targets. Define them. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 4665463779f5..266d3ed4bb41 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -7,6 +7,7 @@ OUTPUT := $(shell pwd) endif TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) +TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) @@ -62,6 +63,11 @@ endef emit_tests: $(EMIT_TESTS) +# define if isn't already. It is undefined in make O= case. +ifeq ($(RM),) +RM := rm -f +endif + define CLEAN $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN) endef @@ -69,6 +75,15 @@ endef clean: $(CLEAN) +# When make O= with kselftest target from main level +# the following aren't defined. +# +ifneq ($(KBUILD_SRC),) +LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +COMPILE.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c +LINK.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) +endif + $(OUTPUT)/%:%.c $(LINK.c) $^ $(LDLIBS) -o $@ From 52fd1d082398b928a86d4fdf33c9f3abe1bf7914 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 19:57:43 -0600 Subject: [PATCH 0419/1322] selftests: Makefile: clear LDFLAGS for make O=dir use-case kselftest target fails when object directory is specified to relocate objects. Inherited "LDFLAGS = -m" fails the test builds. Clear it. Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 26ce4f7168be..f4368db011ea 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -52,6 +52,10 @@ override LDFLAGS = override MAKEFLAGS = endif +ifneq ($(KBUILD_SRC),) +override LDFLAGS = +endif + BUILD := $(O) ifndef BUILD BUILD := $(KBUILD_OUTPUT) From e0a5696a23290c31c5ac2c76f0e7fe50a12c1fc6 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Sep 2017 20:04:26 -0600 Subject: [PATCH 0420/1322] selftests: lib.mk: fix test executable status check to use full path Fix test executable status check to use full path for make O=dir case,m when tests are relocated to user specified object directory. Without the full path, this check fails to find the file and fails the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 266d3ed4bb41..fd1cbbbca8d7 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -21,7 +21,7 @@ define RUN_TESTS test_num=`echo $$test_num+1 | bc`; \ echo "selftests: $$BASENAME_TEST"; \ echo "========================================"; \ - if [ ! -x $$BASENAME_TEST ]; then \ + if [ ! -x $$TEST ]; then \ echo "selftests: Warning: file $$BASENAME_TEST is not executable, correct this.";\ echo "not ok 1..$$test_num selftests: $$BASENAME_TEST [FAIL]"; \ else \ From e2fb65594cae9a016fab4639a5d22a914f1e16c8 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 8 Sep 2017 16:05:50 -0600 Subject: [PATCH 0421/1322] selftests: watchdog: fix to use TEST_GEN_PROGS and remove clean TEST_PROGS should be used for test scripts that don't ned to be built. Use TEST_GEN_PROGS instead which is intended for test executables. Remove clean target and let the common clean take care of cleaning. Signed-off-by: Shuah Khan --- tools/testing/selftests/watchdog/Makefile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/testing/selftests/watchdog/Makefile b/tools/testing/selftests/watchdog/Makefile index f863c664e3d1..ee068511fd0b 100644 --- a/tools/testing/selftests/watchdog/Makefile +++ b/tools/testing/selftests/watchdog/Makefile @@ -1,8 +1,3 @@ -TEST_PROGS := watchdog-test - -all: $(TEST_PROGS) +TEST_GEN_PROGS := watchdog-test include ../lib.mk - -clean: - rm -fr $(TEST_PROGS) From be16a244c199983656e58ddb10d80c67197e502f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:06:33 -0600 Subject: [PATCH 0422/1322] selftests: lib.mk: add TEST_CUSTOM_PROGS to allow custom test run/install Some tests such as sync can't use generic build rules in lib.mk and require custom rules. Currently there is no provision to allow custom builds and test such as sync use TEST_PROGS which is reserved for test shell scripts. Add TEST_CUSTOM_PROGS variable to lib.mk to run and install custom tests built by individual test make files. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index fd1cbbbca8d7..b4699c71aee4 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -6,6 +6,12 @@ ifeq (0,$(MAKELEVEL)) OUTPUT := $(shell pwd) endif +# The following are built by lib.mk common compile rules. +# TEST_CUSTOM_PROGS should be used by tests that require +# custom build rule and prevent common build rule use. +# TEST_PROGS are for test shell scripts. +# TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests +# and install targets. Common clean doesn't touch them. TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS)) TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED)) TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES)) @@ -31,7 +37,7 @@ define RUN_TESTS endef run_tests: all - $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_PROGS)) + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ @@ -39,10 +45,10 @@ define INSTALL_RULE echo "rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/"; \ rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/; \ fi - @if [ "X$(TEST_GEN_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ + @if [ "X$(TEST_GEN_PROGS)$(TEST_CUSTOM_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ mkdir -p ${INSTALL_PATH}; \ - echo "rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ - rsync -a $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ + echo "rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ + rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ fi endef @@ -54,7 +60,7 @@ else endif define EMIT_TESTS - @for TEST in $(TEST_GEN_PROGS) $(TEST_PROGS); do \ + @for TEST in $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS); do \ BASENAME_TEST=`basename $$TEST`; \ echo "(./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && echo \"selftests: $$BASENAME_TEST [PASS]\") || echo \"selftests: $$BASENAME_TEST [FAIL]\""; \ done; From 38f7251852a0cd47f34af4a6f84df0fadafa8ac7 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 13:46:44 -0600 Subject: [PATCH 0423/1322] selftests: sync: use TEST_CUSTOM_PROGS instead of TEST_PROGS lib.mk var TEST_CUSTOM_PROGS is for tests that need custom build rules. TEST_PROGS is used for test shell scripts. Fix it to use TEST_CUSTOM_PROGS. lib.mk will run and install them. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 4981c6b6d050..43db80b71e80 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,9 +2,11 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -TEST_PROGS = sync_test +# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special +# build rules. lib.mk will run and install them. +TEST_CUSTOM_PROGS = sync_test -all: $(TEST_PROGS) +all: $(TEST_CUSTOM_PROGS) include ../lib.mk From b2fc6ade9fe2118591e7f51e21b51f65e36f138f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 14:14:41 -0600 Subject: [PATCH 0424/1322] selftests: sync: kselftest and kselftest-clean fail for make O=dir case sync test fails to build when object directory is specified to relocate object files. Fix it to specify the correct path. Fix clean target to remove objects. Also include simplified logic to use TEST_CUSTOM_PROGS in build and clean targets instead of hard-coding the test name each time. Signed-off-by: Shuah Khan --- tools/testing/selftests/sync/Makefile | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile index 43db80b71e80..8e04d0afcbd7 100644 --- a/tools/testing/selftests/sync/Makefile +++ b/tools/testing/selftests/sync/Makefile @@ -2,14 +2,16 @@ CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra CFLAGS += -I../../../../usr/include/ LDFLAGS += -pthread -# lib.mk TEST_CUSTOM_PROGS var is for custome tests that need special -# build rules. lib.mk will run and install them. -TEST_CUSTOM_PROGS = sync_test - -all: $(TEST_CUSTOM_PROGS) +.PHONY: all clean include ../lib.mk +# lib.mk TEST_CUSTOM_PROGS var is for custom tests that need special +# build rules. lib.mk will run and install them. + +TEST_CUSTOM_PROGS := $(OUTPUT)/sync_test +all: $(TEST_CUSTOM_PROGS) + OBJS = sync_test.o sync.o TESTS += sync_alloc.o @@ -20,6 +22,16 @@ TESTS += sync_stress_parallelism.o TESTS += sync_stress_consumer.o TESTS += sync_stress_merge.o -sync_test: $(OBJS) $(TESTS) +OBJS := $(patsubst %,$(OUTPUT)/%,$(OBJS)) +TESTS := $(patsubst %,$(OUTPUT)/%,$(TESTS)) -EXTRA_CLEAN := sync_test $(OBJS) $(TESTS) +$(TEST_CUSTOM_PROGS): $(TESTS) $(OBJS) + $(CC) -o $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) $(CFLAGS) $(LDFLAGS) + +$(OBJS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ + +$(TESTS): $(OUTPUT)/%.o: %.c + $(CC) -c $^ -o $@ + +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) From 1a940687e424080a6ed285f2de8b2f0d018edf3e Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:03:11 -0600 Subject: [PATCH 0425/1322] selftests: lib.mk: copy test scripts and test files for make O=dir run For make O=dir run_tests to work, test scripts, test files, and other dependencies need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index b4699c71aee4..f65886af7c0c 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -37,7 +37,18 @@ define RUN_TESTS endef run_tests: all +ifneq ($(KBUILD_SRC),) + @if [ "X$(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)" != "X" ]; then + @rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT) + fi + @if [ "X$(TEST_PROGS)" != "X" ]; then + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(OUTPUT)/$(TEST_PROGS)) + else + $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)) + fi +else $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) +endif define INSTALL_RULE @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ From 9bbe7dc05c1f80200621a626ec2266987f4b42b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Sep 2017 17:55:24 +0200 Subject: [PATCH 0426/1322] MIPS: MSP71xx: Include asm/setup.h msp71xx_defconfig can not be built at the in v4.14-rc1 arch/mips/pmcs-msp71xx/msp_smp.c:72:2: error: implicit declaration of function 'set_vi_handler' [-Werror=implicit-function-declaration] I don't know what caused the regression, but including the right header is the obvious fix. Signed-off-by: Arnd Bergmann Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17309/ Signed-off-by: Ralf Baechle --- arch/mips/pmcs-msp71xx/msp_smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/pmcs-msp71xx/msp_smp.c b/arch/mips/pmcs-msp71xx/msp_smp.c index ffa0f7101a97..2b08242ade62 100644 --- a/arch/mips/pmcs-msp71xx/msp_smp.c +++ b/arch/mips/pmcs-msp71xx/msp_smp.c @@ -22,6 +22,8 @@ #include #include +#include + #ifdef CONFIG_MIPS_MT_SMP #define MIPS_CPU_IPI_RESCHED_IRQ 0 /* SW int 0 for resched */ #define MIPS_CPU_IPI_CALL_IRQ 1 /* SW int 1 for call */ From c22c8043105591a8b74142cf837604087cdba40b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 19 Sep 2017 14:11:22 +0100 Subject: [PATCH 0427/1322] MIPS: Fix input modify in __write_64bit_c0_split() The inline asm in __write_64bit_c0_split() modifies the 64-bit input operand by shifting the high register left by 32, and constructing the full 64-bit value in the low register (even on a 32-bit kernel), so if that value is used again it could cause breakage as GCC would assume the registers haven't changed when they have. To quote the GCC extended asm documentation: > Warning: Do not modify the contents of input-only operands (except for > inputs tied to outputs). The compiler assumes that on exit from the > asm statement these operands contain the same values as they had > before executing the statement. Avoid modifying the input by using a temporary variable as an output which is modified instead of the input and not otherwise used. The asm is always __volatile__ so GCC shouldn't optimise it out. The low register of the temporary output is written before the high register of the input is read, so we have two constraint alternatives, one where both use the same registers (for when the input value isn't subsequently used), and one with an early clobber on the output in case the low output uses the same register as the high input. This allows the resulting assembly to remain mostly unchanged. A diff of a MIPS32r6 kernel reveals only three differences, two in relation to write_c0_r10k_diag() in cpu_probe() (register allocation rearranged slightly but otherwise identical), and one in relation to write_c0_cvmmemctl2() in kvm_vz_local_flush_guesttlb_all(), but the octeon CPU is only supported on 64-bit kernels where __write_64bit_c0_split() isn't used so that shouldn't matter in practice. So there currently doesn't appear to be anything broken by this bug. Signed-off-by: James Hogan Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17315/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/mipsregs.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index e4ed1bc9a734..a6810923b3f0 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -1377,29 +1377,32 @@ do { \ #define __write_64bit_c0_split(source, sel, val) \ do { \ + unsigned long long __tmp; \ unsigned long __flags; \ \ local_irq_save(__flags); \ if (sel == 0) \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ - "dsll\t%L0, %L0, 32\n\t" \ + "dsll\t%L0, %L1, 32\n\t" \ "dsrl\t%L0, %L0, 32\n\t" \ - "dsll\t%M0, %M0, 32\n\t" \ + "dsll\t%M0, %M1, 32\n\t" \ "or\t%L0, %L0, %M0\n\t" \ "dmtc0\t%L0, " #source "\n\t" \ ".set\tmips0" \ - : : "r" (val)); \ + : "=&r,r" (__tmp) \ + : "r,0" (val)); \ else \ __asm__ __volatile__( \ ".set\tmips64\n\t" \ - "dsll\t%L0, %L0, 32\n\t" \ + "dsll\t%L0, %L1, 32\n\t" \ "dsrl\t%L0, %L0, 32\n\t" \ - "dsll\t%M0, %M0, 32\n\t" \ + "dsll\t%M0, %M1, 32\n\t" \ "or\t%L0, %L0, %M0\n\t" \ "dmtc0\t%L0, " #source ", " #sel "\n\t" \ ".set\tmips0" \ - : : "r" (val)); \ + : "=&r,r" (__tmp) \ + : "r,0" (val)); \ local_irq_restore(__flags); \ } while (0) From 8eba3651f1dad49c83bb7f8d672301dac4c6add6 Mon Sep 17 00:00:00 2001 From: Manuel Lauss Date: Tue, 12 Sep 2017 20:36:28 +0200 Subject: [PATCH 0428/1322] MIPS: PCI: fix pcibios_map_irq section mismatch Drop the __init from pcibios_map_irq() to make this section mis- match go away: WARNING: vmlinux.o(.text+0x56acd4): Section mismatch in reference from the function pcibios_scanbus() to the function .init.text:pcibios_map_irq() The function pcibios_scanbus() references the function __init pcibios_map_irq(). This is often because pcibios_scanbus lacks a __init annotation or the annotation of pcibios_map_irq is wrong. Run-Tested only on Alchemy. Signed-off-by: Manuel Lauss Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17267/ Signed-off-by: Ralf Baechle --- arch/mips/pci/fixup-capcella.c | 2 +- arch/mips/pci/fixup-cobalt.c | 2 +- arch/mips/pci/fixup-emma2rh.c | 2 +- arch/mips/pci/fixup-fuloong2e.c | 2 +- arch/mips/pci/fixup-ip32.c | 2 +- arch/mips/pci/fixup-lantiq.c | 2 +- arch/mips/pci/fixup-lemote2f.c | 2 +- arch/mips/pci/fixup-loongson3.c | 2 +- arch/mips/pci/fixup-malta.c | 2 +- arch/mips/pci/fixup-mpc30x.c | 2 +- arch/mips/pci/fixup-pmcmsp.c | 2 +- arch/mips/pci/fixup-sni.c | 2 +- arch/mips/pci/fixup-tb0219.c | 2 +- arch/mips/pci/fixup-tb0226.c | 2 +- arch/mips/pci/fixup-tb0287.c | 2 +- arch/mips/pci/pci-alchemy.c | 2 +- arch/mips/pci/pci-bcm47xx.c | 2 +- arch/mips/pci/pci-lasat.c | 2 +- arch/mips/pci/pci-mt7620.c | 2 +- arch/mips/pci/pci-octeon.c | 4 ++-- arch/mips/pci/pci-rt2880.c | 2 +- arch/mips/pci/pci-rt3883.c | 2 +- arch/mips/pci/pci-xlp.c | 2 +- arch/mips/pci/pci-xlr.c | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/mips/pci/fixup-capcella.c b/arch/mips/pci/fixup-capcella.c index 1c02f5737367..35a671595a37 100644 --- a/arch/mips/pci/fixup-capcella.c +++ b/arch/mips/pci/fixup-capcella.c @@ -38,7 +38,7 @@ static char irq_tab_capcella[][5] __initdata = { [14] = { -1, INTA, INTB, INTC, INTD } }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_capcella[slot][pin]; } diff --git a/arch/mips/pci/fixup-cobalt.c b/arch/mips/pci/fixup-cobalt.c index b3ab59318d91..62810d3fe99b 100644 --- a/arch/mips/pci/fixup-cobalt.c +++ b/arch/mips/pci/fixup-cobalt.c @@ -174,7 +174,7 @@ static char irq_tab_raq2[] __initdata = { [COBALT_PCICONF_ETH1] = ETH1_IRQ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (cobalt_board_id <= COBALT_BRD_ID_QUBE1) return irq_tab_qube1[slot]; diff --git a/arch/mips/pci/fixup-emma2rh.c b/arch/mips/pci/fixup-emma2rh.c index 19caf775c206..4832ac9f118a 100644 --- a/arch/mips/pci/fixup-emma2rh.c +++ b/arch/mips/pci/fixup-emma2rh.c @@ -85,7 +85,7 @@ static void emma2rh_pci_host_fixup(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NEC, PCI_DEVICE_ID_NEC_EMMA2RH, emma2rh_pci_host_fixup); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_map[slot][pin]; } diff --git a/arch/mips/pci/fixup-fuloong2e.c b/arch/mips/pci/fixup-fuloong2e.c index 50da773faede..b47c2771dc99 100644 --- a/arch/mips/pci/fixup-fuloong2e.c +++ b/arch/mips/pci/fixup-fuloong2e.c @@ -19,7 +19,7 @@ /* South bridge slot number is set by the pci probe process */ static u8 sb_slot = 5; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = 0; diff --git a/arch/mips/pci/fixup-ip32.c b/arch/mips/pci/fixup-ip32.c index 133685e215ee..ea29f5450be3 100644 --- a/arch/mips/pci/fixup-ip32.c +++ b/arch/mips/pci/fixup-ip32.c @@ -39,7 +39,7 @@ static char irq_tab_mace[][5] __initdata = { * irqs. I suppose a device without a pin A will thank us for doing it * right if there exists such a broken piece of crap. */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return irq_tab_mace[slot][pin]; } diff --git a/arch/mips/pci/fixup-lantiq.c b/arch/mips/pci/fixup-lantiq.c index 2b5427d3f35c..81530a13b349 100644 --- a/arch/mips/pci/fixup-lantiq.c +++ b/arch/mips/pci/fixup-lantiq.c @@ -23,7 +23,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/fixup-lemote2f.c b/arch/mips/pci/fixup-lemote2f.c index 95ab9a1bd010..7e5991e0e323 100644 --- a/arch/mips/pci/fixup-lemote2f.c +++ b/arch/mips/pci/fixup-lemote2f.c @@ -51,7 +51,7 @@ static char irq_tab[][5] __initdata = { {0, 0, 0, 0, 0}, /* 27: Unused */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; diff --git a/arch/mips/pci/fixup-loongson3.c b/arch/mips/pci/fixup-loongson3.c index 2b6d5e196f99..8a741c2c6685 100644 --- a/arch/mips/pci/fixup-loongson3.c +++ b/arch/mips/pci/fixup-loongson3.c @@ -32,7 +32,7 @@ static void print_fixup_info(const struct pci_dev *pdev) pdev->vendor, pdev->device, pdev->irq); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { print_fixup_info(dev); return dev->irq; diff --git a/arch/mips/pci/fixup-malta.c b/arch/mips/pci/fixup-malta.c index 40e920c653cc..1f5f25e39590 100644 --- a/arch/mips/pci/fixup-malta.c +++ b/arch/mips/pci/fixup-malta.c @@ -38,7 +38,7 @@ static char irq_tab[][5] __initdata = { {0, PCID, PCIA, PCIB, PCIC } /* 21: PCI Slot 4 */ }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int virq; virq = irq_tab[slot][pin]; diff --git a/arch/mips/pci/fixup-mpc30x.c b/arch/mips/pci/fixup-mpc30x.c index 8e4f8288eca2..5da62c76e271 100644 --- a/arch/mips/pci/fixup-mpc30x.c +++ b/arch/mips/pci/fixup-mpc30x.c @@ -34,7 +34,7 @@ static const int irq_tab_mpc30x[] __initconst = { [29] = MQ200_IRQ, }; -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (slot == 30) return internal_func_irqs[PCI_FUNC(dev->devfn)]; diff --git a/arch/mips/pci/fixup-pmcmsp.c b/arch/mips/pci/fixup-pmcmsp.c index fab405c21c2f..f2b7b1e4395b 100644 --- a/arch/mips/pci/fixup-pmcmsp.c +++ b/arch/mips/pci/fixup-pmcmsp.c @@ -202,7 +202,7 @@ int pcibios_plat_dev_init(struct pci_dev *dev) * RETURNS: IRQ number * ****************************************************************************/ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { #if !defined(CONFIG_PMC_MSP7120_GW) && !defined(CONFIG_PMC_MSP7120_EVAL) printk(KERN_WARNING "PCI: unknown board, no PCI IRQs assigned.\n"); diff --git a/arch/mips/pci/fixup-sni.c b/arch/mips/pci/fixup-sni.c index f67ebeeb4200..309e1c562959 100644 --- a/arch/mips/pci/fixup-sni.c +++ b/arch/mips/pci/fixup-sni.c @@ -130,7 +130,7 @@ static inline int is_rm300_revd(void) return (csmsr & 0xa0) == 0x20; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (sni_brd_type) { case SNI_BRD_PCI_TOWER_CPLUS: diff --git a/arch/mips/pci/fixup-tb0219.c b/arch/mips/pci/fixup-tb0219.c index d0b0083fbd27..cc581535f257 100644 --- a/arch/mips/pci/fixup-tb0219.c +++ b/arch/mips/pci/fixup-tb0219.c @@ -23,7 +23,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0226.c b/arch/mips/pci/fixup-tb0226.c index 4196ccf3ea3d..b827b5cad5fd 100644 --- a/arch/mips/pci/fixup-tb0226.c +++ b/arch/mips/pci/fixup-tb0226.c @@ -23,7 +23,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/mips/pci/fixup-tb0287.c b/arch/mips/pci/fixup-tb0287.c index 8c5039ed75d7..98f26285f2e3 100644 --- a/arch/mips/pci/fixup-tb0287.c +++ b/arch/mips/pci/fixup-tb0287.c @@ -22,7 +22,7 @@ #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { unsigned char bus; int irq = -1; diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index e99ca7702d8a..f15ec98de2de 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -522,7 +522,7 @@ static int __init alchemy_pci_init(void) arch_initcall(alchemy_pci_init); -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct alchemy_pci_context *ctx = dev->sysdata; if (ctx && ctx->board_map_irq) diff --git a/arch/mips/pci/pci-bcm47xx.c b/arch/mips/pci/pci-bcm47xx.c index 76f16eaed0ad..230d7dd273e2 100644 --- a/arch/mips/pci/pci-bcm47xx.c +++ b/arch/mips/pci/pci-bcm47xx.c @@ -28,7 +28,7 @@ #include #include -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return 0; } diff --git a/arch/mips/pci/pci-lasat.c b/arch/mips/pci/pci-lasat.c index 40d2797d2bc4..47f4ee6bbb3b 100644 --- a/arch/mips/pci/pci-lasat.c +++ b/arch/mips/pci/pci-lasat.c @@ -61,7 +61,7 @@ arch_initcall(lasat_pci_setup); #define LASAT_IRQ_PCIC (LASAT_IRQ_BASE + 7) #define LASAT_IRQ_PCID (LASAT_IRQ_BASE + 8) -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { switch (slot) { case 1: diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c index 4e633c1e7ff3..90fba9bf98da 100644 --- a/arch/mips/pci/pci-mt7620.c +++ b/arch/mips/pci/pci-mt7620.c @@ -361,7 +361,7 @@ static int mt7620_pci_probe(struct platform_device *pdev) return 0; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; u32 val; diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c index 9ee01936862e..771f5de6362d 100644 --- a/arch/mips/pci/pci-octeon.c +++ b/arch/mips/pci/pci-octeon.c @@ -59,7 +59,7 @@ union octeon_pci_address { } s; }; -int __initconst (*octeon_pcibios_map_irq)(const struct pci_dev *dev, +int (*octeon_pcibios_map_irq)(const struct pci_dev *dev, u8 slot, u8 pin); enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; @@ -74,7 +74,7 @@ enum octeon_dma_bar_type octeon_dma_bar_type = OCTEON_DMA_BAR_TYPE_INVALID; * as it goes through each bridge. * Returns Interrupt number for the device */ -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { if (octeon_pcibios_map_irq) return octeon_pcibios_map_irq(dev, slot, pin); diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c index d6360fe73d05..711cdccdf65b 100644 --- a/arch/mips/pci/pci-rt2880.c +++ b/arch/mips/pci/pci-rt2880.c @@ -181,7 +181,7 @@ static inline void rt2880_pci_write_u32(unsigned long reg, u32 val) spin_unlock_irqrestore(&rt2880_pci_lock, flags); } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { u16 cmd; int irq = -1; diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c index 04f8ea953297..958899ffe99c 100644 --- a/arch/mips/pci/pci-rt3883.c +++ b/arch/mips/pci/pci-rt3883.c @@ -564,7 +564,7 @@ err_put_intc_node: return err; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return of_irq_parse_and_map_pci(dev, slot, pin); } diff --git a/arch/mips/pci/pci-xlp.c b/arch/mips/pci/pci-xlp.c index 7babf01600cb..9eff9137f78e 100644 --- a/arch/mips/pci/pci-xlp.c +++ b/arch/mips/pci/pci-xlp.c @@ -205,7 +205,7 @@ int xlp_socdev_to_node(const struct pci_dev *lnkdev) return PCI_SLOT(lnkdev->devfn) / 8; } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { struct pci_dev *lnkdev; int lnkfunc, node; diff --git a/arch/mips/pci/pci-xlr.c b/arch/mips/pci/pci-xlr.c index 26d2dabef281..2a1c81a129ba 100644 --- a/arch/mips/pci/pci-xlr.c +++ b/arch/mips/pci/pci-xlr.c @@ -315,7 +315,7 @@ static void xls_pcie_ack_b(struct irq_data *d) } } -int __init pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) +int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { return get_irq_vector(dev); } From 6af1de2e4ec49635905aaed31d073a0d92c8d3bf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 6 Sep 2017 14:58:53 +0200 Subject: [PATCH 0429/1322] ath10k: mark PM functions as __maybe_unused When CONFIG_PM_SLEEP is disabled, we get a compile-time warning: drivers/net/wireless/ath/ath10k/pci.c:3417:12: error: 'ath10k_pci_pm_resume' defined but not used [-Werror=unused-function] static int ath10k_pci_pm_resume(struct device *dev) ^~~~~~~~~~~~~~~~~~~~ drivers/net/wireless/ath/ath10k/pci.c:3401:12: error: 'ath10k_pci_pm_suspend' defined but not used [-Werror=unused-function] static int ath10k_pci_pm_suspend(struct device *dev) Rather than fixing the #ifdef, this just marks both functions as __maybe_unused, which is a more robust way to do this. Fixes: 32faa3f0ee50 ("ath10k: add the PCI PM core suspend/resume ops") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/ath10k/pci.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c index bc1633945a56..195dafb98131 100644 --- a/drivers/net/wireless/ath/ath10k/pci.c +++ b/drivers/net/wireless/ath/ath10k/pci.c @@ -3396,9 +3396,7 @@ static void ath10k_pci_remove(struct pci_dev *pdev) MODULE_DEVICE_TABLE(pci, ath10k_pci_id_table); -#ifdef CONFIG_PM - -static int ath10k_pci_pm_suspend(struct device *dev) +static __maybe_unused int ath10k_pci_pm_suspend(struct device *dev) { struct ath10k *ar = dev_get_drvdata(dev); int ret; @@ -3414,7 +3412,7 @@ static int ath10k_pci_pm_suspend(struct device *dev) return ret; } -static int ath10k_pci_pm_resume(struct device *dev) +static __maybe_unused int ath10k_pci_pm_resume(struct device *dev) { struct ath10k *ar = dev_get_drvdata(dev); int ret; @@ -3433,7 +3431,6 @@ static int ath10k_pci_pm_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(ath10k_pci_pm_ops, ath10k_pci_pm_suspend, ath10k_pci_pm_resume); -#endif static struct pci_driver ath10k_pci_driver = { .name = "ath10k_pci", From 2e1c42391ff2556387b3cb6308b24f6f65619feb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 21 Sep 2017 16:58:48 +0200 Subject: [PATCH 0430/1322] USB: core: harden cdc_parse_cdc_header Andrey Konovalov reported a possible out-of-bounds problem for the cdc_parse_cdc_header function. He writes: It looks like cdc_parse_cdc_header() doesn't validate buflen before accessing buffer[1], buffer[2] and so on. The only check present is while (buflen > 0). So fix this issue up by properly validating the buffer length matches what the descriptor says it is. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 4c38ea41ae96..371a07d874a3 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -2069,6 +2069,10 @@ int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr, elength = 1; goto next_desc; } + if ((buflen < elength) || (elength < 3)) { + dev_err(&intf->dev, "invalid descriptor buffer length\n"); + break; + } if (buffer[1] != USB_DT_CS_INTERFACE) { dev_err(&intf->dev, "skipping garbage\n"); goto next_desc; From 51a9a8284e43642fc3e85810fd54f4c245d23a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 10:03:12 +0100 Subject: [PATCH 0431/1322] x86/xen: clean up clang build warning In the case where sizeof(maddr) != sizeof(long) p is initialized and never read and clang throws a warning on this. Move declaration of p to clean up the clang build warning: warning: Value stored to 'p' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/hypercall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..e089c1675a7c 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -552,13 +552,13 @@ static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { - u32 *p = (u32 *) &desc; - mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; mcl->args[1] = *(unsigned long *)&desc; } else { + u32 *p = (u32 *)&desc; + mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; mcl->args[2] = *p++; From 4f88836d4f806d212361eb426bc8a6ce897dbef9 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:22 +0530 Subject: [PATCH 0432/1322] drivers: net: de4x: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/de4x5.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index 0affee9c8aa2..299812e92db7 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -1147,9 +1147,8 @@ de4x5_hw_init(struct net_device *dev, u_long iobase, struct device *gendev) lp->timeout = -1; lp->gendev = gendev; spin_lock_init(&lp->lock); - init_timer(&lp->timer); - lp->timer.function = (void (*)(unsigned long))de4x5_ast; - lp->timer.data = (unsigned long)dev; + setup_timer(&lp->timer, (void (*)(unsigned long))de4x5_ast, + (unsigned long)dev); de4x5_parse_params(dev); /* From cdc91b31b81abaa2cf491fd5e9007f4bcd45bc68 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:23 +0530 Subject: [PATCH 0433/1322] drivers: net: b44: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/b44.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index a1125d10c825..42e44fc03a18 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -1474,10 +1474,8 @@ static int b44_open(struct net_device *dev) goto out; } - init_timer(&bp->timer); + setup_timer(&bp->timer, b44_timer, (unsigned long)bp); bp->timer.expires = jiffies + HZ; - bp->timer.data = (unsigned long) bp; - bp->timer.function = b44_timer; add_timer(&bp->timer); b44_enable_ints(bp); From 334e4a7d5505f59a741b0549f41082e29a914439 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:24 +0530 Subject: [PATCH 0434/1322] drivers: net: pcnet32: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pcnet32.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index 7f60d17819ce..e46153654016 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -1970,9 +1970,8 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) lp->options |= PCNET32_PORT_MII; } - init_timer(&lp->watchdog_timer); - lp->watchdog_timer.data = (unsigned long)dev; - lp->watchdog_timer.function = (void *)&pcnet32_watchdog; + setup_timer(&lp->watchdog_timer, (void *)&pcnet32_watchdog, + (unsigned long)dev); /* The PCNET32-specific entries in the device structure. */ dev->netdev_ops = &pcnet32_netdev_ops; From 27dd0852644966c43079d3f812ec37e3e3fc864e Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:25 +0530 Subject: [PATCH 0435/1322] drivers: net: brcm80211: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index aaed4ab503ad..ab3f22353154 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -3260,9 +3260,8 @@ static void brcmf_init_escan(struct brcmf_cfg80211_info *cfg) brcmf_cfg80211_escan_handler); cfg->escan_info.escan_state = WL_ESCAN_STATE_IDLE; /* Init scan_timeout timer */ - init_timer(&cfg->escan_timeout); - cfg->escan_timeout.data = (unsigned long) cfg; - cfg->escan_timeout.function = brcmf_escan_timeout; + setup_timer(&cfg->escan_timeout, brcmf_escan_timeout, + (unsigned long)cfg); INIT_WORK(&cfg->escan_timeout_work, brcmf_cfg80211_escan_timeout_worker); } From ba4cc08793a58a97cffc2769acaa53fff4433332 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:26 +0530 Subject: [PATCH 0436/1322] drivers : net: niu: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/niu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 6a4e8e1bbd90..bde19b307d0d 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -6123,10 +6123,8 @@ static int niu_open(struct net_device *dev) err = niu_init_hw(np); if (!err) { - init_timer(&np->timer); + setup_timer(&np->timer, niu_timer, (unsigned long)np); np->timer.expires = jiffies + HZ; - np->timer.data = (unsigned long) np; - np->timer.function = niu_timer; err = niu_enable_interrupts(np, 1); if (err) From c3bd81cccbaa89da70047f1dcc73443f889735a8 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:27 +0530 Subject: [PATCH 0437/1322] drivers: net: bcm63xx: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 4f3845a58126..f8bbbbfca06e 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1857,9 +1857,8 @@ static int bcm_enet_probe(struct platform_device *pdev) spin_lock_init(&priv->rx_lock); /* init rx timeout (used for oom) */ - init_timer(&priv->rx_timeout); - priv->rx_timeout.function = bcm_enet_refill_rx_timer; - priv->rx_timeout.data = (unsigned long)dev; + setup_timer(&priv->rx_timeout, bcm_enet_refill_rx_timer, + (unsigned long)dev); /* init the mib update lock&work */ mutex_init(&priv->mib_update_lock); From b0b404bd9ba28edb795b999e3c35596e236cc5dd Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:28 +0530 Subject: [PATCH 0438/1322] drivers: net: declance: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/declance.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 82cc81385033..9bdf81c2cd00 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1246,9 +1246,9 @@ static int dec_lance_probe(struct device *bdev, const int type) * can occur from interrupts (ex. IPv6). So we * use a timer to try again later when necessary. -DaveM */ - init_timer(&lp->multicast_timer); - lp->multicast_timer.data = (unsigned long) dev; - lp->multicast_timer.function = lance_set_multicast_retry; + setup_timer(&lp->multicast_timer, lance_set_multicast_retry, + (unsigned long)dev); + ret = register_netdev(dev); if (ret) { From aa0c72859972f209d7d07654037bd974c11f93ed Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:29 +0530 Subject: [PATCH 0439/1322] drivers: net: am79c961: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/am79c961a.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/am79c961a.c b/drivers/net/ethernet/amd/am79c961a.c index b11e910850f7..0612dbee00d2 100644 --- a/drivers/net/ethernet/amd/am79c961a.c +++ b/drivers/net/ethernet/amd/am79c961a.c @@ -728,9 +728,7 @@ static int am79c961_probe(struct platform_device *pdev) am79c961_banner(); spin_lock_init(&priv->chip_lock); - init_timer(&priv->timer); - priv->timer.data = (unsigned long)dev; - priv->timer.function = am79c961_timer; + setup_timer(&priv->timer, am79c961_timer, (unsigned long)dev); if (am79c961_hw_init(dev)) goto release; From 07b6901f61813aa547c5a25e118f977022fec9eb Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:30 +0530 Subject: [PATCH 0440/1322] drivers: net: et131x: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/agere/et131x.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c index 54eff90e2f02..658e92f79d36 100644 --- a/drivers/net/ethernet/agere/et131x.c +++ b/drivers/net/ethernet/agere/et131x.c @@ -3624,11 +3624,10 @@ static int et131x_open(struct net_device *netdev) int result; /* Start the timer to track NIC errors */ - init_timer(&adapter->error_timer); + setup_timer(&adapter->error_timer, et131x_error_timer_handler, + (unsigned long)adapter); adapter->error_timer.expires = jiffies + msecs_to_jiffies(TX_ERROR_PERIOD); - adapter->error_timer.function = et131x_error_timer_handler; - adapter->error_timer.data = (unsigned long)adapter; add_timer(&adapter->error_timer); result = request_irq(irq, et131x_isr, From 1e153e554fc8a3744d1fa8a36d4cb3b4cfbdc3da Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:31 +0530 Subject: [PATCH 0441/1322] drivers: net: appletalk: cops: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/appletalk/cops.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c index 486e1e6997fc..caf04284711a 100644 --- a/drivers/net/appletalk/cops.c +++ b/drivers/net/appletalk/cops.c @@ -424,9 +424,7 @@ static int cops_open(struct net_device *dev) */ if(lp->board==TANGENT) /* Poll 20 times per second */ { - init_timer(&cops_timer); - cops_timer.function = cops_poll; - cops_timer.data = (unsigned long)dev; + setup_timer(&cops_timer, cops_poll, (unsigned long)dev); cops_timer.expires = jiffies + HZ/20; add_timer(&cops_timer); } From 531f3ce953d4c74989eb1ece7ce37a3717532a13 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:32 +0530 Subject: [PATCH 0442/1322] drivers: net: rsi_91x: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wireless/rsi/rsi_91x_hal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c index 070dfd68bb83..7ad286d6e9a7 100644 --- a/drivers/net/wireless/rsi/rsi_91x_hal.c +++ b/drivers/net/wireless/rsi/rsi_91x_hal.c @@ -411,9 +411,8 @@ static void bl_cmd_timeout(unsigned long priv) static int bl_start_cmd_timer(struct rsi_hw *adapter, u32 timeout) { - init_timer(&adapter->bl_cmd_timer); - adapter->bl_cmd_timer.data = (unsigned long)adapter; - adapter->bl_cmd_timer.function = (void *)&bl_cmd_timeout; + setup_timer(&adapter->bl_cmd_timer, (void *)&bl_cmd_timeout, + (unsigned long)adapter); adapter->bl_cmd_timer.expires = (msecs_to_jiffies(timeout) + jiffies); adapter->blcmd_timer_expired = false; From e7bbad4487ae4005904d00a0a04622f07fadbc5b Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:33 +0530 Subject: [PATCH 0443/1322] drivers: net: atp: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/atp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c index bed34684994f..bdc3833fab7e 100644 --- a/drivers/net/ethernet/realtek/atp.c +++ b/drivers/net/ethernet/realtek/atp.c @@ -438,10 +438,8 @@ static int net_open(struct net_device *dev) hardware_init(dev); - init_timer(&lp->timer); + setup_timer(&lp->timer, atp_timed_checker, (unsigned long)dev); lp->timer.expires = jiffies + TIMED_CHECKER; - lp->timer.data = (unsigned long)dev; - lp->timer.function = atp_timed_checker; /* timer handler */ add_timer(&lp->timer); netif_start_queue(dev); From f40c9d5aea0fdf5454464bc477898cc981ff9715 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:34 +0530 Subject: [PATCH 0444/1322] drivers: net: ns83820: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/natsemi/ns83820.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 729095db3e08..99d3c7884a4a 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -1652,9 +1652,7 @@ static int ns83820_open(struct net_device *ndev) writel(0, dev->base + TXDP_HI); writel(desc, dev->base + TXDP); - init_timer(&dev->tx_watchdog); - dev->tx_watchdog.data = (unsigned long)ndev; - dev->tx_watchdog.function = ns83820_tx_watch; + setup_timer(&dev->tx_watchdog, ns83820_tx_watch, (unsigned long)ndev); mod_timer(&dev->tx_watchdog, jiffies + 2*HZ); netif_start_queue(ndev); /* FIXME: wait for phy to come up */ From 82a8c6745169ec932473658c28679069a7ded95a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:35 +0530 Subject: [PATCH 0445/1322] drivers: net: ixgb: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgb/ixgb_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c index 5a713199653c..1e6ec2277d54 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c @@ -508,9 +508,8 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->part_num = ixgb_get_ee_pba_number(&adapter->hw); - init_timer(&adapter->watchdog_timer); - adapter->watchdog_timer.function = ixgb_watchdog; - adapter->watchdog_timer.data = (unsigned long)adapter; + setup_timer(&adapter->watchdog_timer, ixgb_watchdog, + (unsigned long)adapter); INIT_WORK(&adapter->tx_timeout_task, ixgb_tx_timeout_task); From 88e8aa172596d2eda971d3553d52a8e877805e90 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:36 +0530 Subject: [PATCH 0446/1322] drivers: net: sundance: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/dlink/sundance.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c index 2704bcf023be..6ca9e981ad57 100644 --- a/drivers/net/ethernet/dlink/sundance.c +++ b/drivers/net/ethernet/dlink/sundance.c @@ -913,10 +913,8 @@ static int netdev_open(struct net_device *dev) ioread16(ioaddr + MACCtrl1), ioread16(ioaddr + MACCtrl0)); /* Set the timer to check for link beat. */ - init_timer(&np->timer); + setup_timer(&np->timer, netdev_timer, (unsigned long)dev); np->timer.expires = jiffies + 3*HZ; - np->timer.data = (unsigned long)dev; - np->timer.function = netdev_timer; /* timer handler */ add_timer(&np->timer); /* Enable interrupts by setting the interrupt mask. */ From 4896ad68ec3803d01a0d9fead64451377ea1ec5f Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:37 +0530 Subject: [PATCH 0447/1322] drivers: net: tg3: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 656e6af70f0a..d8d5f207c759 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -11087,9 +11087,7 @@ static void tg3_timer_init(struct tg3 *tp) tp->asf_multiplier = (HZ / tp->timer_offset) * TG3_FW_UPDATE_FREQ_SEC; - init_timer(&tp->timer); - tp->timer.data = (unsigned long) tp; - tp->timer.function = tg3_timer; + setup_timer(&tp->timer, tg3_timer, (unsigned long)tp); } static void tg3_timer_start(struct tg3 *tp) From f347bd6b5f6599fe67d040758947dbf9bdd89195 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:38 +0530 Subject: [PATCH 0448/1322] drivers: net: sdla: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/sdla.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c index 236c62538036..0cc48902dbb9 100644 --- a/drivers/net/wan/sdla.c +++ b/drivers/net/wan/sdla.c @@ -1617,10 +1617,8 @@ static void setup_sdla(struct net_device *dev) flp->deassoc = sdla_deassoc; flp->dlci_conf = sdla_dlci_conf; - init_timer(&flp->timer); + setup_timer(&flp->timer, sdla_poll, (unsigned long)dev); flp->timer.expires = 1; - flp->timer.data = (unsigned long) dev; - flp->timer.function = sdla_poll; } static struct net_device *sdla; From dffec39fb185837468486b6bbd00b18f35e38c82 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:39 +0530 Subject: [PATCH 0449/1322] drivers: net: cisco_hdlc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_cisco.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c index a408abc25512..c696d42f4502 100644 --- a/drivers/net/wan/hdlc_cisco.c +++ b/drivers/net/wan/hdlc_cisco.c @@ -293,10 +293,8 @@ static void cisco_start(struct net_device *dev) st->up = st->txseq = st->rxseq = 0; spin_unlock_irqrestore(&st->lock, flags); - init_timer(&st->timer); + setup_timer(&st->timer, cisco_timer, (unsigned long)dev); st->timer.expires = jiffies + HZ; /* First poll after 1 s */ - st->timer.function = cisco_timer; - st->timer.data = (unsigned long)dev; add_timer(&st->timer); } From 18df06c2cafea69b19ab5f274865b89c3ab0f715 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:40 +0530 Subject: [PATCH 0450/1322] drivers: net: slip: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/slip/slip.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 436dd78c396a..eb8a18991d8c 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -763,12 +763,8 @@ static struct slip *sl_alloc(dev_t line) sl->mode = SL_MODE_DEFAULT; #ifdef CONFIG_SLIP_SMART /* initialize timer_list struct */ - init_timer(&sl->keepalive_timer); - sl->keepalive_timer.data = (unsigned long)sl; - sl->keepalive_timer.function = sl_keepalive; - init_timer(&sl->outfill_timer); - sl->outfill_timer.data = (unsigned long)sl; - sl->outfill_timer.function = sl_outfill; + setup_timer(&sl->keepalive_timer, sl_keepalive, (unsigned long)sl); + setup_timer(&sl->outfill_timer, sl_outfill, (unsigned long)sl); #endif slip_devs[i] = dev; return sl; From e998092f7b7c2ce3199db91177aa43397a3f76d7 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:41 +0530 Subject: [PATCH 0451/1322] drivers: net: spider_net: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/toshiba/spider_net.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c index cec9e70ab995..a913538d3213 100644 --- a/drivers/net/ethernet/toshiba/spider_net.c +++ b/drivers/net/ethernet/toshiba/spider_net.c @@ -2256,16 +2256,14 @@ spider_net_setup_netdev(struct spider_net_card *card) pci_set_drvdata(card->pdev, netdev); - init_timer(&card->tx_timer); - card->tx_timer.function = - (void (*)(unsigned long)) spider_net_cleanup_tx_ring; - card->tx_timer.data = (unsigned long) card; + setup_timer(&card->tx_timer, + (void(*)(unsigned long))spider_net_cleanup_tx_ring, + (unsigned long)card); netdev->irq = card->pdev->irq; card->aneg_count = 0; - init_timer(&card->aneg_timer); - card->aneg_timer.function = spider_net_link_phy; - card->aneg_timer.data = (unsigned long) card; + setup_timer(&card->aneg_timer, spider_net_link_phy, + (unsigned long)card); netif_napi_add(netdev, &card->napi, spider_net_poll, SPIDER_NET_NAPI_WEIGHT); From f891f36603dff38a15390e5c950e8ac66f73352b Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:42 +0530 Subject: [PATCH 0452/1322] drivers: net: sun: cassini: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/cassini.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index 382993c1561c..a74d78f64af9 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -5039,10 +5039,7 @@ static int cas_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) spin_lock_init(&cp->stat_lock[N_TX_RINGS]); mutex_init(&cp->pm_mutex); - init_timer(&cp->link_timer); - cp->link_timer.function = cas_link_timer; - cp->link_timer.data = (unsigned long) cp; - + setup_timer(&cp->link_timer, cas_link_timer, (unsigned long)cp); #if 1 /* Just in case the implementation of atomic operations * change so that an explicit initialization is necessary. From ba98e9e2eb32ba8f604f947fa656ef2071d22fa0 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:43 +0530 Subject: [PATCH 0453/1322] drivers: net: natsemi: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/natsemi/natsemi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c index 18af2a23a933..dedeacd0bbca 100644 --- a/drivers/net/ethernet/natsemi/natsemi.c +++ b/drivers/net/ethernet/natsemi/natsemi.c @@ -1571,10 +1571,8 @@ static int netdev_open(struct net_device *dev) dev->name, (int)readl(ioaddr + ChipCmd)); /* Set the timer to check for link beat. */ - init_timer(&np->timer); + setup_timer(&np->timer, netdev_timer, (unsigned long)dev); np->timer.expires = round_jiffies(jiffies + NATSEMI_TIMER_FREQ); - np->timer.data = (unsigned long)dev; - np->timer.function = netdev_timer; /* timer handler */ add_timer(&np->timer); return 0; From 3e436a25fcca449cbb044c7483116772873f3e28 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:44 +0530 Subject: [PATCH 0454/1322] drivers: net: winbond-840: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/winbond-840.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c index 32d7229544fa..6f88d687b6d2 100644 --- a/drivers/net/ethernet/dec/tulip/winbond-840.c +++ b/drivers/net/ethernet/dec/tulip/winbond-840.c @@ -655,10 +655,8 @@ static int netdev_open(struct net_device *dev) netdev_dbg(dev, "Done netdev_open()\n"); /* Set the timer to check for link beat. */ - init_timer(&np->timer); + setup_timer(&np->timer, netdev_timer, (unsigned long)dev); np->timer.expires = jiffies + 1*HZ; - np->timer.data = (unsigned long)dev; - np->timer.function = netdev_timer; /* timer handler */ add_timer(&np->timer); return 0; out_err: From 7afd516ff75e967873d7bdcb8f9b1180c2400b57 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:45 +0530 Subject: [PATCH 0455/1322] drivers: net: enic: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index d24ee1ad3be1..4a11baffe02d 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -2846,9 +2846,8 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Setup notification timer, HW reset task, and wq locks */ - init_timer(&enic->notify_timer); - enic->notify_timer.function = enic_notify_timer; - enic->notify_timer.data = (unsigned long)enic; + setup_timer(&enic->notify_timer, enic_notify_timer, + (unsigned long)enic); enic_set_rx_coal_setting(enic); INIT_WORK(&enic->reset, enic_reset); From c41326fbb3a7d64c329267d20c58dd9cc8f22a47 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:46 +0530 Subject: [PATCH 0456/1322] drivers: net: bnx2: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index e3af1f3cb61f..b3055a76dfbf 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8462,10 +8462,8 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bnx2_set_default_link(bp); bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX; - init_timer(&bp->timer); + setup_timer(&bp->timer, bnx2_timer, (unsigned long)bp); bp->timer.expires = RUN_AT(BNX2_TIMER_INTERVAL); - bp->timer.data = (unsigned long) bp; - bp->timer.function = bnx2_timer; #ifdef BCM_CNIC if (bnx2_shmem_rd(bp, BNX2_ISCSI_INITIATOR) & BNX2_ISCSI_INITIATOR_EN) From f7c11175bdece128c0b4c4a43e6781b9216db4d9 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:47 +0530 Subject: [PATCH 0457/1322] drivers: net: xen-netback: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index ee8ed9da00ad..dcfcb153918c 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -520,8 +520,7 @@ int xenvif_init_queue(struct xenvif_queue *queue) queue->credit_bytes = queue->remaining_credit = ~0UL; queue->credit_usec = 0UL; - init_timer(&queue->credit_timeout); - queue->credit_timeout.function = xenvif_tx_credit_callback; + setup_timer(&queue->credit_timeout, xenvif_tx_credit_callback, 0UL); queue->credit_window_start = get_jiffies_64(); queue->rx_queue_max = XENVIF_RX_QUEUE_BYTES; From 55d3cef4ee68d7d54c457c7deeb0e393dd386fdc Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:48 +0530 Subject: [PATCH 0458/1322] drivers: net: atmel: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wireless/atmel/atmel.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c index b68436b23a63..e816d53c2c05 100644 --- a/drivers/net/wireless/atmel/atmel.c +++ b/drivers/net/wireless/atmel/atmel.c @@ -1579,11 +1579,10 @@ struct net_device *init_atmel_card(unsigned short irq, unsigned long port, priv->default_beacon_period = priv->beacon_period = 100; priv->listen_interval = 1; - init_timer(&priv->management_timer); + setup_timer(&priv->management_timer, atmel_management_timer, + (unsigned long)dev); spin_lock_init(&priv->irqlock); spin_lock_init(&priv->timerlock); - priv->management_timer.function = atmel_management_timer; - priv->management_timer.data = (unsigned long) dev; dev->netdev_ops = &atmel_netdev_ops; dev->wireless_handlers = &atmel_handler_def; From 0625d739cc61d186b65b8619ee8c41da2fd24894 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:49 +0530 Subject: [PATCH 0459/1322] drivers: net: hippi: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/hippi/rrunner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c index 71ddadbf2368..76cc140774a2 100644 --- a/drivers/net/hippi/rrunner.c +++ b/drivers/net/hippi/rrunner.c @@ -1229,10 +1229,8 @@ static int rr_open(struct net_device *dev) /* Set the timer to switch to check for link beat and perhaps switch to an alternate media type. */ - init_timer(&rrpriv->timer); + setup_timer(&rrpriv->timer, rr_timer, (unsigned long)dev); rrpriv->timer.expires = RUN_AT(5*HZ); /* 5 sec. watchdog */ - rrpriv->timer.data = (unsigned long)dev; - rrpriv->timer.function = rr_timer; /* timer handler */ add_timer(&rrpriv->timer); netif_start_queue(dev); From 32db034501d37c60c433d24e9faa41c1fa3136e5 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:50 +0530 Subject: [PATCH 0460/1322] drivers: net: smsc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/smsc/epic100.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index 6a0e1d4b597c..2a9724898fcf 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -739,10 +739,8 @@ static int epic_open(struct net_device *dev) /* Set the timer to switch to check for link beat and perhaps switch to an alternate media type. */ - init_timer(&ep->timer); + setup_timer(&ep->timer, epic_timer, (unsigned long)dev); ep->timer.expires = jiffies + 3*HZ; - ep->timer.data = (unsigned long)dev; - ep->timer.function = epic_timer; /* timer handler */ add_timer(&ep->timer); return rc; From d4d8db71db1bf602623e859e6c3e700b604c2072 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:51 +0530 Subject: [PATCH 0461/1322] drivers: net: qlogic: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qla3xxx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 2991179c2fd0..05479d435469 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -3891,10 +3891,8 @@ static int ql3xxx_probe(struct pci_dev *pdev, INIT_DELAYED_WORK(&qdev->tx_timeout_work, ql_tx_timeout_work); INIT_DELAYED_WORK(&qdev->link_state_work, ql_link_state_machine_work); - init_timer(&qdev->adapter_timer); - qdev->adapter_timer.function = ql3xxx_timer; + setup_timer(&qdev->adapter_timer, ql3xxx_timer, (unsigned long)qdev); qdev->adapter_timer.expires = jiffies + HZ * 2; /* two second delay */ - qdev->adapter_timer.data = (unsigned long)qdev; if (!cards_found) { pr_alert("%s\n", DRV_STRING); From 4a9c07ed71c2b8d755ee585264f80dd2d82a8066 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:52 +0530 Subject: [PATCH 0462/1322] drivers: net: e1000e: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/e1000e/netdev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 327dfe5bedc0..8436c5f2c3e8 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -7252,13 +7252,10 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_eeprom; } - init_timer(&adapter->watchdog_timer); - adapter->watchdog_timer.function = e1000_watchdog; - adapter->watchdog_timer.data = (unsigned long)adapter; - - init_timer(&adapter->phy_info_timer); - adapter->phy_info_timer.function = e1000_update_phy_info; - adapter->phy_info_timer.data = (unsigned long)adapter; + setup_timer(&adapter->watchdog_timer, e1000_watchdog, + (unsigned long)adapter); + setup_timer(&adapter->phy_info_timer, e1000_update_phy_info, + (unsigned long)adapter); INIT_WORK(&adapter->reset_task, e1000_reset_task); INIT_WORK(&adapter->watchdog_task, e1000_watchdog_task); From af25c31d4bf4bc7d4dea4c093af8b7704ef24f81 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:53 +0530 Subject: [PATCH 0463/1322] drivers: net: amd: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/sunlance.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c index 291ca5187f12..0183ffb9d3ba 100644 --- a/drivers/net/ethernet/amd/sunlance.c +++ b/drivers/net/ethernet/amd/sunlance.c @@ -1459,9 +1459,8 @@ no_link_test: * can occur from interrupts (ex. IPv6). So we * use a timer to try again later when necessary. -DaveM */ - init_timer(&lp->multicast_timer); - lp->multicast_timer.data = (unsigned long) dev; - lp->multicast_timer.function = lance_set_multicast_retry; + setup_timer(&lp->multicast_timer, lance_set_multicast_retry, + (unsigned long)dev); if (register_netdev(dev)) { printk(KERN_ERR "SunLance: Cannot register device.\n"); From cec55a92a98fb3205cc4f127d9b70b5a965f3bd7 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:54 +0530 Subject: [PATCH 0464/1322] drivers: net: amd8111e: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/amd8111e.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index 7b5df562f30f..7f22af6e37e0 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -1883,9 +1883,8 @@ static int amd8111e_probe_one(struct pci_dev *pdev, /* Initialize software ipg timer */ if(lp->options & OPTION_DYN_IPG_ENABLE){ - init_timer(&lp->ipg_data.ipg_timer); - lp->ipg_data.ipg_timer.data = (unsigned long) dev; - lp->ipg_data.ipg_timer.function = (void *)&amd8111e_config_ipg; + setup_timer(&lp->ipg_data.ipg_timer, + (void *)&amd8111e_config_ipg, (unsigned long)dev); lp->ipg_data.ipg_timer.expires = jiffies + IPG_CONVERGE_JIFFIES; lp->ipg_data.ipg = DEFAULT_IPG; From fb4de582a22282fa0f9665fd363dfd3e073c40f6 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:55 +0530 Subject: [PATCH 0465/1322] drivers: net: eql: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/eql.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/eql.c b/drivers/net/eql.c index fe13bfea30ac..fccce4b47778 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -178,10 +178,8 @@ static void __init eql_setup(struct net_device *dev) { equalizer_t *eql = netdev_priv(dev); - init_timer(&eql->timer); - eql->timer.data = (unsigned long) eql; + setup_timer(&eql->timer, eql_timer, (unsigned long)eql); eql->timer.expires = jiffies + EQL_DEFAULT_RESCHED_IVAL; - eql->timer.function = eql_timer; spin_lock_init(&eql->queue.lock); INIT_LIST_HEAD(&eql->queue.all_slaves); From 82f5d72da8755cf73387a007ad323a86ea0b5663 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:56 +0530 Subject: [PATCH 0466/1322] drivers: net: can: usb: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/can/usb/peak_usb/pcan_usb.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 838545ce468d..7e10dbdded28 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -798,9 +798,8 @@ static int pcan_usb_init(struct peak_usb_device *dev) int err; /* initialize a timer needed to wait for hardware restart */ - init_timer(&pdev->restart_timer); - pdev->restart_timer.function = pcan_usb_restart; - pdev->restart_timer.data = (unsigned long)dev; + setup_timer(&pdev->restart_timer, pcan_usb_restart, + (unsigned long)dev); /* * explicit use of dev_xxx() instead of netdev_xxx() here: From 9f5ca8816b5ccc1fd13baf46e012fb5228e57763 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:57 +0530 Subject: [PATCH 0467/1322] drivers: net: can: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/can/grcan.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index a7be12d9a139..8570cfdaea75 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -1626,13 +1626,11 @@ static int grcan_setup_netdev(struct platform_device *ofdev, spin_lock_init(&priv->lock); if (priv->need_txbug_workaround) { - init_timer(&priv->rr_timer); - priv->rr_timer.function = grcan_running_reset; - priv->rr_timer.data = (unsigned long)dev; + setup_timer(&priv->rr_timer, grcan_running_reset, + (unsigned long)dev); - init_timer(&priv->hang_timer); - priv->hang_timer.function = grcan_initiate_running_reset; - priv->hang_timer.data = (unsigned long)dev; + setup_timer(&priv->hang_timer, grcan_initiate_running_reset, + (unsigned long)dev); } netif_napi_add(dev, &priv->napi, grcan_poll, GRCAN_NAPI_WEIGHT); From fe9bfe207e200f901361681bf1497386068f1df2 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:58 +0530 Subject: [PATCH 0468/1322] drivers: net: arcnet: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/arcnet/arcnet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index fcfccbb3d9a2..13236b2cdf13 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -450,9 +450,7 @@ struct net_device *alloc_arcdev(const char *name) lp->dev = dev; spin_lock_init(&lp->lock); - init_timer(&lp->timer); - lp->timer.data = (unsigned long) dev; - lp->timer.function = arcnet_timer; + setup_timer(&lp->timer, arcnet_timer, (unsigned long)dev); } return dev; From b9496b6b9100d824033390312dd64244f959a156 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:34:59 +0530 Subject: [PATCH 0469/1322] drivers: net: ath6kl: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wireless/ath/ath6kl/txrx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath6kl/txrx.c b/drivers/net/wireless/ath/ath6kl/txrx.c index e6b2517e6334..4e5cc2b7045a 100644 --- a/drivers/net/wireless/ath/ath6kl/txrx.c +++ b/drivers/net/wireless/ath/ath6kl/txrx.c @@ -1753,9 +1753,7 @@ void aggr_conn_init(struct ath6kl_vif *vif, struct aggr_info *aggr_info, aggr_conn->aggr_sz = AGGR_SZ_DEFAULT; aggr_conn->dev = vif->ndev; - init_timer(&aggr_conn->timer); - aggr_conn->timer.function = aggr_timeout; - aggr_conn->timer.data = (unsigned long) aggr_conn; + setup_timer(&aggr_conn->timer, aggr_timeout, (unsigned long)aggr_conn); aggr_conn->aggr_info = aggr_info; aggr_conn->timer_scheduled = false; From 6d2bcc14f5731e9357f15d41f7c5677a72e354f9 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:00 +0530 Subject: [PATCH 0470/1322] drivers: net: sun: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sungem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index fa607d062cb3..b75ab8f44968 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -2910,9 +2910,7 @@ static int gem_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) gp->msg_enable = DEFAULT_MSG; - init_timer(&gp->link_timer); - gp->link_timer.function = gem_link_timer; - gp->link_timer.data = (unsigned long) gp; + setup_timer(&gp->link_timer, gem_link_timer, (unsigned long)gp); INIT_WORK(&gp->reset_task, gem_reset_task); From ac803d1c5f62937dc142a35dafd180f09b9f9c83 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:01 +0530 Subject: [PATCH 0471/1322] drivers: net: sis900: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/sis/sis900.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index 40bd88362e3d..cb61247b0526 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -1065,10 +1065,8 @@ sis900_open(struct net_device *net_dev) /* Set the timer to switch to check for link beat and perhaps switch to an alternate media type. */ - init_timer(&sis_priv->timer); + setup_timer(&sis_priv->timer, sis900_timer, (unsigned long)net_dev); sis_priv->timer.expires = jiffies + HZ; - sis_priv->timer.data = (unsigned long)net_dev; - sis_priv->timer.function = sis900_timer; add_timer(&sis_priv->timer); return 0; From f1ce56ce5d2a18f5d61ec335aaf5aad978fa9cd6 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:02 +0530 Subject: [PATCH 0472/1322] drivers: net: packetengines: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/packetengines/yellowfin.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c index fa7770da6ef8..33c241f52a71 100644 --- a/drivers/net/ethernet/packetengines/yellowfin.c +++ b/drivers/net/ethernet/packetengines/yellowfin.c @@ -632,10 +632,8 @@ static int yellowfin_open(struct net_device *dev) } /* Set the timer to check for link beat. */ - init_timer(&yp->timer); + setup_timer(&yp->timer, yellowfin_timer, (unsigned long)dev); yp->timer.expires = jiffies + 3*HZ; - yp->timer.data = (unsigned long)dev; - yp->timer.function = yellowfin_timer; /* timer handler */ add_timer(&yp->timer); out: return rc; From 590deff6e7a898fadf3f1fd6425937d481913fb1 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:03 +0530 Subject: [PATCH 0473/1322] drivers: net: mlx5: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 8aea0a065e56..a89a68ce53ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -320,15 +320,13 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; - init_timer(&health->timer); + setup_timer(&health->timer, poll_health, (unsigned long)dev); health->sick = 0; clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; - health->timer.data = (unsigned long)dev; - health->timer.function = poll_health; health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL); add_timer(&health->timer); } From d2a0012e7632a588683ad6320529659c4cd27131 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:04 +0530 Subject: [PATCH 0474/1322] drivers: net: mlx4: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/catas.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c index 53daa6ca5d83..de0f9e5e42ec 100644 --- a/drivers/net/ethernet/mellanox/mlx4/catas.c +++ b/drivers/net/ethernet/mellanox/mlx4/catas.c @@ -277,7 +277,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev) phys_addr_t addr; INIT_LIST_HEAD(&priv->catas_err.list); - init_timer(&priv->catas_err.timer); + setup_timer(&priv->catas_err.timer, poll_catas, (unsigned long)dev); priv->catas_err.map = NULL; if (!mlx4_is_slave(dev)) { @@ -293,8 +293,6 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev) } } - priv->catas_err.timer.data = (unsigned long) dev; - priv->catas_err.timer.function = poll_catas; priv->catas_err.timer.expires = round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); add_timer(&priv->catas_err.timer); From 636873890c63c892fca5ccab8af3a9f3607eb1fc Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:05 +0530 Subject: [PATCH 0475/1322] drivers: net: pxa168: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/pxa168_eth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 993724959a7c..91b1c154fd29 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1496,9 +1496,8 @@ static int pxa168_eth_probe(struct platform_device *pdev) netif_napi_add(dev, &pep->napi, pxa168_rx_poll, pep->rx_ring_size); memset(&pep->timeout, 0, sizeof(struct timer_list)); - init_timer(&pep->timeout); - pep->timeout.function = rxq_refill_timer_wrapper; - pep->timeout.data = (unsigned long)pep; + setup_timer(&pep->timeout, rxq_refill_timer_wrapper, + (unsigned long)pep); pep->smi_bus = mdiobus_alloc(); if (!pep->smi_bus) { From 34b0cf069d174b2615769102d75b4ce687addeb9 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:06 +0530 Subject: [PATCH 0476/1322] drivers: net: fealnx: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/fealnx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c index e92859dab7ae..c8982313d850 100644 --- a/drivers/net/ethernet/fealnx.c +++ b/drivers/net/ethernet/fealnx.c @@ -909,17 +909,13 @@ static int netdev_open(struct net_device *dev) printk(KERN_DEBUG "%s: Done netdev_open().\n", dev->name); /* Set the timer to check for link beat. */ - init_timer(&np->timer); + setup_timer(&np->timer, netdev_timer, (unsigned long)dev); np->timer.expires = RUN_AT(3 * HZ); - np->timer.data = (unsigned long) dev; - np->timer.function = netdev_timer; /* timer handler */ add_timer(&np->timer); - init_timer(&np->reset_timer); - np->reset_timer.data = (unsigned long) dev; - np->reset_timer.function = reset_timer; + setup_timer(&np->reset_timer, reset_timer, (unsigned long)dev); np->reset_timer_armed = 0; return rc; } From a76aec2ac51f8a74659cff9b19f712e8fb984393 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:07 +0530 Subject: [PATCH 0477/1322] drivers: net: dmfe: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/dmfe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c index 07e10a45beaa..6585f737d08b 100644 --- a/drivers/net/ethernet/dec/tulip/dmfe.c +++ b/drivers/net/ethernet/dec/tulip/dmfe.c @@ -596,10 +596,8 @@ static int dmfe_open(struct net_device *dev) netif_wake_queue(dev); /* set and active a timer process */ - init_timer(&db->timer); + setup_timer(&db->timer, dmfe_timer, (unsigned long)dev); db->timer.expires = DMFE_TIMER_WUT + HZ * 2; - db->timer.data = (unsigned long)dev; - db->timer.function = dmfe_timer; add_timer(&db->timer); return 0; From 6c43824477c2ac722325ba460c2ce683c48fb76b Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:08 +0530 Subject: [PATCH 0478/1322] drivers: net: bnxt: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index aacec8bc19d5..c25f5b555adf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7190,9 +7190,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev) bp->stats_coal_ticks = BNXT_DEF_STATS_COAL_TICKS; - init_timer(&bp->timer); - bp->timer.data = (unsigned long)bp; - bp->timer.function = bnxt_timer; + setup_timer(&bp->timer, bnxt_timer, (unsigned long)bp); bp->current_interval = BNXT_TIMER_INTERVAL; clear_bit(BNXT_STATE_OPEN, &bp->state); From cac40a458ae68b41955cc55a80fc9e0166e429b1 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:09 +0530 Subject: [PATCH 0479/1322] drivers: net: amd: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/a2065.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amd/a2065.c b/drivers/net/ethernet/amd/a2065.c index e22f976a0d18..998d30e050a6 100644 --- a/drivers/net/ethernet/amd/a2065.c +++ b/drivers/net/ethernet/amd/a2065.c @@ -733,10 +733,9 @@ static int a2065_init_one(struct zorro_dev *z, dev->watchdog_timeo = 5*HZ; dev->dma = 0; - init_timer(&priv->multicast_timer); - priv->multicast_timer.data = (unsigned long) dev; - priv->multicast_timer.function = - (void (*)(unsigned long))lance_set_multicast; + setup_timer(&priv->multicast_timer, + (void(*)(unsigned long))lance_set_multicast, + (unsigned long)dev); err = register_netdev(dev); if (err) { From 7c214194de36217c06b0dc4ed64d9cf251b261df Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:10 +0530 Subject: [PATCH 0480/1322] drivers: net: adi: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/adi/bfin_mac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c index a251de8d9a91..0658cde1586a 100644 --- a/drivers/net/ethernet/adi/bfin_mac.c +++ b/drivers/net/ethernet/adi/bfin_mac.c @@ -1650,9 +1650,8 @@ static int bfin_mac_probe(struct platform_device *pdev) ndev->netdev_ops = &bfin_mac_netdev_ops; ndev->ethtool_ops = &bfin_mac_ethtool_ops; - init_timer(&lp->tx_reclaim_timer); - lp->tx_reclaim_timer.data = (unsigned long)lp; - lp->tx_reclaim_timer.function = tx_reclaim_skb_timeout; + setup_timer(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout, + (unsigned long)lp); lp->flags = 0; netif_napi_add(ndev, &lp->napi, bfin_mac_poll, CONFIG_BFIN_RX_DESC_NUM); From 13e96b93ff5f1093592419e1f84962d4a266bd3e Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:11 +0530 Subject: [PATCH 0481/1322] drivers: net: can: sja1000: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/can/sja1000/peak_pcmcia.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/can/sja1000/peak_pcmcia.c b/drivers/net/can/sja1000/peak_pcmcia.c index dd56133cc461..4b8758e10bd4 100644 --- a/drivers/net/can/sja1000/peak_pcmcia.c +++ b/drivers/net/can/sja1000/peak_pcmcia.c @@ -692,9 +692,7 @@ static int pcan_probe(struct pcmcia_device *pdev) } /* init the timer which controls the leds */ - init_timer(&card->led_timer); - card->led_timer.function = pcan_led_timer; - card->led_timer.data = (unsigned long)card; + setup_timer(&card->led_timer, pcan_led_timer, (unsigned long)card); /* request the given irq */ err = request_irq(pdev->irq, &pcan_isr, IRQF_SHARED, PCC_NAME, card); From 7890d5341999968b94dbdf9e30ad32f37f82ea10 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:12 +0530 Subject: [PATCH 0482/1322] drivers: net: caif: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/caif/caif_hsi.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c index 438966bf51c2..fed75e75207a 100644 --- a/drivers/net/caif/caif_hsi.c +++ b/drivers/net/caif/caif_hsi.c @@ -1211,17 +1211,14 @@ static int cfhsi_open(struct net_device *ndev) init_waitqueue_head(&cfhsi->flush_fifo_wait); /* Setup the inactivity timer. */ - init_timer(&cfhsi->inactivity_timer); - cfhsi->inactivity_timer.data = (unsigned long)cfhsi; - cfhsi->inactivity_timer.function = cfhsi_inactivity_tout; + setup_timer(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, + (unsigned long)cfhsi); /* Setup the slowpath RX timer. */ - init_timer(&cfhsi->rx_slowpath_timer); - cfhsi->rx_slowpath_timer.data = (unsigned long)cfhsi; - cfhsi->rx_slowpath_timer.function = cfhsi_rx_slowpath; + setup_timer(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, + (unsigned long)cfhsi); /* Setup the aggregation timer. */ - init_timer(&cfhsi->aggregation_timer); - cfhsi->aggregation_timer.data = (unsigned long)cfhsi; - cfhsi->aggregation_timer.function = cfhsi_aggregation_tout; + setup_timer(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, + (unsigned long)cfhsi); /* Activate HSI interface. */ res = cfhsi->ops->cfhsi_up(cfhsi->ops); From ba7400ed88adcd07af0dc004be1cf5ab2443cb44 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:13 +0530 Subject: [PATCH 0483/1322] drivers: net: appletalk: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/appletalk/ltpc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c index ac755d2950a6..e4aa374caa4d 100644 --- a/drivers/net/appletalk/ltpc.c +++ b/drivers/net/appletalk/ltpc.c @@ -1165,9 +1165,7 @@ struct net_device * __init ltpc_probe(void) dev->irq = 0; /* polled mode -- 20 times per second */ /* this is really, really slow... should it poll more often? */ - init_timer(<pc_timer); - ltpc_timer.function=ltpc_poll; - ltpc_timer.data = (unsigned long) dev; + setup_timer(<pc_timer, ltpc_poll, (unsigned long)dev); ltpc_timer.expires = jiffies + HZ/20; add_timer(<pc_timer); From f2803332f2ef86ad29e247350f697bad4eb1aa93 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:14 +0530 Subject: [PATCH 0484/1322] drivers: net: dscc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/dscc4.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c index a043fb1367bd..64f176496da4 100644 --- a/drivers/net/wan/dscc4.c +++ b/drivers/net/wan/dscc4.c @@ -1127,10 +1127,8 @@ static int dscc4_open(struct net_device *dev) done: netif_start_queue(dev); - init_timer(&dpriv->timer); + setup_timer(&dpriv->timer, dscc4_timer, (unsigned long)dev); dpriv->timer.expires = jiffies + 10*HZ; - dpriv->timer.data = (unsigned long)dev; - dpriv->timer.function = dscc4_timer; add_timer(&dpriv->timer); netif_carrier_on(dev); From 8d81fe753c1572573be6d529a44b1506e3b0425d Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:15 +0530 Subject: [PATCH 0485/1322] drivers: net: hdlc_ppp: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_ppp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c index 0d2e00ece804..c7721c729541 100644 --- a/drivers/net/wan/hdlc_ppp.c +++ b/drivers/net/wan/hdlc_ppp.c @@ -610,9 +610,7 @@ static void ppp_start(struct net_device *dev) for (i = 0; i < IDX_COUNT; i++) { struct proto *proto = &ppp->protos[i]; proto->dev = dev; - init_timer(&proto->timer); - proto->timer.function = ppp_timer; - proto->timer.data = (unsigned long)proto; + setup_timer(&proto->timer, ppp_timer, (unsigned long)proto); proto->state = CLOSED; } ppp->protos[IDX_LCP].pid = PID_LCP; From 7e47fc264e2837d971720ccf1f8b35648c2626ea Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:16 +0530 Subject: [PATCH 0486/1322] drivers: net: hamradio: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 021a8ec411ab..97fe8dfb602d 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -623,9 +623,7 @@ static int sixpack_open(struct tty_struct *tty) netif_start_queue(dev); - init_timer(&sp->tx_t); - sp->tx_t.function = sp_xmit_on_air; - sp->tx_t.data = (unsigned long) sp; + setup_timer(&sp->tx_t, sp_xmit_on_air, (unsigned long)sp); init_timer(&sp->resync_t); From 9d90725f33ebd0f30790c26eb5e9e0a098567895 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:17 +0530 Subject: [PATCH 0487/1322] drivers: net: cpsw_ale: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index ddd43e09111e..cd1185e66133 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -859,9 +859,7 @@ void cpsw_ale_start(struct cpsw_ale *ale) cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1); cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1); - init_timer(&ale->timer); - ale->timer.data = (unsigned long)ale; - ale->timer.function = cpsw_ale_timer; + setup_timer(&ale->timer, cpsw_ale_timer, (unsigned long)ale); if (ale->ageout) { ale->timer.expires = jiffies + ale->ageout; add_timer(&ale->timer); From 997decfb6aeaa9be41ff557741845bb9fb4bf5bc Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:18 +0530 Subject: [PATCH 0488/1322] drivers: net: stmmac: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 1763e48c84e2..f41661a04f23 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2217,10 +2217,8 @@ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv) { priv->tx_coal_frames = STMMAC_TX_FRAMES; priv->tx_coal_timer = STMMAC_COAL_TX_TIMER; - init_timer(&priv->txtimer); + setup_timer(&priv->txtimer, stmmac_tx_timer, (unsigned long)priv); priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer); - priv->txtimer.data = (unsigned long)priv; - priv->txtimer.function = stmmac_tx_timer; add_timer(&priv->txtimer); } From 9be5813a29e5b3379db30d00319682fe965febe5 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:19 +0530 Subject: [PATCH 0489/1322] drivers: net: packetengines: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/packetengines/hamachi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c index 482b85e4d665..77bc7cca8980 100644 --- a/drivers/net/ethernet/packetengines/hamachi.c +++ b/drivers/net/ethernet/packetengines/hamachi.c @@ -979,10 +979,8 @@ static int hamachi_open(struct net_device *dev) dev->name, readw(ioaddr + RxStatus), readw(ioaddr + TxStatus)); } /* Set the timer to check for link beat. */ - init_timer(&hmp->timer); + setup_timer(&hmp->timer, hamachi_timer, (unsigned long)dev); hmp->timer.expires = RUN_AT((24*HZ)/10); /* 2.4 sec. */ - hmp->timer.data = (unsigned long)dev; - hmp->timer.function = hamachi_timer; /* timer handler */ add_timer(&hmp->timer); return 0; From 7d8fb3a7742513bb5434be704e0b0bf785032f45 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:20 +0530 Subject: [PATCH 0490/1322] drivers: net: i40evf: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 1825d956bb00..c243f9da95ae 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -2686,9 +2686,8 @@ static void i40evf_init_task(struct work_struct *work) ether_addr_copy(netdev->perm_addr, adapter->hw.mac.addr); } - init_timer(&adapter->watchdog_timer); - adapter->watchdog_timer.function = &i40evf_watchdog_timer; - adapter->watchdog_timer.data = (unsigned long)adapter; + setup_timer(&adapter->watchdog_timer, &i40evf_watchdog_timer, + (unsigned long)adapter); mod_timer(&adapter->watchdog_timer, jiffies + 1); adapter->tx_desc_count = I40EVF_DEFAULT_TXD; From 99e3aa1ea47d0804a69ea7948ddd1251bcd1a635 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:21 +0530 Subject: [PATCH 0491/1322] drivers: net: uli526x: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/uli526x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c index 7fc248efc4ba..5fbbc0caba99 100644 --- a/drivers/net/ethernet/dec/tulip/uli526x.c +++ b/drivers/net/ethernet/dec/tulip/uli526x.c @@ -491,10 +491,8 @@ static int uli526x_open(struct net_device *dev) netif_wake_queue(dev); /* set and active a timer process */ - init_timer(&db->timer); + setup_timer(&db->timer, uli526x_timer, (unsigned long)dev); db->timer.expires = ULI526X_TIMER_WUT + HZ * 2; - db->timer.data = (unsigned long)dev; - db->timer.function = uli526x_timer; add_timer(&db->timer); return 0; From 570ba3e82befbba7649e459fedc4aab27510ef44 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:22 +0530 Subject: [PATCH 0492/1322] drivers: net: enic: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_clsf.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.h b/drivers/net/ethernet/cisco/enic/enic_clsf.h index 6aa9f89d073b..4bfbf25f9ddc 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.h +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.h @@ -19,9 +19,8 @@ void enic_flow_may_expire(unsigned long data); static inline void enic_rfs_timer_start(struct enic *enic) { - init_timer(&enic->rfs_h.rfs_may_expire); - enic->rfs_h.rfs_may_expire.function = enic_flow_may_expire; - enic->rfs_h.rfs_may_expire.data = (unsigned long)enic; + setup_timer(&enic->rfs_h.rfs_may_expire, enic_flow_may_expire, + (unsigned long)enic); mod_timer(&enic->rfs_h.rfs_may_expire, jiffies + HZ/4); } From 66f06890305eb2c8200cefbc3d6405ff6baef47e Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:23 +0530 Subject: [PATCH 0493/1322] drivers: net: cxgb: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb/sge.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c index 0f13a7f7c1d3..75e439918700 100644 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c @@ -2075,9 +2075,8 @@ struct sge *t1_sge_create(struct adapter *adapter, struct sge_params *p) goto nomem_port; } - init_timer(&sge->tx_reclaim_timer); - sge->tx_reclaim_timer.data = (unsigned long)sge; - sge->tx_reclaim_timer.function = sge_tx_reclaim_cb; + setup_timer(&sge->tx_reclaim_timer, sge_tx_reclaim_cb, + (unsigned long)sge); if (is_T2(sge->adapter)) { init_timer(&sge->espibug_timer); From 804dea920b66fa5813278fc55eaa5b2ae39ab110 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:24 +0530 Subject: [PATCH 0494/1322] drivers: net: bnx2x: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index c12b4d3e946e..54d1571384a0 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12414,10 +12414,8 @@ static int bnx2x_init_bp(struct bnx2x *bp) bp->current_interval = CHIP_REV_IS_SLOW(bp) ? 5*HZ : HZ; - init_timer(&bp->timer); + setup_timer(&bp->timer, bnx2x_timer, (unsigned long)bp); bp->timer.expires = jiffies + bp->current_interval; - bp->timer.data = (unsigned long) bp; - bp->timer.function = bnx2x_timer; if (SHMEM2_HAS(bp, dcbx_lldp_params_offset) && SHMEM2_HAS(bp, dcbx_lldp_dcbx_stat_offset) && From 19569c88b938145e90cc7d6ab45d7fa0edd07beb Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 22:35:25 +0530 Subject: [PATCH 0495/1322] drivers: net: lmc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/lmc/lmc_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c index 4698450c77d1..ae69d65158e6 100644 --- a/drivers/net/wan/lmc/lmc_main.c +++ b/drivers/net/wan/lmc/lmc_main.c @@ -1084,10 +1084,8 @@ static int lmc_open(struct net_device *dev) * Setup a timer for the watchdog on probe, and start it running. * Since lmc_ok == 0, it will be a NOP for now. */ - init_timer (&sc->timer); + setup_timer(&sc->timer, lmc_watchdog, (unsigned long)dev); sc->timer.expires = jiffies + HZ; - sc->timer.data = (unsigned long) dev; - sc->timer.function = lmc_watchdog; add_timer (&sc->timer); lmc_trace(dev, "lmc_open out"); From c8e1812960eeae42e2183154927028511c4bc566 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:45:36 +0300 Subject: [PATCH 0496/1322] net_sched: always reset qdisc backlog in qdisc_reset() SKB stored in qdisc->gso_skb also counted into backlog. Some qdiscs don't reset backlog to zero in ->reset(), for example sfq just dequeue and free all queued skb. Signed-off-by: Konstantin Khlebnikov Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..bf8c81e07c70 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,6 +685,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); From 21f4d5cc25ec0e6e8eb8420dd2c399e6d2fc7d14 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Sep 2017 15:46:11 +0300 Subject: [PATCH 0497/1322] net_sched/hfsc: fix curve activation in hfsc_change_class() If real-time or fair-share curves are enabled in hfsc_change_class() class isn't inserted into rb-trees yet. Thus init_ed() and init_vf() must be called in place of update_ed() and update_vf(). Remove isn't required because for now curves cannot be disabled. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); From 95ec66968571bf0af0a22effdc1b9d9e62ea6630 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:56 -0700 Subject: [PATCH 0498/1322] samples/bpf: Use getppid instead of getpgrp for array map stress When cross-compiling the bpf sample map_perf_test for aarch64, I find that __NR_getpgrp is undefined. This causes build errors. This syscall is deprecated and requires defining __ARCH_WANT_SYSCALL_DEPRECATED. To avoid having to define that, just use a different syscall (getppid) for the array map stress test. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/map_perf_test_kern.c | 2 +- samples/bpf/map_perf_test_user.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 098c857f1eda..2b2ffb97018b 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpgrp") +SEC("kprobe/sys_getppid") int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index f388254896f6..a0310fc70057 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -282,7 +282,7 @@ static void test_array_lookup(int cpu) start_time = time_get_ns(); for (i = 0; i < max_cnt; i++) - syscall(__NR_getpgrp, 0); + syscall(__NR_getppid, 0); printf("%d:array_lookup %lld lookups per sec\n", cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } From 876e88e3273e300895e308bd660c6cfaabb03cd5 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:57 -0700 Subject: [PATCH 0499/1322] samples/bpf: Enable cross compiler support When cross compiling, bpf samples use HOSTCC for compiling the non-BPF part of the sample, however what we really want is to use the cross compiler to build for the cross target since that is what will load and run the BPF sample. Detect this and compile samples correctly. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index cf17c7932a6e..13f74b67ca44 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -177,6 +177,11 @@ HOSTLOADLIBES_syscall_tp += -lelf LLC ?= llc CLANG ?= clang +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +HOSTCC = $(CROSS_COMPILE)gcc +endif + # Trick to allow make to be run from this directory all: $(MAKE) -C ../../ $(CURDIR)/ From b655fc1c2ee1c2b2d07c0b6f432798b91202718c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:58 -0700 Subject: [PATCH 0500/1322] samples/bpf: Fix pt_regs issues when cross-compiling BPF samples fail to build when cross-compiling for ARM64 because of incorrect pt_regs param selection. This is because clang defines __x86_64__ and bpf_headers thinks we're building for x86. Since clang is building for the BPF target, it shouldn't make assumptions about what target the BPF program is going to run on. To fix this, lets pass ARCH so the header knows which target the BPF program is being compiled for and can use the correct pt_regs code. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 2 +- tools/testing/selftests/bpf/bpf_helpers.h | 56 ++++++++++++++++++++--- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 13f74b67ca44..ebc2ad69b62c 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -230,7 +230,7 @@ $(obj)/%.o: $(src)/%.c $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ -I$(srctree)/tools/testing/selftests/bpf/ \ -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -Wno-compare-distinct-pointer-types \ + -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option \ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 36fb9161b34a..4875395b0b52 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -109,7 +109,47 @@ static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = static int (*bpf_skb_change_head)(void *, int len, int flags) = (void *) BPF_FUNC_skb_change_head; +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s930x) + #define bpf_target_s930x + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#else + #undef bpf_target_defined +#endif + +/* Fall back to what the compiler says */ +#ifndef bpf_target_defined #if defined(__x86_64__) + #define bpf_target_x86 +#elif defined(__s390x__) + #define bpf_target_s930x +#elif defined(__aarch64__) + #define bpf_target_arm64 +#elif defined(__mips__) + #define bpf_target_mips +#elif defined(__powerpc__) + #define bpf_target_powerpc +#elif defined(__sparc__) + #define bpf_target_sparc +#endif +#endif + +#if defined(bpf_target_x86) #define PT_REGS_PARM1(x) ((x)->di) #define PT_REGS_PARM2(x) ((x)->si) @@ -122,7 +162,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->ip) -#elif defined(__s390x__) +#elif defined(bpf_target_s390x) #define PT_REGS_PARM1(x) ((x)->gprs[2]) #define PT_REGS_PARM2(x) ((x)->gprs[3]) @@ -135,7 +175,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->gprs[15]) #define PT_REGS_IP(x) ((x)->psw.addr) -#elif defined(__aarch64__) +#elif defined(bpf_target_arm64) #define PT_REGS_PARM1(x) ((x)->regs[0]) #define PT_REGS_PARM2(x) ((x)->regs[1]) @@ -148,7 +188,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->pc) -#elif defined(__mips__) +#elif defined(bpf_target_mips) #define PT_REGS_PARM1(x) ((x)->regs[4]) #define PT_REGS_PARM2(x) ((x)->regs[5]) @@ -161,7 +201,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->regs[29]) #define PT_REGS_IP(x) ((x)->cp0_epc) -#elif defined(__powerpc__) +#elif defined(bpf_target_powerpc) #define PT_REGS_PARM1(x) ((x)->gpr[3]) #define PT_REGS_PARM2(x) ((x)->gpr[4]) @@ -172,7 +212,7 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->nip) -#elif defined(__sparc__) +#elif defined(bpf_target_sparc) #define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) #define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) @@ -182,6 +222,8 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) #define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) #define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) + +/* Should this also be a bpf_target check for the sparc case? */ #if defined(__arch64__) #define PT_REGS_IP(x) ((x)->tpc) #else @@ -190,10 +232,10 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #endif -#ifdef __powerpc__ +#ifdef bpf_target_powerpc #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP -#elif defined(__sparc__) +#elif bpf_target_sparc #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #else From 8bf2ac25a96c69985a1a9fbbad7da22ae4343a38 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:59 -0700 Subject: [PATCH 0501/1322] samples/bpf: Add documentation on cross compilation Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/README.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index 79f9a58f1872..5f27e4faca50 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. This will direct make to build +samples for the cross target. + +export ARCH=arm64 +export CROSS_COMPILE="aarch64-linux-gnu-" +make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang From fe2502e49b58606580c77b3d84e42f946de182d8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 20 Sep 2017 09:18:45 -0700 Subject: [PATCH 0502/1322] net_sched: remove cls_flower idr on failure Fixes: c15ab236d69d ("net/sched: Change cls_flower to use IDR") Cc: Chris Mi Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..d230cb4c8094 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -922,28 +922,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +952,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +981,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Sep 2017 18:26:53 +0200 Subject: [PATCH 0503/1322] net: avoid a full fib lookup when rp_filter is disabled. Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") a full fib lookup is needed even if the rp_filter is disabled, if accept_local is false - which is the default. What we really need in the above scenario is just checking that the source IP address is not local, and in most case we can do that is a cheaper way looking up the ifaddr hash table. This commit adds a helper for such lookup, and uses it to validate the src address when rp_filter is disabled and no 'local' routes are created by the user space in the relevant namespace. A new ipv4 netns flag is added to account for such routes. We need that to preserve the same behavior we had before this patch. It also drops the checks to bail early from __fib_validate_source, added by the commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") they do not give any measurable performance improvement: if we do the lookup with are on a slower path. This improves UDP performances for unconnected sockets when rp_filter is disabled by 5% and also gives small but measurable performance improvement for TCP flood scenarios. v1 -> v2: - use the ifaddr lookup helper in __ip_dev_find(), as suggested by Eric - fall-back to full lookup if custom local routes are present Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/devinet.c | 30 ++++++++++++++++++------------ net/ipv4/fib_frontend.c | 22 +++++++++++++++++----- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index fb3f809e34e4..751d051f0bc7 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr); static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa) { return !((addr^ifa->ifa_address)&ifa->ifa_mask); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 20d061c805e3..20720721da4b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; + bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..7ce22a2c07ce 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f02819134ba2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* if no local routes are added from user space we can check + * for local addresses looking-up the ifaddr table + */ + if (net->ipv4.fib_has_custom_local_routes) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } From 0abfd494deefdbab66ac03c1181a614285e7d90c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Sep 2017 12:28:05 -0400 Subject: [PATCH 0504/1322] net: dsa: use dedicated CPU port Each port in DSA has its own dedicated CPU port currently available in its parent switch's ds->ports[port].cpu_dp. Use it instead of getting the unique tree CPU port, which will be deprecated soon. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 4 ++-- drivers/net/dsa/bcm_sf2.c | 6 +++--- drivers/net/dsa/mt7530.c | 4 ++-- drivers/net/dsa/mv88e6060.c | 2 +- drivers/net/dsa/qca8k.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index a9f2a5b55a5e..d4ce092def83 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1336,7 +1336,7 @@ EXPORT_SYMBOL(b53_fdb_dump); int b53_br_join(struct dsa_switch *ds, int port, struct net_device *br) { struct b53_device *dev = ds->priv; - s8 cpu_port = ds->dst->cpu_dp->index; + s8 cpu_port = ds->ports[port].cpu_dp->index; u16 pvlan, reg; unsigned int i; @@ -1382,7 +1382,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br) { struct b53_device *dev = ds->priv; struct b53_vlan *vl = &dev->vlans[0]; - s8 cpu_port = ds->dst->cpu_dp->index; + s8 cpu_port = ds->ports[port].cpu_dp->index; unsigned int i; u16 pvlan, reg, pvid; diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 0072a959db5b..898d5642b516 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -661,7 +661,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds) static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port, struct ethtool_wolinfo *wol) { - struct net_device *p = ds->dst->cpu_dp->netdev; + struct net_device *p = ds->ports[port].cpu_dp->netdev; struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); struct ethtool_wolinfo pwol; @@ -684,9 +684,9 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port, static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port, struct ethtool_wolinfo *wol) { - struct net_device *p = ds->dst->cpu_dp->netdev; + struct net_device *p = ds->ports[port].cpu_dp->netdev; struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - s8 cpu_port = ds->dst->cpu_dp->index; + s8 cpu_port = ds->ports[port].cpu_dp->index; struct ethtool_wolinfo pwol; p->ethtool_ops->get_wol(p, &pwol); diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index c142b97add2c..faa3b88d2206 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -928,11 +928,11 @@ mt7530_setup(struct dsa_switch *ds) struct device_node *dn; struct mt7530_dummy_poll p; - /* The parent node of cpu_dp->netdev which holds the common system + /* The parent node of master netdev which holds the common system * controller also is the container for two GMACs nodes representing * as two netdev instances. */ - dn = ds->dst->cpu_dp->netdev->dev.of_node->parent; + dn = ds->ports[MT7530_CPU_PORT].netdev->dev.of_node->parent; priv->ethernet = syscon_node_to_regmap(dn); if (IS_ERR(priv->ethernet)) return PTR_ERR(priv->ethernet); diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index dce7fa57eb55..621cdc46ad81 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -176,7 +176,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) ((p & 0xf) << PORT_VLAN_MAP_DBNUM_SHIFT) | (dsa_is_cpu_port(ds, p) ? ds->enabled_port_mask : - BIT(ds->dst->cpu_dp->index))); + BIT(ds->ports[p].cpu_dp->index))); /* Port Association Vector: when learning source addresses * of packets, add the address to the address database using diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 5ada7a41449c..82f09711ac1a 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -506,7 +506,7 @@ qca8k_setup(struct dsa_switch *ds) pr_warn("regmap initialization failed"); /* Initialize CPU port pad mode (xMII type, delays...) */ - phy_mode = of_get_phy_mode(ds->dst->cpu_dp->dn); + phy_mode = of_get_phy_mode(ds->ports[QCA8K_CPU_PORT].dn); if (phy_mode < 0) { pr_err("Can't find phy-mode for master device\n"); return phy_mode; From 0ab09befdbb7ca9b969d6206108629ddff43876e Mon Sep 17 00:00:00 2001 From: Alex Ng Date: Wed, 20 Sep 2017 11:17:35 -0700 Subject: [PATCH 0505/1322] hv_netvsc: fix send buffer failure on MTU change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If MTU is changed the host would reject the send buffer change. This problem is result of recent change to allow changing send buffer size. Every time we change the MTU, we store the previous net_device section count before destroying the buffer, but we don’t store the previous section size. When we reinitialize the buffer, its size is calculated by multiplying the previous count and previous size. Since we continuously increase the MTU, the host returns us a decreasing count value while the section size is reinitialized to 1728 bytes every time. This eventually leads to a condition where the calculated buf_size is so small that the host rejects it. Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size") Signed-off-by: Alex Ng Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc.c | 7 ++----- drivers/net/hyperv/netvsc_drv.c | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d98cdfb1536b..5176be76ca7d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -150,6 +150,8 @@ struct netvsc_device_info { u32 num_chn; u32 send_sections; u32 recv_sections; + u32 send_section_size; + u32 recv_section_size; }; enum rndis_device_state { diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index a5511b7326af..8d5077fb0492 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -76,9 +76,6 @@ static struct netvsc_device *alloc_net_device(void) net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; - net_device->recv_section_size = NETVSC_RECV_SECTION_SIZE; - net_device->send_section_size = NETVSC_SEND_SECTION_SIZE; - init_completion(&net_device->channel_init_wait); init_waitqueue_head(&net_device->subchan_open); INIT_WORK(&net_device->subchan_work, rndis_set_subchannel); @@ -262,7 +259,7 @@ static int netvsc_init_buf(struct hv_device *device, int ret = 0; /* Get receive buffer area. */ - buf_size = device_info->recv_sections * net_device->recv_section_size; + buf_size = device_info->recv_sections * device_info->recv_section_size; buf_size = roundup(buf_size, PAGE_SIZE); net_device->recv_buf = vzalloc(buf_size); @@ -344,7 +341,7 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; /* Now setup the send buffer. */ - buf_size = device_info->send_sections * net_device->send_section_size; + buf_size = device_info->send_sections * device_info->send_section_size; buf_size = round_up(buf_size, PAGE_SIZE); net_device->send_buf = vzalloc(buf_size); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d4902ee5f260..a32ae02e1b6c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -848,7 +848,9 @@ static int netvsc_set_channels(struct net_device *net, device_info.num_chn = count; device_info.ring_size = ring_size; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(dev, nvdev); @@ -963,7 +965,9 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) device_info.ring_size = ring_size; device_info.num_chn = nvdev->num_chn; device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(hdev, nvdev); @@ -1485,7 +1489,9 @@ static int netvsc_set_ringparam(struct net_device *ndev, device_info.num_chn = nvdev->num_chn; device_info.ring_size = ring_size; device_info.send_sections = new_tx; + device_info.send_section_size = nvdev->send_section_size; device_info.recv_sections = new_rx; + device_info.recv_section_size = nvdev->recv_section_size; netif_device_detach(ndev); was_opened = rndis_filter_opened(nvdev); @@ -1934,7 +1940,9 @@ static int netvsc_probe(struct hv_device *dev, device_info.ring_size = ring_size; device_info.num_chn = VRSS_CHANNEL_DEFAULT; device_info.send_sections = NETVSC_DEFAULT_TX; + device_info.send_section_size = NETVSC_SEND_SECTION_SIZE; device_info.recv_sections = NETVSC_DEFAULT_RX; + device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE; nvdev = rndis_filter_device_add(dev, &device_info); if (IS_ERR(nvdev)) { From 4a7a3860caac1a8779e8c459d8abe21b111798d6 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Wed, 20 Sep 2017 15:32:53 -0500 Subject: [PATCH 0506/1322] net: qcom/emac: add software control for pause frame mode The EMAC has the option of sending only a single pause frame when flow control is enabled and the RX queue is full. Although sending only one pause frame has little value, this would allow admins to enable automatic flow control without having to worry about the EMAC flooding nearby switches with pause frames if the kernel hangs. The option is enabled by using the single-pause-mode private flag. Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- .../net/ethernet/qualcomm/emac/emac-ethtool.c | 30 +++++++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac-mac.c | 22 ++++++++++++++ drivers/net/ethernet/qualcomm/emac/emac.c | 3 ++ drivers/net/ethernet/qualcomm/emac/emac.h | 3 ++ 4 files changed, 58 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c index bbe24639aa5a..c8c6231b87f3 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c @@ -88,6 +88,8 @@ static void emac_set_msglevel(struct net_device *netdev, u32 data) static int emac_get_sset_count(struct net_device *netdev, int sset) { switch (sset) { + case ETH_SS_PRIV_FLAGS: + return 1; case ETH_SS_STATS: return EMAC_STATS_LEN; default: @@ -100,6 +102,10 @@ static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data) unsigned int i; switch (stringset) { + case ETH_SS_PRIV_FLAGS: + strcpy(data, "single-pause-mode"); + break; + case ETH_SS_STATS: for (i = 0; i < EMAC_STATS_LEN; i++) { strlcpy(data, emac_ethtool_stat_strings[i], @@ -230,6 +236,27 @@ static int emac_get_regs_len(struct net_device *netdev) return EMAC_MAX_REG_SIZE * sizeof(u32); } +#define EMAC_PRIV_ENABLE_SINGLE_PAUSE BIT(0) + +static int emac_set_priv_flags(struct net_device *netdev, u32 flags) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + adpt->single_pause_mode = !!(flags & EMAC_PRIV_ENABLE_SINGLE_PAUSE); + + if (netif_running(netdev)) + return emac_reinit_locked(adpt); + + return 0; +} + +static u32 emac_get_priv_flags(struct net_device *netdev) +{ + struct emac_adapter *adpt = netdev_priv(netdev); + + return adpt->single_pause_mode ? EMAC_PRIV_ENABLE_SINGLE_PAUSE : 0; +} + static const struct ethtool_ops emac_ethtool_ops = { .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, @@ -253,6 +280,9 @@ static const struct ethtool_ops emac_ethtool_ops = { .get_regs_len = emac_get_regs_len, .get_regs = emac_get_regs, + + .set_priv_flags = emac_set_priv_flags, + .get_priv_flags = emac_get_priv_flags, }; void emac_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index bcd4708b3745..0ea3ca09c689 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -551,6 +551,28 @@ static void emac_mac_start(struct emac_adapter *adpt) mac &= ~(HUGEN | VLAN_STRIP | TPAUSE | SIMR | HUGE | MULTI_ALL | DEBUG_MODE | SINGLE_PAUSE_MODE); + /* Enable single-pause-frame mode if requested. + * + * If enabled, the EMAC will send a single pause frame when the RX + * queue is full. This normally leads to packet loss because + * the pause frame disables the remote MAC only for 33ms (the quanta), + * and then the remote MAC continues sending packets even though + * the RX queue is still full. + * + * If disabled, the EMAC sends a pause frame every 31ms until the RX + * queue is no longer full. Normally, this is the preferred + * method of operation. However, when the system is hung (e.g. + * cores are halted), the EMAC interrupt handler is never called + * and so the RX queue fills up quickly and stays full. The resuling + * non-stop "flood" of pause frames sometimes has the effect of + * disabling nearby switches. In some cases, other nearby switches + * are also affected, shutting down the entire network. + * + * The user can enable or disable single-pause-frame mode + * via ethtool. + */ + mac |= adpt->single_pause_mode ? SINGLE_PAUSE_MODE : 0; + writel_relaxed(csr1, adpt->csr + EMAC_EMAC_WRAPPER_CSR1); writel_relaxed(mac, adpt->base + EMAC_MAC_CTRL); diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 60850bfa3d32..759543512117 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -443,6 +443,9 @@ static void emac_init_adapter(struct emac_adapter *adpt) /* default to automatic flow control */ adpt->automatic = true; + + /* Disable single-pause-frame mode by default */ + adpt->single_pause_mode = false; } /* Get the clock */ diff --git a/drivers/net/ethernet/qualcomm/emac/emac.h b/drivers/net/ethernet/qualcomm/emac/emac.h index 8ee4ec6aef2e..d7c9f44209d4 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.h +++ b/drivers/net/ethernet/qualcomm/emac/emac.h @@ -363,6 +363,9 @@ struct emac_adapter { bool tx_flow_control; bool rx_flow_control; + /* True == use single-pause-frame mode. */ + bool single_pause_mode; + /* Ring parameter */ u8 tpd_burst; u8 rfd_burst; From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:13 -0700 Subject: [PATCH 0507/1322] net: ethtool: Add back transceiver type Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") deprecated the ethtool_cmd::transceiver field, which was fine in premise, except that the PHY library was actually using it to report the type of transceiver: internal or external. Use the first word of the reserved field to put this __u8 transceiver field back in. It is made read-only, and we don't expect the ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this is mostly for the legacy path where we do: ethtool_get_settings() -> dev->ethtool_ops->get_link_ksettings() -> convert_link_ksettings_to_legacy_settings() to have no information loss compared to the legacy get_settings API. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++++- net/core/ethtool.c | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c041dae8e2c..5bd1b1de4ea0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1753,6 +1753,8 @@ enum ethtool_reset_flags { * %ethtool_link_mode_bit_indices for the link modes, and other * link features that the link partner advertised through * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. * * If autonegotiation is disabled, the speed and @duplex represent the * fixed link mode and are writable if the driver supports multiple @@ -1804,7 +1806,9 @@ struct ethtool_link_settings { __u8 eth_tp_mdix; __u8 eth_tp_mdix_ctrl; __s8 link_mode_masks_nwords; - __u32 reserved[8]; + __u8 transceiver; + __u8 reserved1[3]; + __u32 reserved[7]; __u32 link_mode_masks[0]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } From ceb628134a75564d7bfa8e4ef902e6e588339e11 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Sep 2017 15:52:14 -0700 Subject: [PATCH 0508/1322] net: phy: Keep reporting transceiver type With commit 2d55173e71b0 ("phy: add generic function to support ksetting support"), we lost the ability to report the transceiver type like we used to. Now that we have added back the transceiver type to ethtool_link_settings, we can report it back like we used to and have no loss of information. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Fixes: 2d55173e71b0 ("phy: add generic function to support ksetting support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e842d2cd1ee7..2b1e67bc1e73 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -373,7 +373,8 @@ void phy_ethtool_ksettings_get(struct phy_device *phydev, cmd->base.port = PORT_BNC; else cmd->base.port = PORT_MII; - + cmd->base.transceiver = phy_is_internal(phydev) ? + XCVR_INTERNAL : XCVR_EXTERNAL; cmd->base.phy_address = phydev->mdio.addr; cmd->base.autoneg = phydev->autoneg; cmd->base.eth_tp_mdix_ctrl = phydev->mdix_ctrl; From 8a7ffeb795f864dd605b579c05934cba95dc8ad3 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:36 +0530 Subject: [PATCH 0509/1322] lan78xx: Fix for eeprom read/write when device auto suspend Fix for eeprom read/write when device auto suspend Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index b99a7fb09f8e..fcf85ae37435 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1265,30 +1265,46 @@ static int lan78xx_ethtool_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; ee->magic = LAN78XX_EEPROM_MAGIC; - return lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_read_raw_eeprom(dev, ee->offset, ee->len, data); + + usb_autopm_put_interface(dev->intf); + + return ret; } static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *ee, u8 *data) { struct lan78xx_net *dev = netdev_priv(netdev); + int ret; + + ret = usb_autopm_get_interface(dev->intf); + if (ret) + return ret; /* Allow entire eeprom update only */ if ((ee->magic == LAN78XX_EEPROM_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == EEPROM_INDICATOR)) - return lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && (ee->len == 512) && (data[0] == OTP_INDICATOR_1)) - return lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); + ret = lan78xx_write_raw_otp(dev, ee->offset, ee->len, data); - return -EINVAL; + usb_autopm_put_interface(dev->intf); + + return ret; } static void lan78xx_get_strings(struct net_device *netdev, u32 stringset, From c077682282dd34c75e8477c22dffe7c9aebc6e98 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:37 +0530 Subject: [PATCH 0510/1322] lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE Allow EEPROM write for less than MAX_EEPROM_SIZE Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index fcf85ae37435..f8c63eec8353 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1290,11 +1290,10 @@ static int lan78xx_ethtool_set_eeprom(struct net_device *netdev, if (ret) return ret; - /* Allow entire eeprom update only */ - if ((ee->magic == LAN78XX_EEPROM_MAGIC) && - (ee->offset == 0) && - (ee->len == 512) && - (data[0] == EEPROM_INDICATOR)) + /* Invalid EEPROM_INDICATOR at offset zero will result in a failure + * to load data from EEPROM + */ + if (ee->magic == LAN78XX_EEPROM_MAGIC) ret = lan78xx_write_raw_eeprom(dev, ee->offset, ee->len, data); else if ((ee->magic == LAN78XX_OTP_MAGIC) && (ee->offset == 0) && From e365280521029c9366bab038915274ddaa1b7195 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Thu, 21 Sep 2017 02:36:38 +0530 Subject: [PATCH 0511/1322] lan78xx: Use default values loaded from EEPROM/OTP after reset Use default value of auto duplex and auto speed values loaded from EEPROM/OTP after reset. The LAN78xx allows platform configurations to be loaded from EEPROM/OTP. Ex: When external phy is connected, the MAC can be configured to have correct auto speed, auto duplex, auto polarity configured from the EEPROM/OTP. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f8c63eec8353..0161f77641fa 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2449,7 +2449,6 @@ static int lan78xx_reset(struct lan78xx_net *dev) /* LAN7801 only has RGMII mode */ if (dev->chipid == ID_REV_CHIP_ID_7801_) buf &= ~MAC_CR_GMII_EN_; - buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; ret = lan78xx_write_reg(dev, MAC_CR, buf); ret = lan78xx_read_reg(dev, MAC_TX, &buf); From 4fa7b718881a5358286903b796968cea80993820 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Sep 2017 19:31:57 -0400 Subject: [PATCH 0512/1322] net: dsa: better scoping of slave functions A few DSA slave functions take a dsa_slave_priv pointer as first argument, whereas the scope of the slave.c functions is the slave net_device structure. Fix this and rename dsa_netpoll_send_skb to dsa_slave_netpoll_send_skb. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6fc9eb094267..4a98942a6135 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -385,10 +385,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev, return 0; } -static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p, - struct sk_buff *skb) +static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, + struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER + struct dsa_slave_priv *p = netdev_priv(dev); + if (p->netpoll) netpoll_send_skb(p->netpoll, skb); #else @@ -422,7 +424,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) - return dsa_netpoll_send_skb(p, nskb); + return dsa_slave_netpoll_send_skb(dev, nskb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType @@ -736,9 +738,9 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev, } static struct dsa_mall_tc_entry * -dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p, - unsigned long cookie) +dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { + struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) @@ -820,7 +822,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, if (!ds->ops->port_mirror_del) return; - mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie); + mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; @@ -1027,10 +1029,9 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct dsa_slave_priv *p, - struct net_device *slave_dev, - int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); @@ -1046,9 +1047,9 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p, p->phy_interface); } -static int dsa_slave_phy_setup(struct dsa_slave_priv *p, - struct net_device *slave_dev) +static int dsa_slave_phy_setup(struct net_device *slave_dev) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; @@ -1088,7 +1089,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!phy_is_fixed && phy_id >= 0 && (ds->phys_mii_mask & (1 << phy_id))) { - ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + ret = dsa_slave_phy_connect(slave_dev, phy_id); if (ret) { netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); of_node_put(phy_dn); @@ -1111,7 +1112,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, * MDIO bus instead */ if (!p->phy) { - ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index); + ret = dsa_slave_phy_connect(slave_dev, p->dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->dp->index, ret); @@ -1233,7 +1234,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) netif_carrier_off(slave_dev); - ret = dsa_slave_phy_setup(p, slave_dev); + ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); unregister_netdev(slave_dev); From de40fc5d210f2c31f2c25a9b920861276a71b70d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 20 Sep 2017 19:32:14 -0400 Subject: [PATCH 0513/1322] net: dsa: add port fdb dump Dumping a DSA port's FDB entries is not specific to a DSA slave, so add a dsa_port_fdb_dump function, similarly to dsa_port_fdb_add and dsa_port_fdb_del. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 + net/dsa/port.c | 11 +++++++++++ net/dsa/slave.c | 9 ++------- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f616b3444418..9803952a5b40 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -128,6 +128,7 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); diff --git a/net/dsa/port.c b/net/dsa/port.c index 659676ba3f8b..76d43a82d397 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -173,6 +173,17 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_fdb_dump) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_dump(ds, port, cb, data); +} + int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4a98942a6135..02ace7d462c4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -263,16 +263,11 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, }; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; int err; - if (!ds->ops->port_fdb_dump) - return -EOPNOTSUPP; - - err = ds->ops->port_fdb_dump(ds, dp->index, - dsa_slave_port_fdb_do_dump, - &dump); + err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); *idx = dump.idx; + return err; } From f0ef1f4f2b772c0a1c8b35a6ae3edf974cc110dd Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 08:24:27 +0200 Subject: [PATCH 0514/1322] net: stmmac: Cocci spatch "of_table" Make sure (of/i2c/platform)_device_id tables are NULL terminated. Found by coccinelle spatch "misc/of_table.cocci" Signed-off-by: Thomas Meyer Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index a366b3747eeb..8a280b48e3a9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -315,6 +315,7 @@ static int stmmac_dt_phy(struct plat_stmmacenet_data *plat, { .compatible = "allwinner,sun8i-h3-emac" }, { .compatible = "allwinner,sun8i-v3s-emac" }, { .compatible = "allwinner,sun50i-a64-emac" }, + {}, }; /* If phy-handle property is passed from DT, use it as the PHY */ From b6cd4b5895848968e8fee93fc5e3dc8babc40b9e Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 08:15:26 +0200 Subject: [PATCH 0515/1322] e100: Cocci spatch "pool_zalloc-simple" Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0. Found by coccinelle spatch "api/alloc/pool_zalloc-simple.cocci" Signed-off-by: Thomas Meyer Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/e100.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 4d10270ddf8f..184f11242f56 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -1910,11 +1910,10 @@ static int e100_alloc_cbs(struct nic *nic) nic->cb_to_use = nic->cb_to_send = nic->cb_to_clean = NULL; nic->cbs_avail = 0; - nic->cbs = pci_pool_alloc(nic->cbs_pool, GFP_KERNEL, - &nic->cbs_dma_addr); + nic->cbs = pci_pool_zalloc(nic->cbs_pool, GFP_KERNEL, + &nic->cbs_dma_addr); if (!nic->cbs) return -ENOMEM; - memset(nic->cbs, 0, count * sizeof(struct cb)); for (cb = nic->cbs, i = 0; i < count; cb++, i++) { cb->next = (i + 1 < count) ? cb + 1 : nic->cbs; From 09579ac803a3638344b8544b5940793d5358673e Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:26 +0200 Subject: [PATCH 0516/1322] net/smc: add missing dev_put In the infiniband part, SMC currently uses get_netdev which calls dev_hold on the returned net device. However, the SMC code never calls dev_put on that net device resulting in a wrong reference count. This patch adds a dev_put after the usage of the net device to fix the issue. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); From 846e344eb7229018457d6d6fc1ab0cc0a167692f Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 21 Sep 2017 09:16:27 +0200 Subject: [PATCH 0517/1322] net/smc: add receive timeout check The SMC receive function currently lacks a timeout check under the condition that no data were received and no data are available. This patch adds such a check. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_rx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { From 731b008560e6dfaf5fb297543f17bbe9bb868f3c Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:28 +0200 Subject: [PATCH 0518/1322] net/smc: take RCU read lock for routing cache lookup smc_netinfo_by_tcpsk() looks up the routing cache. Such a lookup requires protection by an RCU read lock. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..2e8d2dabac0c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); From a6832c3acdb2ceb099ec3c385777fbaa6d5a5fd6 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:29 +0200 Subject: [PATCH 0519/1322] net/smc: adjust net_device refcount smc_pnet_fill_entry() uses dev_get_by_name() adding a refcount to ndev. The following smc_pnet_enter() has to reduce the refcount if the entry to be added exists already in the pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; From 8301fa44b41b1ca46a0547809eb173112559dc51 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:30 +0200 Subject: [PATCH 0520/1322] net/smc: adapt send request completion notification The solicited flag is meaningful for the receive completion queue. Ask for next work completion of any type on the send queue. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_wr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); From 5bc11ddbdf7fc6681db5c3f9a92cdee0f19cee1e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:31 +0200 Subject: [PATCH 0521/1322] net/smc: longer delay for client link group removal Client link group creation always follows the server linkgroup creation. If peer creates a new server link group, client has to create a new client link group. If peer reuses a server link group for a new connection, client has to reuse its client link group as well. This patch introduces a longer delay for client link group removal to make sure this link group still exists, once the peer decides to reuse a server link group. This avoids out-of-sync conditions for link groups. If already scheduled, modify the delay. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) From bfbedfd38378c1ad9df84469403d69c17b074b66 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:32 +0200 Subject: [PATCH 0522/1322] net/smc: terminate link group if out-of-sync is received An out-of-sync condition can just be detected by the client. If the server receives a CLC DECLINE message indicating an out-of-sync condition for the link groups, the server must clean up the out-of-sync link group. There is no need for an extra third parameter in smc_clc_send_decline(). Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 ++---- net/smc/smc_clc.c | 10 +++++----- net/smc/smc_clc.h | 3 +-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2e8d2dabac0c..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -513,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -808,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -903,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); From 18e537cd58e8d6932719bfa79cb96a1fbc639199 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:33 +0200 Subject: [PATCH 0523/1322] net/smc: introduce a delay The number of outstanding work requests is limited. If all work requests are in use, tx processing is postponed to another scheduling of the tx worker. Switch to a delayed worker to have a gap for tx completion queue events before the next retry. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 2 +- net/smc/smc_close.c | 12 +++++++----- net/smc/smc_tx.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..5201bc103bd8 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -425,7 +427,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +441,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..3866573288dd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -406,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +433,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -468,7 +471,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +491,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } From 8c96feeeb39ba0b89c6121f71e8f7aa2a4d46671 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:16:34 +0200 Subject: [PATCH 0524/1322] net/smc: no close wait in case of process shut down Usually socket closing is delayed if there is still data available in the send buffer to be transmitted. If a process is killed, the delay should be avoided. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 5201bc103bd8..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -413,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; From 51957bc53aa7ca60d63f1ba0d9e3d887562b1723 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Sep 2017 09:17:34 +0200 Subject: [PATCH 0525/1322] net/smc: parameter cleanup in smc_cdc_get_free_slot() Use the smc_connection as first parameter with smc_cdc_get_free_slot(). This is just a small code cleanup, no functional change. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 7 ++++--- net/smc/smc_cdc.h | 3 ++- net/smc/smc_tx.c | 6 ++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7294edbc221..5ef97e5a5f78 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -62,10 +62,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, bh_unlock_sock(&smc->sk); } -int smc_cdc_get_free_slot(struct smc_link *link, +int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend) { + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, (struct smc_wr_tx_pend_priv **)pend); } @@ -118,8 +120,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc) return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 8e1d76f26007..56f883d1159c 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -206,7 +206,8 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, struct smc_cdc_tx_pend; -int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, +int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend); void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..e2228f6d1c25 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -394,8 +394,7 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) int rc; spin_lock_bh(&conn->send_lock); - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -463,8 +462,7 @@ void smc_tx_consumer_update(struct smc_connection *conn) ((to_confirm > conn->rmbe_update_limit) && ((to_confirm > (conn->rmbe_size / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], - &wr_buf, &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { From e1f6198e221f472c03b88e352432a01076ec8647 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Thu, 21 Sep 2017 12:50:47 +0530 Subject: [PATCH 0526/1322] cxgb4: avoid stall while shutting down the adapter do not wait for completion while deleting the filters when the adapter is shutting down because we may not get the response as interrupts will be disabled. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 7 ++++++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index ea72d2d2e1b4..c4e997fdff64 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -549,6 +549,7 @@ enum { /* adapter flags */ MASTER_PF = (1 << 7), FW_OFLD_CONN = (1 << 9), ROOT_NO_RELAXED_ORDERING = (1 << 10), + SHUTTING_DOWN = (1 << 11), }; enum { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 45b5853ca2f1..97ead2c66751 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -191,7 +191,8 @@ static int del_filter_wr(struct adapter *adapter, int fidx) return -ENOMEM; fwr = __skb_put(skb, len); - t4_mk_filtdelwr(f->tid, fwr, adapter->sge.fw_evtq.abs_id); + t4_mk_filtdelwr(f->tid, fwr, (adapter->flags & SHUTTING_DOWN) ? -1 + : adapter->sge.fw_evtq.abs_id); /* Mark the filter as "pending" and ship off the Filter Work Request. * When we get the Work Request Reply we'll clear the pending status. @@ -636,6 +637,10 @@ int cxgb4_del_filter(struct net_device *dev, int filter_id) struct filter_ctx ctx; int ret; + /* If we are shutting down the adapter do not wait for completion */ + if (netdev2adap(dev)->flags & SHUTTING_DOWN) + return __cxgb4_del_filter(dev, filter_id, NULL); + init_completion(&ctx.completion); ret = __cxgb4_del_filter(dev, filter_id, &ctx); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 92d9d795d874..5fe81a4e26a6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5254,6 +5254,8 @@ static void remove_one(struct pci_dev *pdev) return; } + adapter->flags |= SHUTTING_DOWN; + if (adapter->pf == 4) { int i; @@ -5339,6 +5341,8 @@ static void shutdown_one(struct pci_dev *pdev) return; } + adapter->flags |= SHUTTING_DOWN; + if (adapter->pf == 4) { int i; From 8701352b22a2f82c814283131587e970a56b69a8 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Thu, 21 Sep 2017 12:05:25 +0200 Subject: [PATCH 0527/1322] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl Currently, there is a difference in netlink events received when an interface is modified through bridge ioctl() or through netlink. This patch generates additional events when an interface is added to or removed from a bridge via ioctl(). When adding then removing an interface from a bridge with netlink, we get: 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff When using ioctl(): 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff Without this patch, the last netlink notification is not sent. Signed-off-by: Vincent Bernat Reviewed-by: Stephen Hemminger Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..66cd98772051 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) else ret = br_del_if(br, dev); + if (!ret) + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + return ret; } From 2ed343f98178ff232b504bbd006b34db07835082 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 16:29:33 +0530 Subject: [PATCH 0528/1322] net:nfc: use setup_timer Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/nfc/core.c b/net/nfc/core.c index 5cf33df888c3..e5e23c2cbe74 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->targets_generation = 1; if (ops->check_presence) { - init_timer(&dev->check_pres_timer); - dev->check_pres_timer.data = (unsigned long)dev; - dev->check_pres_timer.function = nfc_check_pres_timeout; + setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout, + (unsigned long)dev); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } From 802be571348681a8ee052850e47b9652de7e05d4 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 18:17:55 +0530 Subject: [PATCH 0529/1322] net: wan : hdlc: use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_fr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index 78596e42a3f3..425a47ffed25 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -1003,11 +1003,10 @@ static void fr_start(struct net_device *dev) state(hdlc)->n391cnt = 0; state(hdlc)->txseq = state(hdlc)->rxseq = 0; - init_timer(&state(hdlc)->timer); + setup_timer(&state(hdlc)->timer, fr_timer, + (unsigned long)dev); /* First poll after 1 s */ state(hdlc)->timer.expires = jiffies + HZ; - state(hdlc)->timer.function = fr_timer; - state(hdlc)->timer.data = (unsigned long)dev; add_timer(&state(hdlc)->timer); } else fr_set_link_state(1, dev); From e53a84b2b47119cbb895a4a83e6675ec50cf9fe5 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 18:24:15 +0530 Subject: [PATCH 0530/1322] net: usb: catc: use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/usb/catc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c index dbc90313f472..aeb62e17d19d 100644 --- a/drivers/net/usb/catc.c +++ b/drivers/net/usb/catc.c @@ -805,9 +805,7 @@ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id spin_lock_init(&catc->tx_lock); spin_lock_init(&catc->ctrl_lock); - init_timer(&catc->timer); - catc->timer.data = (long) catc; - catc->timer.function = catc_stats_timer; + setup_timer(&catc->timer, catc_stats_timer, (long)catc); catc->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL); catc->tx_urb = usb_alloc_urb(0, GFP_KERNEL); From 8447779637172809060e5064afdf52f16a09aa13 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Thu, 21 Sep 2017 18:32:58 +0530 Subject: [PATCH 0531/1322] net: ti: netcp: use setup_timer Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_ethss.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index 28cb38af1a34..4ad821655e51 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -3616,9 +3616,8 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev, } spin_unlock_bh(&gbe_dev->hw_stats_lock); - init_timer(&gbe_dev->timer); - gbe_dev->timer.data = (unsigned long)gbe_dev; - gbe_dev->timer.function = netcp_ethss_timer; + setup_timer(&gbe_dev->timer, netcp_ethss_timer, + (unsigned long)gbe_dev); gbe_dev->timer.expires = jiffies + GBE_TIMER_INTERVAL; add_timer(&gbe_dev->timer); *inst_priv = gbe_dev; From e8b95728f724797f958912fd9b765a695595d3a6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 1 Sep 2017 17:13:43 -0700 Subject: [PATCH 0532/1322] Input: uinput - avoid FF flush when destroying device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Normally, when input device supporting force feedback effects is being destroyed, we try to "flush" currently playing effects, so that the physical device does not continue vibrating (or executing other effects). Unfortunately this does not work well for uinput as flushing of the effects deadlocks with the destroy action: - if device is being destroyed because the file descriptor is being closed, then there is noone to even service FF requests; - if device is being destroyed because userspace sent UI_DEV_DESTROY, while theoretically it could be possible to service FF requests, userspace is unlikely to do so (they'd need to make sure FF handling happens on a separate thread) even if kernel solves the issue with FF ioctls deadlocking with UI_DEV_DESTROY ioctl on udev->mutex. To avoid lockups like the one below, let's install a custom input device flush handler, and avoid trying to flush force feedback effects when we destroying the device, and instead rely on uinput to shut off the device properly. NMI watchdog: Watchdog detected hard LOCKUP on cpu 3 ... <> [] _raw_spin_lock_irqsave+0x37/0x40 [] complete+0x1d/0x50 [] uinput_request_done+0x3c/0x40 [uinput] [] uinput_request_submit.part.7+0x47/0xb0 [uinput] [] uinput_dev_erase_effect+0x5b/0x76 [uinput] [] erase_effect+0xad/0xf0 [] flush_effects+0x4d/0x90 [] input_flush_device+0x40/0x60 [] evdev_cleanup+0xac/0xc0 [] evdev_disconnect+0x2b/0x60 [] __input_unregister_device+0xac/0x150 [] input_unregister_device+0x47/0x70 [] uinput_destroy_device+0xb5/0xc0 [uinput] [] uinput_ioctl_handler.isra.9+0x65e/0x740 [uinput] [] ? do_futex+0x12b/0xad0 [] uinput_ioctl+0x18/0x20 [uinput] [] do_vfs_ioctl+0x298/0x480 [] ? security_file_ioctl+0x43/0x60 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x12/0x71 Reported-by: Rodrigo Rivas Costa Reported-by: Clément VUCHENER Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=193741 Signed-off-by: Dmitry Torokhov --- drivers/input/ff-core.c | 13 ++++++++++--- drivers/input/misc/uinput.c | 18 ++++++++++++++++++ include/linux/input.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/input/ff-core.c b/drivers/input/ff-core.c index 8f2042432c85..66a46c84e28f 100644 --- a/drivers/input/ff-core.c +++ b/drivers/input/ff-core.c @@ -237,9 +237,15 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file) EXPORT_SYMBOL_GPL(input_ff_erase); /* - * flush_effects - erase all effects owned by a file handle + * input_ff_flush - erase all effects owned by a file handle + * @dev: input device to erase effect from + * @file: purported owner of the effects + * + * This function erases all force-feedback effects associated with + * the given owner from specified device. Note that @file may be %NULL, + * in which case all effects will be erased. */ -static int flush_effects(struct input_dev *dev, struct file *file) +int input_ff_flush(struct input_dev *dev, struct file *file) { struct ff_device *ff = dev->ff; int i; @@ -255,6 +261,7 @@ static int flush_effects(struct input_dev *dev, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(input_ff_flush); /** * input_ff_event() - generic handler for force-feedback events @@ -343,7 +350,7 @@ int input_ff_create(struct input_dev *dev, unsigned int max_effects) mutex_init(&ff->mutex); dev->ff = ff; - dev->flush = flush_effects; + dev->flush = input_ff_flush; dev->event = input_ff_event; __set_bit(EV_FF, dev->evbit); diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 022be0e22eba..2cff40be8860 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -230,6 +230,18 @@ static int uinput_dev_erase_effect(struct input_dev *dev, int effect_id) return uinput_request_submit(udev, &request); } +static int uinput_dev_flush(struct input_dev *dev, struct file *file) +{ + /* + * If we are called with file == NULL that means we are tearing + * down the device, and therefore we can not handle FF erase + * requests: either we are handling UI_DEV_DESTROY (and holding + * the udev->mutex), or the file descriptor is closed and there is + * nobody on the other side anymore. + */ + return file ? input_ff_flush(dev, file) : 0; +} + static void uinput_destroy_device(struct uinput_device *udev) { const char *name, *phys; @@ -297,6 +309,12 @@ static int uinput_create_device(struct uinput_device *udev) dev->ff->playback = uinput_dev_playback; dev->ff->set_gain = uinput_dev_set_gain; dev->ff->set_autocenter = uinput_dev_set_autocenter; + /* + * The standard input_ff_flush() implementation does + * not quite work for uinput as we can't reasonably + * handle FF requests during device teardown. + */ + dev->flush = uinput_dev_flush; } error = input_register_device(udev->dev); diff --git a/include/linux/input.h b/include/linux/input.h index a65e3b24fb18..fb5e23c7ed98 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -529,6 +529,7 @@ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file); int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file); +int input_ff_flush(struct input_dev *dev, struct file *file); int input_ff_create_memless(struct input_dev *dev, void *data, int (*play_effect)(struct input_dev *, void *, struct ff_effect *)); From 6b4877c7bdc6ae39ce03716df7caeecf204697eb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 6 Sep 2017 16:22:59 -0700 Subject: [PATCH 0533/1322] Input: uinput - avoid crash when sending FF request to device going away If FF request comes in while uinput device is going away, uinput_request_send() will fail with -ENODEV, and uinput_request_submit() will attempt to mark the slot as unused by calling uinput_request_done(). Unfortunately in this case we haven't initialized request->done completion yet, and we get a crash: [ 39.402036] BUG: spinlock bad magic on CPU#1, fftest/3108 [ 39.402046] lock: 0xffff88006a93bb00, .magic: 00000000, .owner: /39, .owner_cpu: 1217155072 [ 39.402055] CPU: 1 PID: 3108 Comm: fftest Tainted: G W 4.13.0+ #15 [ 39.402059] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET37W (1.22 ) 07/04/2017 [ 39.402064] 0000000000000086 f0fad82f3ceaa120 ffff88006a93b9a0 ffffffff9de941bb [ 39.402077] ffff88026df8ae00 ffff88006a93bb00 ffff88006a93b9c0 ffffffff9dca62b7 [ 39.402088] ffff88006a93bb00 ffff88006a93baf8 ffff88006a93b9e0 ffffffff9dca62e7 [ 39.402099] Call Trace: [ 39.402112] [] dump_stack+0x4d/0x63 [ 39.402123] [] spin_dump+0x97/0x9c [ 39.402130] [] spin_bug+0x2b/0x2d [ 39.402138] [] do_raw_spin_lock+0x28/0xfd [ 39.402147] [] _raw_spin_lock_irqsave+0x19/0x1f [ 39.402154] [] complete+0x1d/0x48 [ 39.402162] [] 0xffffffffc04f30af [ 39.402167] [] 0xffffffffc04f468c [ 39.402177] [] ? __slab_free+0x22f/0x359 [ 39.402184] [] ? tk_clock_read+0xc/0xe [ 39.402189] [] 0xffffffffc04f471f [ 39.402195] [] ? __wake_up+0x44/0x4b [ 39.402200] [] ? 0xffffffffc04f3240 [ 39.402207] [] erase_effect+0xa1/0xd2 [ 39.402214] [] input_ff_flush+0x43/0x5c [ 39.402219] [] 0xffffffffc04f32ad [ 39.402227] [] input_flush_device+0x3d/0x51 [ 39.402234] [] evdev_flush+0x49/0x5c [ 39.402243] [] filp_close+0x3f/0x65 [ 39.402253] [] put_files_struct+0x66/0xc1 [ 39.402261] [] exit_files+0x47/0x4e [ 39.402270] [] do_exit+0x483/0x969 [ 39.402278] [] ? recalc_sigpending_tsk+0x3d/0x44 [ 39.402285] [] do_group_exit+0x42/0xb0 [ 39.402293] [] get_signal+0x58d/0x5bf [ 39.402300] [] do_signal+0x37/0x53e [ 39.402307] [] ? evdev_ioctl_handler+0xac8/0xb04 [ 39.402314] [] ? evdev_ioctl+0x10/0x12 [ 39.402321] [] ? do_vfs_ioctl+0x42e/0x501 [ 39.402328] [] prepare_exit_to_usermode+0x66/0x90 [ 39.402333] [] syscall_return_slowpath+0xe3/0xec [ 39.402339] [] int_ret_from_sys_call+0x25/0x8f While we could solve this by simply initializing the completion earlier, we are better off rearranging the code a bit so we avoid calling complete() on requests that we did not send out. This patch consolidates marking request slots as free in one place (in uinput_request_submit(), the same place where we acquire them) and having everyone else simply signal completion of the requests. Fixes: 00ce756ce53a ("Input: uinput - mark failed submission requests as free") Signed-off-by: Dmitry Torokhov --- drivers/input/misc/uinput.c | 39 ++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 2cff40be8860..443151de90c6 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -98,14 +98,15 @@ static int uinput_request_reserve_slot(struct uinput_device *udev, uinput_request_alloc_id(udev, request)); } -static void uinput_request_done(struct uinput_device *udev, - struct uinput_request *request) +static void uinput_request_release_slot(struct uinput_device *udev, + unsigned int id) { /* Mark slot as available */ - udev->requests[request->id] = NULL; - wake_up(&udev->requests_waitq); + spin_lock(&udev->requests_lock); + udev->requests[id] = NULL; + spin_unlock(&udev->requests_lock); - complete(&request->done); + wake_up(&udev->requests_waitq); } static int uinput_request_send(struct uinput_device *udev, @@ -138,20 +139,22 @@ static int uinput_request_send(struct uinput_device *udev, static int uinput_request_submit(struct uinput_device *udev, struct uinput_request *request) { - int error; + int retval; - error = uinput_request_reserve_slot(udev, request); - if (error) - return error; + retval = uinput_request_reserve_slot(udev, request); + if (retval) + return retval; - error = uinput_request_send(udev, request); - if (error) { - uinput_request_done(udev, request); - return error; - } + retval = uinput_request_send(udev, request); + if (retval) + goto out; wait_for_completion(&request->done); - return request->retval; + retval = request->retval; + + out: + uinput_request_release_slot(udev, request->id); + return retval; } /* @@ -169,7 +172,7 @@ static void uinput_flush_requests(struct uinput_device *udev) request = udev->requests[i]; if (request) { request->retval = -ENODEV; - uinput_request_done(udev, request); + complete(&request->done); } } @@ -957,7 +960,7 @@ static long uinput_ioctl_handler(struct file *file, unsigned int cmd, } req->retval = ff_up.retval; - uinput_request_done(udev, req); + complete(&req->done); goto out; case UI_END_FF_ERASE: @@ -973,7 +976,7 @@ static long uinput_ioctl_handler(struct file *file, unsigned int cmd, } req->retval = ff_erase.retval; - uinput_request_done(udev, req); + complete(&req->done); goto out; } From 127b8e2692b6c145955bb71e482ca1bc6ce10027 Mon Sep 17 00:00:00 2001 From: Gabriel Fernandez Date: Tue, 19 Sep 2017 13:44:06 +0200 Subject: [PATCH 0534/1322] dt-bindings: clk: stm32h7: fix clock-cell size The clock-cell size is 1 on stm32h7 plaform. Signed-off-by: Gabriel Fernandez Fixes: 3e4d618b0722 ("clk: stm32h7: Add stm32h743 clock driver") Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt index a135504c7d57..cac24ee10b72 100644 --- a/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt +++ b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt @@ -32,7 +32,7 @@ Example: compatible = "st,stm32h743-rcc", "st,stm32-rcc"; reg = <0x58024400 0x400>; #reset-cells = <1>; - #clock-cells = <2>; + #clock-cells = <1>; clocks = <&clk_hse>, <&clk_lse>, <&clk_i2s_ckin>; st,syscfg = <&pwrcfg>; From a1f3316dd7b5ce740c774697c664e2e60d095794 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 Sep 2017 18:18:23 -0700 Subject: [PATCH 0535/1322] ipv4: Move fib_has_custom_local_routes outside of IP_MULTIPLE_TABLES. > net/ipv4/fib_frontend.c: In function 'fib_validate_source': > net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > if (net->ipv4.fib_has_custom_local_routes) > ^ > net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute': > net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > net->ipv4.fib_has_custom_local_routes = true; > ^ Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 20720721da4b..8387f099115e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,10 +49,10 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; - bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif + bool fib_has_custom_local_routes; #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; #endif From 059fbe8b5171960bd5c2371bb327ac18733773de Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 21 Sep 2017 13:27:02 +0200 Subject: [PATCH 0536/1322] net: phy: Fix truncation of large IRQ numbers in phy_attached_print() Given NR_IRQS is 2048 on sparc64, and even 32784 on alpha, 3 digits is not enough to represent interrupt numbers on all architectures. Hence PHY interrupt numbers may be truncated during printing. Increase the buffer size from 4 to 8 bytes to fix this. Fixes: 5e369aefdce4818c ("net: stmmac: Delete dead code for MDIO registration") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 8cf0c5901f95..67f25ac29025 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -879,7 +879,7 @@ void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) { const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; char *irq_str; - char irq_num[4]; + char irq_num[8]; switch(phydev->irq) { case PHY_POLL: From a424120765009df13957af20ba4d18e531cfe643 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2017 07:50:28 -0700 Subject: [PATCH 0537/1322] net: vrf: remove skb_dst_force() after skb_dst_set() skb_dst_set(skb, dst) installs a normal (refcounted) dst, there is no point using skb_dst_force(skb) Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9b243e6f3008..cc18b7b11612 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -132,7 +132,6 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, skb_orphan(skb); skb_dst_set(skb, dst); - skb_dst_force(skb); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing From 411d788a23f7e20b8fc51b548c7204fdecc9d22e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Sep 2017 17:36:08 +0200 Subject: [PATCH 0538/1322] test_rhashtable: remove initdata annotation kbuild test robot reported a section mismatch warning w. gcc 4.x: WARNING: lib/test_rhashtable.o(.text+0x139e): Section mismatch in reference from the function rhltable_insert.clone.3() to the variable .init.data:rhlt so remove this annotation. Fixes: cdd4de372ea06 ("test_rhashtable: add test case for rhl_table interface") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index de4d0584631a..8e83cbdc049c 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -251,7 +251,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht, struct test_obj *array, } static struct rhashtable ht; -static struct rhltable rhlt __initdata; +static struct rhltable rhlt; static int __init test_rhltable(unsigned int entries) { From 222d7dbd258dad4cd5241c43ef818141fad5a87a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2017 09:15:46 -0700 Subject: [PATCH 0539/1322] net: prevent dst uses after free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In linux-4.13, Wei worked hard to convert dst to a traditional refcounted model, removing GC. We now want to make sure a dst refcount can not transition from 0 back to 1. The problem here is that input path attached a not refcounted dst to an skb. Then later, because packet is forwarded and hits skb_dst_force() before exiting RCU section, we might try to take a refcount on one dst that is about to be freed, if another cpu saw 1 -> 0 transition in dst_release() and queued the dst for freeing after one RCU grace period. Lets unify skb_dst_force() and skb_dst_force_safe(), since we should always perform the complete check against dst refcount, and not assume it is not zero. Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005 [ 989.919496] skb_dst_force+0x32/0x34 [ 989.919498] __dev_queue_xmit+0x1ad/0x482 [ 989.919501] ? eth_header+0x28/0xc6 [ 989.919502] dev_queue_xmit+0xb/0xd [ 989.919504] neigh_connected_output+0x9b/0xb4 [ 989.919507] ip_finish_output2+0x234/0x294 [ 989.919509] ? ipt_do_table+0x369/0x388 [ 989.919510] ip_finish_output+0x12c/0x13f [ 989.919512] ip_output+0x53/0x87 [ 989.919513] ip_forward_finish+0x53/0x5a [ 989.919515] ip_forward+0x2cb/0x3e6 [ 989.919516] ? pskb_trim_rcsum.part.9+0x4b/0x4b [ 989.919518] ip_rcv_finish+0x2e2/0x321 [ 989.919519] ip_rcv+0x26f/0x2eb [ 989.919522] ? vlan_do_receive+0x4f/0x289 [ 989.919523] __netif_receive_skb_core+0x467/0x50b [ 989.919526] ? tcp_gro_receive+0x239/0x239 [ 989.919529] ? inet_gro_receive+0x226/0x238 [ 989.919530] __netif_receive_skb+0x4d/0x5f [ 989.919532] netif_receive_skb_internal+0x5c/0xaf [ 989.919533] napi_gro_receive+0x45/0x81 [ 989.919536] ixgbe_poll+0xc8a/0xf09 [ 989.919539] ? kmem_cache_free_bulk+0x1b6/0x1f7 [ 989.919540] net_rx_action+0xf4/0x266 [ 989.919543] __do_softirq+0xa8/0x19d [ 989.919545] irq_exit+0x5d/0x6b [ 989.919546] do_IRQ+0x9c/0xb5 [ 989.919548] common_interrupt+0x93/0x93 [ 989.919548] Similarly dst_clone() can use dst_hold() helper to have additional debugging, as a follow up to commit 44ebe79149ff ("net: add debug atomic_inc_not_zero() in dst_hold()") In net-next we will convert dst atomic_t to refcount_t for peace of mind. Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag") Signed-off-by: Eric Dumazet Cc: Wei Wang Reported-by: Paweł Staszewski Bisected-by: Paweł Staszewski Acked-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/dst.h | 22 ++++------------------ include/net/route.h | 2 +- include/net/sock.h | 2 +- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 93568bd0a352..06a6765da074 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) - atomic_inc(&dst->__refcnt); + dst_hold(dst); return dst; } @@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb __skb_dst_copy(nskb, oskb->_skb_refdst); } -/** - * skb_dst_force - makes sure skb dst is refcounted - * @skb: buffer - * - * If dst is not yet refcounted, let's do it - */ -static inline void skb_dst_force(struct sk_buff *skb) -{ - if (skb_dst_is_noref(skb)) { - WARN_ON(!rcu_read_lock_held()); - skb->_skb_refdst &= ~SKB_DST_NOREF; - dst_clone(skb_dst(skb)); - } -} - /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry @@ -339,16 +324,17 @@ static inline bool dst_hold_safe(struct dst_entry *dst) } /** - * skb_dst_force_safe - makes sure skb dst is refcounted + * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. */ -static inline void skb_dst_force_safe(struct sk_buff *skb) +static inline void skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); + WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; diff --git a/include/net/route.h b/include/net/route.h index 1b09a9368c68..57dfc6850d37 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -190,7 +190,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, rcu_read_lock(); err = ip_route_input_noref(skb, dst, src, tos, devin); if (!err) { - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!skb_dst(skb)) err = -EINVAL; } diff --git a/include/net/sock.h b/include/net/sock.h index 03a362568357..a6b9a8d1a6df 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -856,7 +856,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force_safe(skb); + skb_dst_force(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; From 4ba72fc080ad44a5c1e93449ec070cd4d331803f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 21 Sep 2017 22:34:54 +0200 Subject: [PATCH 0540/1322] drm/sun4i: cec: Enable back CEC-pin framework Now that the cec-pin framework has been merged, we can remove the safeguard that were preventing the CEC part of the sun4i HDMI driver and actually start to use it. Signed-off-by: Hans Verkuil Signed-off-by: Maxime Ripard --- drivers/gpu/drm/sun4i/Kconfig | 2 +- drivers/gpu/drm/sun4i/sun4i_hdmi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/sun4i/Kconfig b/drivers/gpu/drm/sun4i/Kconfig index 06f05302ee75..882d85db9053 100644 --- a/drivers/gpu/drm/sun4i/Kconfig +++ b/drivers/gpu/drm/sun4i/Kconfig @@ -26,7 +26,7 @@ config DRM_SUN4I_HDMI_CEC bool "Allwinner A10 HDMI CEC Support" depends on DRM_SUN4I_HDMI select CEC_CORE - depends on CEC_PIN + select CEC_PIN help Choose this option if you have an Allwinner SoC with an HDMI controller and want to use CEC. diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi.h b/drivers/gpu/drm/sun4i/sun4i_hdmi.h index 1457750988da..a1f8cba251a2 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi.h +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi.h @@ -15,7 +15,7 @@ #include #include -#include +#include #define SUN4I_HDMI_CTRL_REG 0x004 #define SUN4I_HDMI_CTRL_ENABLE BIT(31) From a4fd4a724d6c30ad671046d83be2e9be2f11d275 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:02:05 -0400 Subject: [PATCH 0541/1322] usb-storage: fix bogus hardware error messages for ATA pass-thru devices Ever since commit a621bac3044e ("scsi_lib: correctly retry failed zero length REQ_TYPE_FS commands"), people have been getting bogus error messages for USB disk drives using ATA pass-thru. For example: [ 1344.880193] sd 6:0:0:0: [sdb] Attached SCSI disk [ 1345.069152] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.069159] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.069162] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.069168] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(16) 85 06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00 [ 1345.172252] sd 6:0:0:0: [sdb] tag#0 FAILED Result: hostbyte=DID_ERROR driverbyte=DRIVER_SENSE [ 1345.172258] sd 6:0:0:0: [sdb] tag#0 Sense Key : Hardware Error [current] [descriptor] [ 1345.172261] sd 6:0:0:0: [sdb] tag#0 Add. Sense: No additional sense information [ 1345.172266] sd 6:0:0:0: [sdb] tag#0 CDB: ATA command pass through(12)/Blank a1 06 20 da 00 00 4f c2 00 b0 00 00 These messages can be quite annoying, because programs like udisks2 provoke them every 10 minutes or so. Other programs can also have this effect, such as those in smartmontools. I don't fully understand how that commit induced the SCSI core to log these error messages, but the underlying cause for them is code added to usb-storage by commit f1a0743bc0e7 ("USB: storage: When a device returns no sense data, call it a Hardware Error"). At the time it was necessary to do this, in order to prevent an infinite retry loop with some not-so-great mass storage devices. However, the ATA pass-thru protocol uses SCSI sense data to return command status values, and some devices always report Check Condition status for ATA pass-thru commands to ensure that the host retrieves the sense data, even if the command succeeded. This violates the USB mass-storage protocol (Check Condition status is supposed to mean the command failed), but we can't help that. This patch attempts to mitigate the problem of these bogus error reports by changing usb-storage. The HARDWARE ERROR sense key will be inserted only for commands that aren't ATA pass-thru. Thanks to Ewan Milne for pointing out that this mechanism was present in usb-storage. 8 years after writing it, I had completely forgotten its existence. Signed-off-by: Alan Stern Tested-by: Kris Lindgren Ref: https://bugzilla.redhat.com/show_bug.cgi?id=1351305 CC: Ewan D. Milne CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/transport.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 1a59f335b063..a3ccb899df60 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -834,13 +834,25 @@ Retry_Sense: if (result == USB_STOR_TRANSPORT_GOOD) { srb->result = SAM_STAT_GOOD; srb->sense_buffer[0] = 0x0; + } + + /* + * ATA-passthru commands use sense data to report + * the command completion status, and often devices + * return Check Condition status when nothing is + * wrong. + */ + else if (srb->cmnd[0] == ATA_16 || + srb->cmnd[0] == ATA_12) { + /* leave the data alone */ + } /* * If there was a problem, report an unspecified * hardware error to prevent the higher layers from * entering an infinite retry loop. */ - } else { + else { srb->result = DID_ERROR << 16; if ((sshdr.response_code & 0x72) == 0x72) srb->sense_buffer[1] = HARDWARE_ERROR; From 113f6eb6d50cfa5e2a1cdcf1678b12661fa272ab Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 15:59:30 -0400 Subject: [PATCH 0542/1322] usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives Kris Lindgren reports that without the NO_WP_DETECT flag, his Seagate external disk drive fails all write accesses. This regresssion dates back approximately to the start of the 4.x kernel releases. Signed-off-by: Alan Stern Reported-by: Kris Lindgren CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 5a70c33ef0e0..eb06d88b41d6 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1459,6 +1459,13 @@ UNUSUAL_DEV( 0x0bc2, 0x3010, 0x0000, 0x0000, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_SANE_SENSE ), +/* Reported by Kris Lindgren */ +UNUSUAL_DEV( 0x0bc2, 0x3332, 0x0000, 0x9999, + "Seagate", + "External", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_WP_DETECT ), + UNUSUAL_DEV( 0x0d49, 0x7310, 0x0000, 0x9999, "Maxtor", "USB to SATA", From fd085bb1766d6a598f53af2308374a546a49775a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:40 +0300 Subject: [PATCH 0543/1322] stm class: Fix a use-after-free For reasons unknown, the stm_source removal path uses device_destroy() to kill the underlying device object. Because device_destroy() uses devt to look for the device to destroy and the fact that stm_source devices don't have one (or all have the same one), it just picks the first device in the class, which may well be the wrong one. That is, loading stm_console and stm_heartbeat and then removing both will die in dereferencing a freed object. Since this should have been device_unregister() in the first place, use it instead of device_destroy(). Signed-off-by: Alexander Shishkin Fixes: 7bd1d4093c2 ("stm class: Introduce an abstraction for System Trace Module devices") Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/stm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 9414900575d8..f129869e05a9 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -1119,7 +1119,7 @@ void stm_source_unregister_device(struct stm_source_data *data) stm_source_link_drop(src); - device_destroy(&stm_source_class, src->dev.devt); + device_unregister(&src->dev); } EXPORT_SYMBOL_GPL(stm_source_unregister_device); From 920ce7c33db25cf4acb4ade3ae8c93bd23dfd730 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:41 +0300 Subject: [PATCH 0544/1322] intel_th: pci: Add Cedar Fork PCH support This adds Intel(R) Trace Hub PCI ID for Cedar Fork PCH. Signed-off-by: Alexander Shishkin Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index bc9cebc30526..00ee60d9789e 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -158,6 +158,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x9da6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Cedar Fork PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x18e1), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { 0 }, }; From 24600840c74112ad04a9ddd99d7d7f731dcaa1cb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:42 +0300 Subject: [PATCH 0545/1322] intel_th: pci: Add Lewisburg PCH support This adds Intel(R) Trace Hub PCI ID for Lewisburg PCH. Signed-off-by: Alexander Shishkin Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 00ee60d9789e..c2a2ce8ee541 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -143,6 +143,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x19e1), .driver_data = (kernel_ulong_t)0, }, + { + /* Lewisburg PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa1a6), + .driver_data = (kernel_ulong_t)0, + }, { /* Gemini Lake */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x318e), From 33c150c2ee4a65a59190a124b45d05b1abf9478e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 21 Sep 2017 23:41:47 -0700 Subject: [PATCH 0546/1322] vmbus: don't acquire the mutex in vmbus_hvsock_device_unregister() Due to commit 54a66265d675 ("Drivers: hv: vmbus: Fix rescind handling"), we need this patch to resolve the below deadlock: after we get the mutex in vmbus_hvsock_device_unregister() and call vmbus_device_unregister() -> device_unregister() -> ... -> device_release() -> vmbus_device_release(), we'll get a deadlock, because vmbus_device_release() tries to get the same mutex. Signed-off-by: Dexuan Cui Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Signed-off-by: K. Y. Srinivasan Cc: stable@vger.kernel.org (4.13 and above) Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel_mgmt.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 060df71c2e8b..bcbb031f7263 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -936,14 +936,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) { - mutex_lock(&vmbus_connection.channel_mutex); - BUG_ON(!is_hvsock_channel(channel)); channel->rescind = true; vmbus_device_unregister(channel->device_obj); - - mutex_unlock(&vmbus_connection.channel_mutex); } EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); From 549e658a0919e355a2b2144dc380b3729bef7f3e Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 21 Sep 2017 23:41:48 -0700 Subject: [PATCH 0547/1322] Drivers: hv: fcopy: restore correct transfer length Till recently the expected length of bytes read by the daemon did depend on the context. It was either hv_start_fcopy or hv_do_fcopy. The daemon had a buffer size of two pages, which was much larger than needed. Now the expected length of bytes read by the daemon changed slightly. For START_FILE_COPY it is still the size of hv_start_fcopy. But for WRITE_TO_FILE and the other operations it is as large as the buffer that arrived via vmbus. In case of WRITE_TO_FILE that is slightly larger than a struct hv_do_fcopy. Since the buffer in the daemon was still larger everything was fine. Currently, the daemon reads only what is actually needed. The new buffer layout is as large as a struct hv_do_fcopy, for the WRITE_TO_FILE operation. Since the kernel expects a slightly larger size, hvt_op_read will return -EINVAL because the daemon will read slightly less than expected. Address this by restoring the expected buffer size in case of WRITE_TO_FILE. Fixes: 'c7e490fc23eb ("Drivers: hv: fcopy: convert to hv_utils_transport")' Fixes: '3f2baa8a7d2e ("Tools: hv: update buffer handling in hv_fcopy_daemon")' Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index daa75bd41f86..2364281d8593 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -170,6 +170,10 @@ static void fcopy_send_data(struct work_struct *dummy) out_src = smsg_out; break; + case WRITE_TO_FILE: + out_src = fcopy_transaction.fcopy_msg; + out_len = sizeof(struct hv_do_fcopy); + break; default: out_src = fcopy_transaction.fcopy_msg; out_len = fcopy_transaction.recv_len; From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Fri, 22 Sep 2017 07:53:15 +0200 Subject: [PATCH 0548/1322] KVM: nVMX: fix HOST_CR3/HOST_CR4 cache For nested virt we maintain multiple VMCS that can run on a vCPU. So it is incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in vCPU-wide data structures. Hyper-V nested on KVM runs into this consistently for me with PCID enabled. CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3) fires, and the currently loaded VMCS is updated. Then we switch from L2 to L1 and the next exit reverts CR3 to its old value. Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant") Signed-off-by: Ladi Prosek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0726ca7a1b02..c83d28b0ab05 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -200,6 +200,8 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ + unsigned long vmcs_host_cr4; /* May not match real cr4 */ struct list_head loaded_vmcss_on_cpu_link; }; @@ -600,8 +602,6 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -5178,12 +5178,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -9274,15 +9274,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->host_state.vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the From e001fa78d44d0b5c7b1498d1e4a038740efa3b1e Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 15 Sep 2017 15:26:14 +1000 Subject: [PATCH 0549/1322] KVM: PPC: Book3S HV: Check for updated HDSISR on P9 HDSI exception On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage Interrupt (HDSI) the HDSISR is not be updated at all. To work around this we put a canary value into the HDSISR before returning to a guest and then check for this canary when we take a HDSI. If we find the canary on a HDSI, we know the hardware didn't update the HDSISR. In this case we return to the guest to retake the HDSI which should correctly update the HDSISR the second time HDSI entry. After talking to Paulus we've applied this workaround to all POWER9 CPUs. The workaround of returning to the guest shouldn't ever be triggered on well behaving CPU. The extra instructions should have negligible performance impact. Signed-off-by: Michael Neuling Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 17936f82d3c7..ec69fa45d5a2 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1121,6 +1121,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION mtspr SPRN_PPR, r0 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + +/* Move canary into DSISR to check for later */ +BEGIN_FTR_SECTION + li r0, 0x7fff + mtspr SPRN_HDSISR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) @@ -1956,9 +1963,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) kvmppc_hdsi: ld r3, VCPU_KVM(r9) lbz r0, KVM_RADIX(r3) - cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR +BEGIN_FTR_SECTION + /* Look for DSISR canary. If we find it, retry instruction */ + cmpdi r6, 0x7fff + beq 6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpwi r0, 0 bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h From e87be9b29c22852ec300826e3b1d551b00c1eb7a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Sep 2017 14:30:43 +0200 Subject: [PATCH 0550/1322] mmc: tmio: remove broken and noisy debug macro Some change for v4.14 broke the debug output for TMIO. But since it was not helpful to me and too noisy for my taste anyhow, let's just remove it instead of fixing it. We'll find something better if we'd need it... Signed-off-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 47 -------------------------------- 1 file changed, 47 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 12cf8288d663..a7293e186e03 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -129,50 +129,6 @@ static int tmio_mmc_next_sg(struct tmio_mmc_host *host) #define CMDREQ_TIMEOUT 5000 -#ifdef CONFIG_MMC_DEBUG - -#define STATUS_TO_TEXT(a, status, i) \ - do { \ - if ((status) & TMIO_STAT_##a) { \ - if ((i)++) \ - printk(KERN_DEBUG " | "); \ - printk(KERN_DEBUG #a); \ - } \ - } while (0) - -static void pr_debug_status(u32 status) -{ - int i = 0; - - pr_debug("status: %08x = ", status); - STATUS_TO_TEXT(CARD_REMOVE, status, i); - STATUS_TO_TEXT(CARD_INSERT, status, i); - STATUS_TO_TEXT(SIGSTATE, status, i); - STATUS_TO_TEXT(WRPROTECT, status, i); - STATUS_TO_TEXT(CARD_REMOVE_A, status, i); - STATUS_TO_TEXT(CARD_INSERT_A, status, i); - STATUS_TO_TEXT(SIGSTATE_A, status, i); - STATUS_TO_TEXT(CMD_IDX_ERR, status, i); - STATUS_TO_TEXT(STOPBIT_ERR, status, i); - STATUS_TO_TEXT(ILL_FUNC, status, i); - STATUS_TO_TEXT(CMD_BUSY, status, i); - STATUS_TO_TEXT(CMDRESPEND, status, i); - STATUS_TO_TEXT(DATAEND, status, i); - STATUS_TO_TEXT(CRCFAIL, status, i); - STATUS_TO_TEXT(DATATIMEOUT, status, i); - STATUS_TO_TEXT(CMDTIMEOUT, status, i); - STATUS_TO_TEXT(RXOVERFLOW, status, i); - STATUS_TO_TEXT(TXUNDERRUN, status, i); - STATUS_TO_TEXT(RXRDY, status, i); - STATUS_TO_TEXT(TXRQ, status, i); - STATUS_TO_TEXT(ILL_ACCESS, status, i); - printk("\n"); -} - -#else -#define pr_debug_status(s) do { } while (0) -#endif - static void tmio_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable) { struct tmio_mmc_host *host = mmc_priv(mmc); @@ -762,9 +718,6 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) status = sd_ctrl_read16_and_16_as_32(host, CTL_STATUS); ireg = status & TMIO_MASK_IRQ & ~host->sdcard_irq_mask; - pr_debug_status(status); - pr_debug_status(ireg); - /* Clear the status except the interrupt status */ sd_ctrl_write32_as_16_and_16(host, CTL_STATUS, TMIO_MASK_IRQ); From c51b46dd5b9950436b6b1f8189e93e1ad380cee1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 20 Sep 2017 22:19:57 +0100 Subject: [PATCH 0551/1322] staging: rtl8723bs: add missing range check on id The value of the u8 id needs to be upper bounds checked to ensure the cam_cache array on the adapter dvobj is not indexed outside of the allowed range of 0..TOTAL_CAM_ENTRY-1. This can currently occur if id is >= TOTAL_CAM_ENTRY when calling write_cam_from_cache. Fix this by adding an upper range check. Detected by CoverityScan, CID#1428464 ("Use of untrusted scalar value") Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/rtw_proc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/rtl8723bs/os_dep/rtw_proc.c b/drivers/staging/rtl8723bs/os_dep/rtw_proc.c index 92277457aba4..ce1dd6f9036f 100644 --- a/drivers/staging/rtl8723bs/os_dep/rtw_proc.c +++ b/drivers/staging/rtl8723bs/os_dep/rtw_proc.c @@ -311,6 +311,8 @@ static ssize_t proc_set_cam(struct file *file, const char __user *buffer, size_t if (num < 2) return count; + if (id >= TOTAL_CAM_ENTRY) + return -EINVAL; if (strcmp("c", cmd) == 0) { _clear_cam_entry(adapter, id); From ec14121931a24f8d3678b8a9c408adee3b21d465 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 20 Sep 2017 18:34:18 +0100 Subject: [PATCH 0552/1322] staging: rtl8723bs: avoid null pointer dereference on pmlmepriv There is a check to see if pmlmepriv is null before vfree'ing pmlmepriv->free_bss_buf hence implying pmlmepriv could potenially be null. However, a previous call to rtw_free_mlme_priv_ie_data can also dereference pmlmepriv, so move this call so that it is only called if pmlmepriv non-null. Detected by CoverityScan, CID#1077739 ("Dereference before null check") Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/core/rtw_mlme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/staging/rtl8723bs/core/rtw_mlme.c b/drivers/staging/rtl8723bs/core/rtw_mlme.c index 6b778206a1a3..cb8a95aabd6c 100644 --- a/drivers/staging/rtl8723bs/core/rtw_mlme.c +++ b/drivers/staging/rtl8723bs/core/rtw_mlme.c @@ -119,9 +119,8 @@ void rtw_free_mlme_priv_ie_data(struct mlme_priv *pmlmepriv) void _rtw_free_mlme_priv(struct mlme_priv *pmlmepriv) { - rtw_free_mlme_priv_ie_data(pmlmepriv); - if (pmlmepriv) { + rtw_free_mlme_priv_ie_data(pmlmepriv); if (pmlmepriv->free_bss_buf) { vfree(pmlmepriv->free_bss_buf); } From 6ae033689d7b1a419def78e8e990b0eab8bb6419 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 18 Sep 2017 15:16:08 +0300 Subject: [PATCH 0553/1322] mmc: sdhci-pci: Fix voltage switch for some Intel host controllers Some Intel host controllers (e.g. CNP) use an ACPI device-specific method to ensure correct voltage switching. Fix voltage switch for those, by adding a call to the DSM. Signed-off-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index bbaddf18a1b3..d0ccc6729fd2 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -392,6 +392,7 @@ static const struct sdhci_pci_fixes sdhci_intel_pch_sdio = { enum { INTEL_DSM_FNS = 0, + INTEL_DSM_V18_SWITCH = 3, INTEL_DSM_DRV_STRENGTH = 9, INTEL_DSM_D3_RETUNE = 10, }; @@ -557,6 +558,19 @@ static void intel_hs400_enhanced_strobe(struct mmc_host *mmc, sdhci_writel(host, val, INTEL_HS400_ES_REG); } +static void sdhci_intel_voltage_switch(struct sdhci_host *host) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct device *dev = &slot->chip->pdev->dev; + u32 result = 0; + int err; + + err = intel_dsm(intel_host, dev, INTEL_DSM_V18_SWITCH, &result); + pr_debug("%s: %s DSM error %d result %u\n", + mmc_hostname(host->mmc), __func__, err, result); +} + static const struct sdhci_ops sdhci_intel_byt_ops = { .set_clock = sdhci_set_clock, .set_power = sdhci_intel_set_power, @@ -565,6 +579,7 @@ static const struct sdhci_ops sdhci_intel_byt_ops = { .reset = sdhci_reset, .set_uhs_signaling = sdhci_set_uhs_signaling, .hw_reset = sdhci_pci_hw_reset, + .voltage_switch = sdhci_intel_voltage_switch, }; static void byt_read_dsm(struct sdhci_pci_slot *slot) From c9adcdbc653b52e4be834f535eefec833f9ca6b1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 21 Sep 2017 14:03:29 +0800 Subject: [PATCH 0554/1322] ALSA: pcm: Fix structure definition for X32 ABI X32 ABI uses the 64bit timespec in addition to 64bit alignment of 64bit values. We have added compat ABI for these ioctls, but this patch adds one missing padding into 'struct snd_pcm_mmap_status_x32' to fix incompatibilities. Signed-off-by: Baolin Wang Signed-off-by: Takashi Iwai --- sound/core/pcm_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index 3a1cc7b97e46..b719d0bd833e 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -547,6 +547,7 @@ struct snd_pcm_mmap_status_x32 { u32 pad2; /* alignment */ struct timespec tstamp; s32 suspended_state; + s32 pad3; struct timespec audio_tstamp; } __packed; From c0d05cde2a685cd6486390c62be684ce456d84d6 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 21 Sep 2017 11:20:58 +0100 Subject: [PATCH 0555/1322] iommu/of: Remove PCI host bridge node check of_pci_iommu_init() tries to be clever and stop its alias walk at the device represented by master_np, in case of weird PCI topologies where the bridge to the IOMMU and the rest of the system is not at the root. It turns out this is a bit short-sighted, since there are plenty of other callers of pci_for_each_dma_alias() which would also need the same behaviour in that situation, and the only platform so far with such a topology (Cavium ThunderX2) already solves it more generally via a PCI quirk. As this check is effectively redundant, and returning a boolean value as an int is a bit broken anyway, let's just get rid of it. Reported-by: Jean-Philippe Brucker Fixes: d87beb749281 ("iommu/of: Handle PCI aliases properly") Signed-off-by: Robin Murphy Tested-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- drivers/iommu/of_iommu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index e60e3dba85a0..50947ebb6d17 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -157,10 +157,7 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) err = of_iommu_xlate(info->dev, &iommu_spec); of_node_put(iommu_spec.np); - if (err) - return err; - - return info->np == pdev->bus->dev.of_node; + return err; } const struct iommu_ops *of_iommu_configure(struct device *dev, From a88dc7ba15cd4f4ef5102b5185f8fa7ff86e54e1 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 20 Sep 2017 12:26:38 +0530 Subject: [PATCH 0556/1322] drivers/perf: arm_pmu_acpi: Release memory obtained by kasprintf Free memory region, if arm_pmu_acpi_probe is not successful. Acked-by: Will Deacon Signed-off-by: Arvind Yadav Signed-off-by: Catalin Marinas --- drivers/perf/arm_pmu_acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 0a9b78705ee8..3303dd8d8eb5 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -235,6 +235,7 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn) ret = armpmu_register(pmu); if (ret) { pr_warn("Failed to register PMU for CPU%d\n", cpu); + kfree(pmu->name); return ret; } } From bfc81a8bc18e3c4ba0cbaa7666ff76be2f998991 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 22 Sep 2017 16:18:53 +0200 Subject: [PATCH 0557/1322] ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor When a USB-audio device receives a maliciously adjusted or corrupted buffer descriptor, the USB-audio driver may access an out-of-bounce value at its parser. This was detected by syzkaller, something like: BUG: KASAN: slab-out-of-bounds in usb_audio_probe+0x27b2/0x2ab0 Read of size 1 at addr ffff88006b83a9e8 by task kworker/0:1/24 CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #224 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 snd_usb_create_streams sound/usb/card.c:248 usb_audio_probe+0x27b2/0x2ab0 sound/usb/card.c:605 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 This patch adds the checks of out-of-bounce accesses at appropriate places and bails out when it goes out of the given buffer. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/card.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/usb/card.c b/sound/usb/card.c index 3dc36d913550..23d1d23aefec 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -221,6 +221,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) struct usb_interface_descriptor *altsd; void *control_header; int i, protocol; + int rest_bytes; /* find audiocontrol interface */ host_iface = &usb_ifnum_to_if(dev, ctrlif)->altsetting[0]; @@ -235,6 +236,15 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) return -EINVAL; } + rest_bytes = (void *)(host_iface->extra + host_iface->extralen) - + control_header; + + /* just to be sure -- this shouldn't hit at all */ + if (rest_bytes <= 0) { + dev_err(&dev->dev, "invalid control header\n"); + return -EINVAL; + } + switch (protocol) { default: dev_warn(&dev->dev, @@ -245,11 +255,21 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) case UAC_VERSION_1: { struct uac1_ac_header_descriptor *h1 = control_header; + if (rest_bytes < sizeof(*h1)) { + dev_err(&dev->dev, "too short v1 buffer descriptor\n"); + return -EINVAL; + } + if (!h1->bInCollection) { dev_info(&dev->dev, "skipping empty audio interface (v1)\n"); return -EINVAL; } + if (rest_bytes < h1->bLength) { + dev_err(&dev->dev, "invalid buffer length (v1)\n"); + return -EINVAL; + } + if (h1->bLength < sizeof(*h1) + h1->bInCollection) { dev_err(&dev->dev, "invalid UAC_HEADER (v1)\n"); return -EINVAL; From c4fa6c43ce4b427350cfbb659436bfe3d9e09a1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 21 Sep 2017 09:54:13 -0400 Subject: [PATCH 0558/1322] cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns The cgroup_taskset structure within the larger cgroup_mgctx structure is supposed to be used once and then discarded. That is not really the case in the hotplug code path: cpuset_hotplug_workfn() - cgroup_transfer_tasks() - cgroup_migrate() - cgroup_migrate_add_task() - cgroup_migrate_execute() In this case, the cgroup_migrate() function is called multiple time with the same cgroup_mgctx structure to transfer the tasks from one cgroup to another one-by-one. The second time cgroup_migrate() is called, the cgroup_taskset will be in an incorrect state and so may cause the system to panic. For example, [ 150.888410] Faulting instruction address: 0xc0000000001db648 [ 150.888414] Oops: Kernel access of bad area, sig: 11 [#1] [ 150.888417] SMP NR_CPUS=2048 [ 150.888417] NUMA [ 150.888419] pSeries : [ 150.888545] NIP [c0000000001db648] cpuset_can_attach+0x58/0x1b0 [ 150.888548] LR [c0000000001db638] cpuset_can_attach+0x48/0x1b0 [ 150.888551] Call Trace: [ 150.888554] [c0000005f65cb940] [c0000000001db638] cpuset_can_attach+0x48/0x1b 0 (unreliable) [ 150.888559] [c0000005f65cb9a0] [c0000000001cff04] cgroup_migrate_execute+0xc4/0x4b0 [ 150.888563] [c0000005f65cba20] [c0000000001d7d14] cgroup_transfer_tasks+0x1d4/0x370 [ 150.888568] [c0000005f65cbb70] [c0000000001ddcb0] cpuset_hotplug_workfn+0x710/0x8f0 [ 150.888572] [c0000005f65cbc80] [c00000000012032c] process_one_work+0x1ac/0x4d0 [ 150.888576] [c0000005f65cbd20] [c0000000001206f8] worker_thread+0xa8/0x5b0 [ 150.888580] [c0000005f65cbdc0] [c0000000001293f8] kthread+0x168/0x1b0 [ 150.888584] [c0000005f65cbe30] [c00000000000b368] ret_from_kernel_thread+0x5c/0x74 To allow reuse of the cgroup_mgctx structure, some fields in that structure are now re-initialized at the end of cgroup_migrate_execute() function call so that the structure can be reused again in a later iteration without causing problem. This bug was introduced in the commit e595cd706982 ("group: track migration context in cgroup_mgctx") in 4.11. This commit moves the cgroup_taskset initialization out of cgroup_migrate(). The commit 10467270fb3 ("cgroup: don't call migration methods if there are no tasks to migrate") helped, but did not completely resolve the problem. Fixes: e595cd706982bff0211e6fafe5a108421e747fbc ("group: track migration context in cgroup_mgctx") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.11+ --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..44857278eb8a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2311,6 +2311,14 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } From e6f9bc34d3779cb7b6a337afed5de8be3f0fab77 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Thu, 31 Aug 2017 09:30:34 -0700 Subject: [PATCH 0559/1322] IB/core: Fix for core panic Build with the latest patches resulted in panic: 11384.486289] BUG: unable to handle kernel NULL pointer dereference at (null) [11384.486293] IP: (null) [11384.486295] PGD 0 [11384.486295] P4D 0 [11384.486296] [11384.486299] Oops: 0010 [#1] SMP ......... snip ...... [11384.486401] CPU: 0 PID: 968 Comm: kworker/0:1H Tainted: G W O 4.13.0-a-stream-20170825 #1 [11384.486402] Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0014.121820151719 12/18/2015 [11384.486418] Workqueue: ib-comp-wq ib_cq_poll_work [ib_core] [11384.486419] task: ffff880850579680 task.stack: ffffc90007fec000 [11384.486420] RIP: 0010: (null) [11384.486420] RSP: 0018:ffffc90007fef970 EFLAGS: 00010206 [11384.486421] RAX: ffff88084cfe8000 RBX: ffff88084dce4000 RCX: ffffc90007fef978 [11384.486422] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff88084cfe8000 [11384.486422] RBP: ffffc90007fefab0 R08: 0000000000000000 R09: ffff88084dce4080 [11384.486423] R10: ffffffffa02d7f60 R11: 0000000000000000 R12: ffff88105af65a00 [11384.486423] R13: ffff88084dce4000 R14: 000000000000c000 R15: 000000000000c000 [11384.486424] FS: 0000000000000000(0000) GS:ffff88085f400000(0000) knlGS:0000000000000000 [11384.486425] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [11384.486425] CR2: 0000000000000000 CR3: 0000000001c09000 CR4: 00000000001406f0 [11384.486426] Call Trace: [11384.486431] ? is_valid_mcast_lid.isra.21+0xfb/0x110 [ib_core] [11384.486436] ib_attach_mcast+0x6f/0xa0 [ib_core] [11384.486441] ipoib_mcast_attach+0x81/0x190 [ib_ipoib] [11384.486443] ipoib_mcast_join_complete+0x354/0xb40 [ib_ipoib] [11384.486448] mcast_work_handler+0x330/0x6c0 [ib_core] [11384.486452] join_handler+0x101/0x220 [ib_core] [11384.486455] ib_sa_mcmember_rec_callback+0x54/0x80 [ib_core] [11384.486459] recv_handler+0x3a/0x60 [ib_core] [11384.486462] ib_mad_recv_done+0x423/0x9b0 [ib_core] [11384.486466] __ib_process_cq+0x5d/0xb0 [ib_core] [11384.486469] ib_cq_poll_work+0x20/0x60 [ib_core] [11384.486472] process_one_work+0x149/0x360 [11384.486474] worker_thread+0x4d/0x3c0 [11384.486487] kthread+0x109/0x140 [11384.486488] ? rescuer_thread+0x380/0x380 [11384.486489] ? kthread_park+0x60/0x60 [11384.486490] ? kthread_park+0x60/0x60 [11384.486493] ret_from_fork+0x25/0x30 [11384.486493] Code: Bad RIP value. [11384.486493] Code: Bad RIP value. [11384.486496] RIP: (null) RSP: ffffc90007fef970 [11384.486497] CR2: 0000000000000000 [11384.486531] ---[ end trace b1acec6fb4ff6e75 ]--- [11384.532133] Kernel panic - not syncing: Fatal exception [11384.536541] Kernel Offset: disabled [11384.969491] ---[ end Kernel panic - not syncing: Fatal exception [11384.976875] sched: Unexpected reschedule of offline CPU#1! [11384.983646] ------------[ cut here ]------------ Rdma device driver may not have implemented (*get_link_layer)() so it can not be called directly. Should use appropriate helper function. Reviewed-by: Yuval Shaia Fixes: 523633359224 ("IB/core: Fix the validations of a multicast LID in attach or detach operations") Cc: stable@kernel.org # 4.13 Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee9e27dc799b..de57d6c11a25 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1646,7 +1646,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) */ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { if (attr.qp_state >= IB_QPS_INIT) { - if (qp->device->get_link_layer(qp->device, attr.port_num) != + if (rdma_port_get_link_layer(qp->device, attr.port_num) != IB_LINK_LAYER_INFINIBAND) return true; goto lid_check; @@ -1655,7 +1655,7 @@ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) /* Can't get a quick answer, iterate over all ports */ for (port = 0; port < qp->device->phys_port_cnt; port++) - if (qp->device->get_link_layer(qp->device, port) != + if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; From 786de92b3cb26012d3d0f00ee37adf14527f35c4 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Sep 2017 11:56:49 -0400 Subject: [PATCH 0560/1322] USB: uas: fix bug in handling of alternate settings The uas driver has a subtle bug in the way it handles alternate settings. The uas_find_uas_alt_setting() routine returns an altsetting value (the bAlternateSetting number in the descriptor), but uas_use_uas_driver() then treats that value as an index to the intf->altsetting array, which it isn't. Normally this doesn't cause any problems because the various alternate settings have bAlternateSetting values 0, 1, 2, ..., so the value is equal to the index in the array. But this is not guaranteed, and Andrey Konovalov used the syzkaller fuzzer with KASAN to get a slab-out-of-bounds error by violating this assumption. This patch fixes the bug by making uas_find_uas_alt_setting() return a pointer to the altsetting entry rather than either the value or the index. Pointers are less subject to misinterpretation. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Oliver Neukum CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas-detect.h | 15 ++++++++------- drivers/usb/storage/uas.c | 10 +++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/usb/storage/uas-detect.h b/drivers/usb/storage/uas-detect.h index f58caa9e6a27..a155cd02bce2 100644 --- a/drivers/usb/storage/uas-detect.h +++ b/drivers/usb/storage/uas-detect.h @@ -9,7 +9,8 @@ static int uas_is_interface(struct usb_host_interface *intf) intf->desc.bInterfaceProtocol == USB_PR_UAS); } -static int uas_find_uas_alt_setting(struct usb_interface *intf) +static struct usb_host_interface *uas_find_uas_alt_setting( + struct usb_interface *intf) { int i; @@ -17,10 +18,10 @@ static int uas_find_uas_alt_setting(struct usb_interface *intf) struct usb_host_interface *alt = &intf->altsetting[i]; if (uas_is_interface(alt)) - return alt->desc.bAlternateSetting; + return alt; } - return -ENODEV; + return NULL; } static int uas_find_endpoints(struct usb_host_interface *alt, @@ -58,14 +59,14 @@ static int uas_use_uas_driver(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); struct usb_hcd *hcd = bus_to_hcd(udev->bus); unsigned long flags = id->driver_info; - int r, alt; - + struct usb_host_interface *alt; + int r; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) + if (!alt) return 0; - r = uas_find_endpoints(&intf->altsetting[alt], eps); + r = uas_find_endpoints(alt, eps); if (r < 0) return 0; diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index cfb1e3bbd434..63cf981ed81c 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -873,14 +873,14 @@ MODULE_DEVICE_TABLE(usb, uas_usb_ids); static int uas_switch_interface(struct usb_device *udev, struct usb_interface *intf) { - int alt; + struct usb_host_interface *alt; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) - return alt; + if (!alt) + return -ENODEV; - return usb_set_interface(udev, - intf->altsetting[0].desc.bInterfaceNumber, alt); + return usb_set_interface(udev, alt->desc.bInterfaceNumber, + alt->desc.bAlternateSetting); } static int uas_configure_endpoints(struct uas_dev_info *devinfo) From 6e76c01e71551cb221c1f3deacb9dcd9a7346784 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:12:01 -0400 Subject: [PATCH 0561/1322] USB: gadgetfs: fix copy_to_user while holding spinlock The gadgetfs driver as a long-outstanding FIXME, regarding a call of copy_to_user() made while holding a spinlock. This patch fixes the issue by dropping the spinlock and using the dev->udc_usage mechanism introduced by another recent patch to guard against status changes while the lock isn't held. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov CC: Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 684900fcfe24..956b3dc7c3a4 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -983,11 +983,14 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); -// FIXME don't call this with the spinlock held ... + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; + spin_lock_irq(&dev->lock); + --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } From 520b72fc64debf8a86c3853b8e486aa5982188f0 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:23:58 -0400 Subject: [PATCH 0562/1322] USB: gadgetfs: Fix crash caused by inadequate synchronization The gadgetfs driver (drivers/usb/gadget/legacy/inode.c) was written before the UDC and composite frameworks were adopted; it is a legacy driver. As such, it expects that once bound to a UDC controller, it will not be unbound until it unregisters itself. However, the UDC framework does unbind function drivers while they are still registered. When this happens, it can cause the gadgetfs driver to misbehave or crash. For example, userspace can cause a crash by opening the device file and doing an ioctl call before setting up a configuration (found by Andrey Konovalov using the syzkaller fuzzer). This patch adds checks and synchronization to prevent these bad behaviors. It adds a udc_usage counter that the driver increments at times when it is using a gadget interface without holding the private spinlock. The unbind routine waits for this counter to go to 0 before returning, thereby ensuring that the UDC is no longer in use. The patch also adds a check in the dev_ioctl() routine to make sure the driver is bound to a UDC before dereferencing the gadget pointer, and it makes destroy_ep_files() synchronize with the endpoint I/O routines, to prevent the user from accessing an endpoint data structure after it has been removed. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 41 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 956b3dc7c3a4..5c28bee327e1 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -28,7 +28,7 @@ #include #include #include - +#include #include #include @@ -116,6 +116,7 @@ enum ep0_state { struct dev_data { spinlock_t lock; refcount_t count; + int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; @@ -513,9 +514,9 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } - spin_unlock(&epdata->dev->lock); usb_ep_free_request(ep, req); + spin_unlock(&epdata->dev->lock); put_ep(epdata); } @@ -939,9 +940,11 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { + ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); + --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; @@ -1134,6 +1137,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; + ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; @@ -1145,6 +1149,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) GFP_KERNEL); } spin_lock_irq(&dev->lock); + --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else @@ -1246,9 +1251,21 @@ static long dev_ioctl (struct file *fd, unsigned code, unsigned long value) struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; - if (gadget->ops->ioctl) + spin_lock_irq(&dev->lock); + if (dev->state == STATE_DEV_OPENED || + dev->state == STATE_DEV_UNBOUND) { + /* Not bound to a UDC */ + } else if (gadget->ops->ioctl) { + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); + ret = gadget->ops->ioctl (gadget, code, value); + spin_lock_irq(&dev->lock); + --dev->udc_usage; + } + spin_unlock_irq(&dev->lock); + return ret; } @@ -1466,10 +1483,12 @@ delegate: if (value < 0) break; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); + --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; @@ -1493,8 +1512,12 @@ delegate: req->length = value; req->zero = value < w_length; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); + spin_lock(&dev->lock); + --dev->udc_usage; + spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; @@ -1521,21 +1544,24 @@ static void destroy_ep_files (struct dev_data *dev) /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); + spin_unlock_irq (&dev->lock); + dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ + mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; + mutex_unlock(&ep->lock); + wake_up (&ep->wait); put_ep (ep); - spin_unlock_irq (&dev->lock); - /* break link to dcache */ inode_lock(parent); d_delete (dentry); @@ -1606,6 +1632,11 @@ gadgetfs_unbind (struct usb_gadget *gadget) spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; + while (dev->udc_usage > 0) { + spin_unlock_irq(&dev->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dev->lock); + } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); From 1fbbb78f25d1291274f320462bf6908906f538db Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:22:00 -0400 Subject: [PATCH 0563/1322] USB: g_mass_storage: Fix deadlock when driver is unbound As a holdover from the old g_file_storage gadget, the g_mass_storage legacy gadget driver attempts to unregister itself when its main operating thread terminates (if it hasn't been unregistered already). This is not strictly necessary; it was never more than an attempt to have the gadget fail cleanly if something went wrong and the main thread was killed. However, now that the UDC core manages gadget drivers independently of UDC drivers, this scheme doesn't work any more. A simple test: modprobe dummy-hcd modprobe g-mass-storage file=... rmmod dummy-hcd ends up in a deadlock with the following backtrace: sysrq: SysRq : Show Blocked State task PC stack pid father file-storage D 0 1130 2 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_preempt_disabled+0xd/0xf __mutex_lock.isra.1+0x129/0x224 ? _raw_spin_unlock_irqrestore+0x12/0x14 __mutex_lock_slowpath+0x12/0x14 mutex_lock+0x28/0x2b usb_gadget_unregister_driver+0x29/0x9b [udc_core] usb_composite_unregister+0x10/0x12 [libcomposite] msg_cleanup+0x1d/0x20 [g_mass_storage] msg_thread_exits+0xd/0xdd7 [g_mass_storage] fsg_main_thread+0x1395/0x13d6 [usb_f_mass_storage] ? __schedule+0x573/0x58c kthread+0xd9/0xdb ? do_set_interface+0x25c/0x25c [usb_f_mass_storage] ? init_completion+0x1e/0x1e ret_from_fork+0x19/0x24 rmmod D 0 1155 683 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_timeout+0x26/0xbc ? __schedule+0x573/0x58c do_wait_for_common+0xb3/0x128 ? usleep_range+0x81/0x81 ? wake_up_q+0x3f/0x3f wait_for_common+0x2e/0x45 wait_for_completion+0x17/0x19 fsg_common_put+0x34/0x81 [usb_f_mass_storage] fsg_free_inst+0x13/0x1e [usb_f_mass_storage] usb_put_function_instance+0x1a/0x25 [libcomposite] msg_unbind+0x2a/0x42 [g_mass_storage] __composite_unbind+0x4a/0x6f [libcomposite] composite_unbind+0x12/0x14 [libcomposite] usb_gadget_remove_driver+0x4f/0x77 [udc_core] usb_del_gadget_udc+0x52/0xcc [udc_core] dummy_udc_remove+0x27/0x2c [dummy_hcd] platform_drv_remove+0x1d/0x31 device_release_driver_internal+0xe9/0x16d device_release_driver+0x11/0x13 bus_remove_device+0xd2/0xe2 device_del+0x19f/0x221 ? selinux_capable+0x22/0x27 platform_device_del+0x21/0x63 platform_device_unregister+0x10/0x1a cleanup+0x20/0x817 [dummy_hcd] SyS_delete_module+0x10c/0x197 ? ____fput+0xd/0xf ? task_work_run+0x55/0x62 ? prepare_exit_to_usermode+0x65/0x75 do_fast_syscall_32+0x86/0xc3 entry_SYSENTER_32+0x4e/0x7c What happens is that removing the dummy-hcd driver causes the UDC core to unbind the gadget driver, which it does while holding the udc_lock mutex. The unbind routine in g_mass_storage tells the main thread to exit and waits for it to terminate. But as mentioned above, when the main thread exits it tries to unregister the mass-storage function driver. Via the composite framework this ends up calling usb_gadget_unregister_driver(), which tries to acquire the udc_lock mutex. The result is deadlock. The simplest way to fix the problem is not to be so clever: The main thread doesn't have to unregister the function driver. The side effects won't be so terrible; if the gadget is still attached to a USB host when the main thread is killed, it will appear to the host as though the gadget's firmware has crashed -- a reasonably accurate interpretation, and an all-too-common occurrence for USB mass-storage devices. In fact, the code to unregister the driver when the main thread exits is specific to g-mass-storage; it is not used when f-mass-storage is included as a function in a larger composite device. Therefore the entire mechanism responsible for this (the fsg_operations structure with its ->thread_exits method, the fsg_common_set_ops() routine, and the msg_thread_exits() callback routine) can all be eliminated. Even the msg_registered bitflag can be removed, because now the driver is unregistered in only one place rather than in two places. Signed-off-by: Alan Stern CC: Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_mass_storage.c | 27 +++++--------------- drivers/usb/gadget/function/f_mass_storage.h | 14 ---------- drivers/usb/gadget/legacy/mass_storage.c | 26 +++---------------- 3 files changed, 10 insertions(+), 57 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index d6bd0244b008..5153e29870c3 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -307,8 +307,6 @@ struct fsg_common { struct completion thread_notifier; struct task_struct *thread_task; - /* Callback functions. */ - const struct fsg_operations *ops; /* Gadget's private data. */ void *private_data; @@ -2438,6 +2436,7 @@ static void handle_exception(struct fsg_common *common) static int fsg_main_thread(void *common_) { struct fsg_common *common = common_; + int i; /* * Allow the thread to be killed by a signal, but set the signal mask @@ -2476,21 +2475,16 @@ static int fsg_main_thread(void *common_) common->thread_task = NULL; spin_unlock_irq(&common->lock); - if (!common->ops || !common->ops->thread_exits - || common->ops->thread_exits(common) < 0) { - int i; + /* Eject media from all LUNs */ - down_write(&common->filesem); - for (i = 0; i < ARRAY_SIZE(common->luns); i++) { - struct fsg_lun *curlun = common->luns[i]; - if (!curlun || !fsg_lun_is_open(curlun)) - continue; + down_write(&common->filesem); + for (i = 0; i < ARRAY_SIZE(common->luns); i++) { + struct fsg_lun *curlun = common->luns[i]; + if (curlun && fsg_lun_is_open(curlun)) fsg_lun_close(curlun); - curlun->unit_attention_data = SS_MEDIUM_NOT_PRESENT; - } - up_write(&common->filesem); } + up_write(&common->filesem); /* Let fsg_unbind() know the thread has exited */ complete_and_exit(&common->thread_notifier, 0); @@ -2681,13 +2675,6 @@ void fsg_common_remove_luns(struct fsg_common *common) } EXPORT_SYMBOL_GPL(fsg_common_remove_luns); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops) -{ - common->ops = ops; -} -EXPORT_SYMBOL_GPL(fsg_common_set_ops); - void fsg_common_free_buffers(struct fsg_common *common) { _fsg_common_free_buffers(common->buffhds, common->fsg_num_buffers); diff --git a/drivers/usb/gadget/function/f_mass_storage.h b/drivers/usb/gadget/function/f_mass_storage.h index d3902313b8ac..dc05ca0c4359 100644 --- a/drivers/usb/gadget/function/f_mass_storage.h +++ b/drivers/usb/gadget/function/f_mass_storage.h @@ -60,17 +60,6 @@ struct fsg_module_parameters { struct fsg_common; /* FSF callback functions */ -struct fsg_operations { - /* - * Callback function to call when thread exits. If no - * callback is set or it returns value lower then zero MSF - * will force eject all LUNs it operates on (including those - * marked as non-removable or with prevent_medium_removal flag - * set). - */ - int (*thread_exits)(struct fsg_common *common); -}; - struct fsg_lun_opts { struct config_group group; struct fsg_lun *lun; @@ -142,9 +131,6 @@ void fsg_common_remove_lun(struct fsg_lun *lun); void fsg_common_remove_luns(struct fsg_common *common); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops); - int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg, unsigned int id, const char *name, const char **name_pfx); diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index e99ab57ee3e5..fcba59782f26 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -107,15 +107,6 @@ static unsigned int fsg_num_buffers = CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS; FSG_MODULE_PARAMETERS(/* no prefix */, mod_data); -static unsigned long msg_registered; -static void msg_cleanup(void); - -static int msg_thread_exits(struct fsg_common *common) -{ - msg_cleanup(); - return 0; -} - static int msg_do_config(struct usb_configuration *c) { struct fsg_opts *opts; @@ -154,9 +145,6 @@ static struct usb_configuration msg_config_driver = { static int msg_bind(struct usb_composite_dev *cdev) { - static const struct fsg_operations ops = { - .thread_exits = msg_thread_exits, - }; struct fsg_opts *opts; struct fsg_config config; int status; @@ -173,8 +161,6 @@ static int msg_bind(struct usb_composite_dev *cdev) if (status) goto fail; - fsg_common_set_ops(opts->common, &ops); - status = fsg_common_set_cdev(opts->common, cdev, config.can_stall); if (status) goto fail_set_cdev; @@ -256,18 +242,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - int ret; - - ret = usb_composite_probe(&msg_driver); - set_bit(0, &msg_registered); - - return ret; + return usb_composite_probe(&msg_driver); } module_init(msg_init); -static void msg_cleanup(void) +static void __exit msg_cleanup(void) { - if (test_and_clear_bit(0, &msg_registered)) - usb_composite_unregister(&msg_driver); + usb_composite_unregister(&msg_driver); } module_exit(msg_cleanup); From 3d318605f5e32ff44fb290d9b67573b34213c4c8 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 13 Sep 2017 09:52:32 -0700 Subject: [PATCH 0564/1322] iw_cxgb4: put ep reference in pass_accept_req() The listening endpoint should always be dereferenced at the end of pass_accept_req(). Fixes: f86fac79afec ("RDMA/iw_cxgb4: atomic find and reference for listening endpoints") Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index ceaa2fa54d32..83322dbc4711 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2594,9 +2594,9 @@ fail: c4iw_put_ep(&child_ep->com); reject: reject_cr(dev, hwtid, skb); +out: if (parent_ep) c4iw_put_ep(&parent_ep->com); -out: return 0; } From 3c8415cc7aff467faba25841fb859660ac14a04e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Sep 2017 11:52:33 -0700 Subject: [PATCH 0565/1322] iw_cxgb4: drop listen destroy replies if no ep found If the thread waiting for a CLOSE_LISTSRV_RPL times out and bails, then we need to handle a subsequent CPL if it arrives and the stid has been released. In this case silently drop it. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 83322dbc4711..19297e408820 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2333,9 +2333,14 @@ static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int stid = GET_TID(rpl); struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); + if (!ep) { + pr_debug("%s stid %d lookup failure!\n", __func__, stid); + goto out; + } pr_debug("%s ep %p\n", __func__, ep); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); c4iw_put_ep(&ep->com); +out: return 0; } From 8b1bbf36b7452c4acb20e91948eaa5e225ea6978 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Sep 2017 11:52:34 -0700 Subject: [PATCH 0566/1322] iw_cxgb4: remove the stid on listen create failure If a listen create fails, then the server tid (stid) is incorrectly left in the stid idr table, which can cause a touch-after-free if the stid is looked up and the already freed endpoint is touched. So make sure and remove it in the error path. Cc: stable@vger.kernel.org Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 19297e408820..daf7a56e5d7e 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3462,7 +3462,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = ep; goto out; } - + remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid); cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: From 05f5c38576475439f3124a3e6d62db68de83f7be Mon Sep 17 00:00:00 2001 From: KT Liao Date: Fri, 22 Sep 2017 10:00:57 -0700 Subject: [PATCH 0567/1322] Input: elan_i2c - extend Flash-Write delay The original 20ms delay is only marginally enough delay after a block write operation during firmware update. Let's increase the delay to ensure that the controller finishes up storing the page to avoid failures in the firmware updates. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/elan_i2c_i2c.c b/drivers/input/mouse/elan_i2c_i2c.c index 15b1330606c1..e19eb60b3d2f 100644 --- a/drivers/input/mouse/elan_i2c_i2c.c +++ b/drivers/input/mouse/elan_i2c_i2c.c @@ -598,7 +598,7 @@ static int elan_i2c_write_fw_block(struct i2c_client *client, } /* Wait for F/W to update one page ROM data. */ - msleep(20); + msleep(35); error = elan_i2c_read_cmd(client, ETP_I2C_IAP_CTRL_CMD, val); if (error) { From af3c79beff48fb2bd1c79d02688004cf57215acf Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 7 Sep 2017 13:38:18 +0300 Subject: [PATCH 0568/1322] IB/ipoib: Suppress the retry related completion errors IPoIB doesn't support transport/rnr retry schemes as per RFC so those errors are expected. No need to flood the log files with them. Tested-by: Michael Nowak Tested-by: Rafael Alejandro Peralez Tested-by: Liwen Huang Tested-by: Hong Liu Reviewed-by: Mukesh Kacker Reported-by: Rajiv Raja Signed-off-by: Santosh Shilimkar Signed-off-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 14b62f7472b4..7774654c2ccb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -823,12 +823,18 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_neigh *neigh; - if (wc->status != IB_WC_RNR_RETRY_EXC_ERR) - ipoib_warn(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + /* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle, + * so don't make waves. + */ + if (wc->status == IB_WC_RNR_RETRY_EXC_ERR || + wc->status == IB_WC_RETRY_EXC_ERR) + ipoib_dbg(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err 0x%x)\n", + __func__, wc->status, wr_id, wc->vendor_err); else - ipoib_dbg(priv, "failed cm send event (status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + ipoib_warn(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err 0x%x)\n", + __func__, wc->status, wr_id, wc->vendor_err); spin_lock_irqsave(&priv->lock, flags); neigh = tx->neigh; From 06564f60859bdf7e73d70ae35d7e285e96ae9c46 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 11 Sep 2017 17:03:13 +0100 Subject: [PATCH 0569/1322] IB/ocrdma: fix incorrect fall-through on switch statement In the case where mbox_status is OCRDMA_MBX_STATUS_FAILED and add_status is OCRDMA_MBX_STATUS_FAILED err_num is assigned -EAGAIN however the case OCRDMA_MBX_STATUS_FAILED is missing a break and falls through to the default case which then re-assigns err_num to -EFAULT. Fix this so that err_num is assigned to -EAGAIN for the add_status OCRDMA_MBX_STATUS_FAILED case and -EFAULT otherwise. Detected by CoverityScan CID#703125 ("Missing break in switch") Fixes: fe2caefcdf58 ("RDMA/ocrdma: Add driver for Emulex OneConnect IBoE RDMA adapter") Signed-off-by: Colin Ian King Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index dcb5942f9fb5..65b166cc7437 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -252,7 +252,10 @@ static int ocrdma_get_mbx_errno(u32 status) case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES: err_num = -EAGAIN; break; + default: + err_num = -EFAULT; } + break; default: err_num = -EFAULT; } From cbafad87e1507044c7d442087d41d5e3d432cc4e Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 18 Sep 2017 12:28:48 +0100 Subject: [PATCH 0570/1322] IB/mlx5: fix debugfs cleanup If delay_drop_debugfs_init() fails in any of the operations to create debugfs, it is calling delay_drop_debugfs_cleanup() as part of its cleanup. But delay_drop_debugfs_cleanup() checks for 'dbg' and since we have not yet pointed 'dbg' to the debugfs we need to cleanup, the cleanup fails and we are left with stray debugfs elements and also a memory leak. Fixes: 4a5fd5d2965c ("IB/mlx5: Add necessary delay drop assignment") Signed-off-by: Sudip Mukherjee Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ab3c562d5ba7..05fb4bdff6a0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3837,11 +3837,13 @@ static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) if (!dbg) return -ENOMEM; + dev->delay_drop.dbg = dbg; + dbg->dir_debugfs = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root); if (!dbg->dir_debugfs) - return -ENOMEM; + goto out_debugfs; dbg->events_cnt_debugfs = debugfs_create_atomic_t("num_timeout_events", 0400, @@ -3865,8 +3867,6 @@ static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev) if (!dbg->timeout_debugfs) goto out_debugfs; - dev->delay_drop.dbg = dbg; - return 0; out_debugfs: From e13547bc181ae6c279adf0df054717787f24ee89 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 19 Sep 2017 13:22:13 +0300 Subject: [PATCH 0571/1322] IB/bnxt_re: Fix frame stack compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce stack size by dynamically allocating memory instead of declaring large struct on the stack: drivers/infiniband/hw/bnxt_re/ib_verbs.c: In function ‘bnxt_re_query_qp’: drivers/infiniband/hw/bnxt_re/ib_verbs.c:1600:1: warning: the frame size of 1216 bytes is larger than 1024 bytes [-Wframe-larger-than=] } ^ Cc: Selvin Xavier Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Signed-off-by: Leon Romanovsky Acked-by: Selvin Xavier Reviewed-by: Jonathan Toppins Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 67 +++++++++++++----------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 01eee15bbd65..03caf48af8e2 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1551,43 +1551,46 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - struct bnxt_qplib_qp qplib_qp; + struct bnxt_qplib_qp *qplib_qp; int rc; - memset(&qplib_qp, 0, sizeof(struct bnxt_qplib_qp)); - qplib_qp.id = qp->qplib_qp.id; - qplib_qp.ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index; + qplib_qp = kzalloc(sizeof(*qplib_qp), GFP_KERNEL); + if (!qplib_qp) + return -ENOMEM; - rc = bnxt_qplib_query_qp(&rdev->qplib_res, &qplib_qp); + qplib_qp->id = qp->qplib_qp.id; + qplib_qp->ah.host_sgid_index = qp->qplib_qp.ah.host_sgid_index; + + rc = bnxt_qplib_query_qp(&rdev->qplib_res, qplib_qp); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to query HW QP"); - return rc; + goto out; } - qp_attr->qp_state = __to_ib_qp_state(qplib_qp.state); - qp_attr->en_sqd_async_notify = qplib_qp.en_sqd_async_notify ? 1 : 0; - qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp.access); - qp_attr->pkey_index = qplib_qp.pkey_index; - qp_attr->qkey = qplib_qp.qkey; + qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state); + qp_attr->en_sqd_async_notify = qplib_qp->en_sqd_async_notify ? 1 : 0; + qp_attr->qp_access_flags = __to_ib_access_flags(qplib_qp->access); + qp_attr->pkey_index = qplib_qp->pkey_index; + qp_attr->qkey = qplib_qp->qkey; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; - rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp.ah.flow_label, - qplib_qp.ah.host_sgid_index, - qplib_qp.ah.hop_limit, - qplib_qp.ah.traffic_class); - rdma_ah_set_dgid_raw(&qp_attr->ah_attr, qplib_qp.ah.dgid.data); - rdma_ah_set_sl(&qp_attr->ah_attr, qplib_qp.ah.sl); - ether_addr_copy(qp_attr->ah_attr.roce.dmac, qplib_qp.ah.dmac); - qp_attr->path_mtu = __to_ib_mtu(qplib_qp.path_mtu); - qp_attr->timeout = qplib_qp.timeout; - qp_attr->retry_cnt = qplib_qp.retry_cnt; - qp_attr->rnr_retry = qplib_qp.rnr_retry; - qp_attr->min_rnr_timer = qplib_qp.min_rnr_timer; - qp_attr->rq_psn = qplib_qp.rq.psn; - qp_attr->max_rd_atomic = qplib_qp.max_rd_atomic; - qp_attr->sq_psn = qplib_qp.sq.psn; - qp_attr->max_dest_rd_atomic = qplib_qp.max_dest_rd_atomic; - qp_init_attr->sq_sig_type = qplib_qp.sig_type ? IB_SIGNAL_ALL_WR : - IB_SIGNAL_REQ_WR; - qp_attr->dest_qp_num = qplib_qp.dest_qpn; + rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->ah.flow_label, + qplib_qp->ah.host_sgid_index, + qplib_qp->ah.hop_limit, + qplib_qp->ah.traffic_class); + rdma_ah_set_dgid_raw(&qp_attr->ah_attr, qplib_qp->ah.dgid.data); + rdma_ah_set_sl(&qp_attr->ah_attr, qplib_qp->ah.sl); + ether_addr_copy(qp_attr->ah_attr.roce.dmac, qplib_qp->ah.dmac); + qp_attr->path_mtu = __to_ib_mtu(qplib_qp->path_mtu); + qp_attr->timeout = qplib_qp->timeout; + qp_attr->retry_cnt = qplib_qp->retry_cnt; + qp_attr->rnr_retry = qplib_qp->rnr_retry; + qp_attr->min_rnr_timer = qplib_qp->min_rnr_timer; + qp_attr->rq_psn = qplib_qp->rq.psn; + qp_attr->max_rd_atomic = qplib_qp->max_rd_atomic; + qp_attr->sq_psn = qplib_qp->sq.psn; + qp_attr->max_dest_rd_atomic = qplib_qp->max_dest_rd_atomic; + qp_init_attr->sq_sig_type = qplib_qp->sig_type ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; + qp_attr->dest_qp_num = qplib_qp->dest_qpn; qp_attr->cap.max_send_wr = qp->qplib_qp.sq.max_wqe; qp_attr->cap.max_send_sge = qp->qplib_qp.sq.max_sge; @@ -1596,7 +1599,9 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp_attr->cap.max_inline_data = qp->qplib_qp.max_inline_data; qp_init_attr->cap = qp_attr->cap; - return 0; +out: + kfree(qplib_qp); + return rc; } /* Routine for sending QP1 packets for RoCE V1 an V2 From 01df7f5a77b926c2d790cf66d942d2a68d4ad5de Mon Sep 17 00:00:00 2001 From: Adit Ranadive Date: Thu, 21 Sep 2017 15:56:21 -0700 Subject: [PATCH 0572/1322] RDMA/vmw_pvrdma: Fix reporting correct opcodes for completion Since the IB_WC_BIND_MW opcode has been dropped, set the correct IB WC opcode explicitly. Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Adit Ranadive Signed-off-by: Bryan Tan Signed-off-by: Doug Ledford --- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 29 +++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 663a0c301c43..984aa3484928 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -416,9 +416,34 @@ static inline enum ib_wc_status pvrdma_wc_status_to_ib( return (enum ib_wc_status)status; } -static inline int pvrdma_wc_opcode_to_ib(int opcode) +static inline int pvrdma_wc_opcode_to_ib(unsigned int opcode) { - return opcode; + switch (opcode) { + case PVRDMA_WC_SEND: + return IB_WC_SEND; + case PVRDMA_WC_RDMA_WRITE: + return IB_WC_RDMA_WRITE; + case PVRDMA_WC_RDMA_READ: + return IB_WC_RDMA_READ; + case PVRDMA_WC_COMP_SWAP: + return IB_WC_COMP_SWAP; + case PVRDMA_WC_FETCH_ADD: + return IB_WC_FETCH_ADD; + case PVRDMA_WC_LOCAL_INV: + return IB_WC_LOCAL_INV; + case PVRDMA_WC_FAST_REG_MR: + return IB_WC_REG_MR; + case PVRDMA_WC_MASKED_COMP_SWAP: + return IB_WC_MASKED_COMP_SWAP; + case PVRDMA_WC_MASKED_FETCH_ADD: + return IB_WC_MASKED_FETCH_ADD; + case PVRDMA_WC_RECV: + return IB_WC_RECV; + case PVRDMA_WC_RECV_RDMA_WITH_IMM: + return IB_WC_RECV_RDMA_WITH_IMM; + default: + return IB_WC_SEND; + } } static inline int pvrdma_wc_flags_to_ib(int flags) From cd9100ca9e78eb03bf5a0ee2ba98ebdc8cc5880e Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:09 -0500 Subject: [PATCH 0573/1322] i40iw: Fail open if there are no available MSI-X vectors Check number of available MSI-X vectors for i40iw. If there are no available vectors, fail the open. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index cc742c3132c6..5965b59a2f83 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1400,6 +1400,11 @@ static enum i40iw_status_code i40iw_save_msix_info(struct i40iw_device *iwdev, u32 i; u32 size; + if (!ldev->msix_count) { + i40iw_pr_err("No MSI-X vectors\n"); + return I40IW_ERR_CONFIG; + } + iwdev->msix_count = ldev->msix_count; size = sizeof(struct i40iw_msix_vector) * iwdev->msix_count; @@ -1550,7 +1555,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, status = i40iw_save_msix_info(iwdev, ldev); if (status) - goto exit; + return status; iwdev->hw.dev_context = (void *)ldev->pcidev; iwdev->hw.hw_addr = ldev->hw_addr; status = i40iw_allocate_dma_mem(&iwdev->hw, From 47fb3c1610b28dd435b892762280eebf04fe9adf Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:10 -0500 Subject: [PATCH 0574/1322] i40iw: Prevent multiple netdev event notifier registrations Netdev event notifier registration/de-registration is not synchronized with a lock and there is a possibility of a duplicate registration of notifier before the unregister completes. Register netdev event notifiers during module init and de-register them at module exit. This avoids the need to tie the registration to first netdev client interface open and de-registration to last client interface close and the synchronization to achieve it. This also fixes a crash due to duplicate registration. BUG: unable to handle kernel paging request at ffffffffa0d60388 IP: [] notifier_call_chain+0x3d/0x70 PGD 190d067 PUD 190e063 PMD 76c840067 PTE 0 Oops: 0000 [#1] SMP Modules linked in: i40e(OF-) fuse btrfs zlib_deflate raid6_pq xor vfat msdos [..] e1000e vxlan ip_tunnel ptp pps_core i2c_core video [last unloaded: i40iw] CPU: 1 PID: 27101 Comm: modprobe Tainted: GF W O-------------- 3.10.0-229.el7.x86_64 #1 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./Q87M-D2H, BIOS F7 01/17/2014 task: ffff88076e8a96c0 ti: ffff8806959c8000 task.ti: ffff8806959c8000 RIP: 0010:[] [] notifier_call_chain+0x3d/0x70 RSP: 0018:ffff8806959cbb38 EFLAGS: 00010282 RAX: ffffffffa0d60380 RBX: 00000000fffffffd RCX: 0000000000000000 0708] RDX: 0000000000000000 RSI: ffff88081227a000 RDI: 0000000000000002 RBP: ffff8806959cbb60 R08: 0000000000000246 R09: 000000000000700c R10: ffff88080e16ea40 R11: 00000000000ae8df R12: ffffffffa0d60380 R13: 0000000000000002 R14: ffff88076e738800 R15: 0000000000000000 FS: 00007f604ef4a740(0000) GS:ffff88083e240000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa0d60388 CR3: 0000000753cd2000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffffffff819e73a0 0000000000000000 0000000000000002 ffff88076e738800 00000000ffffffff ffff8806959cbba0 ffffffff8109d61d 0000000000000000 0000000000000000 ffff88076e738800 0000000000000000 ffff88076e738800 Call Trace: [] __blocking_notifier_call_chain+0x4d/0x70 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0x154/0x2b0 [] inetdev_event+0x182/0x530 [] notifier_call_chain+0x4c/0x70 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x2d/0x60 [] rollback_registered_many+0x105/0x220 [] rollback_registered+0x40/0x70 [] unregister_netdevice_queue+0x48/0x80 [] unregister_netdev+0x1c/0x30 [] i40e_vsi_release+0x2a9/0x2b0 [i40e] [] i40e_remove+0x128/0x2b0 [i40e] [] pci_device_remove+0x3b/0xb0 [] __device_release_driver+0x7f/0xf0 [] driver_detach+0xb8/0xc0 [] bus_remove_driver+0x9b/0x120 [] driver_unregister+0x2c/0x50 [] pci_unregister_driver+0x2c/0x90 [] i40e_exit_module+0x10/0x23 [i40e] [] SyS_delete_module+0x16b/0x2d0 [] ? do_notify_resume+0x9c/0xb0 [] system_call_fastpath+0x16/0x1b Code: e5 41 57 4d 89 c7 41 56 49 89 d6 41 55 49 89 f5 41 54 53 89 cb 75 14 eb 3d 0f 1f 44 00 00 83 eb 01 74 25 4d 85 e4 74 20 4c 89 e0 <4c> 8b 60 08 4c 89 f2 4c 89 ee 48 89 c7 ff 10 4d 85 ff 74 04 41 RIP [] notifier_call_chain+0x3d/0x70 Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw.h | 1 - drivers/infiniband/hw/i40iw/i40iw_main.c | 32 ++++++++++++----------- drivers/infiniband/hw/i40iw/i40iw_utils.c | 6 ++--- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 9b1566468744..a65e4cbdce2f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -201,7 +201,6 @@ enum init_completion_state { CEQ_CREATED, ILQ_CREATED, IEQ_CREATED, - INET_NOTIFIER, IP_ADDR_REGISTERED, RDMA_DEV_REGISTERED }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 5965b59a2f83..27590ae21881 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -99,8 +99,6 @@ static struct notifier_block i40iw_net_notifier = { .notifier_call = i40iw_net_event }; -static atomic_t i40iw_notifiers_registered; - /** * i40iw_find_i40e_handler - find a handler given a client info * @ldev: pointer to a client info @@ -1376,11 +1374,20 @@ error: */ static void i40iw_register_notifiers(void) { - if (atomic_inc_return(&i40iw_notifiers_registered) == 1) { - register_inetaddr_notifier(&i40iw_inetaddr_notifier); - register_inet6addr_notifier(&i40iw_inetaddr6_notifier); - register_netevent_notifier(&i40iw_net_notifier); - } + register_inetaddr_notifier(&i40iw_inetaddr_notifier); + register_inet6addr_notifier(&i40iw_inetaddr6_notifier); + register_netevent_notifier(&i40iw_net_notifier); +} + +/** + * i40iw_unregister_notifiers - unregister tcp ip notifiers + */ + +static void i40iw_unregister_notifiers(void) +{ + unregister_netevent_notifier(&i40iw_net_notifier); + unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); + unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); } /** @@ -1467,12 +1474,6 @@ static void i40iw_deinit_device(struct i40iw_device *iwdev) if (!iwdev->reset) i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx); /* fallthrough */ - case INET_NOTIFIER: - if (!atomic_dec_return(&i40iw_notifiers_registered)) { - unregister_netevent_notifier(&i40iw_net_notifier); - unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); - unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); - } /* fallthrough */ case PBLE_CHUNK_MEM: i40iw_destroy_pble_pool(dev, iwdev->pble_rsrc); @@ -1672,8 +1673,6 @@ static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client) break; iwdev->init_state = PBLE_CHUNK_MEM; iwdev->virtchnl_wq = alloc_ordered_workqueue("iwvch", WQ_MEM_RECLAIM); - i40iw_register_notifiers(); - iwdev->init_state = INET_NOTIFIER; status = i40iw_add_mac_ip(iwdev); if (status) break; @@ -2023,6 +2022,8 @@ static int __init i40iw_init_module(void) i40iw_client.type = I40E_CLIENT_IWARP; spin_lock_init(&i40iw_handler_lock); ret = i40e_register_client(&i40iw_client); + i40iw_register_notifiers(); + return ret; } @@ -2034,6 +2035,7 @@ static int __init i40iw_init_module(void) */ static void __exit i40iw_exit_module(void) { + i40iw_unregister_notifiers(); i40e_unregister_client(&i40iw_client); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 62f1f45b8737..e52dbbb4165e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -160,7 +160,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, return NOTIFY_DONE; iwdev = &hdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; netdev = iwdev->ldev->netdev; @@ -217,7 +217,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, return NOTIFY_DONE; iwdev = &hdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; netdev = iwdev->ldev->netdev; @@ -266,7 +266,7 @@ int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void * if (!iwhdl) return NOTIFY_DONE; iwdev = &iwhdl->device; - if (iwdev->init_state < INET_NOTIFIER) + if (iwdev->init_state < IP_ADDR_REGISTERED || iwdev->closing) return NOTIFY_DONE; p = (__be32 *)neigh->primary_key; i40iw_copy_ip_ntohl(local_ipaddr, p); From 471b370d52a4a461bf855ff542b544f3e4a5cb3a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:11 -0500 Subject: [PATCH 0575/1322] i40iw: Call i40iw_cm_disconn on modify QP to disconnect If QP modify to closing/terminate/error fails, connection is not torn down as there is no corresponding asynchronous event that will initiate the teardown. Add explicit call to i40iw_cm_disconn if not waiting in modify QP, otherwise schedule it in CM timer. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1aa411034a27..28b3d02d511b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1027,7 +1027,19 @@ int i40iw_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, iwqp->hw_tcp_state = I40IW_TCP_STATE_CLOSED; iwqp->last_aeq = I40IW_AE_RESET_SENT; spin_unlock_irqrestore(&iwqp->lock, flags); + i40iw_cm_disconn(iwqp); } + } else { + spin_lock_irqsave(&iwqp->lock, flags); + if (iwqp->cm_id) { + if (atomic_inc_return(&iwqp->close_timer_started) == 1) { + iwqp->cm_id->add_ref(iwqp->cm_id); + i40iw_schedule_cm_timer(iwqp->cm_node, + (struct i40iw_puda_buf *)iwqp, + I40IW_TIMER_TYPE_CLOSE, 1, 0); + } + } + spin_unlock_irqrestore(&iwqp->lock, flags); } } return 0; From dfc612b3407e88913a58db00b3bca93685d4f4f9 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Tue, 19 Sep 2017 09:19:12 -0500 Subject: [PATCH 0576/1322] i40iw: Add missing VLAN priority Set the VLAN priority which is in the upper 3 bits of the VLAN tag field in the QP context. Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 3 ++- drivers/infiniband/hw/i40iw/i40iw_cm.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 14f36ba4e5be..b7215448bb63 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -3255,7 +3255,8 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->snd_mss = cpu_to_le32(((u32)cm_node->tcp_cntxt.mss)); if (cm_node->vlan_id < VLAN_TAG_PRESENT) { tcp_info->insert_vlan_tag = true; - tcp_info->vlan_tag = cpu_to_le16(cm_node->vlan_id); + tcp_info->vlan_tag = cpu_to_le16(((u16)cm_node->user_pri << I40IW_VLAN_PRIO_SHIFT) | + cm_node->vlan_id); } if (cm_node->ipv4) { tcp_info->src_port = cpu_to_le16(cm_node->loc_port); diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 2e52e38ffcf3..8626e7f1fdd3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -71,6 +71,8 @@ #define I40IW_HW_IRD_SETTING_32 32 #define I40IW_HW_IRD_SETTING_64 64 +#define I40IW_VLAN_PRIO_SHIFT 13 + enum ietf_mpa_flags { IETF_MPA_FLAGS_MARKERS = 0x80, /* receive Markers */ IETF_MPA_FLAGS_CRC = 0x40, /* receive Markers */ From f16dc0aa5ea20a2cf173e82ade5f05bfecaa850a Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 19 Sep 2017 09:19:13 -0500 Subject: [PATCH 0577/1322] i40iw: Add support for port reuse on active side connections During OpenMPI scale up testing, we observe rdma_connect failures if ports are reused on multiple connections. This is because the Control Queue-Pair (CQP) command to add the reused port to Accelerated Port Bit VectorTable (APBVT) fails as there already exists an entry. Check for duplicate port before invoking the CQP command to add APBVT entry and delete the entry only if the port is not in use. Signed-off-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 151 ++++++++++++------------- drivers/infiniband/hw/i40iw/i40iw_cm.h | 3 + 2 files changed, 78 insertions(+), 76 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index b7215448bb63..5230dd3c938c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1504,23 +1504,40 @@ static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, } /** - * listen_port_in_use - determine if port is in use - * @port: Listen port number + * i40iw_port_in_use - determine if port is in use + * @port: port number + * @active_side: flag for listener side vs active side */ -static bool i40iw_listen_port_in_use(struct i40iw_cm_core *cm_core, u16 port) +static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side) { struct i40iw_cm_listener *listen_node; + struct i40iw_cm_node *cm_node; unsigned long flags; bool ret = false; - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { - if (listen_node->loc_port == port) { - ret = true; - break; + if (active_side) { + /* search connected node list */ + spin_lock_irqsave(&cm_core->ht_lock, flags); + list_for_each_entry(cm_node, &cm_core->connected_nodes, list) { + if (cm_node->loc_port == port) { + ret = true; + break; + } } + if (!ret) + clear_bit(port, cm_core->active_side_ports); + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + } else { + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { + if (listen_node->loc_port == port) { + ret = true; + break; + } + } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); } - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return ret; } @@ -1868,7 +1885,7 @@ static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); if (listener->iwdev) { - if (apbvt_del && !i40iw_listen_port_in_use(cm_core, listener->loc_port)) + if (apbvt_del && !i40iw_port_in_use(cm_core, listener->loc_port, false)) i40iw_manage_apbvt(listener->iwdev, listener->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -2247,21 +2264,21 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) && - cm_node->apbvt_set) { + if (!i40iw_port_in_use(cm_core, cm_node->loc_port, true) && cm_node->apbvt_set) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, I40IW_MANAGE_APBVT_DEL); - i40iw_get_addr_info(cm_node, &nfo); - if (cm_node->qhash_set) { - i40iw_manage_qhash(cm_node->iwdev, - &nfo, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - cm_node->qhash_set = 0; - } + cm_node->apbvt_set = 0; + } + i40iw_get_addr_info(cm_node, &nfo); + if (cm_node->qhash_set) { + i40iw_manage_qhash(cm_node->iwdev, + &nfo, + I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_DELETE, + NULL, + false); + cm_node->qhash_set = 0; } } @@ -3738,10 +3755,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in *raddr; struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; - bool qhash_set = false; - int apbvt_set = 0; - int err = 0; - enum i40iw_status_code status; + int ret = 0; + unsigned long flags; ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3790,32 +3805,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_info.user_pri = rt_tos2priority(cm_id->tos); i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_DCB, "%s TOS:[%d] UP:[%d]\n", __func__, cm_id->tos, cm_info.user_pri); - if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) || - (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32, - raddr6->sin6_addr.in6_u.u6_addr32, - sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) { - status = i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_ADD, - NULL, - true); - if (status) - return -EINVAL; - qhash_set = true; - } - status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD); - if (status) { - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - return -EINVAL; - } - - apbvt_set = 1; cm_id->add_ref(cm_id); cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev, conn_param->private_data_len, @@ -3823,17 +3812,40 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) &cm_info); if (IS_ERR(cm_node)) { - err = PTR_ERR(cm_node); - goto err_out; + ret = PTR_ERR(cm_node); + cm_id->rem_ref(cm_id); + return ret; } + if ((cm_info.ipv4 && (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr)) || + (!cm_info.ipv4 && memcmp(laddr6->sin6_addr.in6_u.u6_addr32, + raddr6->sin6_addr.in6_u.u6_addr32, + sizeof(laddr6->sin6_addr.in6_u.u6_addr32)))) { + if (i40iw_manage_qhash(iwdev, &cm_info, I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_ADD, NULL, true)) { + ret = -EINVAL; + goto err; + } + cm_node->qhash_set = true; + } + + spin_lock_irqsave(&iwdev->cm_core.ht_lock, flags); + if (!test_and_set_bit(cm_info.loc_port, iwdev->cm_core.active_side_ports)) { + spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD)) { + ret = -EINVAL; + goto err; + } + } else { + spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + } + + cm_node->apbvt_set = true; i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && !cm_node->ord_size) cm_node->ord_size = 1; - cm_node->apbvt_set = apbvt_set; - cm_node->qhash_set = qhash_set; iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; @@ -3841,11 +3853,9 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (cm_node->state != I40IW_CM_STATE_OFFLOADED) { cm_node->state = I40IW_CM_STATE_SYN_SENT; - err = i40iw_send_syn(cm_node, 0); - if (err) { - i40iw_rem_ref_cm_node(cm_node); - goto err_out; - } + ret = i40iw_send_syn(cm_node, 0); + if (ret) + goto err; } i40iw_debug(cm_node->dev, @@ -3854,9 +3864,10 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->rem_port, cm_node, cm_node->cm_id); + return 0; -err_out: +err: if (cm_info.ipv4) i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, @@ -3868,22 +3879,10 @@ err_out: "Api - connect() FAILED: dest addr=%pI6", cm_info.rem_addr); - if (qhash_set) - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - - if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, - cm_info.loc_port)) - i40iw_manage_apbvt(iwdev, - cm_info.loc_port, - I40IW_MANAGE_APBVT_DEL); + i40iw_rem_ref_cm_node(cm_node); cm_id->rem_ref(cm_id); iwdev->cm_core.stats_connect_errs++; - return err; + return ret; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 8626e7f1fdd3..45abef76295b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -71,6 +71,7 @@ #define I40IW_HW_IRD_SETTING_32 32 #define I40IW_HW_IRD_SETTING_64 64 +#define MAX_PORTS 65536 #define I40IW_VLAN_PRIO_SHIFT 13 enum ietf_mpa_flags { @@ -413,6 +414,8 @@ struct i40iw_cm_core { spinlock_t ht_lock; /* manage hash table */ spinlock_t listen_list_lock; /* listen list */ + unsigned long active_side_ports[BITS_TO_LONGS(MAX_PORTS)]; + u64 stats_nodes_created; u64 stats_nodes_destroyed; u64 stats_listen_created; From 432654df90f2a7d8ac8438905208074604ed3e00 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 11 Sep 2017 21:41:43 +0200 Subject: [PATCH 0578/1322] parisc: Fix too large frame size warnings The parisc architecture has larger stack frames than most other architectures on 32-bit kernels. Increase the maximum allowed stack frame to 1280 bytes for parisc to avoid warnings in the do_sys_poll() and pat_memconfig() functions. Signed-off-by: Helge Deller --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b19c491cbc4e..2689b7c50c52 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -219,7 +219,8 @@ config FRAME_WARN range 0 8192 default 0 if KASAN default 2048 if GCC_PLUGIN_LATENT_ENTROPY - default 1024 if !64BIT + default 1280 if (!64BIT && PARISC) + default 1024 if (!64BIT && !PARISC) default 2048 if 64BIT help Tell gcc to warn at build time for stack frames larger than this. From e77900abfd8be4e207412d8b7752dbb9838e2571 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:05:02 +0200 Subject: [PATCH 0579/1322] parisc: Stop unwinding at start of stack Check stack pointer if we are reaching the stack end and stop unwinding if we do. This fixes early backtraces and avoids showing unrealistic call stacks. Signed-off-by: Helge Deller --- arch/parisc/kernel/unwind.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 48dc7d4d20bb..caab39dfa95d 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -279,6 +280,17 @@ static void unwind_frame_regs(struct unwind_frame_info *info) info->prev_sp = sp - 64; info->prev_ip = 0; + + /* The stack is at the end inside the thread_union + * struct. If we reach data, we have reached the + * beginning of the stack and should stop unwinding. */ + if (info->prev_sp >= (unsigned long) task_thread_info(info->t) && + info->prev_sp < ((unsigned long) task_thread_info(info->t) + + THREAD_SZ_ALGN)) { + info->prev_sp = 0; + break; + } + if (get_user(tmp, (unsigned long *)(info->prev_sp - RP_OFFSET))) break; info->prev_ip = tmp; From 08b8a99b2c5ea8da4d3dd55056881d12baea1e04 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:17:10 +0200 Subject: [PATCH 0580/1322] parisc: Move start_parisc() into init section Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index dee6f9d6a153..a31e91c1782b 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -398,9 +399,8 @@ static int __init parisc_init(void) } arch_initcall(parisc_init); -void start_parisc(void) +void __init start_parisc(void) { - extern void start_kernel(void); extern void early_trap_init(void); int ret, cpunum; From 77089c5274fe2f72db5a2cd956d0d308aed08e68 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:15:09 +0200 Subject: [PATCH 0581/1322] parisc: Add wrapper for pdc_instr() firmware function Signed-off-by: Helge Deller --- arch/parisc/include/asm/pdc.h | 1 + arch/parisc/kernel/firmware.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h index 26b4455baa83..510341f62d97 100644 --- a/arch/parisc/include/asm/pdc.h +++ b/arch/parisc/include/asm/pdc.h @@ -280,6 +280,7 @@ void setup_pdc(void); /* in inventory.c */ /* wrapper-functions from pdc.c */ int pdc_add_valid(unsigned long address); +int pdc_instr(unsigned int *instr); int pdc_chassis_info(struct pdc_chassis_info *chassis_info, void *led_info, unsigned long len); int pdc_chassis_disp(unsigned long disp); int pdc_chassis_warn(unsigned long *warn); diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index ab80e5c6f651..6d471c00c71a 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -232,6 +232,26 @@ int pdc_add_valid(unsigned long address) } EXPORT_SYMBOL(pdc_add_valid); +/** + * pdc_instr - Get instruction that invokes PDCE_CHECK in HPMC handler. + * @instr: Pointer to variable which will get instruction opcode. + * + * The return value is PDC_OK (0) in case call succeeded. + */ +int __init pdc_instr(unsigned int *instr) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&pdc_lock, flags); + retval = mem_pdc_call(PDC_INSTR, 0UL, __pa(pdc_result)); + convert_to_wide(pdc_result); + *instr = pdc_result[0]; + spin_unlock_irqrestore(&pdc_lock, flags); + + return retval; +} + /** * pdc_chassis_info - Return chassis information. * @result: The return buffer. From 8d771b143fe2e3941fc8a32926d21410004578c0 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 17 Sep 2017 21:28:11 +0200 Subject: [PATCH 0582/1322] parisc: Add PDCE_CHECK instruction to HPMC handler According to the programming note at page 1-31 of the PA 1.1 Firmware Architecture document, one should use the PDC_INSTR firmware function to get the instruction that invokes a PDCE_CHECK in the HPMC handler. This patch follows this note and sets the instruction which has been a nop up until now. Testing on a C3000 and C8000 showed that this firmware call isn't implemented on those machines, so maybe it's only needed on older ones. Signed-off-by: Helge Deller --- arch/parisc/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 991654c88eec..230333157fe3 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -817,7 +817,7 @@ void __init initialize_ivt(const void *iva) u32 check = 0; u32 *ivap; u32 *hpmcp; - u32 length; + u32 length, instr; if (strcmp((const char *)iva, "cows can fly")) panic("IVT invalid"); @@ -827,6 +827,14 @@ void __init initialize_ivt(const void *iva) for (i = 0; i < 8; i++) *ivap++ = 0; + /* + * Use PDC_INSTR firmware function to get instruction that invokes + * PDCE_CHECK in HPMC handler. See programming note at page 1-31 of + * the PA 1.1 Firmware Architecture document. + */ + if (pdc_instr(&instr) == PDC_OK) + ivap[0] = instr; + /* Compute Checksum for HPMC handler */ length = os_hpmc_size; ivap[7] = length; From ea6976483fb0ced259fbaa9e4f68a2cdcee7e312 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 17:55:24 +0200 Subject: [PATCH 0583/1322] parisc: Check if initrd was loaded into broken RAM While scanning the PDT for reported broken memory modules, warn if the initrd was coincidentally loaded into bad memory. Signed-off-by: Helge Deller --- arch/parisc/kernel/pdt.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index 05730a83895c..00aed082969b 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -216,8 +217,16 @@ void __init pdc_pdt_init(void) } for (i = 0; i < pdt_status.pdt_entries; i++) { + unsigned long addr; + report_mem_err(pdt_entry[i]); + addr = pdt_entry[i] & PDT_ADDR_PHYS_MASK; + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && + addr >= initrd_start && addr < initrd_end) + pr_crit("CRITICAL: initrd possibly broken " + "due to bad memory!\n"); + /* mark memory page bad */ memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); } From a7e6601f70a53957b1d01c321319f0237bba5202 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 21 Sep 2017 21:22:27 +0200 Subject: [PATCH 0584/1322] parisc: Move init_per_cpu() into init section Signed-off-by: Helge Deller --- arch/parisc/include/asm/smp.h | 1 + arch/parisc/kernel/processor.c | 2 +- arch/parisc/kernel/setup.c | 2 +- arch/parisc/kernel/smp.c | 3 +-- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h index a5dc9066c6d8..ad9c9c3b4136 100644 --- a/arch/parisc/include/asm/smp.h +++ b/arch/parisc/include/asm/smp.h @@ -1,6 +1,7 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H +extern int init_per_cpu(int cpuid); #if defined(CONFIG_SMP) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index a778bd3c107c..e120d63c1b28 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -317,7 +317,7 @@ void __init collect_boot_cpu_data(void) * * o Enable CPU profiling hooks. */ -int init_per_cpu(int cpunum) +int __init init_per_cpu(int cpunum) { int ret; struct pdc_coproc_cfg coproc_cfg; diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index a31e91c1782b..f7d0c3b33d70 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -49,6 +49,7 @@ #include #include #include +#include static char __initdata command_line[COMMAND_LINE_SIZE]; @@ -116,7 +117,6 @@ void __init dma_ops_init(void) } #endif -extern int init_per_cpu(int cpuid); extern void collect_boot_cpu_data(void); void __init setup_arch(char **cmdline_p) diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 63365106ea19..30c28ab14540 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -255,12 +255,11 @@ void arch_send_call_function_single_ipi(int cpu) static void __init smp_cpu_init(int cpunum) { - extern int init_per_cpu(int); /* arch/parisc/kernel/processor.c */ extern void init_IRQ(void); /* arch/parisc/kernel/irq.c */ extern void start_cpu_itimer(void); /* arch/parisc/kernel/time.c */ /* Set modes and Enable floating point coprocessor */ - (void) init_per_cpu(cpunum); + init_per_cpu(cpunum); disable_sr_hashing(); From 606f95e4255845155f62504a9e1f12665b1853c8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 21 Sep 2017 21:52:08 +0200 Subject: [PATCH 0585/1322] parisc: Add HWPOISON page fault handler code Commit 24587380f61d ("parisc: Add MADV_HWPOISON and MADV_SOFT_OFFLINE") added the necessary constants to handle hardware-poisoning. Those were needed to support the page deallocation feature from firmware. But I completely missed to add the relevant fault handler code. This now showed up when I ran the madvise07 testcase from the Linux Test Project, which failed with a kernel BUG at arch/parisc/mm/fault.c:320. With this patch the parisc kernel now behaves like other platforms and gives the same kernel syslog warnings when poisoning pages. Signed-off-by: Helge Deller --- arch/parisc/mm/fault.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 5b101f6a5607..e247edbca68e 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -261,7 +262,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, struct task_struct *tsk; struct mm_struct *mm; unsigned long acc_type; - int fault; + int fault = 0; unsigned int flags; if (faulthandler_disabled()) @@ -315,7 +316,8 @@ good_area: goto out_of_memory; else if (fault & VM_FAULT_SIGSEGV) goto bad_area; - else if (fault & VM_FAULT_SIGBUS) + else if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) goto bad_area; BUG(); } @@ -352,8 +354,7 @@ bad_area: if (user_mode(regs)) { struct siginfo si; - - show_signal_msg(regs, code, address, tsk, vma); + unsigned int lsb = 0; switch (code) { case 15: /* Data TLB miss fault/Data page fault */ @@ -386,6 +387,30 @@ bad_area: si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR; break; } + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n", + tsk->comm, tsk->pid, address); + si.si_signo = SIGBUS; + si.si_code = BUS_MCEERR_AR; + } +#endif + + /* + * Either small page or large page may be poisoned. + * In other words, VM_FAULT_HWPOISON_LARGE and + * VM_FAULT_HWPOISON are mutually exclusive. + */ + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + else if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + else + show_signal_msg(regs, code, address, tsk, vma); + si.si_addr_lsb = lsb; + si.si_errno = 0; si.si_addr = (void __user *) address; force_sig_info(si.si_signo, &si, current); From f9b941baa4b78dafcbc4790c0ed2fbbe2aafed06 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 31 Aug 2017 09:27:28 +0530 Subject: [PATCH 0586/1322] bnxt_re: Fix update of qplib_qp.mtu when modified The MTU value in the qplib_qp.mtu should be consistent with whatever mtu was set during INIT to RTR.The Next PSN and number of packets are calculated based on this member in the qplib_qp structure. Signed-off-by: Narender Reddy Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 03caf48af8e2..11bbf3e748b0 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1436,11 +1436,14 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; qp->qplib_qp.path_mtu = __from_ib_mtu(qp_attr->path_mtu); + qp->qplib_qp.mtu = ib_mtu_enum_to_int(qp_attr->path_mtu); } else if (qp_attr->qp_state == IB_QPS_RTR) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PATH_MTU; qp->qplib_qp.path_mtu = __from_ib_mtu(iboe_get_mtu(rdev->netdev->mtu)); + qp->qplib_qp.mtu = + ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); } if (qp_attr_mask & IB_QP_TIMEOUT) { From 2b6376305dcb2cb79d0599cf3b9ce98aa860cb4f Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:29 +0530 Subject: [PATCH 0587/1322] bnxt_re: Stop issuing further cmds to FW once a cmd times out Once a cmd to FW times out(after 20s) it is reasonable to assume the FW or atleast the control path is dead. No point issuing further cmds to the FW as each subsequent cmd with another 20s timeout will cascade resulting in unnecessary traces and/or NMI Lockups. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 391bb7006e8f..2bdb1562bd21 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -107,6 +107,9 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, return -EINVAL; } + if (test_bit(FIRMWARE_TIMED_OUT, &rcfw->flags)) + return -ETIMEDOUT; + /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ @@ -226,6 +229,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* timed out */ dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); + set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags); return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 0ed312f17c8d..85b16da287f9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -162,8 +162,9 @@ struct bnxt_qplib_rcfw { unsigned long *cmdq_bitmap; u32 bmap_size; unsigned long flags; -#define FIRMWARE_INITIALIZED_FLAG 1 +#define FIRMWARE_INITIALIZED_FLAG BIT(0) #define FIRMWARE_FIRST_FLAG BIT(31) +#define FIRMWARE_TIMED_OUT BIT(3) wait_queue_head_t waitq; int (*aeq_handler)(struct bnxt_qplib_rcfw *, struct creq_func_event *); From 55311d055175cada8249d39436371afe790df699 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 31 Aug 2017 09:27:30 +0530 Subject: [PATCH 0588/1322] bnxt_re: Fix compare and swap atomic operands Driver must assign the user supplied compare/swap values in the wqe to successfully complete the atomic compare and swap operation. Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 11bbf3e748b0..0dbdbe1616ab 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1916,6 +1916,7 @@ static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: wqe->type = BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP; + wqe->atomic.cmp_data = atomic_wr(wr)->compare_add; wqe->atomic.swap_data = atomic_wr(wr)->swap; break; case IB_WR_ATOMIC_FETCH_AND_ADD: From 027c892924eac22f5ca8db4100bccde423be797d Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:31 +0530 Subject: [PATCH 0589/1322] bnxt_re: Free up devices in module_exit path Clean up all devices added to the bnxt_re_dev_list in the module_exit entry point. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 82d1cbc27aee..00a3b743c156 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1375,6 +1375,22 @@ err_netdev: static void __exit bnxt_re_mod_exit(void) { + struct bnxt_re_dev *rdev; + LIST_HEAD(to_be_deleted); + + mutex_lock(&bnxt_re_dev_lock); + /* Free all adapter allocated resources */ + if (!list_empty(&bnxt_re_dev_list)) + list_splice_init(&bnxt_re_dev_list, &to_be_deleted); + mutex_unlock(&bnxt_re_dev_lock); + + list_for_each_entry(rdev, &to_be_deleted, list) { + dev_info(rdev_to_dev(rdev), "Unregistering Device"); + bnxt_re_dev_stop(rdev); + bnxt_re_ib_unreg(rdev, true); + bnxt_re_remove_one(rdev); + bnxt_re_dev_unreg(rdev); + } unregister_netdevice_notifier(&bnxt_re_netdev_notifier); if (bnxt_re_wq) destroy_workqueue(bnxt_re_wq); From d5917307bb1caa9cb0a915951c57f4cdbacca443 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:32 +0530 Subject: [PATCH 0590/1322] bnxt_re: Fix race between the netdev register and unregister events Upon receipt of the NETDEV_REGISTER event from the netdev notifier chain, the IB stack registration is spawned off to a workqueue since that also requires an rtnl lock. There could be 2 kinds of races between the NETDEV_REGISTER and the NETDEV_UNREGISTER event handling. a)The NETDEV_UNREGISTER event is received in rapid succession after the NETDEV_REGISTER event even before the work queue got a chance to run. b)The NETDEV_UNREGISTER event is received while the workqueue that handles registration with the IB stack is still in progress. Handle both the races with a bit flag that is set just before the work item is queued and cleared in the workqueue after the event is handled just before the workqueue item is freed. While adding the new flag, it was noted that the flags are all used in *_bit() operations which expect a bit number and not a literal constant with a bit set. So change the numbers to be bit numbers. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 12 +++++++----- drivers/infiniband/hw/bnxt_re/main.c | 8 ++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b3ad37fec578..a25f9d240880 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -93,11 +93,13 @@ struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; unsigned long flags; -#define BNXT_RE_FLAG_NETDEV_REGISTERED 0 -#define BNXT_RE_FLAG_IBDEV_REGISTERED 1 -#define BNXT_RE_FLAG_GOT_MSIX 2 -#define BNXT_RE_FLAG_RCFW_CHANNEL_EN 8 -#define BNXT_RE_FLAG_QOS_WORK_REG 16 +#define BNXT_RE_FLAG_NETDEV_REGISTERED 0 +#define BNXT_RE_FLAG_IBDEV_REGISTERED 1 +#define BNXT_RE_FLAG_GOT_MSIX 2 +#define BNXT_RE_FLAG_HAVE_L2_REF 3 +#define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 +#define BNXT_RE_FLAG_QOS_WORK_REG 5 +#define BNXT_RE_FLAG_TASK_IN_PROG 6 struct net_device *netdev; unsigned int version, major, minor; struct bnxt_en_dev *en_dev; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 00a3b743c156..29c3d7e254af 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1259,6 +1259,8 @@ static void bnxt_re_task(struct work_struct *work) default: break; } + smp_mb__before_atomic(); + clear_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags); kfree(re_work); } @@ -1317,6 +1319,11 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, break; case NETDEV_UNREGISTER: + /* netdev notifier will call NETDEV_UNREGISTER again later since + * we are still holding the reference to the netdev + */ + if (test_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags)) + goto exit; bnxt_re_ib_unreg(rdev, false); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); @@ -1335,6 +1342,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, re_work->vlan_dev = (real_dev == netdev ? NULL : netdev); INIT_WORK(&re_work->work, bnxt_re_task); + set_bit(BNXT_RE_FLAG_TASK_IN_PROG, &rdev->flags); queue_work(bnxt_re_wq, &re_work->work); } } From 74828b128115033ff25d4140d732a05a36eaeaf0 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:33 +0530 Subject: [PATCH 0591/1322] bnxt_re: Remove RTNL lock dependency in bnxt_re_query_port When there is a NETDEV_UNREGISTER event, bnxt_re driver calls ib_unregister_device() (RTNL lock held). ib_unregister_device attempts to flush a worker queue scheduled by ib_core and that queue might have a pending ib_query_port(). ib_query_port in turn calls bnxt_re_query_port(), which while querying the link speed using ib_get_eth_speed(), tries to acquire the rtnl_lock() which was already held by NETDEV_UNREGISTER. Fixing the issue by removing the link speed query from bnxt_re_query_port() Now the speed is queried post a successful ib_register_device or whenever there is a NETDEV_CHANGE event. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 11 +++-------- drivers/infiniband/hw/bnxt_re/main.c | 4 ++++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index a25f9d240880..ecbac91b2e14 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -110,6 +110,8 @@ struct bnxt_re_dev { struct delayed_work worker; u8 cur_prio_map; + u8 active_speed; + u8 active_width; /* FP Notification Queue (CQ & SRQ) */ struct tasklet_struct nq_task; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 0dbdbe1616ab..7430ef07a0e1 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -259,14 +259,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, port_attr->sm_sl = 0; port_attr->subnet_timeout = 0; port_attr->init_type_reply = 0; - /* call the underlying netdev's ethtool hooks to query speed settings - * for which we acquire rtnl_lock _only_ if it's registered with - * IB stack to avoid race in the NETDEV_UNREG path - */ - if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) - if (ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed, - &port_attr->active_width)) - return -EINVAL; + port_attr->active_speed = rdev->active_speed; + port_attr->active_width = rdev->active_width; + return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 29c3d7e254af..e7450ea92aa9 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1161,6 +1161,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } } set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); + ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, + &rdev->active_width); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE); @@ -1255,6 +1257,8 @@ static void bnxt_re_task(struct work_struct *work) else if (netif_carrier_ok(rdev->netdev)) bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); + ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, + &rdev->active_width); break; default: break; From 1993519be8bc86342687c61d8d11c3ade62b3b84 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 31 Aug 2017 09:27:34 +0530 Subject: [PATCH 0592/1322] bnxt_re: Fix memory leak in FRMR path This patch fixes a memory leak issue when alloc_mr is used. mr->pages and mr->npages are used only in alloc_mr path. mr->pages is allocated when alloc_mr is called or in the case of FRMR, while creating the MR. mr->npages is updated only when the MR created is used i.e. after invoking map_mr_sg verb, before data transfer. In the dereg_mr path, if mr->npages is 0, driver ends up not freeing the memory created. Removing the npages check from the dereg_mr path for kernel consumers. Signed-off-by: Selvin Xavier Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 7430ef07a0e1..d7be4e4227ce 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3066,7 +3066,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr) return rc; } - if (mr->npages && mr->pages) { + if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, &mr->qplib_frpl); kfree(mr->pages); From 89aaca54ba60e91f02c1c168fbef5d71f71a6d43 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 31 Aug 2017 09:27:35 +0530 Subject: [PATCH 0593/1322] bnxt_re: Don't issue cmd to delete GID for QP1 GID entry before the QP is destroyed FW needs the 0th GID Entry in the Table to be preserved before it's corresponding QP1 is deleted, else it will fail the cmd. Check for the same and return to prevent error msg being logged for cmd failure. Signed-off-by: Somnath Kotur Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index d7be4e4227ce..0d89621d9fe8 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -314,6 +314,7 @@ int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, struct bnxt_re_gid_ctx *ctx, **ctx_tbl; struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl; + struct bnxt_qplib_gid *gid_to_del; /* Delete the entry from the hardware */ ctx = *context; @@ -323,11 +324,25 @@ int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, if (sgid_tbl && sgid_tbl->active) { if (ctx->idx >= sgid_tbl->max) return -EINVAL; + gid_to_del = &sgid_tbl->tbl[ctx->idx]; + /* DEL_GID is called in WQ context(netdevice_event_work_handler) + * or via the ib_unregister_device path. In the former case QP1 + * may not be destroyed yet, in which case just return as FW + * needs that entry to be present and will fail it's deletion. + * We could get invoked again after QP1 is destroyed OR get an + * ADD_GID call with a different GID value for the same index + * where we issue MODIFY_GID cmd to update the GID entry -- TBD + */ + if (ctx->idx == 0 && + rdma_link_local_addr((struct in6_addr *)gid_to_del) && + ctx->refcnt == 1 && rdev->qp1_sqp) { + dev_dbg(rdev_to_dev(rdev), + "Trying to delete GID0 while QP1 is alive\n"); + return -EFAULT; + } ctx->refcnt--; if (!ctx->refcnt) { - rc = bnxt_qplib_del_sgid(sgid_tbl, - &sgid_tbl->tbl[ctx->idx], - true); + rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del, true); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to remove GID: %#x", rc); @@ -811,6 +826,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) kfree(rdev->sqp_ah); kfree(rdev->qp1_sqp); + rdev->qp1_sqp = NULL; + rdev->sqp_ah = NULL; } if (!IS_ERR_OR_NULL(qp->rumem)) From 19fe43a54fb67b6cc8857e65c78e1dc8aa2e97a3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Jul 2017 10:56:21 +0200 Subject: [PATCH 0594/1322] apparmor: Fix shadowed local variable in unpack_trans_table() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit with W=2: security/apparmor/policy_unpack.c: In function ‘unpack_trans_table’: security/apparmor/policy_unpack.c:469: warning: declaration of ‘pos’ shadows a previous local security/apparmor/policy_unpack.c:451: warning: shadowed declaration is here Rename the old "pos" to "saved_pos" to fix this. Fixes: 5379a3312024a8be ("apparmor: support v7 transition format compatible with label_parse") Signed-off-by: Geert Uytterhoeven Reviewed-by: Serge Hallyn Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index c600f4dd1783..2d5a1a007b06 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -448,7 +448,7 @@ fail: */ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) { - void *pos = e->pos; + void *saved_pos = e->pos; /* exec table is optional */ if (unpack_nameX(e, AA_STRUCT, "xtable")) { @@ -511,7 +511,7 @@ static bool unpack_trans_table(struct aa_ext *e, struct aa_profile *profile) fail: aa_free_domain_entries(&profile->file.trans); - e->pos = pos; + e->pos = saved_pos; return 0; } From 86aea56f14929ff1c05eca1776e9068e907429d5 Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Sat, 8 Jul 2017 20:50:21 +0100 Subject: [PATCH 0595/1322] apparmor: Fix logical error in verify_header() verify_header() is currently checking whether interface version is less than 5 *and* greater than 7, which always evaluates to false. Instead it should check whether it is less than 5 *or* greater than 7. Signed-off-by: Christos Gkekas Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 2d5a1a007b06..bda0dce3b582 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -832,7 +832,7 @@ static int verify_header(struct aa_ext *e, int required, const char **ns) * if not specified use previous version * Mask off everything that is not kernel abi version */ - if (VERSION_LT(e->version, v5) && VERSION_GT(e->version, v7)) { + if (VERSION_LT(e->version, v5) || VERSION_GT(e->version, v7)) { audit_iface(NULL, NULL, NULL, "unsupported interface version", e, error); return error; From 5d314a81eca29b01939930c1c596dfa44937e970 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 13 Jul 2017 10:39:20 +0300 Subject: [PATCH 0596/1322] apparmor: Fix an error code in aafs_create() We accidentally forgot to set the error code on this path. It means we return NULL instead of an error pointer. I looked through a bunch of callers and I don't think it really causes a big issue, but the documentation says we're supposed to return error pointers here. Signed-off-by: Dan Carpenter Acked-by: Serge Hallyn Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 853c2ec8e0c9..2caeb748070c 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -248,8 +248,10 @@ static struct dentry *aafs_create(const char *name, umode_t mode, inode_lock(dir); dentry = lookup_one_len(name, parent, strlen(name)); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); goto fail_lock; + } if (d_really_is_positive(dentry)) { error = -EEXIST; From c5561700c9cb951ec3a33a0914c840423b09d7c9 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Mon, 31 Jul 2017 23:44:37 -0700 Subject: [PATCH 0597/1322] apparmor: Redundant condition: prev_ns. in [label.c:1498] Reported-by: David Binderman Signed-off-by: John Johansen --- security/apparmor/label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index e052eaba1cf6..e324f4df3e34 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1495,7 +1495,7 @@ static int aa_profile_snxprint(char *str, size_t size, struct aa_ns *view, view = profiles_ns(profile); if (view != profile->ns && - (!prev_ns || (prev_ns && *prev_ns != profile->ns))) { + (!prev_ns || (*prev_ns != profile->ns))) { if (prev_ns) *prev_ns = profile->ns; ns_name = aa_ns_name(view, profile->ns, From cd1dbf76b23d5ab2cba5e657fe20b1e236a408cc Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 22:56:22 -0700 Subject: [PATCH 0598/1322] apparmor: add the ability to mediate signals Add signal mediation where the signal can be mediated based on the signal, direction, or the label or the peer/target. The signal perms are verified on a cross check to ensure policy consistency in the case of incremental policy load/replacement. The optimization of skipping the cross check when policy is guaranteed to be consistent (single compile unit) remains to be done. policy rules have the form of SIGNAL_RULE = [ QUALIFIERS ] 'signal' [ SIGNAL ACCESS PERMISSIONS ] [ SIGNAL SET ] [ SIGNAL PEER ] SIGNAL ACCESS PERMISSIONS = SIGNAL ACCESS | SIGNAL ACCESS LIST SIGNAL ACCESS LIST = '(' Comma or space separated list of SIGNAL ACCESS ')' SIGNAL ACCESS = ( 'r' | 'w' | 'rw' | 'read' | 'write' | 'send' | 'receive' ) SIGNAL SET = 'set' '=' '(' SIGNAL LIST ')' SIGNAL LIST = Comma or space separated list of SIGNALS SIGNALS = ( 'hup' | 'int' | 'quit' | 'ill' | 'trap' | 'abrt' | 'bus' | 'fpe' | 'kill' | 'usr1' | 'segv' | 'usr2' | 'pipe' | 'alrm' | 'term' | 'stkflt' | 'chld' | 'cont' | 'stop' | 'stp' | 'ttin' | 'ttou' | 'urg' | 'xcpu' | 'xfsz' | 'vtalrm' | 'prof' | 'winch' | 'io' | 'pwr' | 'sys' | 'emt' | 'exists' | 'rtmin+0' ... 'rtmin+32' ) SIGNAL PEER = 'peer' '=' AARE eg. signal, # allow all signals signal send set=(hup, kill) peer=foo, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/apparmorfs.c | 7 ++ security/apparmor/include/apparmor.h | 1 + security/apparmor/include/audit.h | 2 + security/apparmor/include/ipc.h | 6 ++ security/apparmor/include/sig_names.h | 95 +++++++++++++++++++++++++ security/apparmor/ipc.c | 99 +++++++++++++++++++++++++++ security/apparmor/lsm.c | 21 ++++++ 7 files changed, 231 insertions(+) create mode 100644 security/apparmor/include/sig_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2caeb748070c..a5f9e1aa51f7 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -32,6 +32,7 @@ #include "include/audit.h" #include "include/context.h" #include "include/crypto.h" +#include "include/ipc.h" #include "include/policy_ns.h" #include "include/label.h" #include "include/policy.h" @@ -2129,6 +2130,11 @@ static struct aa_sfs_entry aa_sfs_entry_ptrace[] = { { } }; +static struct aa_sfs_entry aa_sfs_entry_signal[] = { + AA_SFS_FILE_STRING("mask", AA_SFS_SIG_MASK), + { } +}; + static struct aa_sfs_entry aa_sfs_entry_domain[] = { AA_SFS_FILE_BOOLEAN("change_hat", 1), AA_SFS_FILE_BOOLEAN("change_hatv", 1), @@ -2179,6 +2185,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("rlimit", aa_sfs_entry_rlimit), AA_SFS_DIR("caps", aa_sfs_entry_caps), AA_SFS_DIR("ptrace", aa_sfs_entry_ptrace), + AA_SFS_DIR("signal", aa_sfs_entry_signal), AA_SFS_DIR("query", aa_sfs_entry_query), { } }; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index aaf893f4e4f5..962a20a75e01 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -28,6 +28,7 @@ #define AA_CLASS_RLIMITS 5 #define AA_CLASS_DOMAIN 6 #define AA_CLASS_PTRACE 9 +#define AA_CLASS_SIGNAL 10 #define AA_CLASS_LABEL 16 #define AA_CLASS_LAST AA_CLASS_LABEL diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index c68839a44351..d9a156ae11b9 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -86,6 +86,7 @@ enum audit_type { #define OP_SHUTDOWN "socket_shutdown" #define OP_PTRACE "ptrace" +#define OP_SIGNAL "signal" #define OP_EXEC "exec" @@ -126,6 +127,7 @@ struct apparmor_audit_data { long pos; const char *ns; } iface; + int signal; struct { int rlim; unsigned long max; diff --git a/security/apparmor/include/ipc.h b/security/apparmor/include/ipc.h index 656fdb81c8a0..5ffc218d1e74 100644 --- a/security/apparmor/include/ipc.h +++ b/security/apparmor/include/ipc.h @@ -27,8 +27,14 @@ struct aa_profile; #define AA_PTRACE_PERM_MASK (AA_PTRACE_READ | AA_PTRACE_TRACE | \ AA_MAY_BE_READ | AA_MAY_BE_TRACED) +#define AA_SIGNAL_PERM_MASK (MAY_READ | MAY_WRITE) + +#define AA_SFS_SIG_MASK "hup int quit ill trap abrt bus fpe kill usr1 " \ + "segv usr2 pipe alrm term stkflt chld cont stop stp ttin ttou urg " \ + "xcpu xfsz vtalrm prof winch io pwr sys emt lost" int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, u32 request); +int aa_may_signal(struct aa_label *sender, struct aa_label *target, int sig); #endif /* __AA_IPC_H */ diff --git a/security/apparmor/include/sig_names.h b/security/apparmor/include/sig_names.h new file mode 100644 index 000000000000..0d4395f231ca --- /dev/null +++ b/security/apparmor/include/sig_names.h @@ -0,0 +1,95 @@ +#include + +#define SIGUNKNOWN 0 +#define MAXMAPPED_SIG 35 +/* provide a mapping of arch signal to internal signal # for mediation + * those that are always an alias SIGCLD for SIGCLHD and SIGPOLL for SIGIO + * map to the same entry those that may/or may not get a separate entry + */ +static const int sig_map[MAXMAPPED_SIG] = { + [0] = MAXMAPPED_SIG, /* existence test */ + [SIGHUP] = 1, + [SIGINT] = 2, + [SIGQUIT] = 3, + [SIGILL] = 4, + [SIGTRAP] = 5, /* -, 5, - */ + [SIGABRT] = 6, /* SIGIOT: -, 6, - */ + [SIGBUS] = 7, /* 10, 7, 10 */ + [SIGFPE] = 8, + [SIGKILL] = 9, + [SIGUSR1] = 10, /* 30, 10, 16 */ + [SIGSEGV] = 11, + [SIGUSR2] = 12, /* 31, 12, 17 */ + [SIGPIPE] = 13, + [SIGALRM] = 14, + [SIGTERM] = 15, + [SIGSTKFLT] = 16, /* -, 16, - */ + [SIGCHLD] = 17, /* 20, 17, 18. SIGCHLD -, -, 18 */ + [SIGCONT] = 18, /* 19, 18, 25 */ + [SIGSTOP] = 19, /* 17, 19, 23 */ + [SIGTSTP] = 20, /* 18, 20, 24 */ + [SIGTTIN] = 21, /* 21, 21, 26 */ + [SIGTTOU] = 22, /* 22, 22, 27 */ + [SIGURG] = 23, /* 16, 23, 21 */ + [SIGXCPU] = 24, /* 24, 24, 30 */ + [SIGXFSZ] = 25, /* 25, 25, 31 */ + [SIGVTALRM] = 26, /* 26, 26, 28 */ + [SIGPROF] = 27, /* 27, 27, 29 */ + [SIGWINCH] = 28, /* 28, 28, 20 */ + [SIGIO] = 29, /* SIGPOLL: 23, 29, 22 */ + [SIGPWR] = 30, /* 29, 30, 19. SIGINFO 29, -, - */ +#ifdef SIGSYS + [SIGSYS] = 31, /* 12, 31, 12. often SIG LOST/UNUSED */ +#endif +#ifdef SIGEMT + [SIGEMT] = 32, /* 7, - , 7 */ +#endif +#if defined(SIGLOST) && SIGPWR != SIGLOST /* sparc */ + [SIGLOST] = 33, /* unused on Linux */ +#endif +#if defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS + [SIGUNUSED] = 34, /* -, 31, - */ +#endif +}; + +/* this table is ordered post sig_map[sig] mapping */ +static const char *const sig_names[MAXMAPPED_SIG + 1] = { + "unknown", + "hup", + "int", + "quit", + "ill", + "trap", + "abrt", + "bus", + "fpe", + "kill", + "usr1", + "segv", + "usr2", + "pipe", + "alrm", + "term", + "stkflt", + "chld", + "cont", + "stop", + "stp", + "ttin", + "ttou", + "urg", + "xcpu", + "xfsz", + "vtalrm", + "prof", + "winch", + "io", + "pwr", + "sys", + "emt", + "lost", + "unused", + + "exists", /* always last existence test mapped to MAXMAPPED_SIG */ +}; + diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 11e66b5bbc42..66fb9ede9447 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -20,6 +20,7 @@ #include "include/context.h" #include "include/policy.h" #include "include/ipc.h" +#include "include/sig_names.h" /** * audit_ptrace_mask - convert mask to permission string @@ -121,3 +122,101 @@ int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, } +static inline int map_signal_num(int sig) +{ + if (sig > SIGRTMAX) + return SIGUNKNOWN; + else if (sig >= SIGRTMIN) + return sig - SIGRTMIN + 128; /* rt sigs mapped to 128 */ + else if (sig <= MAXMAPPED_SIG) + return sig_map[sig]; + return SIGUNKNOWN; +} + +/** + * audit_file_mask - convert mask to permission string + * @buffer: buffer to write string to (NOT NULL) + * @mask: permission mask to convert + */ +static void audit_signal_mask(struct audit_buffer *ab, u32 mask) +{ + if (mask & MAY_READ) + audit_log_string(ab, "receive"); + if (mask & MAY_WRITE) + audit_log_string(ab, "send"); +} + +/** + * audit_cb - call back for signal specific audit fields + * @ab: audit_buffer (NOT NULL) + * @va: audit struct to audit values of (NOT NULL) + */ +static void audit_signal_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + if (aad(sa)->request & AA_SIGNAL_PERM_MASK) { + audit_log_format(ab, " requested_mask="); + audit_signal_mask(ab, aad(sa)->request); + if (aad(sa)->denied & AA_SIGNAL_PERM_MASK) { + audit_log_format(ab, " denied_mask="); + audit_signal_mask(ab, aad(sa)->denied); + } + } + if (aad(sa)->signal <= MAXMAPPED_SIG) + audit_log_format(ab, " signal=%s", sig_names[aad(sa)->signal]); + else + audit_log_format(ab, " signal=rtmin+%d", + aad(sa)->signal - 128); + audit_log_format(ab, " peer="); + aa_label_xaudit(ab, labels_ns(aad(sa)->label), aad(sa)->peer, + FLAGS_NONE, GFP_ATOMIC); +} + +/* TODO: update to handle compound name&name2, conditionals */ +static void profile_match_signal(struct aa_profile *profile, const char *label, + int signal, struct aa_perms *perms) +{ + unsigned int state; + + /* TODO: secondary cache check */ + state = aa_dfa_next(profile->policy.dfa, + profile->policy.start[AA_CLASS_SIGNAL], + signal); + state = aa_dfa_match(profile->policy.dfa, state, label); + aa_compute_perms(profile->policy.dfa, state, perms); +} + +static int profile_signal_perm(struct aa_profile *profile, + struct aa_profile *peer, u32 request, + struct common_audit_data *sa) +{ + struct aa_perms perms; + + if (profile_unconfined(profile) || + !PROFILE_MEDIATES(profile, AA_CLASS_SIGNAL)) + return 0; + + aad(sa)->peer = &peer->label; + profile_match_signal(profile, peer->base.hname, aad(sa)->signal, + &perms); + aa_apply_modes_to_perms(profile, &perms); + return aa_check_perms(profile, &perms, request, sa, audit_signal_cb); +} + +static int aa_signal_cross_perm(struct aa_profile *sender, + struct aa_profile *target, + struct common_audit_data *sa) +{ + return xcheck(profile_signal_perm(sender, target, MAY_WRITE, sa), + profile_signal_perm(target, sender, MAY_READ, sa)); +} + +int aa_may_signal(struct aa_label *sender, struct aa_label *target, int sig) +{ + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_SIGNAL); + + aad(&sa)->signal = map_signal_num(sig); + return xcheck_labels_profiles(sender, target, aa_signal_cross_perm, + &sa); +} diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 867bcd154c7e..af22f3dfbcce 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -656,6 +656,26 @@ static int apparmor_task_setrlimit(struct task_struct *task, return error; } +static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, + int sig, u32 secid) +{ + struct aa_label *cl, *tl; + int error; + + if (secid) + /* TODO: after secid to label mapping is done. + * Dealing with USB IO specific behavior + */ + return 0; + cl = __begin_current_label_crit_section(); + tl = aa_get_task_label(target); + error = aa_may_signal(cl, tl, sig); + aa_put_label(tl); + __end_current_label_crit_section(cl); + + return error; +} + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -697,6 +717,7 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(bprm_secureexec, apparmor_bprm_secureexec), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), + LSM_HOOK_INIT(task_kill, apparmor_task_kill), }; /* From 2ea3ffb7782a84da33a8382f13ebd016da50079b Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:04:47 -0700 Subject: [PATCH 0599/1322] apparmor: add mount mediation Add basic mount mediation. That allows controlling based on basic mount parameters. It does not include special mount parameters for apparmor, super block labeling, or any triggers for apparmor namespace parameter modifications on pivot root. default userspace policy rules have the form of MOUNT RULE = ( MOUNT | REMOUNT | UMOUNT ) MOUNT = [ QUALIFIERS ] 'mount' [ MOUNT CONDITIONS ] [ SOURCE FILEGLOB ] [ '->' MOUNTPOINT FILEGLOB ] REMOUNT = [ QUALIFIERS ] 'remount' [ MOUNT CONDITIONS ] MOUNTPOINT FILEGLOB UMOUNT = [ QUALIFIERS ] 'umount' [ MOUNT CONDITIONS ] MOUNTPOINT FILEGLOB MOUNT CONDITIONS = [ ( 'fstype' | 'vfstype' ) ( '=' | 'in' ) MOUNT FSTYPE EXPRESSION ] [ 'options' ( '=' | 'in' ) MOUNT FLAGS EXPRESSION ] MOUNT FSTYPE EXPRESSION = ( MOUNT FSTYPE LIST | MOUNT EXPRESSION ) MOUNT FSTYPE LIST = Comma separated list of valid filesystem and virtual filesystem types (eg ext4, debugfs, etc) MOUNT FLAGS EXPRESSION = ( MOUNT FLAGS LIST | MOUNT EXPRESSION ) MOUNT FLAGS LIST = Comma separated list of MOUNT FLAGS. MOUNT FLAGS = ( 'ro' | 'rw' | 'nosuid' | 'suid' | 'nodev' | 'dev' | 'noexec' | 'exec' | 'sync' | 'async' | 'remount' | 'mand' | 'nomand' | 'dirsync' | 'noatime' | 'atime' | 'nodiratime' | 'diratime' | 'bind' | 'rbind' | 'move' | 'verbose' | 'silent' | 'loud' | 'acl' | 'noacl' | 'unbindable' | 'runbindable' | 'private' | 'rprivate' | 'slave' | 'rslave' | 'shared' | 'rshared' | 'relatime' | 'norelatime' | 'iversion' | 'noiversion' | 'strictatime' | 'nouser' | 'user' ) MOUNT EXPRESSION = ( ALPHANUMERIC | AARE ) ... PIVOT ROOT RULE = [ QUALIFIERS ] pivot_root [ oldroot=OLD PUT FILEGLOB ] [ NEW ROOT FILEGLOB ] SOURCE FILEGLOB = FILEGLOB MOUNTPOINT FILEGLOB = FILEGLOB eg. mount, mount /dev/foo, mount options=ro /dev/foo -> /mnt/, mount options in (ro,atime) /dev/foo -> /mnt/, mount options=ro options=atime, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/Makefile | 2 +- security/apparmor/apparmorfs.c | 8 +- security/apparmor/domain.c | 4 +- security/apparmor/include/apparmor.h | 1 + security/apparmor/include/audit.h | 11 + security/apparmor/include/domain.h | 5 + security/apparmor/include/mount.h | 54 +++ security/apparmor/lsm.c | 64 +++ security/apparmor/mount.c | 696 +++++++++++++++++++++++++++ 9 files changed, 841 insertions(+), 4 deletions(-) create mode 100644 security/apparmor/include/mount.h create mode 100644 security/apparmor/mount.c diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index a16b195274de..81a34426d024 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o secid.o file.o policy_ns.o label.o + resource.o secid.o file.o policy_ns.o label.o mount.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o clean-files := capability_names.h rlim_names.h diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index a5f9e1aa51f7..8fa6c898c44b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2159,9 +2159,14 @@ static struct aa_sfs_entry aa_sfs_entry_policy[] = { { } }; +static struct aa_sfs_entry aa_sfs_entry_mount[] = { + AA_SFS_FILE_STRING("mask", "mount umount pivot_root"), + { } +}; + static struct aa_sfs_entry aa_sfs_entry_ns[] = { AA_SFS_FILE_BOOLEAN("profile", 1), - AA_SFS_FILE_BOOLEAN("pivot_root", 1), + AA_SFS_FILE_BOOLEAN("pivot_root", 0), { } }; @@ -2180,6 +2185,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("policy", aa_sfs_entry_policy), AA_SFS_DIR("domain", aa_sfs_entry_domain), AA_SFS_DIR("file", aa_sfs_entry_file), + AA_SFS_DIR("mount", aa_sfs_entry_mount), AA_SFS_DIR("namespaces", aa_sfs_entry_ns), AA_SFS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), AA_SFS_DIR("rlimit", aa_sfs_entry_rlimit), diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index d0594446ae3f..ffc8c75a6785 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -374,8 +374,8 @@ static const char *next_name(int xtype, const char *name) * * Returns: refcounted label, or NULL on failure (MAYBE NULL) */ -static struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, - const char **name) +struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, + const char **name) { struct aa_label *label = NULL; u32 xtype = xindex & AA_X_TYPE_MASK; diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 962a20a75e01..829082c35faa 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -27,6 +27,7 @@ #define AA_CLASS_NET 4 #define AA_CLASS_RLIMITS 5 #define AA_CLASS_DOMAIN 6 +#define AA_CLASS_MOUNT 7 #define AA_CLASS_PTRACE 9 #define AA_CLASS_SIGNAL 10 #define AA_CLASS_LABEL 16 diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index d9a156ae11b9..c3fe1c5ef3bc 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -71,6 +71,10 @@ enum audit_type { #define OP_FMPROT "file_mprotect" #define OP_INHERIT "file_inherit" +#define OP_PIVOTROOT "pivotroot" +#define OP_MOUNT "mount" +#define OP_UMOUNT "umount" + #define OP_CREATE "create" #define OP_POST_CREATE "post_create" #define OP_BIND "bind" @@ -132,6 +136,13 @@ struct apparmor_audit_data { int rlim; unsigned long max; } rlim; + struct { + const char *src_name; + const char *type; + const char *trans; + const char *data; + unsigned long flags; + } mnt; }; }; diff --git a/security/apparmor/include/domain.h b/security/apparmor/include/domain.h index bab5810b6e9a..db27403346c5 100644 --- a/security/apparmor/include/domain.h +++ b/security/apparmor/include/domain.h @@ -15,6 +15,8 @@ #include #include +#include "label.h" + #ifndef __AA_DOMAIN_H #define __AA_DOMAIN_H @@ -29,6 +31,9 @@ struct aa_domain { #define AA_CHANGE_ONEXEC 4 #define AA_CHANGE_STACK 8 +struct aa_label *x_table_lookup(struct aa_profile *profile, u32 xindex, + const char **name); + int apparmor_bprm_set_creds(struct linux_binprm *bprm); int apparmor_bprm_secureexec(struct linux_binprm *bprm); diff --git a/security/apparmor/include/mount.h b/security/apparmor/include/mount.h new file mode 100644 index 000000000000..25d6067fa6ef --- /dev/null +++ b/security/apparmor/include/mount.h @@ -0,0 +1,54 @@ +/* + * AppArmor security module + * + * This file contains AppArmor file mediation function definitions. + * + * Copyright 2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_MOUNT_H +#define __AA_MOUNT_H + +#include +#include + +#include "domain.h" +#include "policy.h" + +/* mount perms */ +#define AA_MAY_PIVOTROOT 0x01 +#define AA_MAY_MOUNT 0x02 +#define AA_MAY_UMOUNT 0x04 +#define AA_AUDIT_DATA 0x40 +#define AA_MNT_CONT_MATCH 0x40 + +#define AA_MS_IGNORE_MASK (MS_KERNMOUNT | MS_NOSEC | MS_ACTIVE | MS_BORN) + +int aa_remount(struct aa_label *label, const struct path *path, + unsigned long flags, void *data); + +int aa_bind_mount(struct aa_label *label, const struct path *path, + const char *old_name, unsigned long flags); + + +int aa_mount_change_type(struct aa_label *label, const struct path *path, + unsigned long flags); + +int aa_move_mount(struct aa_label *label, const struct path *path, + const char *old_name); + +int aa_new_mount(struct aa_label *label, const char *dev_name, + const struct path *path, const char *type, unsigned long flags, + void *data); + +int aa_umount(struct aa_label *label, struct vfsmount *mnt, int flags); + +int aa_pivotroot(struct aa_label *label, const struct path *old_path, + const struct path *new_path); + +#endif /* __AA_MOUNT_H */ diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index af22f3dfbcce..4ad0b3a45142 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -38,6 +38,7 @@ #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" +#include "include/mount.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; @@ -511,6 +512,65 @@ static int apparmor_file_mprotect(struct vm_area_struct *vma, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0); } +static int apparmor_sb_mount(const char *dev_name, const struct path *path, + const char *type, unsigned long flags, void *data) +{ + struct aa_label *label; + int error = 0; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + flags &= ~AA_MS_IGNORE_MASK; + + label = __begin_current_label_crit_section(); + if (!unconfined(label)) { + if (flags & MS_REMOUNT) + error = aa_remount(label, path, flags, data); + else if (flags & MS_BIND) + error = aa_bind_mount(label, path, dev_name, flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE)) + error = aa_mount_change_type(label, path, flags); + else if (flags & MS_MOVE) + error = aa_move_mount(label, path, dev_name); + else + error = aa_new_mount(label, dev_name, path, type, + flags, data); + } + __end_current_label_crit_section(label); + + return error; +} + +static int apparmor_sb_umount(struct vfsmount *mnt, int flags) +{ + struct aa_label *label; + int error = 0; + + label = __begin_current_label_crit_section(); + if (!unconfined(label)) + error = aa_umount(label, mnt, flags); + __end_current_label_crit_section(label); + + return error; +} + +static int apparmor_sb_pivotroot(const struct path *old_path, + const struct path *new_path) +{ + struct aa_label *label; + int error = 0; + + label = aa_get_current_label(); + if (!unconfined(label)) + error = aa_pivotroot(label, old_path, new_path); + aa_put_label(label); + + return error; +} + static int apparmor_getprocattr(struct task_struct *task, char *name, char **value) { @@ -682,6 +742,10 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), + LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), + LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), + LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), + LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c new file mode 100644 index 000000000000..82a64b58041d --- /dev/null +++ b/security/apparmor/mount.c @@ -0,0 +1,696 @@ +/* + * AppArmor security module + * + * This file contains AppArmor mediation of files + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/domain.h" +#include "include/file.h" +#include "include/match.h" +#include "include/mount.h" +#include "include/path.h" +#include "include/policy.h" + + +static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) +{ + if (flags & MS_RDONLY) + audit_log_format(ab, "ro"); + else + audit_log_format(ab, "rw"); + if (flags & MS_NOSUID) + audit_log_format(ab, ", nosuid"); + if (flags & MS_NODEV) + audit_log_format(ab, ", nodev"); + if (flags & MS_NOEXEC) + audit_log_format(ab, ", noexec"); + if (flags & MS_SYNCHRONOUS) + audit_log_format(ab, ", sync"); + if (flags & MS_REMOUNT) + audit_log_format(ab, ", remount"); + if (flags & MS_MANDLOCK) + audit_log_format(ab, ", mand"); + if (flags & MS_DIRSYNC) + audit_log_format(ab, ", dirsync"); + if (flags & MS_NOATIME) + audit_log_format(ab, ", noatime"); + if (flags & MS_NODIRATIME) + audit_log_format(ab, ", nodiratime"); + if (flags & MS_BIND) + audit_log_format(ab, flags & MS_REC ? ", rbind" : ", bind"); + if (flags & MS_MOVE) + audit_log_format(ab, ", move"); + if (flags & MS_SILENT) + audit_log_format(ab, ", silent"); + if (flags & MS_POSIXACL) + audit_log_format(ab, ", acl"); + if (flags & MS_UNBINDABLE) + audit_log_format(ab, flags & MS_REC ? ", runbindable" : + ", unbindable"); + if (flags & MS_PRIVATE) + audit_log_format(ab, flags & MS_REC ? ", rprivate" : + ", private"); + if (flags & MS_SLAVE) + audit_log_format(ab, flags & MS_REC ? ", rslave" : + ", slave"); + if (flags & MS_SHARED) + audit_log_format(ab, flags & MS_REC ? ", rshared" : + ", shared"); + if (flags & MS_RELATIME) + audit_log_format(ab, ", relatime"); + if (flags & MS_I_VERSION) + audit_log_format(ab, ", iversion"); + if (flags & MS_STRICTATIME) + audit_log_format(ab, ", strictatime"); + if (flags & MS_NOUSER) + audit_log_format(ab, ", nouser"); +} + +/** + * audit_cb - call back for mount specific audit fields + * @ab: audit_buffer (NOT NULL) + * @va: audit struct to audit values of (NOT NULL) + */ +static void audit_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + if (aad(sa)->mnt.type) { + audit_log_format(ab, " fstype="); + audit_log_untrustedstring(ab, aad(sa)->mnt.type); + } + if (aad(sa)->mnt.src_name) { + audit_log_format(ab, " srcname="); + audit_log_untrustedstring(ab, aad(sa)->mnt.src_name); + } + if (aad(sa)->mnt.trans) { + audit_log_format(ab, " trans="); + audit_log_untrustedstring(ab, aad(sa)->mnt.trans); + } + if (aad(sa)->mnt.flags) { + audit_log_format(ab, " flags=\""); + audit_mnt_flags(ab, aad(sa)->mnt.flags); + audit_log_format(ab, "\""); + } + if (aad(sa)->mnt.data) { + audit_log_format(ab, " options="); + audit_log_untrustedstring(ab, aad(sa)->mnt.data); + } +} + +/** + * audit_mount - handle the auditing of mount operations + * @profile: the profile being enforced (NOT NULL) + * @op: operation being mediated (NOT NULL) + * @name: name of object being mediated (MAYBE NULL) + * @src_name: src_name of object being mediated (MAYBE_NULL) + * @type: type of filesystem (MAYBE_NULL) + * @trans: name of trans (MAYBE NULL) + * @flags: filesystem idependent mount flags + * @data: filesystem mount flags + * @request: permissions requested + * @perms: the permissions computed for the request (NOT NULL) + * @info: extra information message (MAYBE NULL) + * @error: 0 if operation allowed else failure error code + * + * Returns: %0 or error on failure + */ +static int audit_mount(struct aa_profile *profile, const char *op, + const char *name, const char *src_name, + const char *type, const char *trans, + unsigned long flags, const void *data, u32 request, + struct aa_perms *perms, const char *info, int error) +{ + int audit_type = AUDIT_APPARMOR_AUTO; + DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, op); + + if (likely(!error)) { + u32 mask = perms->audit; + + if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) + mask = 0xffff; + + /* mask off perms that are not being force audited */ + request &= mask; + + if (likely(!request)) + return 0; + audit_type = AUDIT_APPARMOR_AUDIT; + } else { + /* only report permissions that were denied */ + request = request & ~perms->allow; + + if (request & perms->kill) + audit_type = AUDIT_APPARMOR_KILL; + + /* quiet known rejects, assumes quiet and kill do not overlap */ + if ((request & perms->quiet) && + AUDIT_MODE(profile) != AUDIT_NOQUIET && + AUDIT_MODE(profile) != AUDIT_ALL) + request &= ~perms->quiet; + + if (!request) + return error; + } + + aad(&sa)->name = name; + aad(&sa)->mnt.src_name = src_name; + aad(&sa)->mnt.type = type; + aad(&sa)->mnt.trans = trans; + aad(&sa)->mnt.flags = flags; + if (data && (perms->audit & AA_AUDIT_DATA)) + aad(&sa)->mnt.data = data; + aad(&sa)->info = info; + aad(&sa)->error = error; + + return aa_audit(audit_type, profile, &sa, audit_cb); +} + +/** + * match_mnt_flags - Do an ordered match on mount flags + * @dfa: dfa to match against + * @state: state to start in + * @flags: mount flags to match against + * + * Mount flags are encoded as an ordered match. This is done instead of + * checking against a simple bitmask, to allow for logical operations + * on the flags. + * + * Returns: next state after flags match + */ +static unsigned int match_mnt_flags(struct aa_dfa *dfa, unsigned int state, + unsigned long flags) +{ + unsigned int i; + + for (i = 0; i <= 31 ; ++i) { + if ((1 << i) & flags) + state = aa_dfa_next(dfa, state, i + 1); + } + + return state; +} + +/** + * compute_mnt_perms - compute mount permission associated with @state + * @dfa: dfa to match against (NOT NULL) + * @state: state match finished in + * + * Returns: mount permissions + */ +static struct aa_perms compute_mnt_perms(struct aa_dfa *dfa, + unsigned int state) +{ + struct aa_perms perms; + + perms.kill = 0; + perms.allow = dfa_user_allow(dfa, state); + perms.audit = dfa_user_audit(dfa, state); + perms.quiet = dfa_user_quiet(dfa, state); + perms.xindex = dfa_user_xindex(dfa, state); + + return perms; +} + +static const char * const mnt_info_table[] = { + "match succeeded", + "failed mntpnt match", + "failed srcname match", + "failed type match", + "failed flags match", + "failed data match" +}; + +/* + * Returns 0 on success else element that match failed in, this is the + * index into the mnt_info_table above + */ +static int do_match_mnt(struct aa_dfa *dfa, unsigned int start, + const char *mntpnt, const char *devname, + const char *type, unsigned long flags, + void *data, bool binary, struct aa_perms *perms) +{ + unsigned int state; + + AA_BUG(!dfa); + AA_BUG(!perms); + + state = aa_dfa_match(dfa, start, mntpnt); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 1; + + if (devname) + state = aa_dfa_match(dfa, state, devname); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 2; + + if (type) + state = aa_dfa_match(dfa, state, type); + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 3; + + state = match_mnt_flags(dfa, state, flags); + if (!state) + return 4; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + + /* only match data if not binary and the DFA flags data is expected */ + if (data && !binary && (perms->allow & AA_MNT_CONT_MATCH)) { + state = aa_dfa_null_transition(dfa, state); + if (!state) + return 4; + + state = aa_dfa_match(dfa, state, data); + if (!state) + return 5; + *perms = compute_mnt_perms(dfa, state); + if (perms->allow & AA_MAY_MOUNT) + return 0; + } + + /* failed at end of flags match */ + return 4; +} + + +static int path_flags(struct aa_profile *profile, const struct path *path) +{ + AA_BUG(!profile); + AA_BUG(!path); + + return profile->path_flags | + (S_ISDIR(path->dentry->d_inode->i_mode) ? PATH_IS_DIR : 0); +} + +/** + * match_mnt_path_str - handle path matching for mount + * @profile: the confining profile + * @mntpath: for the mntpnt (NOT NULL) + * @buffer: buffer to be used to lookup mntpath + * @devnme: string for the devname/src_name (MAY BE NULL OR ERRPTR) + * @type: string for the dev type (MAYBE NULL) + * @flags: mount flags to match + * @data: fs mount data (MAYBE NULL) + * @binary: whether @data is binary + * @devinfo: error str if (IS_ERR(@devname)) + * + * Returns: 0 on success else error + */ +static int match_mnt_path_str(struct aa_profile *profile, + const struct path *mntpath, char *buffer, + const char *devname, const char *type, + unsigned long flags, void *data, bool binary, + const char *devinfo) +{ + struct aa_perms perms = { }; + const char *mntpnt = NULL, *info = NULL; + int pos, error; + + AA_BUG(!profile); + AA_BUG(!mntpath); + AA_BUG(!buffer); + + error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer, + &mntpnt, &info, profile->disconnected); + if (error) + goto audit; + if (IS_ERR(devname)) { + error = PTR_ERR(devname); + devname = NULL; + info = devinfo; + goto audit; + } + + error = -EACCES; + pos = do_match_mnt(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + mntpnt, devname, type, flags, data, binary, &perms); + if (pos) { + info = mnt_info_table[pos]; + goto audit; + } + error = 0; + +audit: + return audit_mount(profile, OP_MOUNT, mntpnt, devname, type, NULL, + flags, data, AA_MAY_MOUNT, &perms, info, error); +} + +/** + * match_mnt - handle path matching for mount + * @profile: the confining profile + * @mntpath: for the mntpnt (NOT NULL) + * @buffer: buffer to be used to lookup mntpath + * @devpath: path devname/src_name (MAYBE NULL) + * @devbuffer: buffer to be used to lookup devname/src_name + * @type: string for the dev type (MAYBE NULL) + * @flags: mount flags to match + * @data: fs mount data (MAYBE NULL) + * @binary: whether @data is binary + * + * Returns: 0 on success else error + */ +static int match_mnt(struct aa_profile *profile, const struct path *path, + char *buffer, struct path *devpath, char *devbuffer, + const char *type, unsigned long flags, void *data, + bool binary) +{ + const char *devname = NULL, *info = NULL; + int error = -EACCES; + + AA_BUG(!profile); + AA_BUG(devpath && !devbuffer); + + if (devpath) { + error = aa_path_name(devpath, path_flags(profile, devpath), + devbuffer, &devname, &info, + profile->disconnected); + if (error) + devname = ERR_PTR(error); + } + + return match_mnt_path_str(profile, path, buffer, devname, type, flags, + data, binary, info); +} + +int aa_remount(struct aa_label *label, const struct path *path, + unsigned long flags, void *data) +{ + struct aa_profile *profile; + char *buffer = NULL; + bool binary; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + binary = path->dentry->d_sb->s_type->fs_flags & FS_BINARY_MOUNTDATA; + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, NULL, NULL, NULL, + flags, data, binary)); + put_buffers(buffer); + + return error; +} + +int aa_bind_mount(struct aa_label *label, const struct path *path, + const char *dev_name, unsigned long flags) +{ + struct aa_profile *profile; + char *buffer = NULL, *old_buffer = NULL; + struct path old_path; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + if (!dev_name || !*dev_name) + return -EINVAL; + + flags &= MS_REC | MS_BIND; + + error = kern_path(dev_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (error) + return error; + + get_buffers(buffer, old_buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, &old_path, old_buffer, + NULL, flags, NULL, false)); + put_buffers(buffer, old_buffer); + path_put(&old_path); + + return error; +} + +int aa_mount_change_type(struct aa_label *label, const struct path *path, + unsigned long flags) +{ + struct aa_profile *profile; + char *buffer = NULL; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + /* These are the flags allowed by do_change_type() */ + flags &= (MS_REC | MS_SILENT | MS_SHARED | MS_PRIVATE | MS_SLAVE | + MS_UNBINDABLE); + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, NULL, NULL, NULL, + flags, NULL, false)); + put_buffers(buffer); + + return error; +} + +int aa_move_mount(struct aa_label *label, const struct path *path, + const char *orig_name) +{ + struct aa_profile *profile; + char *buffer = NULL, *old_buffer = NULL; + struct path old_path; + int error; + + AA_BUG(!label); + AA_BUG(!path); + + if (!orig_name || !*orig_name) + return -EINVAL; + + error = kern_path(orig_name, LOOKUP_FOLLOW, &old_path); + if (error) + return error; + + get_buffers(buffer, old_buffer); + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, &old_path, old_buffer, + NULL, MS_MOVE, NULL, false)); + put_buffers(buffer, old_buffer); + path_put(&old_path); + + return error; +} + +int aa_new_mount(struct aa_label *label, const char *dev_name, + const struct path *path, const char *type, unsigned long flags, + void *data) +{ + struct aa_profile *profile; + char *buffer = NULL, *dev_buffer = NULL; + bool binary = true; + int error; + int requires_dev = 0; + struct path tmp_path, *dev_path = NULL; + + AA_BUG(!label); + AA_BUG(!path); + + if (type) { + struct file_system_type *fstype; + + fstype = get_fs_type(type); + if (!fstype) + return -ENODEV; + binary = fstype->fs_flags & FS_BINARY_MOUNTDATA; + requires_dev = fstype->fs_flags & FS_REQUIRES_DEV; + put_filesystem(fstype); + + if (requires_dev) { + if (!dev_name || !*dev_name) + return -ENOENT; + + error = kern_path(dev_name, LOOKUP_FOLLOW, &tmp_path); + if (error) + return error; + dev_path = &tmp_path; + } + } + + get_buffers(buffer, dev_buffer); + if (dev_path) { + error = fn_for_each_confined(label, profile, + match_mnt(profile, path, buffer, dev_path, dev_buffer, + type, flags, data, binary)); + } else { + error = fn_for_each_confined(label, profile, + match_mnt_path_str(profile, path, buffer, dev_name, + type, flags, data, binary, NULL)); + } + put_buffers(buffer, dev_buffer); + if (dev_path) + path_put(dev_path); + + return error; +} + +static int profile_umount(struct aa_profile *profile, struct path *path, + char *buffer) +{ + struct aa_perms perms = { }; + const char *name = NULL, *info = NULL; + unsigned int state; + int error; + + AA_BUG(!profile); + AA_BUG(!path); + + error = aa_path_name(path, path_flags(profile, path), buffer, &name, + &info, profile->disconnected); + if (error) + goto audit; + + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + name); + perms = compute_mnt_perms(profile->policy.dfa, state); + if (AA_MAY_UMOUNT & ~perms.allow) + error = -EACCES; + +audit: + return audit_mount(profile, OP_UMOUNT, name, NULL, NULL, NULL, 0, NULL, + AA_MAY_UMOUNT, &perms, info, error); +} + +int aa_umount(struct aa_label *label, struct vfsmount *mnt, int flags) +{ + struct aa_profile *profile; + char *buffer = NULL; + int error; + struct path path = { .mnt = mnt, .dentry = mnt->mnt_root }; + + AA_BUG(!label); + AA_BUG(!mnt); + + get_buffers(buffer); + error = fn_for_each_confined(label, profile, + profile_umount(profile, &path, buffer)); + put_buffers(buffer); + + return error; +} + +/* helper fn for transition on pivotroot + * + * Returns: label for transition or ERR_PTR. Does not return NULL + */ +static struct aa_label *build_pivotroot(struct aa_profile *profile, + const struct path *new_path, + char *new_buffer, + const struct path *old_path, + char *old_buffer) +{ + const char *old_name, *new_name = NULL, *info = NULL; + const char *trans_name = NULL; + struct aa_perms perms = { }; + unsigned int state; + int error; + + AA_BUG(!profile); + AA_BUG(!new_path); + AA_BUG(!old_path); + + if (profile_unconfined(profile)) + return aa_get_newest_label(&profile->label); + + error = aa_path_name(old_path, path_flags(profile, old_path), + old_buffer, &old_name, &info, + profile->disconnected); + if (error) + goto audit; + error = aa_path_name(new_path, path_flags(profile, new_path), + new_buffer, &new_name, &info, + profile->disconnected); + if (error) + goto audit; + + error = -EACCES; + state = aa_dfa_match(profile->policy.dfa, + profile->policy.start[AA_CLASS_MOUNT], + new_name); + state = aa_dfa_null_transition(profile->policy.dfa, state); + state = aa_dfa_match(profile->policy.dfa, state, old_name); + perms = compute_mnt_perms(profile->policy.dfa, state); + + if (AA_MAY_PIVOTROOT & perms.allow) + error = 0; + +audit: + error = audit_mount(profile, OP_PIVOTROOT, new_name, old_name, + NULL, trans_name, 0, NULL, AA_MAY_PIVOTROOT, + &perms, info, error); + if (error) + return ERR_PTR(error); + + return aa_get_newest_label(&profile->label); +} + +int aa_pivotroot(struct aa_label *label, const struct path *old_path, + const struct path *new_path) +{ + struct aa_profile *profile; + struct aa_label *target = NULL; + char *old_buffer = NULL, *new_buffer = NULL, *info = NULL; + int error; + + AA_BUG(!label); + AA_BUG(!old_path); + AA_BUG(!new_path); + + get_buffers(old_buffer, new_buffer); + target = fn_label_build(label, profile, GFP_ATOMIC, + build_pivotroot(profile, new_path, new_buffer, + old_path, old_buffer)); + if (!target) { + info = "label build failed"; + error = -ENOMEM; + goto fail; + } else if (!IS_ERR(target)) { + error = aa_replace_current_label(target); + if (error) { + /* TODO: audit target */ + aa_put_label(target); + goto out; + } + } else + /* already audited error */ + error = PTR_ERR(target); +out: + put_buffers(old_buffer, new_buffer); + + return error; + +fail: + /* TODO: add back in auditing of new_name and old_name */ + error = fn_for_each(label, profile, + audit_mount(profile, OP_PIVOTROOT, NULL /*new_name */, + NULL /* old_name */, + NULL, NULL, + 0, NULL, AA_MAY_PIVOTROOT, &nullperms, info, + error)); + goto out; +} From f872af75d325cc449b6621a0d30a4f2ba77dd092 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 6 Aug 2017 05:36:40 -0700 Subject: [PATCH 0600/1322] apparmor: cleanup conditional check for label in label_print Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/label.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index e324f4df3e34..38be7a89cc31 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1450,9 +1450,11 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) * cached label name is present and visible * @label->hname only exists if label is namespace hierachical */ -static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label) +static inline bool use_label_hname(struct aa_ns *ns, struct aa_label *label, + int flags) { - if (label->hname && labels_ns(label) == ns) + if (label->hname && (!ns || labels_ns(label) == ns) && + !(flags & ~FLAG_SHOW_MODE)) return true; return false; @@ -1710,10 +1712,8 @@ void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, AA_BUG(!ab); AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label) || display_mode(ns, label, flags)) { + if (!use_label_hname(ns, label, flags) || + display_mode(ns, label, flags)) { len = aa_label_asxprint(&name, ns, label, flags, gfp); if (len == -1) { AA_DEBUG("label print error"); @@ -1738,10 +1738,7 @@ void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, AA_BUG(!f); AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label)) { + if (!use_label_hname(ns, label, flags)) { char *str; int len; @@ -1764,10 +1761,7 @@ void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, { AA_BUG(!label); - if (!ns) - ns = labels_ns(label); - - if (!use_label_hname(ns, label)) { + if (!use_label_hname(ns, label, flags)) { char *str; int len; From 26b7899510ae243e392960704ebdba52d05fbb13 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Sun, 6 Aug 2017 05:39:08 -0700 Subject: [PATCH 0601/1322] apparmor: add support for absolute root view based labels With apparmor policy virtualization based on policy namespace View's we don't generally want/need absolute root based views, however there are cases like debugging and some secid based conversions where using a root based view is important. Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/include/label.h | 1 + security/apparmor/label.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/security/apparmor/include/label.h b/security/apparmor/include/label.h index 9a283b722755..af22dcbbcb8a 100644 --- a/security/apparmor/include/label.h +++ b/security/apparmor/include/label.h @@ -310,6 +310,7 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp); #define FLAG_SHOW_MODE 1 #define FLAG_VIEW_SUBNS 2 #define FLAG_HIDDEN_UNCONFINED 4 +#define FLAG_ABS_ROOT 8 int aa_label_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_label *label, int flags); int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 38be7a89cc31..52b4ef14840d 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1607,8 +1607,13 @@ int aa_label_snxprint(char *str, size_t size, struct aa_ns *ns, AA_BUG(!str && size != 0); AA_BUG(!label); - if (!ns) + if (flags & FLAG_ABS_ROOT) { + ns = root_ns; + len = snprintf(str, size, "="); + update_for_len(total, len, size, str); + } else if (!ns) { ns = labels_ns(label); + } label_for_each(i, label, profile) { if (aa_ns_visible(ns, profile->ns, flags & FLAG_VIEW_SUBNS)) { @@ -1868,6 +1873,9 @@ struct aa_label *aa_label_parse(struct aa_label *base, const char *str, if (*str == '&') str++; } + if (*str == '=') + base = &root_ns->unconfined->label; + error = vec_setup(profile, vec, len, gfp); if (error) return ERR_PTR(error); From 2410aa96d6b4930ed25fd02c3d173f14b962e0f4 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:37:18 -0700 Subject: [PATCH 0602/1322] apparmor: make policy_unpack able to audit different info messages Switch unpack auditing to using the generic name field in the audit struct and make it so we can start adding new info messages about why an unpack failed. Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/include/audit.h | 4 +-- security/apparmor/policy_unpack.c | 52 ++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index c3fe1c5ef3bc..620e81169659 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -127,9 +127,9 @@ struct apparmor_audit_data { } fs; }; struct { - const char *name; - long pos; + struct aa_profile *profile; const char *ns; + long pos; } iface; int signal; struct { diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index bda0dce3b582..4ede87c30f8b 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -85,9 +85,9 @@ static void audit_cb(struct audit_buffer *ab, void *va) audit_log_format(ab, " ns="); audit_log_untrustedstring(ab, aad(sa)->iface.ns); } - if (aad(sa)->iface.name) { + if (aad(sa)->name) { audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, aad(sa)->iface.name); + audit_log_untrustedstring(ab, aad(sa)->name); } if (aad(sa)->iface.pos) audit_log_format(ab, " offset=%ld", aad(sa)->iface.pos); @@ -114,9 +114,9 @@ static int audit_iface(struct aa_profile *new, const char *ns_name, aad(&sa)->iface.pos = e->pos - e->start; aad(&sa)->iface.ns = ns_name; if (new) - aad(&sa)->iface.name = new->base.hname; + aad(&sa)->name = new->base.hname; else - aad(&sa)->iface.name = name; + aad(&sa)->name = name; aad(&sa)->info = info; aad(&sa)->error = error; @@ -583,6 +583,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) { struct aa_profile *profile = NULL; const char *tmpname, *tmpns = NULL, *name = NULL; + const char *info = "failed to unpack profile"; size_t ns_len; struct rhashtable_params params = { 0 }; char *key = NULL; @@ -604,8 +605,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) tmpname = aa_splitn_fqname(name, strlen(name), &tmpns, &ns_len); if (tmpns) { *ns_name = kstrndup(tmpns, ns_len, GFP_KERNEL); - if (!*ns_name) + if (!*ns_name) { + info = "out of memory"; goto fail; + } name = tmpname; } @@ -624,12 +627,15 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (IS_ERR(profile->xmatch)) { error = PTR_ERR(profile->xmatch); profile->xmatch = NULL; + info = "bad xmatch"; goto fail; } /* xmatch_len is not optional if xmatch is set */ if (profile->xmatch) { - if (!unpack_u32(e, &tmp, NULL)) + if (!unpack_u32(e, &tmp, NULL)) { + info = "missing xmatch len"; goto fail; + } profile->xmatch_len = tmp; } @@ -637,8 +643,11 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) (void) unpack_str(e, &profile->disconnected, "disconnected"); /* per profile debug flags (complain, audit) */ - if (!unpack_nameX(e, AA_STRUCT, "flags")) + if (!unpack_nameX(e, AA_STRUCT, "flags")) { + info = "profile missing flags"; goto fail; + } + info = "failed to unpack profile flags"; if (!unpack_u32(e, &tmp, NULL)) goto fail; if (tmp & PACKED_FLAG_HAT) @@ -667,6 +676,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) /* set a default value if path_flags field is not present */ profile->path_flags = PATH_MEDIATE_DELETED; + info = "failed to unpack profile capabilities"; if (!unpack_u32(e, &(profile->caps.allow.cap[0]), NULL)) goto fail; if (!unpack_u32(e, &(profile->caps.audit.cap[0]), NULL)) @@ -676,6 +686,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (!unpack_u32(e, &tmpcap.cap[0], NULL)) goto fail; + info = "failed to unpack upper profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "caps64")) { /* optional upper half of 64 bit caps */ if (!unpack_u32(e, &(profile->caps.allow.cap[1]), NULL)) @@ -690,6 +701,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + info = "failed to unpack extended profile capabilities"; if (unpack_nameX(e, AA_STRUCT, "capsx")) { /* optional extended caps mediation mask */ if (!unpack_u32(e, &(profile->caps.extended.cap[0]), NULL)) @@ -700,11 +712,14 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } - if (!unpack_rlimits(e, profile)) + if (!unpack_rlimits(e, profile)) { + info = "failed to unpack profile rlimits"; goto fail; + } if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ + info = "failed to unpack policydb"; profile->policy.dfa = unpack_dfa(e); if (IS_ERR(profile->policy.dfa)) { error = PTR_ERR(profile->policy.dfa); @@ -734,6 +749,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (IS_ERR(profile->file.dfa)) { error = PTR_ERR(profile->file.dfa); profile->file.dfa = NULL; + info = "failed to unpack profile file rules"; goto fail; } else if (profile->file.dfa) { if (!unpack_u32(e, &profile->file.start, "dfa_start")) @@ -746,10 +762,13 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) } else profile->file.dfa = aa_get_dfa(nulldfa); - if (!unpack_trans_table(e, profile)) + if (!unpack_trans_table(e, profile)) { + info = "failed to unpack profile transition table"; goto fail; + } if (unpack_nameX(e, AA_STRUCT, "data")) { + info = "out of memory"; profile->data = kzalloc(sizeof(*profile->data), GFP_KERNEL); if (!profile->data) goto fail; @@ -761,8 +780,10 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) params.hashfn = strhash; params.obj_cmpfn = datacmp; - if (rhashtable_init(profile->data, ¶ms)) + if (rhashtable_init(profile->data, ¶ms)) { + info = "failed to init key, value hash table"; goto fail; + } while (unpack_strdup(e, &key, NULL)) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -784,12 +805,16 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) profile->data->p); } - if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) { + info = "failed to unpack end of key, value data table"; goto fail; + } } - if (!unpack_nameX(e, AA_STRUCTEND, NULL)) + if (!unpack_nameX(e, AA_STRUCTEND, NULL)) { + info = "failed to unpack end of profile"; goto fail; + } return profile; @@ -798,8 +823,7 @@ fail: name = NULL; else if (!name) name = "unknown"; - audit_iface(profile, NULL, name, "failed to unpack profile", e, - error); + audit_iface(profile, NULL, name, info, e, error); aa_free_profile(profile); return ERR_PTR(error); From cbf2d0e1a9e4876046a628e0e036a7545a3a4c40 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:41:13 -0700 Subject: [PATCH 0603/1322] apparmor: add more debug asserts to apparmorfs Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/apparmorfs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 8fa6c898c44b..7acea14c850b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1446,6 +1446,10 @@ void __aafs_profile_migrate_dents(struct aa_profile *old, { int i; + AA_BUG(!old); + AA_BUG(!new); + AA_BUG(!mutex_is_locked(&profiles_ns(old)->lock)); + for (i = 0; i < AAFS_PROF_SIZEOF; i++) { new->dents[i] = old->dents[i]; if (new->dents[i]) @@ -1509,6 +1513,9 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) struct dentry *dent = NULL, *dir; int error; + AA_BUG(!profile); + AA_BUG(!mutex_is_locked(&profiles_ns(profile)->lock)); + if (!parent) { struct aa_profile *p; p = aa_deref_parent(profile); @@ -1734,6 +1741,7 @@ void __aafs_ns_rmdir(struct aa_ns *ns) if (!ns) return; + AA_BUG(!mutex_is_locked(&ns->lock)); list_for_each_entry(child, &ns->base.profiles, base.list) __aafs_profile_rmdir(child); @@ -1906,6 +1914,10 @@ static struct aa_ns *__next_ns(struct aa_ns *root, struct aa_ns *ns) { struct aa_ns *parent, *next; + AA_BUG(!root); + AA_BUG(!ns); + AA_BUG(ns != root && !mutex_is_locked(&ns->parent->lock)); + /* is next namespace a child */ if (!list_empty(&ns->sub_ns)) { next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list); @@ -1940,6 +1952,9 @@ static struct aa_ns *__next_ns(struct aa_ns *root, struct aa_ns *ns) static struct aa_profile *__first_profile(struct aa_ns *root, struct aa_ns *ns) { + AA_BUG(!root); + AA_BUG(ns && !mutex_is_locked(&ns->lock)); + for (; ns; ns = __next_ns(root, ns)) { if (!list_empty(&ns->base.profiles)) return list_first_entry(&ns->base.profiles, @@ -1962,6 +1977,8 @@ static struct aa_profile *__next_profile(struct aa_profile *p) struct aa_profile *parent; struct aa_ns *ns = p->ns; + AA_BUG(!mutex_is_locked(&profiles_ns(p)->lock)); + /* is next profile a child */ if (!list_empty(&p->base.profiles)) return list_first_entry(&p->base.profiles, typeof(*p), From 651e28c5537abb39076d3949fb7618536f1d242e Mon Sep 17 00:00:00 2001 From: John Johansen Date: Tue, 18 Jul 2017 23:18:33 -0700 Subject: [PATCH 0604/1322] apparmor: add base infastructure for socket mediation Provide a basic mediation of sockets. This is not a full net mediation but just whether a spcific family of socket can be used by an application, along with setting up some basic infrastructure for network mediation to follow. the user space rule hav the basic form of NETWORK RULE = [ QUALIFIERS ] 'network' [ DOMAIN ] [ TYPE | PROTOCOL ] DOMAIN = ( 'inet' | 'ax25' | 'ipx' | 'appletalk' | 'netrom' | 'bridge' | 'atmpvc' | 'x25' | 'inet6' | 'rose' | 'netbeui' | 'security' | 'key' | 'packet' | 'ash' | 'econet' | 'atmsvc' | 'sna' | 'irda' | 'pppox' | 'wanpipe' | 'bluetooth' | 'netlink' | 'unix' | 'rds' | 'llc' | 'can' | 'tipc' | 'iucv' | 'rxrpc' | 'isdn' | 'phonet' | 'ieee802154' | 'caif' | 'alg' | 'nfc' | 'vsock' | 'mpls' | 'ib' | 'kcm' ) ',' TYPE = ( 'stream' | 'dgram' | 'seqpacket' | 'rdm' | 'raw' | 'packet' ) PROTOCOL = ( 'tcp' | 'udp' | 'icmp' ) eg. network, network inet, Signed-off-by: John Johansen Acked-by: Seth Arnold --- security/apparmor/.gitignore | 1 + security/apparmor/Makefile | 43 +++- security/apparmor/apparmorfs.c | 1 + security/apparmor/file.c | 30 +++ security/apparmor/include/audit.h | 26 +- security/apparmor/include/net.h | 114 +++++++++ security/apparmor/include/perms.h | 5 +- security/apparmor/include/policy.h | 13 + security/apparmor/lib.c | 5 +- security/apparmor/lsm.c | 387 +++++++++++++++++++++++++++++ security/apparmor/net.c | 184 ++++++++++++++ security/apparmor/policy_unpack.c | 47 +++- 12 files changed, 840 insertions(+), 16 deletions(-) create mode 100644 security/apparmor/include/net.h create mode 100644 security/apparmor/net.c diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore index 9cdec70d72b8..d5b291e94264 100644 --- a/security/apparmor/.gitignore +++ b/security/apparmor/.gitignore @@ -1,5 +1,6 @@ # # Generated include files # +net_names.h capability_names.h rlim_names.h diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile index 81a34426d024..dafdd387d42b 100644 --- a/security/apparmor/Makefile +++ b/security/apparmor/Makefile @@ -4,11 +4,44 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \ path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \ - resource.o secid.o file.o policy_ns.o label.o mount.o + resource.o secid.o file.o policy_ns.o label.o mount.o net.o apparmor-$(CONFIG_SECURITY_APPARMOR_HASH) += crypto.o -clean-files := capability_names.h rlim_names.h +clean-files := capability_names.h rlim_names.h net_names.h +# Build a lower case string table of address family names +# Transform lines from +# #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# [1] = "local", +# [2] = "inet", +# +# and build the securityfs entries for the mapping. +# Transforms lines from +# #define AF_INET 2 /* Internet IP Protocol */ +# to +# #define AA_SFS_AF_MASK "local inet" +quiet_cmd_make-af = GEN $@ +cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ;\ + sed $< >>$@ -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "/AF_ROUTE/d" -e \ + 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ ;\ + printf '%s' '\#define AA_SFS_AF_MASK "' >> $@ ;\ + sed -r -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "/AF_ROUTE/d" -e \ + 's/^\#define[ \t]+AF_([A-Z0-9_]+)[ \t]+([0-9]+)(.*)/\L\1/p'\ + $< | tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ + +# Build a lower case string table of sock type names +# Transform lines from +# SOCK_STREAM = 1, +# to +# [1] = "stream", +quiet_cmd_make-sock = GEN $@ +cmd_make-sock = echo "static const char *sock_type_names[] = {" >> $@ ;\ + sed $^ >>$@ -r -n \ + -e 's/^\tSOCK_([A-Z0-9_]+)[\t]+=[ \t]+([0-9]+)(.*)/[\2] = "\L\1",/p';\ + echo "};" >> $@ # Build a lower case string table of capability names # Transforms lines from @@ -61,6 +94,7 @@ cmd_make-rlim = echo "static const char *const rlim_names[RLIM_NLIMITS] = {" \ tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@ $(obj)/capability.o : $(obj)/capability_names.h +$(obj)/net.o : $(obj)/net_names.h $(obj)/resource.o : $(obj)/rlim_names.h $(obj)/capability_names.h : $(srctree)/include/uapi/linux/capability.h \ $(src)/Makefile @@ -68,3 +102,8 @@ $(obj)/capability_names.h : $(srctree)/include/uapi/linux/capability.h \ $(obj)/rlim_names.h : $(srctree)/include/uapi/asm-generic/resource.h \ $(src)/Makefile $(call cmd,make-rlim) +$(obj)/net_names.h : $(srctree)/include/linux/socket.h \ + $(srctree)/include/linux/net.h \ + $(src)/Makefile + $(call cmd,make-af) + $(call cmd,make-sock) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7acea14c850b..125dad5c3fde 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2202,6 +2202,7 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { AA_SFS_DIR("policy", aa_sfs_entry_policy), AA_SFS_DIR("domain", aa_sfs_entry_domain), AA_SFS_DIR("file", aa_sfs_entry_file), + AA_SFS_DIR("network", aa_sfs_entry_network), AA_SFS_DIR("mount", aa_sfs_entry_mount), AA_SFS_DIR("namespaces", aa_sfs_entry_ns), AA_SFS_FILE_U64("capability", VFS_CAP_FLAGS_MASK), diff --git a/security/apparmor/file.c b/security/apparmor/file.c index 3382518b87fa..db80221891c6 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -21,6 +21,7 @@ #include "include/context.h" #include "include/file.h" #include "include/match.h" +#include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/label.h" @@ -566,6 +567,32 @@ static int __file_path_perm(const char *op, struct aa_label *label, return error; } +static int __file_sock_perm(const char *op, struct aa_label *label, + struct aa_label *flabel, struct file *file, + u32 request, u32 denied) +{ + struct socket *sock = (struct socket *) file->private_data; + int error; + + AA_BUG(!sock); + + /* revalidation due to label out of date. No revocation at this time */ + if (!denied && aa_label_is_subset(flabel, label)) + return 0; + + /* TODO: improve to skip profiles cached in flabel */ + error = aa_sock_file_perm(label, op, request, sock); + if (denied) { + /* TODO: improve to skip profiles checked above */ + /* check every profile in file label to is cached */ + last_error(error, aa_sock_file_perm(flabel, op, request, sock)); + } + if (!error) + update_file_ctx(file_ctx(file), label, request); + + return error; +} + /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked @@ -610,6 +637,9 @@ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, error = __file_path_perm(op, label, flabel, file, request, denied); + else if (S_ISSOCK(file_inode(file)->i_mode)) + error = __file_sock_perm(op, label, flabel, file, request, + denied); done: rcu_read_unlock(); diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 620e81169659..ff4316e1068d 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -121,21 +121,29 @@ struct apparmor_audit_data { /* these entries require a custom callback fn */ struct { struct aa_label *peer; - struct { - const char *target; - kuid_t ouid; - } fs; + union { + struct { + kuid_t ouid; + const char *target; + } fs; + struct { + int type, protocol; + struct sock *peer_sk; + void *addr; + int addrlen; + } net; + int signal; + struct { + int rlim; + unsigned long max; + } rlim; + }; }; struct { struct aa_profile *profile; const char *ns; long pos; } iface; - int signal; - struct { - int rlim; - unsigned long max; - } rlim; struct { const char *src_name; const char *type; diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h new file mode 100644 index 000000000000..140c8efcf364 --- /dev/null +++ b/security/apparmor/include/net.h @@ -0,0 +1,114 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation definitions. + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#ifndef __AA_NET_H +#define __AA_NET_H + +#include +#include + +#include "apparmorfs.h" +#include "label.h" +#include "perms.h" +#include "policy.h" + +#define AA_MAY_SEND AA_MAY_WRITE +#define AA_MAY_RECEIVE AA_MAY_READ + +#define AA_MAY_SHUTDOWN AA_MAY_DELETE + +#define AA_MAY_CONNECT AA_MAY_OPEN +#define AA_MAY_ACCEPT 0x00100000 + +#define AA_MAY_BIND 0x00200000 +#define AA_MAY_LISTEN 0x00400000 + +#define AA_MAY_SETOPT 0x01000000 +#define AA_MAY_GETOPT 0x02000000 + +#define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ + AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ + AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ + AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) + +#define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ + AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ + AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ + AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ + AA_MAY_MPROT) + +#define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ + AA_MAY_ACCEPT) +struct aa_sk_ctx { + struct aa_label *label; + struct aa_label *peer; + struct path path; +}; + +#define SK_CTX(X) ((X)->sk_security) +#define SOCK_ctx(X) SOCK_INODE(X)->i_security +#define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P) \ + struct lsm_network_audit NAME ## _net = { .sk = (SK), \ + .family = (F)}; \ + DEFINE_AUDIT_DATA(NAME, \ + ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ + LSM_AUDIT_DATA_NONE, \ + OP); \ + NAME.u.net = &(NAME ## _net); \ + aad(&NAME)->net.type = (T); \ + aad(&NAME)->net.protocol = (P) + +#define DEFINE_AUDIT_SK(NAME, OP, SK) \ + DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type, \ + (SK)->sk_protocol) + +/* struct aa_net - network confinement data + * @allow: basic network families permissions + * @audit: which network permissions to force audit + * @quiet: which network permissions to quiet rejects + */ +struct aa_net { + u16 allow[AF_MAX]; + u16 audit[AF_MAX]; + u16 quiet[AF_MAX]; +}; + + +extern struct aa_sfs_entry aa_sfs_entry_network[]; + +void audit_net_cb(struct audit_buffer *ab, void *va); +int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, + u32 request, u16 family, int type); +int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, + int type, int protocol); +static inline int aa_profile_af_sk_perm(struct aa_profile *profile, + struct common_audit_data *sa, + u32 request, + struct sock *sk) +{ + return aa_profile_af_perm(profile, sa, request, sk->sk_family, + sk->sk_type); +} +int aa_sk_perm(const char *op, u32 request, struct sock *sk); + +int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, + struct socket *sock); + + +static inline void aa_free_net_rules(struct aa_net *new) +{ + /* NOP */ +} + +#endif /* __AA_NET_H */ diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 2b27bb79aec4..af04d5a7d73d 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -135,9 +135,10 @@ extern struct aa_perms allperms; void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask); -void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask); +void aa_audit_perm_names(struct audit_buffer *ab, const char * const *names, + u32 mask); void aa_audit_perm_mask(struct audit_buffer *ab, u32 mask, const char *chrs, - u32 chrsmask, const char **names, u32 namesmask); + u32 chrsmask, const char * const *names, u32 namesmask); void aa_apply_modes_to_perms(struct aa_profile *profile, struct aa_perms *perms); void aa_compute_perms(struct aa_dfa *dfa, unsigned int state, diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h index 17fe41a9cac3..4364088a0b9e 100644 --- a/security/apparmor/include/policy.h +++ b/security/apparmor/include/policy.h @@ -30,6 +30,7 @@ #include "file.h" #include "lib.h" #include "label.h" +#include "net.h" #include "perms.h" #include "resource.h" @@ -111,6 +112,7 @@ struct aa_data { * @policy: general match rules governing policy * @file: The set of rules governing basic file access and domain transitions * @caps: capabilities for the profile + * @net: network controls for the profile * @rlimits: rlimits for the profile * * @dents: dentries for the profiles file entries in apparmorfs @@ -148,6 +150,7 @@ struct aa_profile { struct aa_policydb policy; struct aa_file_rules file; struct aa_caps caps; + struct aa_net net; struct aa_rlimit rlimits; struct aa_loaddata *rawdata; @@ -220,6 +223,16 @@ static inline unsigned int PROFILE_MEDIATES_SAFE(struct aa_profile *profile, return 0; } +static inline unsigned int PROFILE_MEDIATES_AF(struct aa_profile *profile, + u16 AF) { + unsigned int state = PROFILE_MEDIATES(profile, AA_CLASS_NET); + u16 be_af = cpu_to_be16(AF); + + if (!state) + return 0; + return aa_dfa_match_len(profile->policy.dfa, state, (char *) &be_af, 2); +} + /** * aa_get_profile - increment refcount on profile @p * @p: profile (MAYBE NULL) diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 08ca26bcca77..8818621b5d95 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -211,7 +211,8 @@ void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask) *str = '\0'; } -void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask) +void aa_audit_perm_names(struct audit_buffer *ab, const char * const *names, + u32 mask) { const char *fmt = "%s"; unsigned int i, perm = 1; @@ -229,7 +230,7 @@ void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask) } void aa_audit_perm_mask(struct audit_buffer *ab, u32 mask, const char *chrs, - u32 chrsmask, const char **names, u32 namesmask) + u32 chrsmask, const char * const *names, u32 namesmask) { char str[33]; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 4ad0b3a45142..cc5ab23a2d84 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -33,6 +33,7 @@ #include "include/context.h" #include "include/file.h" #include "include/ipc.h" +#include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" @@ -736,6 +737,368 @@ static int apparmor_task_kill(struct task_struct *target, struct siginfo *info, return error; } +/** + * apparmor_sk_alloc_security - allocate and attach the sk_security field + */ +static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags) +{ + struct aa_sk_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), flags); + if (!ctx) + return -ENOMEM; + + SK_CTX(sk) = ctx; + + return 0; +} + +/** + * apparmor_sk_free_security - free the sk_security field + */ +static void apparmor_sk_free_security(struct sock *sk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + SK_CTX(sk) = NULL; + aa_put_label(ctx->label); + aa_put_label(ctx->peer); + path_put(&ctx->path); + kfree(ctx); +} + +/** + * apparmor_clone_security - clone the sk_security field + */ +static void apparmor_sk_clone_security(const struct sock *sk, + struct sock *newsk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + struct aa_sk_ctx *new = SK_CTX(newsk); + + new->label = aa_get_label(ctx->label); + new->peer = aa_get_label(ctx->peer); + new->path = ctx->path; + path_get(&new->path); +} + +static int aa_sock_create_perm(struct aa_label *label, int family, int type, + int protocol) +{ + AA_BUG(!label); + AA_BUG(in_interrupt()); + + return aa_af_perm(label, OP_CREATE, AA_MAY_CREATE, family, type, + protocol); +} + + +/** + * apparmor_socket_create - check perms before creating a new socket + */ +static int apparmor_socket_create(int family, int type, int protocol, int kern) +{ + struct aa_label *label; + int error = 0; + + label = begin_current_label_crit_section(); + if (!(kern || unconfined(label))) + error = aa_sock_create_perm(label, family, type, protocol); + end_current_label_crit_section(label); + + return error; +} + +/** + * apparmor_socket_post_create - setup the per-socket security struct + * + * Note: + * - kernel sockets currently labeled unconfined but we may want to + * move to a special kernel label + * - socket may not have sk here if created with sock_create_lite or + * sock_alloc. These should be accept cases which will be handled in + * sock_graft. + */ +static int apparmor_socket_post_create(struct socket *sock, int family, + int type, int protocol, int kern) +{ + struct aa_label *label; + + if (kern) { + struct aa_ns *ns = aa_get_current_ns(); + + label = aa_get_label(ns_unconfined(ns)); + aa_put_ns(ns); + } else + label = aa_get_current_label(); + + if (sock->sk) { + struct aa_sk_ctx *ctx = SK_CTX(sock->sk); + + aa_put_label(ctx->label); + ctx->label = aa_get_label(label); + } + aa_put_label(label); + + return 0; +} + +/** + * apparmor_socket_bind - check perms before bind addr to socket + */ +static int apparmor_socket_bind(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!address); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk); +} + +/** + * apparmor_socket_connect - check perms before connecting @sock to @address + */ +static int apparmor_socket_connect(struct socket *sock, + struct sockaddr *address, int addrlen) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!address); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk); +} + +/** + * apparmor_socket_list - check perms before allowing listen + */ +static int apparmor_socket_listen(struct socket *sock, int backlog) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk); +} + +/** + * apparmor_socket_accept - check perms before accepting a new connection. + * + * Note: while @newsock is created and has some information, the accept + * has not been done. + */ +static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!newsock); + AA_BUG(in_interrupt()); + + return aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk); +} + +static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, + struct msghdr *msg, int size) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(!msg); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_socket_sendmsg - check perms before sending msg to another socket + */ +static int apparmor_socket_sendmsg(struct socket *sock, + struct msghdr *msg, int size) +{ + return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); +} + +/** + * apparmor_socket_recvmsg - check perms before receiving a message + */ +static int apparmor_socket_recvmsg(struct socket *sock, + struct msghdr *msg, int size, int flags) +{ + return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); +} + +/* revaliation, get/set attr, shutdown */ +static int aa_sock_perm(const char *op, u32 request, struct socket *sock) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_socket_getsockname - check perms before getting the local address + */ +static int apparmor_socket_getsockname(struct socket *sock) +{ + return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); +} + +/** + * apparmor_socket_getpeername - check perms before getting remote address + */ +static int apparmor_socket_getpeername(struct socket *sock) +{ + return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); +} + +/* revaliation, get/set attr, opt */ +static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, + int level, int optname) +{ + AA_BUG(!sock); + AA_BUG(!sock->sk); + AA_BUG(in_interrupt()); + + return aa_sk_perm(op, request, sock->sk); +} + +/** + * apparmor_getsockopt - check perms before getting socket options + */ +static int apparmor_socket_getsockopt(struct socket *sock, int level, + int optname) +{ + return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, + level, optname); +} + +/** + * apparmor_setsockopt - check perms before setting socket options + */ +static int apparmor_socket_setsockopt(struct socket *sock, int level, + int optname) +{ + return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, + level, optname); +} + +/** + * apparmor_socket_shutdown - check perms before shutting down @sock conn + */ +static int apparmor_socket_shutdown(struct socket *sock, int how) +{ + return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); +} + +/** + * apparmor_socket_sock_recv_skb - check perms before associating skb to sk + * + * Note: can not sleep may be called with locks held + * + * dont want protocol specific in __skb_recv_datagram() + * to deny an incoming connection socket_sock_rcv_skb() + */ +static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + + +static struct aa_label *sk_peer_label(struct sock *sk) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (ctx->peer) + return ctx->peer; + + return ERR_PTR(-ENOPROTOOPT); +} + +/** + * apparmor_socket_getpeersec_stream - get security context of peer + * + * Note: for tcp only valid if using ipsec or cipso on lan + */ +static int apparmor_socket_getpeersec_stream(struct socket *sock, + char __user *optval, + int __user *optlen, + unsigned int len) +{ + char *name; + int slen, error = 0; + struct aa_label *label; + struct aa_label *peer; + + label = begin_current_label_crit_section(); + peer = sk_peer_label(sock->sk); + if (IS_ERR(peer)) { + error = PTR_ERR(peer); + goto done; + } + slen = aa_label_asxprint(&name, labels_ns(label), peer, + FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | + FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); + /* don't include terminating \0 in slen, it breaks some apps */ + if (slen < 0) { + error = -ENOMEM; + } else { + if (slen > len) { + error = -ERANGE; + } else if (copy_to_user(optval, name, slen)) { + error = -EFAULT; + goto out; + } + if (put_user(slen, optlen)) + error = -EFAULT; +out: + kfree(name); + + } + +done: + end_current_label_crit_section(label); + + return error; +} + +/** + * apparmor_socket_getpeersec_dgram - get security label of packet + * @sock: the peer socket + * @skb: packet data + * @secid: pointer to where to put the secid of the packet + * + * Sets the netlabel socket state on sk from parent + */ +static int apparmor_socket_getpeersec_dgram(struct socket *sock, + struct sk_buff *skb, u32 *secid) + +{ + /* TODO: requires secid support */ + return -ENOPROTOOPT; +} + +/** + * apparmor_sock_graft - Initialize newly created socket + * @sk: child sock + * @parent: parent socket + * + * Note: could set off of SOCK_CTX(parent) but need to track inode and we can + * just set sk security information off of current creating process label + * Labeling of sk for accept case - probably should be sock based + * instead of task, because of the case where an implicitly labeled + * socket is shared by different tasks. + */ +static void apparmor_sock_graft(struct sock *sk, struct socket *parent) +{ + struct aa_sk_ctx *ctx = SK_CTX(sk); + + if (!ctx->label) + ctx->label = aa_get_current_label(); +} + static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), @@ -770,6 +1133,30 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), + LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), + LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), + LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), + + LSM_HOOK_INIT(socket_create, apparmor_socket_create), + LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), + LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), + LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), + LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), + LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), + LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), + LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), + LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), + LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), + LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), + LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), + LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), + LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), + LSM_HOOK_INIT(socket_getpeersec_stream, + apparmor_socket_getpeersec_stream), + LSM_HOOK_INIT(socket_getpeersec_dgram, + apparmor_socket_getpeersec_dgram), + LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), + LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), diff --git a/security/apparmor/net.c b/security/apparmor/net.c new file mode 100644 index 000000000000..33d54435f8d6 --- /dev/null +++ b/security/apparmor/net.c @@ -0,0 +1,184 @@ +/* + * AppArmor security module + * + * This file contains AppArmor network mediation + * + * Copyright (C) 1998-2008 Novell/SUSE + * Copyright 2009-2017 Canonical Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include "include/apparmor.h" +#include "include/audit.h" +#include "include/context.h" +#include "include/label.h" +#include "include/net.h" +#include "include/policy.h" + +#include "net_names.h" + + +struct aa_sfs_entry aa_sfs_entry_network[] = { + AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), + { } +}; + +static const char * const net_mask_names[] = { + "unknown", + "send", + "receive", + "unknown", + + "create", + "shutdown", + "connect", + "unknown", + + "setattr", + "getattr", + "setcred", + "getcred", + + "chmod", + "chown", + "chgrp", + "lock", + + "mmap", + "mprot", + "unknown", + "unknown", + + "accept", + "bind", + "listen", + "unknown", + + "setopt", + "getopt", + "unknown", + "unknown", + + "unknown", + "unknown", + "unknown", + "unknown", +}; + + +/* audit callback for net specific fields */ +void audit_net_cb(struct audit_buffer *ab, void *va) +{ + struct common_audit_data *sa = va; + + audit_log_format(ab, " family="); + if (address_family_names[sa->u.net->family]) + audit_log_string(ab, address_family_names[sa->u.net->family]); + else + audit_log_format(ab, "\"unknown(%d)\"", sa->u.net->family); + audit_log_format(ab, " sock_type="); + if (sock_type_names[aad(sa)->net.type]) + audit_log_string(ab, sock_type_names[aad(sa)->net.type]); + else + audit_log_format(ab, "\"unknown(%d)\"", aad(sa)->net.type); + audit_log_format(ab, " protocol=%d", aad(sa)->net.protocol); + + if (aad(sa)->request & NET_PERMS_MASK) { + audit_log_format(ab, " requested_mask="); + aa_audit_perm_mask(ab, aad(sa)->request, NULL, 0, + net_mask_names, NET_PERMS_MASK); + + if (aad(sa)->denied & NET_PERMS_MASK) { + audit_log_format(ab, " denied_mask="); + aa_audit_perm_mask(ab, aad(sa)->denied, NULL, 0, + net_mask_names, NET_PERMS_MASK); + } + } + if (aad(sa)->peer) { + audit_log_format(ab, " peer="); + aa_label_xaudit(ab, labels_ns(aad(sa)->label), aad(sa)->peer, + FLAGS_NONE, GFP_ATOMIC); + } +} + + +/* Generic af perm */ +int aa_profile_af_perm(struct aa_profile *profile, struct common_audit_data *sa, + u32 request, u16 family, int type) +{ + struct aa_perms perms = { }; + + AA_BUG(family >= AF_MAX); + AA_BUG(type < 0 || type >= SOCK_MAX); + + if (profile_unconfined(profile)) + return 0; + + perms.allow = (profile->net.allow[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + perms.audit = (profile->net.audit[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + perms.quiet = (profile->net.quiet[family] & (1 << type)) ? + ALL_PERMS_MASK : 0; + aa_apply_modes_to_perms(profile, &perms); + + return aa_check_perms(profile, &perms, request, sa, audit_net_cb); +} + +int aa_af_perm(struct aa_label *label, const char *op, u32 request, u16 family, + int type, int protocol) +{ + struct aa_profile *profile; + DEFINE_AUDIT_NET(sa, op, NULL, family, type, protocol); + + return fn_for_each_confined(label, profile, + aa_profile_af_perm(profile, &sa, request, family, + type)); +} + +static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 request, + struct sock *sk) +{ + struct aa_profile *profile; + DEFINE_AUDIT_SK(sa, op, sk); + + AA_BUG(!label); + AA_BUG(!sk); + + if (unconfined(label)) + return 0; + + return fn_for_each_confined(label, profile, + aa_profile_af_sk_perm(profile, &sa, request, sk)); +} + +int aa_sk_perm(const char *op, u32 request, struct sock *sk) +{ + struct aa_label *label; + int error; + + AA_BUG(!sk); + AA_BUG(in_interrupt()); + + /* TODO: switch to begin_current_label ???? */ + label = begin_current_label_crit_section(); + error = aa_label_sk_perm(label, op, request, sk); + end_current_label_crit_section(label); + + return error; +} + + +int aa_sock_file_perm(struct aa_label *label, const char *op, u32 request, + struct socket *sock) +{ + AA_BUG(!label); + AA_BUG(!sock); + AA_BUG(!sock->sk); + + return aa_label_sk_perm(label, op, request, sock->sk); +} diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 4ede87c30f8b..5a2aec358322 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -275,6 +275,19 @@ fail: return 0; } +static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name) +{ + if (unpack_nameX(e, AA_U16, name)) { + if (!inbounds(e, sizeof(u16))) + return 0; + if (data) + *data = le16_to_cpu(get_unaligned((__le16 *) e->pos)); + e->pos += sizeof(u16); + return 1; + } + return 0; +} + static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name) { if (unpack_nameX(e, AA_U32, name)) { @@ -584,7 +597,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) struct aa_profile *profile = NULL; const char *tmpname, *tmpns = NULL, *name = NULL; const char *info = "failed to unpack profile"; - size_t ns_len; + size_t size = 0, ns_len; struct rhashtable_params params = { 0 }; char *key = NULL; struct aa_data *data; @@ -717,6 +730,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) goto fail; } + size = unpack_array(e, "net_allowed_af"); + if (size) { + + for (i = 0; i < size; i++) { + /* discard extraneous rules that this kernel will + * never request + */ + if (i >= AF_MAX) { + u16 tmp; + + if (!unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL) || + !unpack_u16(e, &tmp, NULL)) + goto fail; + continue; + } + if (!unpack_u16(e, &profile->net.allow[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.audit[i], NULL)) + goto fail; + if (!unpack_u16(e, &profile->net.quiet[i], NULL)) + goto fail; + } + if (!unpack_nameX(e, AA_ARRAYEND, NULL)) + goto fail; + } + if (VERSION_LT(e->version, v7)) { + /* pre v7 policy always allowed these */ + profile->net.allow[AF_UNIX] = 0xffff; + profile->net.allow[AF_NETLINK] = 0xffff; + } + if (unpack_nameX(e, AA_STRUCT, "policydb")) { /* generic policy dfa - optional and may be NULL */ info = "failed to unpack policydb"; From d07881d2edb0ab783846730629ac2faeaafdf4f1 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 08:59:57 -0700 Subject: [PATCH 0605/1322] apparmor: move new_null_profile to after profile lookup fns() new_null_profile will need to use some of the profile lookup fns() so move instead of doing forward fn declarations. Signed-off-by: John Johansen --- security/apparmor/policy.c | 158 ++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 244ea4a4a8f0..a81a384a63b1 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -289,85 +289,6 @@ fail: return NULL; } -/** - * aa_new_null_profile - create or find a null-X learning profile - * @parent: profile that caused this profile to be created (NOT NULL) - * @hat: true if the null- learning profile is a hat - * @base: name to base the null profile off of - * @gfp: type of allocation - * - * Find/Create a null- complain mode profile used in learning mode. The - * name of the profile is unique and follows the format of parent//null-XXX. - * where XXX is based on the @name or if that fails or is not supplied - * a unique number - * - * null profiles are added to the profile list but the list does not - * hold a count on them so that they are automatically released when - * not in use. - * - * Returns: new refcounted profile else NULL on failure - */ -struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, - const char *base, gfp_t gfp) -{ - struct aa_profile *profile; - char *name; - - AA_BUG(!parent); - - if (base) { - name = kmalloc(strlen(parent->base.hname) + 8 + strlen(base), - gfp); - if (name) { - sprintf(name, "%s//null-%s", parent->base.hname, base); - goto name; - } - /* fall through to try shorter uniq */ - } - - name = kmalloc(strlen(parent->base.hname) + 2 + 7 + 8, gfp); - if (!name) - return NULL; - sprintf(name, "%s//null-%x", parent->base.hname, - atomic_inc_return(&parent->ns->uniq_null)); - -name: - /* lookup to see if this is a dup creation */ - profile = aa_find_child(parent, basename(name)); - if (profile) - goto out; - - profile = aa_alloc_profile(name, NULL, gfp); - if (!profile) - goto fail; - - profile->mode = APPARMOR_COMPLAIN; - profile->label.flags |= FLAG_NULL; - if (hat) - profile->label.flags |= FLAG_HAT; - profile->path_flags = parent->path_flags; - - /* released on free_profile */ - rcu_assign_pointer(profile->parent, aa_get_profile(parent)); - profile->ns = aa_get_ns(parent->ns); - profile->file.dfa = aa_get_dfa(nulldfa); - profile->policy.dfa = aa_get_dfa(nulldfa); - - mutex_lock(&profile->ns->lock); - __add_profile(&parent->base.profiles, profile); - mutex_unlock(&profile->ns->lock); - - /* refcount released by caller */ -out: - kfree(name); - - return profile; - -fail: - aa_free_profile(profile); - return NULL; -} - /* TODO: profile accounting - setup in remove */ /** @@ -558,6 +479,85 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, return profile; } +/** + * aa_new_null_profile - create or find a null-X learning profile + * @parent: profile that caused this profile to be created (NOT NULL) + * @hat: true if the null- learning profile is a hat + * @base: name to base the null profile off of + * @gfp: type of allocation + * + * Find/Create a null- complain mode profile used in learning mode. The + * name of the profile is unique and follows the format of parent//null-XXX. + * where XXX is based on the @name or if that fails or is not supplied + * a unique number + * + * null profiles are added to the profile list but the list does not + * hold a count on them so that they are automatically released when + * not in use. + * + * Returns: new refcounted profile else NULL on failure + */ +struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, + const char *base, gfp_t gfp) +{ + struct aa_profile *profile; + char *name; + + AA_BUG(!parent); + + if (base) { + name = kmalloc(strlen(parent->base.hname) + 8 + strlen(base), + gfp); + if (name) { + sprintf(name, "%s//null-%s", parent->base.hname, base); + goto name; + } + /* fall through to try shorter uniq */ + } + + name = kmalloc(strlen(parent->base.hname) + 2 + 7 + 8, gfp); + if (!name) + return NULL; + sprintf(name, "%s//null-%x", parent->base.hname, + atomic_inc_return(&parent->ns->uniq_null)); + +name: + /* lookup to see if this is a dup creation */ + profile = aa_find_child(parent, basename(name)); + if (profile) + goto out; + + profile = aa_alloc_profile(name, NULL, gfp); + if (!profile) + goto fail; + + profile->mode = APPARMOR_COMPLAIN; + profile->label.flags |= FLAG_NULL; + if (hat) + profile->label.flags |= FLAG_HAT; + profile->path_flags = parent->path_flags; + + /* released on free_profile */ + rcu_assign_pointer(profile->parent, aa_get_profile(parent)); + profile->ns = aa_get_ns(parent->ns); + profile->file.dfa = aa_get_dfa(nulldfa); + profile->policy.dfa = aa_get_dfa(nulldfa); + + mutex_lock(&profile->ns->lock); + __add_profile(&parent->base.profiles, profile); + mutex_unlock(&profile->ns->lock); + + /* refcount released by caller */ +out: + kfree(name); + + return profile; + +fail: + aa_free_profile(profile); + return NULL; +} + /** * replacement_allowed - test to see if replacement is allowed * @profile: profile to test if it can be replaced (MAYBE NULL) From 290638a52a808d658bd04b746b3ca46886c157e0 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 05:40:49 -0700 Subject: [PATCH 0606/1322] apparmor: fix race condition in null profile creation There is a race when null- profile is being created between the initial lookup/creation of the profile and lock/addition of the profile. This could result in multiple version of a profile being added to the list which need to be removed/replaced. Since these are learning profile their is no affect on mediation. Signed-off-by: John Johansen --- security/apparmor/policy.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index a81a384a63b1..4243b0c3f0e4 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -500,7 +500,8 @@ struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, const char *base, gfp_t gfp) { - struct aa_profile *profile; + struct aa_profile *p, *profile; + const char *bname; char *name; AA_BUG(!parent); @@ -523,7 +524,8 @@ struct aa_profile *aa_new_null_profile(struct aa_profile *parent, bool hat, name: /* lookup to see if this is a dup creation */ - profile = aa_find_child(parent, basename(name)); + bname = basename(name); + profile = aa_find_child(parent, bname); if (profile) goto out; @@ -544,7 +546,13 @@ name: profile->policy.dfa = aa_get_dfa(nulldfa); mutex_lock(&profile->ns->lock); - __add_profile(&parent->base.profiles, profile); + p = __find_child(&parent->base.profiles, bname); + if (p) { + aa_free_profile(profile); + profile = aa_get_profile(p); + } else { + __add_profile(&parent->base.profiles, profile); + } mutex_unlock(&profile->ns->lock); /* refcount released by caller */ From 15372b97aa7593c6f5bc1afe69f42fd403c40685 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 05:48:06 -0700 Subject: [PATCH 0607/1322] apparmor: ensure unconfined profiles have dfas initialized Generally unconfined has early bailout tests and does not need the dfas initialized, however if an early bailout test is ever missed it will result in an oops. Be defensive and initialize the unconfined profile to have null dfas (no permission) so if an early bailout test is missed we fail closed (no perms granted) instead of oopsing. Signed-off-by: John Johansen --- security/apparmor/policy_ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/policy_ns.c b/security/apparmor/policy_ns.c index 351d3bab3a3d..62a3589c62ab 100644 --- a/security/apparmor/policy_ns.c +++ b/security/apparmor/policy_ns.c @@ -112,6 +112,8 @@ static struct aa_ns *alloc_ns(const char *prefix, const char *name) ns->unconfined->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; ns->unconfined->mode = APPARMOR_UNCONFINED; + ns->unconfined->file.dfa = aa_get_dfa(nulldfa); + ns->unconfined->policy.dfa = aa_get_dfa(nulldfa); /* ns and ns->unconfined share ns->unconfined refcount */ ns->unconfined->ns = ns; From bc4d82fb946e7b471eab2a80e384227c4eb15652 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 16 Aug 2017 09:33:48 -0700 Subject: [PATCH 0608/1322] apparmor: fix incorrect type assignment when freeing proxies sparse reports poisoning the proxy->label before freeing the struct is resulting in a sparse build warning. ../security/apparmor/label.c:52:30: warning: incorrect type in assignment (different address spaces) ../security/apparmor/label.c:52:30: expected struct aa_label [noderef] *label ../security/apparmor/label.c:52:30: got struct aa_label * fix with RCU_INIT_POINTER as this is one of those cases where rcu_assign_pointer() is not needed. Signed-off-by: John Johansen --- security/apparmor/label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 52b4ef14840d..c5b99b954580 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -49,7 +49,7 @@ static void free_proxy(struct aa_proxy *proxy) /* p->label will not updated any more as p is dead */ aa_put_label(rcu_dereference_protected(proxy->label, true)); memset(proxy, 0, sizeof(*proxy)); - proxy->label = (struct aa_label *) PROXY_POISON; + RCU_INIT_POINTER(proxy->label, (struct aa_label *)PROXY_POISON); kfree(proxy); } } From b1545dba092ba543eab1f7b5ed757a4988e267c8 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Wed, 23 Aug 2017 12:10:39 -0700 Subject: [PATCH 0609/1322] apparmor: fix build failure on sparc caused by undeclared signals In file included from security/apparmor/ipc.c:23:0: security/apparmor/include/sig_names.h:26:3: error: 'SIGSTKFLT' undeclared here (not in a function) [SIGSTKFLT] = 16, /* -, 16, - */ ^ security/apparmor/include/sig_names.h:26:3: error: array index in initializer not of integer type security/apparmor/include/sig_names.h:26:3: note: (near initialization for 'sig_map') security/apparmor/include/sig_names.h:51:3: error: 'SIGUNUSED' undeclared here (not in a function) [SIGUNUSED] = 34, /* -, 31, - */ ^ security/apparmor/include/sig_names.h:51:3: error: array index in initializer not of integer type security/apparmor/include/sig_names.h:51:3: note: (near initialization for 'sig_map') Reported-by: Stephen Rothwell Fixes: c6bf1adaecaa ("apparmor: add the ability to mediate signals") Signed-off-by: John Johansen --- security/apparmor/include/sig_names.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/apparmor/include/sig_names.h b/security/apparmor/include/sig_names.h index 0d4395f231ca..92e62fe95292 100644 --- a/security/apparmor/include/sig_names.h +++ b/security/apparmor/include/sig_names.h @@ -23,7 +23,9 @@ static const int sig_map[MAXMAPPED_SIG] = { [SIGPIPE] = 13, [SIGALRM] = 14, [SIGTERM] = 15, +#ifdef SIGSTKFLT [SIGSTKFLT] = 16, /* -, 16, - */ +#endif [SIGCHLD] = 17, /* 20, 17, 18. SIGCHLD -, -, 18 */ [SIGCONT] = 18, /* 19, 18, 25 */ [SIGSTOP] = 19, /* 17, 19, 23 */ @@ -47,7 +49,8 @@ static const int sig_map[MAXMAPPED_SIG] = { #if defined(SIGLOST) && SIGPWR != SIGLOST /* sparc */ [SIGLOST] = 33, /* unused on Linux */ #endif -#if defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS +#if defined(SIGUNUSED) && \ + defined(SIGLOST) && defined(SIGSYS) && SIGLOST != SIGSYS [SIGUNUSED] = 34, /* -, 31, - */ #endif }; From bf81100f63db7ea243d17b9d5008ba3af2fdf6b2 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Thu, 31 Aug 2017 09:54:43 -0700 Subject: [PATCH 0610/1322] apparmor: fix apparmorfs DAC access permissions The DAC access permissions for several apparmorfs files are wrong. .access - needs to be writable by all tasks to perform queries the others in the set only provide a read fn so should be read only. With policy namespace virtualization all apparmor needs to control the permission and visibility checks directly which means DAC access has to be allowed for all user, group, and other. BugLink: http://bugs.launchpad.net/bugs/1713103 Fixes: c97204baf840b ("apparmor: rename apparmor file fns and data to indicate use") Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 125dad5c3fde..518d5928661b 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -2215,12 +2215,12 @@ static struct aa_sfs_entry aa_sfs_entry_features[] = { }; static struct aa_sfs_entry aa_sfs_entry_apparmor[] = { - AA_SFS_FILE_FOPS(".access", 0640, &aa_sfs_access), + AA_SFS_FILE_FOPS(".access", 0666, &aa_sfs_access), AA_SFS_FILE_FOPS(".stacked", 0444, &seq_ns_stacked_fops), AA_SFS_FILE_FOPS(".ns_stacked", 0444, &seq_ns_nsstacked_fops), - AA_SFS_FILE_FOPS(".ns_level", 0666, &seq_ns_level_fops), - AA_SFS_FILE_FOPS(".ns_name", 0640, &seq_ns_name_fops), - AA_SFS_FILE_FOPS("profiles", 0440, &aa_sfs_profiles_fops), + AA_SFS_FILE_FOPS(".ns_level", 0444, &seq_ns_level_fops), + AA_SFS_FILE_FOPS(".ns_name", 0444, &seq_ns_name_fops), + AA_SFS_FILE_FOPS("profiles", 0444, &aa_sfs_profiles_fops), AA_SFS_DIR("features", aa_sfs_entry_features), { } }; From af21b01d1166248f282fc02d0f459c94de06615e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 22 Sep 2017 22:24:02 +0200 Subject: [PATCH 0611/1322] parisc: Reintroduce option to gzip-compress the kernel By adding the feature to build the kernel as self-extracting executeable, the possibility to simply compress the kernel with gzip was lost. This patch now reintroduces this possibilty again and leaves it up to the user to decide how the kernel should be built. The palo bootloader is able to natively load both formats. Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 12 ++++++++++++ arch/parisc/Makefile | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index ba7b7ddc3844..a57dedbfc7b7 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -257,6 +257,18 @@ config PARISC_PAGE_SIZE_64KB endchoice +config PARISC_SELF_EXTRACT + bool "Build kernel as self-extracting executable" + default y + help + Say Y if you want to build the parisc kernel as a kind of + self-extracting executable. + + If you say N here, the kernel will be compressed with gzip + which can be loaded by the palo bootloader directly too. + + If you don't know what to do here, say Y. + config SMP bool "Symmetric multi-processing support" ---help--- diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 58fae5d2449d..01946ebaff72 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -129,8 +129,13 @@ Image: vmlinux bzImage: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +ifdef CONFIG_PARISC_SELF_EXTRACT vmlinuz: bzImage $(OBJCOPY) $(boot)/bzImage $@ +else +vmlinuz: vmlinux + @gzip -cf -9 $< > $@ +endif install: $(CONFIG_SHELL) $(src)/arch/parisc/install.sh \ From 8c031ba63f8f2a9efc471cb45b2ff18271556544 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 22 Sep 2017 21:57:11 +0200 Subject: [PATCH 0612/1322] parisc: Unbreak bootloader due to gcc-7 optimizations gcc-7 optimizes the byte-wise accesses of get_unaligned_le32() into word-wise accesses if the 32-bit integer output_len is declared as external. This panics then the bootloader since we don't have the unaligned access fault trap handler installed during boot time. Avoid this optimization by declaring output_len as byte-aligned and thus unbreak the bootloader code. Additionally, compile the boot code optimized for size. Signed-off-by: Helge Deller --- arch/parisc/boot/compressed/Makefile | 2 +- arch/parisc/boot/compressed/misc.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 5450a11c9d10..7d7e594bda36 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -15,7 +15,7 @@ targets += misc.o piggy.o sizes.h head.o real2.o firmware.o KBUILD_CFLAGS := -D__KERNEL__ -O2 -DBOOTLOADER KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs +KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os ifndef CONFIG_64BIT KBUILD_CFLAGS += -mfast-indirect-calls endif diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 13a4bf9ac4da..9345b44b86f0 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -24,7 +24,8 @@ /* Symbols defined by linker scripts */ extern char input_data[]; extern int input_len; -extern __le32 output_len; /* at unaligned address, little-endian */ +/* output_len is inserted by the linker possibly at an unaligned address */ +extern __le32 output_len __aligned(1); extern char _text, _end; extern char _bss, _ebss; extern char _startcode_end; From c17c02040bf0d186cebd3e66ff349f955575bf38 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 22 Sep 2017 09:42:42 +0200 Subject: [PATCH 0613/1322] arch: remove unused *_segments() macros/functions Some architectures define the no-op macros/functions copy_segments, release_segments and forget_segments. These are used nowhere in the tree, so removed them. Signed-off-by: Tobias Klauser Acked-by: Vineet Gupta [for arch/arc] Signed-off-by: Linus Torvalds --- arch/arc/include/asm/processor.h | 3 --- arch/c6x/include/asm/processor.h | 3 --- arch/frv/include/asm/processor.h | 4 ---- arch/m32r/include/asm/processor.h | 8 -------- arch/metag/include/asm/processor.h | 3 --- arch/mn10300/kernel/process.c | 12 ------------ arch/sh/include/asm/processor_32.h | 4 ---- arch/sh/include/asm/processor_64.h | 4 ---- arch/um/include/asm/processor-generic.h | 5 ----- arch/xtensa/include/asm/processor.h | 5 ----- 10 files changed, 51 deletions(-) diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index d400a2161935..8ee41e988169 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -78,9 +78,6 @@ struct task_struct; #endif -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - #define KSTK_EIP(tsk) (task_pt_regs(tsk)->ret) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h index 7c87b5be53b5..8f7cce829f8e 100644 --- a/arch/c6x/include/asm/processor.h +++ b/arch/c6x/include/asm/processor.h @@ -92,9 +92,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - /* * saved kernel SP and DP of a blocked thread. */ diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index e4d08d74ed9f..021cce78b401 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -92,10 +92,6 @@ static inline void release_thread(struct task_struct *dead_task) extern asmlinkage void save_user_regs(struct user_context *target); extern asmlinkage void *restore_user_regs(const struct user_context *target, ...); -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) -#define forget_segments() do { } while (0) - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.frame0->pc) diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h index 657874eeeccc..c70fa9ac7169 100644 --- a/arch/m32r/include/asm/processor.h +++ b/arch/m32r/include/asm/processor.h @@ -118,14 +118,6 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -extern void copy_segments(struct task_struct *p, struct mm_struct * mm); -extern void release_segments(struct mm_struct * mm); - -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.lr) #define KSTK_ESP(tsk) ((tsk)->thread.sp) diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h index ec6a49076980..8ae92d6abfd2 100644 --- a/arch/metag/include/asm/processor.h +++ b/arch/metag/include/asm/processor.h @@ -131,9 +131,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -#define copy_segments(tsk, mm) do { } while (0) -#define release_segments(mm) do { } while (0) - /* * Return saved PC of a blocked thread. */ diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 89e8027e07fb..7c475fd99c46 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -59,10 +59,6 @@ void arch_cpu_idle(void) } #endif -void release_segments(struct mm_struct *mm) -{ -} - void machine_restart(char *cmd) { #ifdef CONFIG_KERNEL_DEBUGGER @@ -112,14 +108,6 @@ void release_thread(struct task_struct *dead_task) { } -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -void copy_segments(struct task_struct *p, struct mm_struct *new_mm) -{ -} - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 18e0377f72bb..88ce1e22237b 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -136,10 +136,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned lo /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while(0) -#define release_segments(mm) do { } while(0) - /* * FPU lazy state save handling. */ diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h index eedd4f625d07..777a16318aff 100644 --- a/arch/sh/include/asm/processor_64.h +++ b/arch/sh/include/asm/processor_64.h @@ -170,10 +170,6 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while (0) -#define release_segments(mm) do { } while (0) -#define forget_segments() do { } while (0) /* * FPU lazy state save handling. */ diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index f6d1a3f747a9..86942a492454 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -58,11 +58,6 @@ static inline void release_thread(struct task_struct *task) { } -static inline void mm_copy_segments(struct mm_struct *from_mm, - struct mm_struct *new_mm) -{ -} - #define init_stack (init_thread_union.stack) /* diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 30ee8c608853..5b0027d4ecc0 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -208,11 +208,6 @@ struct mm_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -/* Copy and release all segment info associated with a VM */ -#define copy_segments(p, mm) do { } while(0) -#define release_segments(mm) do { } while(0) -#define forget_segments() do { } while (0) - extern unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) From 6e70e26dc52be62c1f39f81b5f71fa5e643677aa Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 21 Sep 2017 21:32:29 -0500 Subject: [PATCH 0614/1322] SMB3: handle new statx fields We weren't returning the creation time or the two easily supported attributes (ENCRYPTED or COMPRESSED) for the getattr call to allow statx to return these fields. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg \ Acked-by: Jeff Layton CC: Stable Reviewed-by: Pavel Shilovsky --- fs/cifs/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8693632235f..7c732cb44164 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -234,6 +234,8 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime); fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange); + /* old POSIX extensions don't get create time */ + fattr->cf_mode = le64_to_cpu(info->Permissions); /* @@ -2024,6 +2026,19 @@ int cifs_getattr(const struct path *path, struct kstat *stat, stat->blksize = CIFS_MAX_MSGSIZE; stat->ino = CIFS_I(inode)->uniqueid; + /* old CIFS Unix Extensions doesn't return create time */ + if (CIFS_I(inode)->createtime) { + stat->result_mask |= STATX_BTIME; + stat->btime = + cifs_NTtimeToUnix(cpu_to_le64(CIFS_I(inode)->createtime)); + } + + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED); + if (CIFS_I(inode)->cifsAttrs & FILE_ATTRIBUTE_COMPRESSED) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (CIFS_I(inode)->cifsAttrs & FILE_ATTRIBUTE_ENCRYPTED) + stat->attributes |= STATX_ATTR_ENCRYPTED; + /* * If on a multiuser mount without unix extensions or cifsacl being * enabled, and the admin hasn't overridden them, set the ownership From 1013e760d10e614dc10b5624ce9fc41563ba2e65 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Sep 2017 01:40:27 -0500 Subject: [PATCH 0615/1322] SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky --- fs/cifs/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 822311919163..92fdf9c35de2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -224,6 +224,13 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; From dd5437974964c759570d68e50ce13c313808f79a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 22 Sep 2017 14:38:58 +0800 Subject: [PATCH 0616/1322] virtio-net: correctly set xdp_xmit for mergeable buffer We should set xdp_xmit only when xdp_do_redirect() succeed. Cc: John Fastabend Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f6c1f135a024..dd14a4547932 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -721,7 +721,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto xdp_xmit; case XDP_REDIRECT: err = xdp_do_redirect(dev, &xdp, xdp_prog); - if (err) + if (!err) *xdp_xmit = true; rcu_read_unlock(); goto xdp_xmit; From b9b95da92de9d498ece7e6a82e2d6dcfc76fd9d8 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Fri, 22 Sep 2017 14:28:46 +0200 Subject: [PATCH 0617/1322] MAINTAINERS: update git tree locations for ieee802154 subsystem Patches for ieee802154 will go through my new trees towards netdev from now on. The 6LoWPAN subsystem will stay as is (shared between ieee802154 and bluetooth) and go through the bluetooth tree as usual. Signed-off-by: Stefan Schmidt Signed-off-by: David S. Miller --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 955f034fd523..61ee134cf6a8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6642,8 +6642,8 @@ M: Alexander Aring M: Stefan Schmidt L: linux-wpan@vger.kernel.org W: http://wpan.cakelab.org/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sschmidt/wpan-next.git S: Maintained F: net/ieee802154/ F: net/mac802154/ From 581fe0ea61584d88072527ae9fb9dcb9d1f2783e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Sep 2017 19:42:37 -0400 Subject: [PATCH 0618/1322] net: orphan frags on stand-alone ptype in dev_queue_xmit_nit Zerocopy skbs frags are copied when the skb is looped to a local sock. Commit 1080e512d44d ("net: orphan frags on receive") introduced calls to skb_orphan_frags to deliver_skb and __netif_receive_skb for this. With msg_zerocopy, these skbs can also exist in the tx path and thus loop from dev_queue_xmit_nit. This already calls deliver_skb in its loop. But it does not orphan before a separate pt_prev->func(). Add the missing skb_orphan_frags_rx. Changes v1->v2: handle skb_orphan_frags_rx failure Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a2254f9802f..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); From cbb2fb5c72f48d3029c144be0f0e61da1c7bccf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:06 -0400 Subject: [PATCH 0619/1322] net: set tb->fast_sk_family We need to set the tb->fast_sk_family properly so we can use the proper comparison function for all subsequent reuseport bind requests. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b9c64b40a83a..f87f4805e244 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -328,6 +328,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif From 7a56673b58f2414679e926bba80309a037a4fd35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:07 -0400 Subject: [PATCH 0620/1322] net: use inet6_rcv_saddr to compare sockets In ipv6_rcv_saddr_equal() we need to use inet6_rcv_saddr(sk) for the ipv6 compare with the fast socket information to make sure we're doing the proper comparisons. Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk") Reported-and-tested-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f87f4805e244..a1bf30438bc5 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, From fbed24bcc69d3e48c5402c371f19f5c7688871e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 22 Sep 2017 20:20:08 -0400 Subject: [PATCH 0621/1322] inet: fix improper empty comparison When doing my reuseport rework I screwed up and changed a if (hlist_empty(&tb->owners)) to if (!hlist_empty(&tb->owners)) This is obviously bad as all of the reuseport/reuse logic was reversed, which caused weird problems like allowing an ipv4 bind conflict if we opened an ipv4 only socket on a port followed by an ipv6 only socket on the same port. Fixes: b9470c27607b ("inet: kill smallest_size and smallest_port") Reported-by: Cole Robinson Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a1bf30438bc5..c039c937ba90 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -321,7 +321,7 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Fri, 22 Sep 2017 10:25:22 +0800 Subject: [PATCH 0622/1322] net: Remove useless function skb_header_release There is no one which would invokes the function skb_header_release. So just remove it now. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 2 +- include/linux/skbuff.h | 19 ------------------- net/batman-adv/soft-interface.c | 2 +- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 522d2900cd1d..f4d7362eb325 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, * - We are allowed to put 4 bytes at tail if skb_cloned() * is false (and if we have 4 bytes of tailroom) * - * TCP packets for example are cloned, but skb_header_release() + * TCP packets for example are cloned, but __skb_header_release() * was called in tcp stack, allowing us to use headroom for our needs. */ if (!skb_header_cloned(skb) && diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 492828801acb..f9db5539a6fb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1456,28 +1456,9 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) return 0; } -/** - * skb_header_release - release reference to header - * @skb: buffer to operate on - * - * Drop a reference to the header part of the buffer. This is done - * by acquiring a payload reference. You must not read from the header - * part of skb->data after this. - * Note : Check if you can use __skb_header_release() instead. - */ -static inline void skb_header_release(struct sk_buff *skb) -{ - BUG_ON(skb->nohdr); - skb->nohdr = 1; - atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref); -} - /** * __skb_header_release - release reference to header * @skb: buffer to operate on - * - * Variant of skb_header_release() assuming skb is private to caller. - * We can avoid one atomic operation. */ static inline void __skb_header_release(struct sk_buff *skb) { diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 10f7edfb176e..c2c986746d0b 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using skb_header_release in our skbs to allow skb_cow_header to + * data using __skb_header_release in our skbs to allow skb_cow_header to * work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer From 52a59bd509e3dc458be99dcf333b778e6e3b3749 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:33:29 +0300 Subject: [PATCH 0623/1322] net: use 32-bit arithmetic while allocating net device Private part of allocation is never big enough to warrant size_t. Space savings: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-10 (-10) function old new delta alloc_netdev_mqs 1120 1110 -10 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..37d6a3c59e69 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7989,7 +7989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - size_t alloc_size; + unsigned int alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); From 6a345b3dbd1ed83a7877993c6e23c977a84bb483 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Thu, 21 Sep 2017 23:41:13 +0530 Subject: [PATCH 0624/1322] cxgb4: add tc flower offload skeleton Add basic skeleton to prepare for offloading tc-flower flows. Signed-off-by: Kumar Sanghvi Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/Makefile | 4 +- .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 22 +++++++ .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 57 +++++++++++++++++++ .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.h | 46 +++++++++++++++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile index 817212702f0a..fecd7aab673b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/Makefile +++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile @@ -4,7 +4,9 @@ obj-$(CONFIG_CHELSIO_T4) += cxgb4.o -cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o cxgb4_ptp.o +cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o \ + cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o \ + cxgb4_ptp.o cxgb4_tc_flower.o cxgb4-$(CONFIG_CHELSIO_T4_DCB) += cxgb4_dcb.o cxgb4-$(CONFIG_CHELSIO_T4_FCOE) += cxgb4_fcoe.o cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 5fe81a4e26a6..5079246aaf2c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -79,6 +79,7 @@ #include "l2t.h" #include "sched.h" #include "cxgb4_tc_u32.h" +#include "cxgb4_tc_flower.h" #include "cxgb4_ptp.h" char cxgb4_driver_name[] = KBUILD_MODNAME; @@ -2873,6 +2874,25 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate) return err; } +static int cxgb_setup_tc_flower(struct net_device *dev, + struct tc_cls_flower_offload *cls_flower) +{ + if (!is_classid_clsact_ingress(cls_flower->common.classid) || + cls_flower->common.chain_index) + return -EOPNOTSUPP; + + switch (cls_flower->command) { + case TC_CLSFLOWER_REPLACE: + return cxgb4_tc_flower_replace(dev, cls_flower); + case TC_CLSFLOWER_DESTROY: + return cxgb4_tc_flower_destroy(dev, cls_flower); + case TC_CLSFLOWER_STATS: + return cxgb4_tc_flower_stats(dev, cls_flower); + default: + return -EOPNOTSUPP; + } +} + static int cxgb_setup_tc_cls_u32(struct net_device *dev, struct tc_cls_u32_offload *cls_u32) { @@ -2907,6 +2927,8 @@ static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type, switch (type) { case TC_SETUP_CLSU32: return cxgb_setup_tc_cls_u32(dev, type_data); + case TC_SETUP_CLSFLOWER: + return cxgb_setup_tc_flower(dev, type_data); default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c new file mode 100644 index 000000000000..16dff71e4d02 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -0,0 +1,57 @@ +/* + * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux. + * + * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "cxgb4.h" +#include "cxgb4_tc_flower.h" + +int cxgb4_tc_flower_replace(struct net_device *dev, + struct tc_cls_flower_offload *cls) +{ + return -EOPNOTSUPP; +} + +int cxgb4_tc_flower_destroy(struct net_device *dev, + struct tc_cls_flower_offload *cls) +{ + return -EOPNOTSUPP; +} + +int cxgb4_tc_flower_stats(struct net_device *dev, + struct tc_cls_flower_offload *cls) +{ + return -EOPNOTSUPP; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h new file mode 100644 index 000000000000..b321fc205b5a --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -0,0 +1,46 @@ +/* + * This file is part of the Chelsio T4/T5/T6 Ethernet driver for Linux. + * + * Copyright (c) 2017 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __CXGB4_TC_FLOWER_H +#define __CXGB4_TC_FLOWER_H + +#include + +int cxgb4_tc_flower_replace(struct net_device *dev, + struct tc_cls_flower_offload *cls); +int cxgb4_tc_flower_destroy(struct net_device *dev, + struct tc_cls_flower_offload *cls); +int cxgb4_tc_flower_stats(struct net_device *dev, + struct tc_cls_flower_offload *cls); +#endif /* __CXGB4_TC_FLOWER_H */ From 62488e4b53ae02d82ac000f91ec82b5cfb41d6f2 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Thu, 21 Sep 2017 23:41:14 +0530 Subject: [PATCH 0625/1322] cxgb4: add basic tc flower offload support Add support to add/remove flows for offload. Following match and action are supported for offloading a flow: Match: ether-protocol, IPv4/IPv6 addresses, L4 ports (TCP/UDP) Action: drop, redirect to another port on the device. The qualifying flows can have accompanying mask information. Signed-off-by: Kumar Sanghvi Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 3 + .../net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 24 ++ .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 + .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 280 +++++++++++++++++- .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.h | 17 ++ .../net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 1 + 6 files changed, 325 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index c4e997fdff64..d05721b06178 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -905,6 +905,9 @@ struct adapter { /* TC u32 offload */ struct cxgb4_tc_u32_table *tc_u32; struct chcr_stats_debug chcr_stats; + + /* TC flower offload */ + DECLARE_HASHTABLE(flower_anymatch_tbl, 9); }; /* Support for "sched-class" command to allow a TX Scheduling Class to be diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 97ead2c66751..f3de9cdd4181 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -148,6 +148,30 @@ static int get_filter_steerq(struct net_device *dev, return iq; } +int cxgb4_get_free_ftid(struct net_device *dev, int family) +{ + struct adapter *adap = netdev2adap(dev); + struct tid_info *t = &adap->tids; + int ftid; + + spin_lock_bh(&t->ftid_lock); + if (family == PF_INET) { + ftid = find_first_zero_bit(t->ftid_bmap, t->nftids); + if (ftid >= t->nftids) + ftid = -1; + } else { + ftid = bitmap_find_free_region(t->ftid_bmap, t->nftids, 2); + if (ftid < 0) + goto out_unlock; + + /* this is only a lookup, keep the found region unallocated */ + bitmap_release_region(t->ftid_bmap, ftid, 2); + } +out_unlock: + spin_unlock_bh(&t->ftid_lock); + return ftid; +} + static int cxgb4_set_ftid(struct tid_info *t, int fidx, int family) { spin_lock_bh(&t->ftid_lock); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 5079246aaf2c..ce33c3addc2b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5105,6 +5105,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!adapter->tc_u32) dev_warn(&pdev->dev, "could not offload tc u32, continuing\n"); + + cxgb4_init_tc_flower(adapter); } if (is_offload(adapter)) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index 16dff71e4d02..dda34d5a52fb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -38,16 +38,287 @@ #include "cxgb4.h" #include "cxgb4_tc_flower.h" +static struct ch_tc_flower_entry *allocate_flower_entry(void) +{ + struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL); + return new; +} + +/* Must be called with either RTNL or rcu_read_lock */ +static struct ch_tc_flower_entry *ch_flower_lookup(struct adapter *adap, + unsigned long flower_cookie) +{ + struct ch_tc_flower_entry *flower_entry; + + hash_for_each_possible_rcu(adap->flower_anymatch_tbl, flower_entry, + link, flower_cookie) + if (flower_entry->tc_flower_cookie == flower_cookie) + return flower_entry; + return NULL; +} + +static void cxgb4_process_flow_match(struct net_device *dev, + struct tc_cls_flower_offload *cls, + struct ch_filter_specification *fs) +{ + u16 addr_type = 0; + + if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_CONTROL)) { + struct flow_dissector_key_control *key = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_CONTROL, + cls->key); + + addr_type = key->addr_type; + } + + if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_BASIC)) { + struct flow_dissector_key_basic *key = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_BASIC, + cls->key); + struct flow_dissector_key_basic *mask = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_BASIC, + cls->mask); + u16 ethtype_key = ntohs(key->n_proto); + u16 ethtype_mask = ntohs(mask->n_proto); + + if (ethtype_key == ETH_P_ALL) { + ethtype_key = 0; + ethtype_mask = 0; + } + + fs->val.ethtype = ethtype_key; + fs->mask.ethtype = ethtype_mask; + fs->val.proto = key->ip_proto; + fs->mask.proto = mask->ip_proto; + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_dissector_key_ipv4_addrs *key = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + cls->key); + struct flow_dissector_key_ipv4_addrs *mask = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + cls->mask); + fs->type = 0; + memcpy(&fs->val.lip[0], &key->dst, sizeof(key->dst)); + memcpy(&fs->val.fip[0], &key->src, sizeof(key->src)); + memcpy(&fs->mask.lip[0], &mask->dst, sizeof(mask->dst)); + memcpy(&fs->mask.fip[0], &mask->src, sizeof(mask->src)); + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_dissector_key_ipv6_addrs *key = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + cls->key); + struct flow_dissector_key_ipv6_addrs *mask = + skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + cls->mask); + + fs->type = 1; + memcpy(&fs->val.lip[0], key->dst.s6_addr, sizeof(key->dst)); + memcpy(&fs->val.fip[0], key->src.s6_addr, sizeof(key->src)); + memcpy(&fs->mask.lip[0], mask->dst.s6_addr, sizeof(mask->dst)); + memcpy(&fs->mask.fip[0], mask->src.s6_addr, sizeof(mask->src)); + } + + if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_PORTS)) { + struct flow_dissector_key_ports *key, *mask; + + key = skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_PORTS, + cls->key); + mask = skb_flow_dissector_target(cls->dissector, + FLOW_DISSECTOR_KEY_PORTS, + cls->mask); + fs->val.lport = cpu_to_be16(key->dst); + fs->mask.lport = cpu_to_be16(mask->dst); + fs->val.fport = cpu_to_be16(key->src); + fs->mask.fport = cpu_to_be16(mask->src); + } + + /* Match only packets coming from the ingress port where this + * filter will be created. + */ + fs->val.iport = netdev2pinfo(dev)->port_id; + fs->mask.iport = ~0; +} + +static int cxgb4_validate_flow_match(struct net_device *dev, + struct tc_cls_flower_offload *cls) +{ + if (cls->dissector->used_keys & + ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_PORTS))) { + netdev_warn(dev, "Unsupported key used: 0x%x\n", + cls->dissector->used_keys); + return -EOPNOTSUPP; + } + return 0; +} + +static void cxgb4_process_flow_actions(struct net_device *in, + struct tc_cls_flower_offload *cls, + struct ch_filter_specification *fs) +{ + const struct tc_action *a; + LIST_HEAD(actions); + + tcf_exts_to_list(cls->exts, &actions); + list_for_each_entry(a, &actions, list) { + if (is_tcf_gact_shot(a)) { + fs->action = FILTER_DROP; + } else if (is_tcf_mirred_egress_redirect(a)) { + int ifindex = tcf_mirred_ifindex(a); + struct net_device *out = __dev_get_by_index(dev_net(in), + ifindex); + struct port_info *pi = netdev_priv(out); + + fs->action = FILTER_SWITCH; + fs->eport = pi->port_id; + } + } +} + +static int cxgb4_validate_flow_actions(struct net_device *dev, + struct tc_cls_flower_offload *cls) +{ + const struct tc_action *a; + LIST_HEAD(actions); + + tcf_exts_to_list(cls->exts, &actions); + list_for_each_entry(a, &actions, list) { + if (is_tcf_gact_shot(a)) { + /* Do nothing */ + } else if (is_tcf_mirred_egress_redirect(a)) { + struct adapter *adap = netdev2adap(dev); + struct net_device *n_dev; + unsigned int i, ifindex; + bool found = false; + + ifindex = tcf_mirred_ifindex(a); + for_each_port(adap, i) { + n_dev = adap->port[i]; + if (ifindex == n_dev->ifindex) { + found = true; + break; + } + } + + /* If interface doesn't belong to our hw, then + * the provided output port is not valid + */ + if (!found) { + netdev_err(dev, "%s: Out port invalid\n", + __func__); + return -EINVAL; + } + } else { + netdev_err(dev, "%s: Unsupported action\n", __func__); + return -EOPNOTSUPP; + } + } + return 0; +} + int cxgb4_tc_flower_replace(struct net_device *dev, struct tc_cls_flower_offload *cls) { - return -EOPNOTSUPP; + struct adapter *adap = netdev2adap(dev); + struct ch_tc_flower_entry *ch_flower; + struct ch_filter_specification *fs; + struct filter_ctx ctx; + int fidx; + int ret; + + if (cxgb4_validate_flow_actions(dev, cls)) + return -EOPNOTSUPP; + + if (cxgb4_validate_flow_match(dev, cls)) + return -EOPNOTSUPP; + + ch_flower = allocate_flower_entry(); + if (!ch_flower) { + netdev_err(dev, "%s: ch_flower alloc failed.\n", __func__); + return -ENOMEM; + } + + fs = &ch_flower->fs; + fs->hitcnts = 1; + cxgb4_process_flow_actions(dev, cls, fs); + cxgb4_process_flow_match(dev, cls, fs); + + fidx = cxgb4_get_free_ftid(dev, fs->type ? PF_INET6 : PF_INET); + if (fidx < 0) { + netdev_err(dev, "%s: No fidx for offload.\n", __func__); + ret = -ENOMEM; + goto free_entry; + } + + init_completion(&ctx.completion); + ret = __cxgb4_set_filter(dev, fidx, fs, &ctx); + if (ret) { + netdev_err(dev, "%s: filter creation err %d\n", + __func__, ret); + goto free_entry; + } + + /* Wait for reply */ + ret = wait_for_completion_timeout(&ctx.completion, 10 * HZ); + if (!ret) { + ret = -ETIMEDOUT; + goto free_entry; + } + + ret = ctx.result; + /* Check if hw returned error for filter creation */ + if (ret) { + netdev_err(dev, "%s: filter creation err %d\n", + __func__, ret); + goto free_entry; + } + + INIT_HLIST_NODE(&ch_flower->link); + ch_flower->tc_flower_cookie = cls->cookie; + ch_flower->filter_id = ctx.tid; + hash_add_rcu(adap->flower_anymatch_tbl, &ch_flower->link, cls->cookie); + + return ret; + +free_entry: + kfree(ch_flower); + return ret; } int cxgb4_tc_flower_destroy(struct net_device *dev, struct tc_cls_flower_offload *cls) { - return -EOPNOTSUPP; + struct adapter *adap = netdev2adap(dev); + struct ch_tc_flower_entry *ch_flower; + int ret; + + ch_flower = ch_flower_lookup(adap, cls->cookie); + if (!ch_flower) + return -ENOENT; + + ret = cxgb4_del_filter(dev, ch_flower->filter_id); + if (ret) + goto err; + + hash_del_rcu(&ch_flower->link); + kfree_rcu(ch_flower, rcu); + +err: + return ret; } int cxgb4_tc_flower_stats(struct net_device *dev, @@ -55,3 +326,8 @@ int cxgb4_tc_flower_stats(struct net_device *dev, { return -EOPNOTSUPP; } + +void cxgb4_init_tc_flower(struct adapter *adap) +{ + hash_init(adap->flower_anymatch_tbl); +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h index b321fc205b5a..6145a9e056eb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -37,10 +37,27 @@ #include +struct ch_tc_flower_stats { + u64 packet_count; + u64 byte_count; + u64 last_used; +}; + +struct ch_tc_flower_entry { + struct ch_filter_specification fs; + struct ch_tc_flower_stats stats; + unsigned long tc_flower_cookie; + struct hlist_node link; + struct rcu_head rcu; + u32 filter_id; +}; + int cxgb4_tc_flower_replace(struct net_device *dev, struct tc_cls_flower_offload *cls); int cxgb4_tc_flower_destroy(struct net_device *dev, struct tc_cls_flower_offload *cls); int cxgb4_tc_flower_stats(struct net_device *dev, struct tc_cls_flower_offload *cls); + +void cxgb4_init_tc_flower(struct adapter *adap); #endif /* __CXGB4_TC_FLOWER_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 84541fce94c5..88487095d14f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -212,6 +212,7 @@ struct filter_ctx { struct ch_filter_specification; +int cxgb4_get_free_ftid(struct net_device *dev, int family); int __cxgb4_set_filter(struct net_device *dev, int filter_id, struct ch_filter_specification *fs, struct filter_ctx *ctx); From cf2885a70fc71d5f6b434b86eedfc18ad66ba6f6 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Thu, 21 Sep 2017 23:41:15 +0530 Subject: [PATCH 0626/1322] cxgb4: add support to offload action vlan Add support for offloading tc-flower flows having vlan actions: pop, push and modify. Signed-off-by: Kumar Sanghvi Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index dda34d5a52fb..e42d2efc9ea2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -34,6 +34,7 @@ #include #include +#include #include "cxgb4.h" #include "cxgb4_tc_flower.h" @@ -185,6 +186,27 @@ static void cxgb4_process_flow_actions(struct net_device *in, fs->action = FILTER_SWITCH; fs->eport = pi->port_id; + } else if (is_tcf_vlan(a)) { + u32 vlan_action = tcf_vlan_action(a); + u8 prio = tcf_vlan_push_prio(a); + u16 vid = tcf_vlan_push_vid(a); + u16 vlan_tci = (prio << VLAN_PRIO_SHIFT) | vid; + + switch (vlan_action) { + case TCA_VLAN_ACT_POP: + fs->newvlan |= VLAN_REMOVE; + break; + case TCA_VLAN_ACT_PUSH: + fs->newvlan |= VLAN_INSERT; + fs->vlan = vlan_tci; + break; + case TCA_VLAN_ACT_MODIFY: + fs->newvlan |= VLAN_REWRITE; + fs->vlan = vlan_tci; + break; + default: + break; + } } } } @@ -222,6 +244,26 @@ static int cxgb4_validate_flow_actions(struct net_device *dev, __func__); return -EINVAL; } + } else if (is_tcf_vlan(a)) { + u16 proto = be16_to_cpu(tcf_vlan_push_proto(a)); + u32 vlan_action = tcf_vlan_action(a); + + switch (vlan_action) { + case TCA_VLAN_ACT_POP: + break; + case TCA_VLAN_ACT_PUSH: + case TCA_VLAN_ACT_MODIFY: + if (proto != ETH_P_8021Q) { + netdev_err(dev, "%s: Unsupported vlan proto\n", + __func__); + return -EOPNOTSUPP; + } + break; + default: + netdev_err(dev, "%s: Unsupported vlan action\n", + __func__); + return -EOPNOTSUPP; + } } else { netdev_err(dev, "%s: Unsupported action\n", __func__); return -EOPNOTSUPP; From e0f911c81e93fc23fe1a4fb0318ff1c3b1c9027f Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Thu, 21 Sep 2017 23:41:16 +0530 Subject: [PATCH 0627/1322] cxgb4: fetch stats for offloaded tc flower flows Add support to retrieve stats from hardware for offloaded tc flower flows. Also, poll for the stats of offloaded flows via timer callback. Signed-off-by: Kumar Sanghvi Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + .../net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 76 ++++++++++++++++++ .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 1 + .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 79 ++++++++++++++++++- .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.h | 3 + .../net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 2 + 6 files changed, 161 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index d05721b06178..0db3ab6ad094 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -908,6 +908,7 @@ struct adapter { /* TC flower offload */ DECLARE_HASHTABLE(flower_anymatch_tbl, 9); + struct timer_list flower_stats_timer; }; /* Support for "sched-class" command to allow a TX Scheduling Class to be diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index f3de9cdd4181..15361ca2857c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -148,6 +148,82 @@ static int get_filter_steerq(struct net_device *dev, return iq; } +static int get_filter_count(struct adapter *adapter, unsigned int fidx, + u64 *pkts, u64 *bytes) +{ + unsigned int tcb_base, tcbaddr; + unsigned int word_offset; + struct filter_entry *f; + __be64 be64_byte_count; + int ret; + + tcb_base = t4_read_reg(adapter, TP_CMM_TCB_BASE_A); + if ((fidx != (adapter->tids.nftids + adapter->tids.nsftids - 1)) && + fidx >= adapter->tids.nftids) + return -E2BIG; + + f = &adapter->tids.ftid_tab[fidx]; + if (!f->valid) + return -EINVAL; + + tcbaddr = tcb_base + f->tid * TCB_SIZE; + + spin_lock(&adapter->win0_lock); + if (is_t4(adapter->params.chip)) { + __be64 be64_count; + + /* T4 doesn't maintain byte counts in hw */ + *bytes = 0; + + /* Get pkts */ + word_offset = 4; + ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0, + tcbaddr + (word_offset * sizeof(__be32)), + sizeof(be64_count), + (__be32 *)&be64_count, + T4_MEMORY_READ); + if (ret < 0) + goto out; + *pkts = be64_to_cpu(be64_count); + } else { + __be32 be32_count; + + /* Get bytes */ + word_offset = 4; + ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0, + tcbaddr + (word_offset * sizeof(__be32)), + sizeof(be64_byte_count), + &be64_byte_count, + T4_MEMORY_READ); + if (ret < 0) + goto out; + *bytes = be64_to_cpu(be64_byte_count); + + /* Get pkts */ + word_offset = 6; + ret = t4_memory_rw(adapter, MEMWIN_NIC, MEM_EDC0, + tcbaddr + (word_offset * sizeof(__be32)), + sizeof(be32_count), + &be32_count, + T4_MEMORY_READ); + if (ret < 0) + goto out; + *pkts = (u64)be32_to_cpu(be32_count); + } + +out: + spin_unlock(&adapter->win0_lock); + return ret; +} + +int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx, + u64 *hitcnt, u64 *bytecnt) +{ + struct adapter *adapter = netdev2adap(dev); + + return get_filter_count(adapter, fidx, hitcnt, bytecnt); +} + int cxgb4_get_free_ftid(struct net_device *dev, int family) { struct adapter *adap = netdev2adap(dev); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index ce33c3addc2b..aa93ae95d3b9 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4637,6 +4637,7 @@ static void free_some_resources(struct adapter *adapter) kvfree(adapter->l2t); t4_cleanup_sched(adapter); kvfree(adapter->tids.tid_tab); + cxgb4_cleanup_tc_flower(adapter); cxgb4_cleanup_tc_u32(adapter); kfree(adapter->sge.egr_map); kfree(adapter->sge.ingr_map); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index e42d2efc9ea2..a36bd66d2834 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -39,9 +39,12 @@ #include "cxgb4.h" #include "cxgb4_tc_flower.h" +#define STATS_CHECK_PERIOD (HZ / 2) + static struct ch_tc_flower_entry *allocate_flower_entry(void) { struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL); + spin_lock_init(&new->lock); return new; } @@ -363,13 +366,87 @@ err: return ret; } +void ch_flower_stats_cb(unsigned long data) +{ + struct adapter *adap = (struct adapter *)data; + struct ch_tc_flower_entry *flower_entry; + struct ch_tc_flower_stats *ofld_stats; + unsigned int i; + u64 packets; + u64 bytes; + int ret; + + rcu_read_lock(); + hash_for_each_rcu(adap->flower_anymatch_tbl, i, flower_entry, link) { + ret = cxgb4_get_filter_counters(adap->port[0], + flower_entry->filter_id, + &packets, &bytes); + if (!ret) { + spin_lock(&flower_entry->lock); + ofld_stats = &flower_entry->stats; + + if (ofld_stats->prev_packet_count != packets) { + ofld_stats->prev_packet_count = packets; + ofld_stats->last_used = jiffies; + } + spin_unlock(&flower_entry->lock); + } + } + rcu_read_unlock(); + mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD); +} + int cxgb4_tc_flower_stats(struct net_device *dev, struct tc_cls_flower_offload *cls) { - return -EOPNOTSUPP; + struct adapter *adap = netdev2adap(dev); + struct ch_tc_flower_stats *ofld_stats; + struct ch_tc_flower_entry *ch_flower; + u64 packets; + u64 bytes; + int ret; + + ch_flower = ch_flower_lookup(adap, cls->cookie); + if (!ch_flower) { + ret = -ENOENT; + goto err; + } + + ret = cxgb4_get_filter_counters(dev, ch_flower->filter_id, + &packets, &bytes); + if (ret < 0) + goto err; + + spin_lock_bh(&ch_flower->lock); + ofld_stats = &ch_flower->stats; + if (ofld_stats->packet_count != packets) { + if (ofld_stats->prev_packet_count != packets) + ofld_stats->last_used = jiffies; + tcf_exts_stats_update(cls->exts, bytes - ofld_stats->byte_count, + packets - ofld_stats->packet_count, + ofld_stats->last_used); + + ofld_stats->packet_count = packets; + ofld_stats->byte_count = bytes; + ofld_stats->prev_packet_count = packets; + } + spin_unlock_bh(&ch_flower->lock); + return 0; + +err: + return ret; } void cxgb4_init_tc_flower(struct adapter *adap) { hash_init(adap->flower_anymatch_tbl); + setup_timer(&adap->flower_stats_timer, ch_flower_stats_cb, + (unsigned long)adap); + mod_timer(&adap->flower_stats_timer, jiffies + STATS_CHECK_PERIOD); +} + +void cxgb4_cleanup_tc_flower(struct adapter *adap) +{ + if (adap->flower_stats_timer.function) + del_timer_sync(&adap->flower_stats_timer); } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h index 6145a9e056eb..604feffc752e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -38,6 +38,7 @@ #include struct ch_tc_flower_stats { + u64 prev_packet_count; u64 packet_count; u64 byte_count; u64 last_used; @@ -49,6 +50,7 @@ struct ch_tc_flower_entry { unsigned long tc_flower_cookie; struct hlist_node link; struct rcu_head rcu; + spinlock_t lock; /* lock for stats */ u32 filter_id; }; @@ -60,4 +62,5 @@ int cxgb4_tc_flower_stats(struct net_device *dev, struct tc_cls_flower_offload *cls); void cxgb4_init_tc_flower(struct adapter *adap); +void cxgb4_cleanup_tc_flower(struct adapter *adap); #endif /* __CXGB4_TC_FLOWER_H */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 88487095d14f..52324c77a4fe 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -221,6 +221,8 @@ int __cxgb4_del_filter(struct net_device *dev, int filter_id, int cxgb4_set_filter(struct net_device *dev, int filter_id, struct ch_filter_specification *fs); int cxgb4_del_filter(struct net_device *dev, int filter_id); +int cxgb4_get_filter_counters(struct net_device *dev, unsigned int fidx, + u64 *hitcnt, u64 *bytecnt); static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue) { From 0d0970eef3b03ef08b19da5bc3044410731cf38f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Sep 2017 16:24:32 -0500 Subject: [PATCH 0628/1322] objtool: Handle another GCC stack pointer adjustment bug The kbuild bot reported the following warning with GCC 4.4 and a randconfig: net/socket.o: warning: objtool: compat_sock_ioctl()+0x1083: stack state mismatch: cfa1=7+160 cfa2=-1+0 This is caused by another GCC non-optimization, where it backs up and restores the stack pointer for no apparent reason: 2f91: 48 89 e0 mov %rsp,%rax 2f94: 4c 89 e7 mov %r12,%rdi 2f97: 4c 89 f6 mov %r14,%rsi 2f9a: ba 20 00 00 00 mov $0x20,%edx 2f9f: 48 89 c4 mov %rax,%rsp This issue would have been happily ignored before the following commit: dd88a0a0c861 ("objtool: Handle GCC stack pointer adjustment bug") But now that objtool is paying attention to such stack pointer writes to/from a register, it needs to understand them properly. In this case that means recognizing that the "mov %rsp, %rax" instruction is potentially a backup of the stack pointer. Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dd88a0a0c861 ("objtool: Handle GCC stack pointer adjustment bug") Link: http://lkml.kernel.org/r/8c7aa8e9a36fbbb6655d9d8e7cea58958c912da8.1505942196.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/decode.c | 6 ++--- tools/objtool/check.c | 43 ++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0e8c8ec4fd4e..0f22768c0d4d 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -208,14 +208,14 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, break; case 0x89: - if (rex == 0x48 && modrm == 0xe5) { + if (rex_w && !rex_r && modrm_mod == 3 && modrm_reg == 4) { - /* mov %rsp, %rbp */ + /* mov %rsp, reg */ *type = INSN_STACK; op->src.type = OP_SRC_REG; op->src.reg = CFI_SP; op->dest.type = OP_DEST_REG; - op->dest.reg = CFI_BP; + op->dest.reg = op_to_cfi_reg[modrm_rm][rex_b]; break; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index f744617c9946..a0c518ecf085 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1203,24 +1203,39 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state) switch (op->src.type) { case OP_SRC_REG: - if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP) { + if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP && + cfa->base == CFI_SP && + regs[CFI_BP].base == CFI_CFA && + regs[CFI_BP].offset == -cfa->offset) { - if (cfa->base == CFI_SP && - regs[CFI_BP].base == CFI_CFA && - regs[CFI_BP].offset == -cfa->offset) { + /* mov %rsp, %rbp */ + cfa->base = op->dest.reg; + state->bp_scratch = false; + } - /* mov %rsp, %rbp */ - cfa->base = op->dest.reg; - state->bp_scratch = false; - } + else if (op->src.reg == CFI_SP && + op->dest.reg == CFI_BP && state->drap) { - else if (state->drap) { + /* drap: mov %rsp, %rbp */ + regs[CFI_BP].base = CFI_BP; + regs[CFI_BP].offset = -state->stack_size; + state->bp_scratch = false; + } - /* drap: mov %rsp, %rbp */ - regs[CFI_BP].base = CFI_BP; - regs[CFI_BP].offset = -state->stack_size; - state->bp_scratch = false; - } + else if (op->src.reg == CFI_SP && cfa->base == CFI_SP) { + + /* + * mov %rsp, %reg + * + * This is needed for the rare case where GCC + * does: + * + * mov %rsp, %rax + * ... + * mov %rax, %rsp + */ + state->vals[op->dest.reg].base = CFI_CFA; + state->vals[op->dest.reg].offset = -state->stack_size; } else if (op->dest.reg == cfa->base) { From f5caf621ee357279e759c0911daf6d55c7d36f03 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Sep 2017 16:24:33 -0500 Subject: [PATCH 0629/1322] x86/asm: Fix inline asm call constraints for Clang For inline asm statements which have a CALL instruction, we list the stack pointer as a constraint to convince GCC to ensure the frame pointer is set up first: static inline void foo() { register void *__sp asm(_ASM_SP); asm("call bar" : "+r" (__sp)) } Unfortunately, that pattern causes Clang to corrupt the stack pointer. The fix is easy: convert the stack pointer register variable to a global variable. It should be noted that the end result is different based on the GCC version. With GCC 6.4, this patch has exactly the same result as before: defconfig defconfig-nofp distro distro-nofp before 9820389 9491555 8816046 8516940 after 9820389 9491555 8816046 8516940 With GCC 7.2, however, GCC's behavior has changed. It now changes its behavior based on the conversion of the register variable to a global. That somehow convinces it to *always* set up the frame pointer before inserting *any* inline asm. (Therefore, listing the variable as an output constraint is a no-op and is no longer necessary.) It's a bit overkill, but the performance impact should be negligible. And in fact, there's a nice improvement with frame pointers disabled: defconfig defconfig-nofp distro distro-nofp before 9796316 9468236 9076191 8790305 after 9796957 9464267 9076381 8785949 So in summary, while listing the stack pointer as an output constraint is no longer necessary for newer versions of GCC, it's still needed for older versions. Suggested-by: Andrey Ryabinin Reported-by: Matthias Kaehlcke Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Linus Torvalds Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3db862e970c432ae823cf515c52b54fec8270e0e.1505942196.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 3 +-- arch/x86/include/asm/asm.h | 11 +++++++++++ arch/x86/include/asm/mshyperv.h | 10 ++++------ arch/x86/include/asm/paravirt_types.h | 14 +++++++------- arch/x86/include/asm/preempt.h | 15 +++++---------- arch/x86/include/asm/processor.h | 6 ++---- arch/x86/include/asm/rwsem.h | 4 ++-- arch/x86/include/asm/uaccess.h | 4 ++-- arch/x86/include/asm/xen/hypercall.h | 5 ++--- arch/x86/kvm/emulate.c | 3 +-- arch/x86/kvm/vmx.c | 3 +-- arch/x86/mm/fault.c | 3 +-- tools/objtool/Documentation/stack-validation.txt | 6 +++--- 13 files changed, 42 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..c096624137ae 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 676ee5807d86..c1eadbaf1115 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -132,4 +132,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned int __asm_call_sp asm("esp"); +#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 63cc96f064dc..738503e1f80c 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 input_address = input ? virt_to_phys(input) : 0; u64 output_address = output ? virt_to_phys(output) : 0; u64 hv_status; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 if (!hv_hypercall_pg) @@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("mov %4, %%r8\n" "call *%5" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) : "r" (output_address), "m" (hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); @@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) __asm__ __volatile__("call *%7" : "=A" (hv_status), - "+c" (input_address_lo), "+r" (__sp) + "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), @@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) { u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; - register void *__sp asm(_ASM_SP); #ifdef CONFIG_X86_64 { __asm__ __volatile__("call *%4" - : "=a" (hv_status), "+r" (__sp), + : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) : "m" (hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); @@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) __asm__ __volatile__ ("call *%5" : "=A"(hv_status), "+c"(input1_lo), - "+r"(__sp) + ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), "m" (hv_hypercall_pg) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 42873edd9f9d..280d94c36dad 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -459,8 +459,8 @@ int paravirt_disable_iospace(void); */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -480,8 +480,8 @@ int paravirt_disable_iospace(void); /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -532,7 +532,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -542,7 +542,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -569,7 +569,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index ec1f3c651150..4f44505dbf87 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3fa26a61eabc..b390ff76e58f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -677,8 +677,6 @@ static inline void sync_core(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -686,7 +684,7 @@ static inline void sync_core(void) "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; @@ -703,7 +701,7 @@ static inline void sync_core(void) "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a34e0d4b957d..7116b7931c7b 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 184eb9894dae..78e8fcc87d4c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9606688caa4b..128a1a0b1450 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 16bf6655aa85..f23f13403f33 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5296,7 +5296,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) @@ -5304,7 +5303,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [fastop]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06c0c6d0541e..6ee237f509dc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9036,7 +9036,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { @@ -9065,7 +9064,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif - "+r"(__sp) + ASM_CALL_CONSTRAINT : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b836a7274e12..39567b5c33da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -806,7 +806,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -821,7 +820,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt index 6a1af43862df..3995735a878f 100644 --- a/tools/objtool/Documentation/stack-validation.txt +++ b/tools/objtool/Documentation/stack-validation.txt @@ -194,10 +194,10 @@ they mean, and suggestions for how to fix them. If it's a GCC-compiled .c file, the error may be because the function uses an inline asm() statement which has a "call" instruction. An asm() statement with a call instruction must declare the use of the - stack pointer in its output operand. For example, on x86_64: + stack pointer in its output operand. On x86_64, this means adding + the ASM_CALL_CONSTRAINT as an output constraint: - register void *__sp asm("rsp"); - asm volatile("call func" : "+r" (__sp)); + asm volatile("call func" : ASM_CALL_CONSTRAINT); Otherwise the stack frame may not get created before the call. From af2e658fc08a397b10352265e50b83f27e25d73e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:35 +0300 Subject: [PATCH 0630/1322] as3645a: Use ams,input-max-microamp as documented in DT bindings DT bindings document the property "ams,input-max-microamp" that limits the chip's maximum input current. The driver and the DTS however used "peak-current-limit" property. Fix this by using the property documented in DT binding documentation. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 2 +- drivers/leds/leds-as3645a.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index cb47ae79a5f9..b86fc83a5a65 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -273,7 +273,7 @@ flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; - peak-current-limit = <1750000>; + ams,input-max-microamp = <1750000>; }; indicator { led-max-microamp = <10000>; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index bbbbe0898233..e3f89c6130d2 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -534,7 +534,7 @@ static int as3645a_parse_node(struct as3645a *flash, of_property_read_u32(flash->flash_node, "voltage-reference", &cfg->voltage_reference); - of_property_read_u32(flash->flash_node, "peak-current-limit", + of_property_read_u32(flash->flash_node, "ams,input-max-microamp", &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); From 75f9f7279e874ff95d1abe4613abc0826c9a8dcc Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:36 +0300 Subject: [PATCH 0631/1322] dt: bindings: as3645a: Use LED number to refer to LEDs Use integers (reg property) to tell the number of the LED to the driver instead of the node name. While both of these approaches are currently used by the LED bindings, using integers will require less driver changes for ACPI support. Additionally, it will make possible LED naming using chip and LED node names, effectively making the label property most useful for human-readable names only. Signed-off-by: Sakari Ailus Acked-by: Rob Herring Signed-off-by: Jacek Anaszewski --- .../devicetree/bindings/leds/ams,as3645a.txt | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/leds/ams,as3645a.txt b/Documentation/devicetree/bindings/leds/ams,as3645a.txt index 12c5ef26ec73..fdc40e354a64 100644 --- a/Documentation/devicetree/bindings/leds/ams,as3645a.txt +++ b/Documentation/devicetree/bindings/leds/ams,as3645a.txt @@ -15,11 +15,14 @@ Required properties compatible : Must be "ams,as3645a". reg : The I2C address of the device. Typically 0x30. +#address-cells : 1 +#size-cells : 0 -Required properties of the "flash" child node -============================================= +Required properties of the flash child node (0) +=============================================== +reg: 0 flash-timeout-us: Flash timeout in microseconds. The value must be in the range [100000, 850000] and divisible by 50000. flash-max-microamp: Maximum flash current in microamperes. Has to be @@ -33,20 +36,21 @@ ams,input-max-microamp: Maximum flash controller input current. The and divisible by 50000. -Optional properties of the "flash" child node -============================================= +Optional properties of the flash child node +=========================================== label : The label of the flash LED. -Required properties of the "indicator" child node -================================================= +Required properties of the indicator child node (1) +=================================================== +reg: 1 led-max-microamp: Maximum indicator current. The allowed values are 2500, 5000, 7500 and 10000. -Optional properties of the "indicator" child node -================================================= +Optional properties of the indicator child node +=============================================== label : The label of the indicator LED. @@ -55,16 +59,20 @@ Example ======= as3645a@30 { + #address-cells: 1 + #size-cells: 0 reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; label = "as3645a:flash"; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; label = "as3645a:indicator"; }; From e626c325277531db15314b80610d1f5a1c2637b2 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:37 +0300 Subject: [PATCH 0632/1322] as3645a: Use integer numbers for parsing LEDs Use integer numbers for LEDs, 0 is the flash and 1 is the indicator. Signed-off-by: Sakari Ailus Acked-by: Pavel Machek Signed-off-by: Jacek Anaszewski --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 8 ++++++-- drivers/leds/leds-as3645a.c | 26 ++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index b86fc83a5a65..1b0bd72945f2 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -267,15 +267,19 @@ clock-frequency = <400000>; as3645a@30 { + #address-cells = <1>; + #size-cells = <0>; reg = <0x30>; compatible = "ams,as3645a"; - flash { + flash@0 { + reg = <0x0>; flash-timeout-us = <150000>; flash-max-microamp = <320000>; led-max-microamp = <60000>; ams,input-max-microamp = <1750000>; }; - indicator { + indicator@1 { + reg = <0x1>; led-max-microamp = <10000>; }; }; diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index e3f89c6130d2..605e0c64e974 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -112,6 +112,10 @@ #define AS_PEAK_mA_TO_REG(a) \ ((min_t(u32, AS_PEAK_mA_MAX, a) - 1250) / 250) +/* LED numbers for Devicetree */ +#define AS_LED_FLASH 0 +#define AS_LED_INDICATOR 1 + enum as_mode { AS_MODE_EXT_TORCH = 0 << AS_CONTROL_MODE_SETTING_SHIFT, AS_MODE_INDICATOR = 1 << AS_CONTROL_MODE_SETTING_SHIFT, @@ -491,10 +495,29 @@ static int as3645a_parse_node(struct as3645a *flash, struct device_node *node) { struct as3645a_config *cfg = &flash->cfg; + struct device_node *child; const char *name; int rval; - flash->flash_node = of_get_child_by_name(node, "flash"); + for_each_child_of_node(node, child) { + u32 id = 0; + + of_property_read_u32(child, "reg", &id); + + switch (id) { + case AS_LED_FLASH: + flash->flash_node = of_node_get(child); + break; + case AS_LED_INDICATOR: + flash->indicator_node = of_node_get(child); + break; + default: + dev_warn(&flash->client->dev, + "unknown LED %u encountered, ignoring\n", id); + break; + } + } + if (!flash->flash_node) { dev_err(&flash->client->dev, "can't find flash node\n"); return -ENODEV; @@ -538,7 +561,6 @@ static int as3645a_parse_node(struct as3645a *flash, &cfg->peak); cfg->peak = AS_PEAK_mA_TO_REG(cfg->peak); - flash->indicator_node = of_get_child_by_name(node, "indicator"); if (!flash->indicator_node) { dev_warn(&flash->client->dev, "can't find indicator node\n"); From 12c4b878e71fa8b65bc479b2460765c7d1d81a26 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 22 Sep 2017 12:32:38 +0300 Subject: [PATCH 0633/1322] as3645a: Unregister indicator LED on device unbind The indicator LED was registered in probe but was not removed in driver remove callback. Fix this. Signed-off-by: Sakari Ailus Signed-off-by: Jacek Anaszewski --- drivers/leds/leds-as3645a.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/leds/leds-as3645a.c b/drivers/leds/leds-as3645a.c index 605e0c64e974..9a257f969300 100644 --- a/drivers/leds/leds-as3645a.c +++ b/drivers/leds/leds-as3645a.c @@ -743,6 +743,7 @@ static int as3645a_remove(struct i2c_client *client) as3645a_set_control(flash, AS_MODE_EXT_TORCH, false); v4l2_flash_release(flash->vf); + v4l2_flash_release(flash->vfind); led_classdev_flash_unregister(&flash->fled); led_classdev_unregister(&flash->iled_cdev); From 28585a832602747cbfa88ad8934013177a3aae38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 14:10:22 -0700 Subject: [PATCH 0634/1322] rcu: Allow for page faults in NMI handlers A number of architecture invoke rcu_irq_enter() on exception entry in order to allow RCU read-side critical sections in the exception handler when the exception is from an idle or nohz_full CPU. This works, at least unless the exception happens in an NMI handler. In that case, rcu_nmi_enter() would already have exited the extended quiescent state, which would mean that rcu_irq_enter() would (incorrectly) cause RCU to think that it is again in an extended quiescent state. This will in turn result in lockdep splats in response to later RCU read-side critical sections. This commit therefore causes rcu_irq_enter() and rcu_irq_exit() to take no action if there is an rcu_nmi_enter() in effect, thus avoiding the unscheduled return to RCU quiescent state. This in turn should make the kernel safe for on-demand RCU voyeurism. Link: http://lkml.kernel.org/r/20170922211022.GA18084@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..63bee8e1b193 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -888,6 +888,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1020,6 +1025,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && From 9aadde91b3c035413c806619beb3e3ef6e697953 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:22:19 -0400 Subject: [PATCH 0635/1322] extable: Consolidate *kernel_text_address() functions The functionality between kernel_text_address() and _kernel_text_address() is the same except that _kernel_text_address() does a little more (that function needs a rename, but that can be done another time). Instead of having duplicate code in both, simply have _kernel_text_address() calls kernel_text_address() instead. This is marked for stable because there's an RCU bug that can happen if one of these functions gets called while RCU is not watching. That fix depends on this fix to keep from having to write the fix twice. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..a7024a494faf 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. From e8cac8b1d10589be45671a5ade0926a639b543b7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 17:36:32 -0400 Subject: [PATCH 0636/1322] extable: Enable RCU if it is not watching in kernel_text_address() If kernel_text_address() is called when RCU is not watching, it can cause an RCU bug because is_module_text_address(), the is_kprobe_*insn_slot() and is_bpf_text_address() functions require the use of RCU. Only enable RCU if it is not currently watching before it calls is_module_text_address(). The use of rcu_nmi_enter() is used to enable RCU because kernel_text_address() can happen pretty much anywhere (like an NMI), and even from within an NMI. It is called via save_stack_trace() that can be called by any WARN() or tracing function, which can happen while RCU is not watching (for example, going to or coming from idle, or during CPU take down or bring up). Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/extable.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index a7024a494faf..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -119,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* From 15516c89acce948debc4c598e03c3fee53045797 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 13:00:21 -0400 Subject: [PATCH 0637/1322] tracing: Remove RCU work arounds from stack tracer Currently the stack tracer calls rcu_irq_enter() to make sure RCU is watching when it records a stack trace. But if the stack tracer is triggered while tracing inside of a rcu_irq_enter(), calling rcu_irq_enter() unconditionally can be problematic. The reason for having rcu_irq_enter() in the first place has been fixed from within the saving of the stack trace code, and there's no reason for doing it in the stack tracer itself. Just remove it. Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Acked-by: Paul E. McKenney Suggested-by: "Paul E. McKenney" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } From 05cf97e7a619fc7ede81ee6bb8ebfa7531b633f5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 22 Sep 2017 01:01:11 +0200 Subject: [PATCH 0638/1322] cnic: Fix an error handling path in 'cnic_alloc_bnx2x_resc()' All the error handling paths 'goto error', except this one. We should also go to error in this case, or some resources will be leaking. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/cnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index cec94bbb2ea5..8bc126a156e8 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -1278,7 +1278,7 @@ static int cnic_alloc_bnx2x_resc(struct cnic_dev *dev) ret = cnic_alloc_dma(dev, kwq_16_dma, pages, 0); if (ret) - return -ENOMEM; + goto error; n = CNIC_PAGE_SIZE / CNIC_KWQ16_DATA_SIZE; for (i = 0, j = 0; i < cp->max_cid_space; i++) { From 5c346525d3591cb032eca86d0f904cc01f1069ff Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 21 Sep 2017 18:00:36 -0600 Subject: [PATCH 0639/1322] net: qualcomm: rmnet: Fix rcu splat in rmnet_is_real_dev_registered Xiaolong reported a suspicious rcu_dereference_check in the device unregister notifier callback. Since we do not dereference the rx_handler_data, it's ok to just check for the value of the pointer. Note that this section is already protected by rtnl_lock. [ 101.364846] WARNING: suspicious RCU usage [ 101.365654] 4.13.0-rc6-01701-gceed73a #1 Not tainted [ 101.370873] ----------------------------- [ 101.372472] drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c:57 suspicious rcu_dereference_check() usage! [ 101.374427] [ 101.374427] other info that might help us debug this: [ 101.374427] [ 101.387491] [ 101.387491] rcu_scheduler_active = 2, debug_locks = 1 [ 101.389368] 1 lock held by trinity-main/2809: [ 101.390736] #0: (rtnl_mutex){+.+.+.}, at: [<8146085b>] rtnl_lock+0xf/0x11 [ 101.395482] [ 101.395482] stack backtrace: [ 101.396948] CPU: 0 PID: 2809 Comm: trinity-main Not tainted 4.13.0-rc6-01701-gceed73a #1 [ 101.398857] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014 [ 101.401079] Call Trace: [ 101.401656] dump_stack+0xa1/0xeb [ 101.402871] lockdep_rcu_suspicious+0xc7/0xd0 [ 101.403665] rmnet_is_real_dev_registered+0x40/0x4e [ 101.405199] rmnet_config_notify_cb+0x2c/0x142 [ 101.406344] ? wireless_nlevent_flush+0x47/0x71 [ 101.407385] notifier_call_chain+0x2d/0x47 [ 101.408645] raw_notifier_call_chain+0xc/0xe [ 101.409882] call_netdevice_notifiers_info+0x41/0x49 [ 101.411402] call_netdevice_notifiers+0xc/0xe [ 101.412713] rollback_registered_many+0x268/0x36e [ 101.413702] rollback_registered+0x39/0x56 [ 101.414965] unregister_netdevice_queue+0x79/0x88 [ 101.415908] unregister_netdev+0x16/0x1d Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Subash Abhinov Kasiviswanathan Reported-by: kernel test robot Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 98f22551eb45..1e33aea59f50 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -51,10 +51,7 @@ struct rmnet_walk_data { static int rmnet_is_real_dev_registered(const struct net_device *real_dev) { - rx_handler_func_t *rx_handler; - - rx_handler = rcu_dereference(real_dev->rx_handler); - return (rx_handler == rmnet_rx_handler); + return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler; } /* Needs rtnl lock */ From 39e50d9637f9a31967ac9e956b829ee8b50a750f Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 22 Sep 2017 10:20:21 -0400 Subject: [PATCH 0640/1322] forcedeth: optimize the xmit/rx with unlikely In the xmit/rx fastpath, the function dma_map_single rarely fails. Therefore, add an unlikely() optimization to this error check conditional. Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- drivers/net/ethernet/nvidia/forcedeth.c | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index b605b94f4567..a235e8881af9 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -1817,8 +1817,8 @@ static int nv_alloc_rx(struct net_device *dev) skb->data, skb_tailroom(skb), DMA_FROM_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, - np->put_rx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_rx_ctx->dma))) { kfree_skb(skb); goto packet_dropped; } @@ -1858,8 +1858,8 @@ static int nv_alloc_rx_optimized(struct net_device *dev) skb->data, skb_tailroom(skb), DMA_FROM_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, - np->put_rx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_rx_ctx->dma))) { kfree_skb(skb); goto packet_dropped; } @@ -2227,8 +2227,8 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev) np->put_tx_ctx->dma = dma_map_single(&np->pci_dev->dev, skb->data + offset, bcnt, DMA_TO_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, - np->put_tx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_tx_ctx->dma))) { /* on DMA mapping error - drop the packet */ dev_kfree_skb_any(skb); u64_stats_update_begin(&np->swstats_tx_syncp); @@ -2268,7 +2268,8 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev) frag, offset, bcnt, DMA_TO_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, np->put_tx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_tx_ctx->dma))) { /* Unwind the mapped fragments */ do { @@ -2377,8 +2378,8 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb, np->put_tx_ctx->dma = dma_map_single(&np->pci_dev->dev, skb->data + offset, bcnt, DMA_TO_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, - np->put_tx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_tx_ctx->dma))) { /* on DMA mapping error - drop the packet */ dev_kfree_skb_any(skb); u64_stats_update_begin(&np->swstats_tx_syncp); @@ -2419,7 +2420,8 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb, bcnt, DMA_TO_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, np->put_tx_ctx->dma)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + np->put_tx_ctx->dma))) { /* Unwind the mapped fragments */ do { @@ -5075,8 +5077,8 @@ static int nv_loopback_test(struct net_device *dev) test_dma_addr = dma_map_single(&np->pci_dev->dev, tx_skb->data, skb_tailroom(tx_skb), DMA_FROM_DEVICE); - if (dma_mapping_error(&np->pci_dev->dev, - test_dma_addr)) { + if (unlikely(dma_mapping_error(&np->pci_dev->dev, + test_dma_addr))) { dev_kfree_skb_any(tx_skb); goto out; } From ab5348c9c23cd253f5902980d2d8fe067dc24c82 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Wed, 26 Jul 2017 22:27:05 -0400 Subject: [PATCH 0641/1322] security: fix description of values returned by cap_inode_need_killpriv cap_inode_need_killpriv returns 1 if security.capability exists and has a value and inode_killpriv() is required, 0 otherwise. Fix the description of the return value to reflect this. Signed-off-by: Stefan Berger Reviewed-by: Serge Hallyn Signed-off-by: James Morris --- security/commoncap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index 7abebd782d5e..123426900d25 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -300,10 +300,10 @@ static inline void bprm_clear_caps(struct linux_binprm *bprm) * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should - * inode_killpriv() be invoked or the change rejected? + * inode_killpriv() be invoked or the change rejected. * - * Returns 0 if granted; +ve if granted, but inode_killpriv() is required; and - * -ve to deny the change. + * Returns 1 if security.capability has a value, meaning inode_killpriv() + * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { From c2a9c4bf034ab7415ec272c4c86f1c0b796f80e6 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 17 Aug 2017 23:04:21 +0530 Subject: [PATCH 0642/1322] tpm: vtpm: constify vio_device_id vio_device_id are not supposed to change at runtime. All functions working with vio_device_id provided by work with const vio_device_id. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Reviewed-by: Jason Gunthorpe Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_ibmvtpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index f01d083eced2..d2ce46bd2471 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -32,7 +32,7 @@ static const char tpm_ibmvtpm_driver_name[] = "tpm_ibmvtpm"; -static struct vio_device_id tpm_ibmvtpm_device_table[] = { +static const struct vio_device_id tpm_ibmvtpm_device_table[] = { { "IBM,vtpm", "IBM,vtpm"}, { "", "" } }; From e1ec650f9ae7d1e66f77bf14c86e4e9dc629a4d5 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 6 Jul 2017 23:18:39 +0530 Subject: [PATCH 0643/1322] tpm: tpm_crb: constify acpi_device_id. acpi_device_id are not supposed to change at runtime. All functions working with acpi_device_id provided by work with const acpi_device_id. So mark the non-const structs as const. File size before: text data bss dec hex filename 4198 608 0 4806 12c6 drivers/char/tpm/tpm_crb.o File size After adding 'const': text data bss dec hex filename 4262 520 0 4782 12ae drivers/char/tpm/tpm_crb.o Signed-off-by: Arvind Yadav Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_crb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index a4ac63a21d8a..8f0a98dea327 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -665,7 +665,7 @@ static const struct dev_pm_ops crb_pm = { SET_RUNTIME_PM_OPS(crb_pm_runtime_suspend, crb_pm_runtime_resume, NULL) }; -static struct acpi_device_id crb_device_ids[] = { +static const struct acpi_device_id crb_device_ids[] = { {"MSFT0101", 0}, {"", 0}, }; From 5d0e4d78149b9745ea34848e950e734f7db3b95f Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Tue, 27 Jun 2017 12:27:23 +0200 Subject: [PATCH 0644/1322] Documentation: tpm: add powered-while-suspended binding documentation Add a new powered-while-suspended property to control the behavior of the TPM suspend/resume. Signed-off-by: Enric Balletbo i Serra Signed-off-by: Sonny Rao Reviewed-by: Jason Gunthorpe Reviewed-by: Jarkko Sakkinen Acked-by: Rob Herring Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt index 8cb638b7e89c..85c8216fc335 100644 --- a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt +++ b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt @@ -8,6 +8,12 @@ Required properties: the firmware event log - linux,sml-size : size of the memory allocated for the firmware event log +Optional properties: + +- powered-while-suspended: present when the TPM is left powered on between + suspend and resume (makes the suspend/resume + callbacks do nothing). + Example (for OpenPower Systems with Nuvoton TPM 2.0 on I2C) ---------------------------------------------------------- From 9f3fc7bcddcb51234e23494531f93ab60475e1c3 Mon Sep 17 00:00:00 2001 From: Hamza Attak Date: Mon, 14 Aug 2017 19:09:16 +0100 Subject: [PATCH 0645/1322] tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers The patch simply replaces all msleep function calls with usleep_range calls in the generic drivers. Tested with an Infineon TPM 1.2, using the generic tpm-tis module, for a thousand PCR extends, we see results going from 1m57s unpatched to 40s with the new patch. We obtain similar results when using the original and patched tpm_infineon driver, which is also part of the patch. Similarly with a STM TPM 2.0, using the CRB driver, it takes about 20ms per extend unpatched and around 7ms with the new patch. Note that the PCR consistency is untouched with this patch, each TPM has been tested with 10 million extends and the aggregated PCR value is continuously verified to be correct. As an extension of this work, this could potentially and easily be applied to other vendor's drivers. Still, these changes are not included in the proposed patch as they are untested. Signed-off-by: Hamza Attak Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm-interface.c | 10 +++++----- drivers/char/tpm/tpm.h | 9 ++++++++- drivers/char/tpm/tpm2-cmd.c | 2 +- drivers/char/tpm/tpm_infineon.c | 6 +++--- drivers/char/tpm/tpm_tis_core.c | 8 ++++---- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index fe597e6c55c4..1d6729be4cd6 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -455,7 +455,7 @@ ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space, goto out; } - msleep(TPM_TIMEOUT); /* CHECK */ + tpm_msleep(TPM_TIMEOUT); rmb(); } while (time_before(jiffies, stop)); @@ -970,7 +970,7 @@ int tpm_do_selftest(struct tpm_chip *chip) dev_info( &chip->dev, HW_ERR "TPM command timed out during continue self test"); - msleep(delay_msec); + tpm_msleep(delay_msec); continue; } @@ -985,7 +985,7 @@ int tpm_do_selftest(struct tpm_chip *chip) } if (rc != TPM_WARN_DOING_SELFTEST) return rc; - msleep(delay_msec); + tpm_msleep(delay_msec); } while (--loops > 0); return rc; @@ -1085,7 +1085,7 @@ again: } } else { do { - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); status = chip->ops->status(chip); if ((status & mask) == mask) return 0; @@ -1150,7 +1150,7 @@ int tpm_pm_suspend(struct device *dev) */ if (rc != TPM_WARN_RETRY) break; - msleep(TPM_TIMEOUT_RETRY); + tpm_msleep(TPM_TIMEOUT_RETRY); } if (rc) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 04fbff2edbf3..2d5466a72e40 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -50,7 +50,8 @@ enum tpm_const { enum tpm_timeout { TPM_TIMEOUT = 5, /* msecs */ - TPM_TIMEOUT_RETRY = 100 /* msecs */ + TPM_TIMEOUT_RETRY = 100, /* msecs */ + TPM_TIMEOUT_RANGE_US = 300 /* usecs */ }; /* TPM addresses */ @@ -527,6 +528,12 @@ int tpm_pm_resume(struct device *dev); int wait_for_tpm_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout, wait_queue_head_t *queue, bool check_cancel); +static inline void tpm_msleep(unsigned int delay_msec) +{ + usleep_range(delay_msec * 1000, + (delay_msec * 1000) + TPM_TIMEOUT_RANGE_US); +}; + struct tpm_chip *tpm_chip_find_get(int chip_num); __must_check int tpm_try_get_ops(struct tpm_chip *chip); void tpm_put_ops(struct tpm_chip *chip); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index f7f34b2aa981..e1a41b788f08 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -899,7 +899,7 @@ static int tpm2_do_selftest(struct tpm_chip *chip) if (rc != TPM2_RC_TESTING) break; - msleep(delay_msec); + tpm_msleep(delay_msec); } return rc; diff --git a/drivers/char/tpm/tpm_infineon.c b/drivers/char/tpm/tpm_infineon.c index 3b1b9f9322d5..d8f10047fbba 100644 --- a/drivers/char/tpm/tpm_infineon.c +++ b/drivers/char/tpm/tpm_infineon.c @@ -191,7 +191,7 @@ static int wait(struct tpm_chip *chip, int wait_for_bit) /* check the status-register if wait_for_bit is set */ if (status & 1 << wait_for_bit) break; - msleep(TPM_MSLEEP_TIME); + tpm_msleep(TPM_MSLEEP_TIME); } if (i == TPM_MAX_TRIES) { /* timeout occurs */ if (wait_for_bit == STAT_XFE) @@ -226,7 +226,7 @@ static void tpm_wtx(struct tpm_chip *chip) wait_and_send(chip, TPM_CTRL_WTX); wait_and_send(chip, 0x00); wait_and_send(chip, 0x00); - msleep(TPM_WTX_MSLEEP_TIME); + tpm_msleep(TPM_WTX_MSLEEP_TIME); } static void tpm_wtx_abort(struct tpm_chip *chip) @@ -237,7 +237,7 @@ static void tpm_wtx_abort(struct tpm_chip *chip) wait_and_send(chip, 0x00); wait_and_send(chip, 0x00); number_of_wtx = 0; - msleep(TPM_WTX_MSLEEP_TIME); + tpm_msleep(TPM_WTX_MSLEEP_TIME); } static int tpm_inf_recv(struct tpm_chip *chip, u8 * buf, size_t count) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index b617b2eeb080..63bc6c3b949e 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -51,7 +51,7 @@ static int wait_startup(struct tpm_chip *chip, int l) if (access & TPM_ACCESS_VALID) return 0; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); return -1; } @@ -117,7 +117,7 @@ again: do { if (check_locality(chip, l)) return l; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); } return -1; @@ -164,7 +164,7 @@ static int get_burstcount(struct tpm_chip *chip) burstcnt = (value >> 8) & 0xFFFF; if (burstcnt) return burstcnt; - msleep(TPM_TIMEOUT); + tpm_msleep(TPM_TIMEOUT); } while (time_before(jiffies, stop)); return -EBUSY; } @@ -396,7 +396,7 @@ static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len) priv->irq = irq; chip->flags |= TPM_CHIP_FLAG_IRQ; if (!priv->irq_tested) - msleep(1); + tpm_msleep(1); if (!priv->irq_tested) disable_interrupts(chip); priv->irq_tested = true; From fb154e0e0a95249459df054241a9e8f4cca56062 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 24 Feb 2017 20:35:16 +0100 Subject: [PATCH 0646/1322] tpm: ibmvtpm: simplify crq initialization and document crq format The crq is passed in registers and is the same on BE and LE hosts. However, current implementation allocates a structure on-stack to represent the crq, initializes the members swapping them to BE, and loads the structure swapping it from BE. This is pointless and causes GCC warnings about ununitialized members. Get rid of the structure and the warnings. Signed-off-by: Michal Suchanek Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: James Morris --- drivers/char/tpm/tpm_ibmvtpm.c | 96 +++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index d2ce46bd2471..25f6e2665385 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -39,19 +39,63 @@ static const struct vio_device_id tpm_ibmvtpm_device_table[] = { MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table); /** + * + * ibmvtpm_send_crq_word - Send a CRQ request + * @vdev: vio device struct + * @w1: pre-constructed first word of tpm crq (second word is reserved) + * + * Return: + * 0 - Success + * Non-zero - Failure + */ +static int ibmvtpm_send_crq_word(struct vio_dev *vdev, u64 w1) +{ + return plpar_hcall_norets(H_SEND_CRQ, vdev->unit_address, w1, 0); +} + +/** + * * ibmvtpm_send_crq - Send a CRQ request * * @vdev: vio device struct - * @w1: first word - * @w2: second word + * @valid: Valid field + * @msg: Type field + * @len: Length field + * @data: Data field + * + * The ibmvtpm crq is defined as follows: + * + * Byte | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 + * ----------------------------------------------------------------------- + * Word0 | Valid | Type | Length | Data + * ----------------------------------------------------------------------- + * Word1 | Reserved + * ----------------------------------------------------------------------- + * + * Which matches the following structure (on bigendian host): + * + * struct ibmvtpm_crq { + * u8 valid; + * u8 msg; + * __be16 len; + * __be32 data; + * __be64 reserved; + * } __attribute__((packed, aligned(8))); + * + * However, the value is passed in a register so just compute the numeric value + * to load into the register avoiding byteswap altogether. Endian only affects + * memory loads and stores - registers are internally represented the same. * * Return: - * 0 -Sucess + * 0 (H_SUCCESS) - Success * Non-zero - Failure */ -static int ibmvtpm_send_crq(struct vio_dev *vdev, u64 w1, u64 w2) +static int ibmvtpm_send_crq(struct vio_dev *vdev, + u8 valid, u8 msg, u16 len, u32 data) { - return plpar_hcall_norets(H_SEND_CRQ, vdev->unit_address, w1, w2); + u64 w1 = ((u64)valid << 56) | ((u64)msg << 48) | ((u64)len << 32) | + (u64)data; + return ibmvtpm_send_crq_word(vdev, w1); } /** @@ -109,8 +153,6 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) { struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev); - struct ibmvtpm_crq crq; - __be64 *word = (__be64 *)&crq; int rc, sig; if (!ibmvtpm->rtce_buf) { @@ -137,10 +179,6 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) spin_lock(&ibmvtpm->rtce_lock); ibmvtpm->res_len = 0; memcpy((void *)ibmvtpm->rtce_buf, (void *)buf, count); - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_TPM_COMMAND; - crq.len = cpu_to_be16(count); - crq.data = cpu_to_be32(ibmvtpm->rtce_dma_handle); /* * set the processing flag before the Hcall, since we may get the @@ -148,8 +186,9 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) */ ibmvtpm->tpm_processing_cmd = true; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, be64_to_cpu(word[0]), - be64_to_cpu(word[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_TPM_COMMAND, + count, ibmvtpm->rtce_dma_handle); if (rc != H_SUCCESS) { dev_err(ibmvtpm->dev, "tpm_ibmvtpm_send failed rc=%d\n", rc); rc = 0; @@ -182,15 +221,10 @@ static u8 tpm_ibmvtpm_status(struct tpm_chip *chip) */ static int ibmvtpm_crq_get_rtce_size(struct ibmvtpm_dev *ibmvtpm) { - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_GET_RTCE_BUFFER_SIZE; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_GET_RTCE_BUFFER_SIZE, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_get_rtce_size failed rc=%d\n", rc); @@ -210,15 +244,10 @@ static int ibmvtpm_crq_get_rtce_size(struct ibmvtpm_dev *ibmvtpm) */ static int ibmvtpm_crq_get_version(struct ibmvtpm_dev *ibmvtpm) { - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_GET_VERSION; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_GET_VERSION, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_get_version failed rc=%d\n", rc); @@ -238,7 +267,7 @@ static int ibmvtpm_crq_send_init_complete(struct ibmvtpm_dev *ibmvtpm) { int rc; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, INIT_CRQ_COMP_CMD, 0); + rc = ibmvtpm_send_crq_word(ibmvtpm->vdev, INIT_CRQ_COMP_CMD); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_send_init_complete failed rc=%d\n", rc); @@ -258,7 +287,7 @@ static int ibmvtpm_crq_send_init(struct ibmvtpm_dev *ibmvtpm) { int rc; - rc = ibmvtpm_send_crq(ibmvtpm->vdev, INIT_CRQ_CMD, 0); + rc = ibmvtpm_send_crq_word(ibmvtpm->vdev, INIT_CRQ_CMD); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "ibmvtpm_crq_send_init failed rc=%d\n", rc); @@ -340,15 +369,10 @@ static int tpm_ibmvtpm_suspend(struct device *dev) { struct tpm_chip *chip = dev_get_drvdata(dev); struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev); - struct ibmvtpm_crq crq; - u64 *buf = (u64 *) &crq; int rc = 0; - crq.valid = (u8)IBMVTPM_VALID_CMD; - crq.msg = (u8)VTPM_PREPARE_TO_SUSPEND; - - rc = ibmvtpm_send_crq(ibmvtpm->vdev, cpu_to_be64(buf[0]), - cpu_to_be64(buf[1])); + rc = ibmvtpm_send_crq(ibmvtpm->vdev, + IBMVTPM_VALID_CMD, VTPM_PREPARE_TO_SUSPEND, 0, 0); if (rc != H_SUCCESS) dev_err(ibmvtpm->dev, "tpm_ibmvtpm_suspend failed rc=%d\n", rc); From 656f083116a4799d8c0194976b8a2d66bf306538 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:44 +0200 Subject: [PATCH 0647/1322] x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to copy_user_to_xstate()/copy_xstate_to_user() The 'copyin/copyout' nomenclature needlessly departs from what the modern FPU code uses, which is: copy_fpregs_to_fpstate() copy_fpstate_to_sigframe() copy_fregs_to_user() copy_fxregs_to_kernel() copy_fxregs_to_user() copy_kernel_to_fpregs() copy_kernel_to_fregs() copy_kernel_to_fxregs() copy_kernel_to_xregs() copy_user_to_fregs() copy_user_to_fxregs() copy_user_to_xregs() copy_xregs_to_kernel() copy_xregs_to_user() I.e. according to this pattern, the following rename should be done: copyin_to_xsaves() -> copy_user_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user() or, if we want to be pedantic, denote that that the user-space format is ptrace: copyin_to_xsaves() -> copy_user_ptrace_to_xstate() copyout_from_xsaves() -> copy_xstate_to_user_ptrace() But I'd suggest the shorter, non-pedantic name. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1b2799e0699a..a1baa17e9748 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b188b16841e3..165d0545c924 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copyin_to_xsaves(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(kbuf, ubuf, xsave); else ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 83c23c230b4c..b1fe9a1fc4e0 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) { - err = copyin_to_xsaves(NULL, buf_fx, + err = copy_user_to_xstate(NULL, buf_fx, &fpu->state.xsave); } else { err = __copy_from_user(&fpu->state.xsave, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c24ac1efb12d..e7bb41723eaa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -951,7 +951,7 @@ static inline int xstate_copyout(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; @@ -1023,7 +1023,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; From f0d4f30a7fd299587840a028655285a87f334904 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:45 +0200 Subject: [PATCH 0648/1322] x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() copy_xstate_to_user() is a weird API - in part due to a bad API inherited from the regset APIs. But don't propagate that bad API choice into the FPU code - so as a first step split the API into kernel and user buffer handling routines. (Also split the xstate_copyout() internal helper.) The split API is a dumb duplication that should be obviously correct, the real splitting will be done in the next patch. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +- arch/x86/kernel/fpu/regset.c | 5 +- arch/x86/kernel/fpu/xstate.c | 110 ++++++++++++++++++++++++++++-- 3 files changed, 109 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a1baa17e9748..92dc8ca14124 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 165d0545c924..b6d12d66d04b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -92,7 +92,10 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); if (using_compacted_format()) { - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + if (kbuf) + ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + else + ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e7bb41723eaa..38561539cb99 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,10 +924,106 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int xstate_copyout(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +static inline int +__copy_xstate_to_kernel(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) +{ + if ((count == 0) || (pos < start_pos)) + return 0; + + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } + } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(pos != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} + +static inline int +__copy_xstate_to_user(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -977,7 +1073,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); if (ret) return ret; @@ -992,7 +1088,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); if (ret) return ret; @@ -1009,7 +1105,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; From 4d981cf2d96f29cdfa7d4972c8b377fe7baa9c4c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:46 +0200 Subject: [PATCH 0649/1322] x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel() APIs The 'ubuf' parameter is unused in the _kernel() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 21 ++++++--------------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 92dc8ca14124..c762574a245f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,7 +48,7 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b6d12d66d04b..34e74adf9d5d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 38561539cb99..71d3bda2b898 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -926,7 +926,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ static inline int __copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, + void *kbuf, const void *data, const int start_pos, const int end_pos) { @@ -936,12 +936,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + memcpy(kbuf + pos, data, copy); } return 0; } @@ -953,8 +948,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -979,8 +973,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); if (ret) return ret; @@ -994,8 +987,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); if (ret) return ret; @@ -1011,8 +1003,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; From a69c158fb3e7a91220f55029bf222a4e678d16e9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:47 +0200 Subject: [PATCH 0650/1322] x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs The 'kbuf' parameter is unused in the _user() side of the API, remove it. This simplifies the code and makes it easier to think about. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 25 +++++++------------------ 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c762574a245f..65bd68c30cd0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -49,7 +49,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 34e74adf9d5d..fd6dbdd8fde6 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -95,7 +95,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); else - ret = copy_xstate_to_user(pos, count, kbuf, ubuf, xsave); + ret = copy_xstate_to_user(pos, count, ubuf, xsave); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71d3bda2b898..2d8f3344875d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1011,10 +1011,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1022,12 +1019,8 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, if (end_pos < 0 || pos < end_pos) { unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; - } + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; } return 0; } @@ -1038,8 +1031,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int ret, i; @@ -1064,8 +1056,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); if (ret) return ret; @@ -1079,8 +1070,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, src, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); if (ret) return ret; @@ -1096,8 +1086,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); if (ret) return ret; From d7eda6c99cc75f1c41d67abf988f37a10045a370 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:48 +0200 Subject: [PATCH 0651/1322] x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs Parameter ordering is weird: int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); 'pos' and 'count', which are attributes of the destination buffer, are listed before the destination buffer itself ... List them after the primary arguments instead. This makes the code more similar to regular memcpy() variant APIs. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 65bd68c30cd0..e4430b84939d 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave); -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index fd6dbdd8fde6..ec1404194b65 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,9 +93,9 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (using_compacted_format()) { if (kbuf) - ret = copy_xstate_to_kernel(pos, count, kbuf, xsave); + ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); else - ret = copy_xstate_to_user(pos, count, ubuf, xsave); + ret = copy_xstate_to_user(ubuf, xsave, pos, count); } else { fpstate_sanitize_xstate(fpu); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2d8f3344875d..0a299468510f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,10 +925,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(unsigned int pos, unsigned int count, - void *kbuf, - const void *data, const int start_pos, - const int end_pos) +__copy_xstate_to_kernel(void *kbuf, + const void *data, + unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -948,7 +947,7 @@ __copy_xstate_to_kernel(unsigned int pos, unsigned int count, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, struct xregs_state *xsave) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -973,7 +972,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(offset, size, kbuf, &header, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); if (ret) return ret; @@ -987,7 +986,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(offset, size, kbuf, src, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); if (ret) return ret; @@ -1003,7 +1002,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(offset, size, kbuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; @@ -1011,7 +1010,7 @@ int copy_xstate_to_kernel(unsigned int pos, unsigned int count, void *kbuf, stru } static inline int -__copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, const void *data, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; @@ -1031,7 +1030,7 @@ __copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, c * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) { unsigned int offset, size; int ret, i; @@ -1056,7 +1055,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(offset, size, ubuf, &header, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); if (ret) return ret; @@ -1070,7 +1069,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(offset, size, ubuf, src, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); if (ret) return ret; @@ -1086,7 +1085,7 @@ int copy_xstate_to_user(unsigned int pos, unsigned int count, void __user *ubuf, offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(offset, size, ubuf, xstate_fx_sw_bytes, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); if (ret) return ret; From becb2bb72ff906cc0d2bac3ee9574f694364823b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:49 +0200 Subject: [PATCH 0652/1322] x86/fpu: Clean up the parameter definitions of copy_xstate_to_*() Remove pointless 'const' of non-pointer input parameter. Remove unnecessary parenthesis that shows uncertainty about arithmetic operator precedence. Clarify copy_xstate_to_user() description. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0a299468510f..9647e7256179 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,13 +927,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, const int start_pos, const int end_pos) + unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); memcpy(kbuf + pos, data, copy); } @@ -1010,13 +1010,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, const int start_pos, const int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) { if ((count == 0) || (pos < start_pos)) return 0; if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); if (__copy_to_user(ubuf + pos, data, copy)) return -EFAULT; @@ -1026,7 +1026,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns /* * Convert from kernel XSAVES compacted format to standard format and copy - * to a ptrace buffer. It supports partial copy but pos always starts from + * to a user-space buffer. It supports partial copy but pos always starts from * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ From 8a5b731889cbf004b406d988dc591c8a7aac773e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:50 +0200 Subject: [PATCH 0653/1322] x86/fpu: Remove the 'start_pos' parameter from the __copy_xstate_to_*() functions 'start_pos' is always 0, so remove it and remove the pointless check of 'pos < 0' which can not ever be true as 'pos' is unsigned ... No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9647e7256179..1f50fdaf4c5a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,9 +927,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int start_pos, int end_pos) + unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); if (ret) return ret; @@ -986,7 +986,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); if (ret) return ret; @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; @@ -1010,9 +1010,9 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int start_pos, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) { - if ((count == 0) || (pos < start_pos)) + if (!count) return 0; if (end_pos < 0 || pos < end_pos) { @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); if (ret) return ret; @@ -1069,7 +1069,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, count); if (ret) return ret; @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, 0, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); if (ret) return ret; From 56583c9a1400fe1935edd55b24b4fbbc779b59cb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:51 +0200 Subject: [PATCH 0654/1322] x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods Right now there's a confusing mixture of 'offset' and 'size' parameters: - __copy_xstate_to_*() input parameter 'end_pos' not not really an offset, but the full size of the copy to be performed. - input parameter 'count' to copy_xstate_to_*() shadows that of __copy_xstate_to_*()'s 'count' parameter name - but the roles are different: the first one is the total number of bytes to be copied, while the second one is a partial copy size. To unconfuse all this, use a consistent set of parameter names: - 'size' is the partial copy size within a single xstate component - 'size_total' is the total copy requested - 'offset_start' is the requested starting offset. - 'offset' is the offset within an xstate component. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +-- arch/x86/kernel/fpu/xstate.c | 44 +++++++++++++++---------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index e4430b84939d..fed6617a1079 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -48,8 +48,8 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1f50fdaf4c5a..c13083579655 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -927,15 +927,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline int __copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int pos, unsigned int count, int end_pos) + unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - memcpy(kbuf + pos, data, copy); + memcpy(kbuf + offset, data, copy); } return 0; } @@ -947,7 +947,7 @@ __copy_xstate_to_kernel(void *kbuf, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -956,7 +956,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -972,7 +972,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); if (ret) return ret; @@ -986,11 +986,11 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1002,7 +1002,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; @@ -1010,15 +1010,15 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int po } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, unsigned int count, int end_pos) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) { - if (!count) + if (!size) return 0; - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = end_pos < 0 ? count : min(count, end_pos - pos); + if (size_total < 0 || offset < size_total) { + unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); - if (__copy_to_user(ubuf + pos, data, copy)) + if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; } return 0; @@ -1030,7 +1030,7 @@ __copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int pos, uns * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int pos, unsigned int count) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -1039,7 +1039,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -1055,7 +1055,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_user(ubuf, &header, offset, size, count); + ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); if (ret) return ret; @@ -1069,11 +1069,11 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = __copy_xstate_to_user(ubuf, src, offset, size, count); + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - if (offset + size >= count) + if (offset + size >= size_total) break; } @@ -1085,7 +1085,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, count); + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; From 6ff15f8db7eaf29ef5ead6afbec9b25485fe8703 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:52 +0200 Subject: [PATCH 0655/1322] x86/fpu: Change 'size_total' parameter to unsigned and standardize the size checks in copy_xstate_to_*() 'size_total' is derived from an unsigned input parameter - and then converted to 'int' and checked for negative ranges: if (size_total < 0 || offset < size_total) { This conversion and the checks are unnecessary obfuscation, reject overly large requested copy sizes outright and simplify the underlying code. Reported-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c13083579655..b18c5457065a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -925,15 +925,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * the source data pointer or increment pos, count, kbuf, and ubuf. */ static inline int -__copy_xstate_to_kernel(void *kbuf, - const void *data, - unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_kernel(void *kbuf, const void *data, + unsigned int offset, unsigned int size, unsigned int size_total) { - if (!size) - return 0; - - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); memcpy(kbuf + offset, data, copy); } @@ -986,12 +982,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } @@ -1010,13 +1007,13 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, int size_total) +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { if (!size) return 0; - if (size_total < 0 || offset < size_total) { - unsigned int copy = size_total < 0 ? size : min(size, size_total - offset); + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); if (__copy_to_user(ubuf + offset, data, copy)) return -EFAULT; @@ -1069,12 +1066,13 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i offset = xstate_offsets[i]; size = xstate_sizes[i]; + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= size_total) - break; } } From 8c0817f4a3188ac5485ce14f96f12a175800b881 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:53 +0200 Subject: [PATCH 0656/1322] x86/fpu: Simplify __copy_xstate_to_kernel() return values __copy_xstate_to_kernel() can only return 0 (because kernel copies cannot fail), simplify the code throughout. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b18c5457065a..00c3b41c3cf1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -924,7 +924,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int +static inline void __copy_xstate_to_kernel(void *kbuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) { @@ -933,7 +933,6 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, memcpy(kbuf + offset, data, copy); } - return 0; } /* @@ -946,8 +945,8 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; - int ret, i; struct xstate_header header; + int i; /* * Currently copy_regset_to_user() starts from pos 0: @@ -968,9 +967,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); for (i = 0; i < XFEATURE_MAX; i++) { /* @@ -986,9 +983,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of if (offset + size > size_total) break; - ret = __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); } } @@ -999,9 +994,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); - if (ret) - return ret; + __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); return 0; } From 79fecc2b7506f29fb91becc65e8788e5ae7eba9f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:54 +0200 Subject: [PATCH 0657/1322] x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() & copy_user_to_xstate() Similar to: x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-12-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 +- arch/x86/kernel/fpu/regset.c | 10 +++-- arch/x86/kernel/fpu/xstate.c | 66 ++++++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fed6617a1079..79af79dbcab6 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ec1404194b65..cb45dd81d617 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,10 +134,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copy_user_to_xstate(kbuf, ubuf, xsave); - else + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (kbuf) + ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + else + ret = copy_user_to_xstate(kbuf, ubuf, xsave); + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } /* * In case of failure, mark all states as init: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 00c3b41c3cf1..1ad25d1b8056 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1084,7 +1084,71 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } /* - * Convert from a ptrace standard-format buffer to kernel XSAVES format + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. + */ +int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) +{ + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } + + /* + * Reject if the user sets any disabled or supervisor features: + */ + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } + + /* + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= xfeatures; + + return 0; +} + +/* + * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format * and copy to the target thread. This is called from xstateregs_set() and * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. From 59dffa4edba1f15b2bfdbe608aca1efe664c674c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:55 +0200 Subject: [PATCH 0658/1322] x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-13-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 17 +++-------------- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 79af79dbcab6..f10889bc0c88 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index cb45dd81d617..785302c75f38 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,7 +136,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, ubuf, xsave); + ret = copy_kernel_to_xstate(kbuf, xsave); else ret = copy_user_to_xstate(kbuf, ubuf, xsave); } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1ad25d1b8056..71cc8d367fdd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,8 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1100,12 +1099,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + memcpy(&xfeatures, kbuf + offset, size); /* * Reject if the user sets any disabled or supervisor features: @@ -1124,12 +1118,7 @@ int copy_kernel_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + memcpy(dst, kbuf + offset, size); } } From 7b9094c688f807c110a2dab6f6edc5876bfa7b0b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:56 +0200 Subject: [PATCH 0659/1322] x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-14-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 11 ++++------- arch/x86/kernel/fpu/xstate.c | 19 +++++-------------- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index f10889bc0c88..4ceb90740d80 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -51,5 +51,5 @@ int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, struct xregs_state *xsave); +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 785302c75f38..caf723f31737 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -138,7 +138,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (kbuf) ret = copy_kernel_to_xstate(kbuf, xsave); else - ret = copy_user_to_xstate(kbuf, ubuf, xsave); + ret = copy_user_to_xstate(ubuf, xsave); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b1fe9a1fc4e0..2c685b492fd6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -323,13 +323,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) { - err = copy_user_to_xstate(NULL, buf_fx, - &fpu->state.xsave); - } else { - err = __copy_from_user(&fpu->state.xsave, - buf_fx, state_size); - } + if (using_compacted_format()) + err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + else + err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 71cc8d367fdd..b1f3e4dae2e3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1142,8 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) { unsigned int offset, size; int i; @@ -1153,12 +1152,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = offsetof(struct xregs_state, header); size = sizeof(xfeatures); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; /* * Reject if the user sets any disabled or supervisor features: @@ -1177,12 +1172,8 @@ int copy_user_to_xstate(const void *kbuf, const void __user *ubuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } } From 6d7f7da5533a3f841eeb1d9657257c9367924274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:57 +0200 Subject: [PATCH 0660/1322] x86/fpu: Flip the parameter order in copy_*_to_xstate() Make it more consistent with regular memcpy() semantics, where the destination argument comes first. No change in functionality. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-15-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 4 ++-- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4ceb90740d80..579ac2358e63 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,6 +50,6 @@ const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave); -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave); +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index caf723f31737..19a7385a912c 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -136,9 +136,9 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) - ret = copy_kernel_to_xstate(kbuf, xsave); + ret = copy_kernel_to_xstate(xsave, kbuf); else - ret = copy_user_to_xstate(ubuf, xsave); + ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2c685b492fd6..2d682dac35d4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,7 +324,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (using_compacted_format()) - err = copy_user_to_xstate(buf_fx, &fpu->state.xsave); + err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); else err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b1f3e4dae2e3..0ef35040d0ad 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1089,7 +1089,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; @@ -1142,7 +1142,7 @@ int copy_kernel_to_xstate(const void *kbuf, struct xregs_state *xsave) * there we check the CPU has XSAVES and a whole standard-sized buffer * exists. */ -int copy_user_to_xstate(const void __user *ubuf, struct xregs_state *xsave) +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; From b3a163081c28d1a4d1ad76259a9d93b34a82f1da Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:58 +0200 Subject: [PATCH 0661/1322] x86/fpu: Simplify fpu->fpregs_active use The fpregs_active() inline function is pretty pointless - in almost all the callsites it can be replaced with a direct fpu->fpregs_active access. Do so and eliminate the extra layer of obfuscation. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-16-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 17 +---------------- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 9 +++++---- arch/x86/mm/pkeys.c | 2 +- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 554cdb205d17..b223c57dd5dc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -542,21 +542,6 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int fpregs_active(void) -{ - return current->thread.fpu.fpregs_active; -} - /* * FPU state switching for scheduling. * @@ -617,7 +602,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpregs_active()) + if (!fpu->fpregs_active) fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e1114f070c2d..bad57248e5a0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -367,7 +367,7 @@ void fpu__current_fpstate_write_end(void) * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpregs_active()) + if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2d682dac35d4..684025654d0c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active() || using_compacted_format()) { + if (fpu->fpregs_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); + copy_fxregs_to_kernel(fpu); } else { /* * It is a *bug* if kernel uses compacted-format for xsave @@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; } - fpstate_sanitize_xstate(&tsk->thread.fpu); + fpstate_sanitize_xstate(fpu); if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..e2c23472233e 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -45,7 +45,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.fpregs_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; From a10b6a16cdad88170f546d008c77453cddf918e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 14:59:59 +0200 Subject: [PATCH 0662/1322] x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic Do this temporarily only, to make it easier to change the FPU state machine, in particular this change couples the fpu->fpregs_active and fpu->fpstate_active states: they are only set/cleared together (as far as the scheduler sees them). This will be removed by later patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-17-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index bad57248e5a0..b7dc3833d41a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -462,9 +462,11 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { + preempt_disable(); fpu__activate_curr(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); + preempt_enable(); } } From b6aa85558d7e7b18fc3470d2bc1731d2205dd275 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:00 +0200 Subject: [PATCH 0663/1322] x86/fpu: Split the state handling in fpu__drop() Prepare fpu__drop() to use fpu->fpregs_active. There are two distinct usecases for fpu__drop() in this context: exit_thread() when called for 'current' in exit(), and when called for another task in fork(). This patch does not change behavior, it only adds a couple of debug checks and structures the code to make the ->fpregs_active change more obviously correct. All the complications will be removed later on. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-18-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b7dc3833d41a..815dfba7781a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -414,12 +414,19 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); + if (fpu == ¤t->thread.fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpregs_active) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + if (fpu->fpregs_active) + fpregs_deactivate(fpu); + } + } else { + WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; From f1c8cd0176078c7bcafdc89cac447cab672a0b5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:01 +0200 Subject: [PATCH 0664/1322] x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active We want to simplify the FPU state machine by eliminating fpu->fpregs_active, and we can do that because the two state flags (::fpregs_active and ::fpstate_active) are set essentially together. The old lazy FPU switching code used to make a distinction - but there's no lazy switching code anymore, we always switch in an 'eager' fashion. Do this by first changing all substantial uses of fpu->fpregs_active to fpu->fpstate_active and adding a few debug checks to double check our assumption is correct. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-19-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 +++- arch/x86/kernel/fpu/core.c | 16 ++++++++++------ arch/x86/kernel/fpu/signal.c | 4 +++- arch/x86/mm/pkeys.c | 3 +-- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b223c57dd5dc..7fa676f93ac1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -556,7 +556,9 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpregs_active) { + WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); + + if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 815dfba7781a..eab244622402 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -147,8 +147,10 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + trace_x86_fpu_before_save(fpu); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -262,11 +264,12 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: */ - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { @@ -362,12 +365,13 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with * an XRSTOR if they are active. */ - if (fpu->fpregs_active) + if (fpu->fpstate_active) copy_kernel_to_fpregs(&fpu->state); /* @@ -417,7 +421,7 @@ void fpu__drop(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpregs_active) { + if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 684025654d0c..a88083ba7f8b 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,9 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpregs_active || using_compacted_format()) { + WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); + + if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index e2c23472233e..4d24269c071f 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ -#include /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpregs_active && + current->thread.fpu.fpstate_active && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; From 6cf4edbe0526db311a28734609da888fdfcb3604 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:02 +0200 Subject: [PATCH 0665/1322] x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from fpu->fpregs_active The fpregs_activate()/fpregs_deactivate() are currently called in such a pattern: if (!fpu->fpregs_active) fpregs_activate(fpu); ... if (fpu->fpregs_active) fpregs_deactivate(fpu); But note that it's actually safe to call them without checking the flag first. This further decouples the fpu->fpregs_active flag from actual FPU logic. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-20-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 7 +------ arch/x86/kernel/fpu/core.c | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7fa676f93ac1..42a601673c09 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,8 +526,6 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - WARN_ON_FPU(!fpu->fpregs_active); - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); @@ -535,8 +533,6 @@ static inline void fpregs_deactivate(struct fpu *fpu) static inline void fpregs_activate(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpregs_active); - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); @@ -604,8 +600,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpu->fpregs_active) - fpregs_activate(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eab244622402..01a47e9edfb4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,8 +426,7 @@ void fpu__drop(struct fpu *fpu) asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - if (fpu->fpregs_active) - fpregs_deactivate(fpu); + fpregs_deactivate(fpu); } } else { WARN_ON_FPU(fpu->fpregs_active); From 99dc26bda233ee722bbd370bddf20beece3ffb93 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:03 +0200 Subject: [PATCH 0666/1322] x86/fpu: Remove struct fpu::fpregs_active The previous changes paved the way for the removal of the fpu::fpregs_active state flag - we now only have the fpu::fpstate_active and fpu::last_cpu fields left. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-21-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 5 ----- arch/x86/include/asm/fpu/types.h | 23 ----------------------- arch/x86/include/asm/trace/fpu.h | 5 +---- arch/x86/kernel/fpu/core.c | 9 --------- arch/x86/kernel/fpu/signal.c | 2 -- 5 files changed, 1 insertion(+), 43 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 42a601673c09..629e7abcd6c9 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -526,14 +526,12 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } @@ -552,8 +550,6 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - WARN_ON_FPU(old_fpu->fpregs_active != old_fpu->fpstate_active); - if (old_fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -561,7 +557,6 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3c80f5b9c09d..0c314a397cf5 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -298,29 +298,6 @@ struct fpu { */ unsigned char fpstate_active; - /* - * @fpregs_active: - * - * This flag determines whether a given context is actively - * loaded into the FPU's registers and that those registers - * represent the task's current FPU state. - * - * Note the interaction with fpstate_active: - * - * # task does not use the FPU: - * fpstate_active == 0 - * - * # task uses the FPU and regs are active: - * fpstate_active == 1 && fpregs_active == 1 - * - * # the regs are inactive but still match fpstate: - * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu - * - * The third state is what we use for the lazy restore optimization - * on lazy-switching CPUs. - */ - unsigned char fpregs_active; - /* * @state: * diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 342e59789fcd..da565aae9fd2 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,7 +12,6 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpregs_active) __field(bool, fpstate_active) __field(u64, xfeatures) __field(u64, xcomp_bv) @@ -20,16 +19,14 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_fast_assign( __entry->fpu = fpu; - __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpregs_active, __entry->fpstate_active, __entry->xfeatures, __entry->xcomp_bv diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 01a47e9edfb4..93103a909c47 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -147,8 +147,6 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - trace_x86_fpu_before_save(fpu); if (fpu->fpstate_active) { if (!copy_fpregs_to_fpstate(fpu)) { @@ -191,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) @@ -264,7 +261,6 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); */ void fpu__activate_fpstate_read(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * If fpregs are active (in the current CPU), then * copy them to the fpstate: @@ -365,7 +361,6 @@ void fpu__current_fpstate_write_end(void) { struct fpu *fpu = ¤t->thread.fpu; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); /* * 'fpu' now has an updated copy of the state, but the * registers may still be out of date. Update them with @@ -419,8 +414,6 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" @@ -428,8 +421,6 @@ void fpu__drop(struct fpu *fpu) _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } - } else { - WARN_ON_FPU(fpu->fpregs_active); } fpu->fpstate_active = 0; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a88083ba7f8b..629106e51a29 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,8 +171,6 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - WARN_ON_FPU(fpu->fpstate_active != fpu->fpregs_active); - if (fpu->fpstate_active || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) From 0852b374173bb57f870d78e6c6839c77b339be5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 23 Sep 2017 15:00:04 +0200 Subject: [PATCH 0667/1322] x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs On Skylake CPUs I noticed that XRSTOR is unable to deal with states created by copyout_from_xsaves() if the xstate has only SSE/YMM state, and no FP state. That is, xfeatures had XFEATURE_MASK_SSE set, but not XFEATURE_MASK_FP. The reason is that part of the SSE/YMM state lives in the MXCSR and MXCSR_FLAGS fields of the FP state. Ensure that whenever we copy SSE or YMM state around, the MXCSR and MXCSR_FLAGS fields are also copied around. Signed-off-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170210085445.0f1cc708@annuminas.surriel.com Link: http://lkml.kernel.org/r/20170923130016.21448-22-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 3 +++ arch/x86/kernel/fpu/xstate.c | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0c314a397cf5..71db45ca8870 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -68,6 +68,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0ef35040d0ad..41c52256bdce 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -920,6 +920,23 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ +/* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return 0; + + if (xfeatures & XFEATURE_MASK_FP) + return 0; + + return 1; +} + /* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. @@ -988,6 +1005,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1070,6 +1093,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ @@ -1122,6 +1151,12 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1177,6 +1212,13 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } + if (xfeatures_mxcsr_quirk(xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': From 4f8cef59bad29344aca0e2e6b0ad18dadd078fd0 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Sat, 23 Sep 2017 15:00:05 +0200 Subject: [PATCH 0668/1322] x86/fpu: Fix boolreturn.cocci warnings arch/x86/kernel/fpu/xstate.c:931:9-10: WARNING: return of 0/1 in function 'xfeatures_mxcsr_quirk' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Signed-off-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Yu-cheng Yu Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170306004553.GA25764@lkp-wsm-ep1 Link: http://lkml.kernel.org/r/20170923130016.21448-23-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 41c52256bdce..fda1109cc355 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -929,12 +929,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) { if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) - return 0; + return false; if (xfeatures & XFEATURE_MASK_FP) - return 0; + return false; - return 1; + return true; } /* From 03eaec81ac09814817e9f0307d572ffe8365f980 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 23 Sep 2017 15:00:06 +0200 Subject: [PATCH 0669/1322] x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU() copy_xregs_to_kernel checks if the alternatives have been already patched. This WARN_ON() is always executed in every context switch. All the other checks in fpu internal.h are WARN_ON_FPU(), but this one is plain WARN_ON(). I assume it was forgotten to switch it. So switch it to WARN_ON_FPU() too to avoid some unnecessary code in the context switch, and a potentially expensive cache line miss for the global variable. Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170329062605.4970-1-andi@firstfloor.org Link: http://lkml.kernel.org/r/20170923130016.21448-24-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 629e7abcd6c9..2dca7c65319c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -350,7 +350,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u32 hmask = mask >> 32; int err; - WARN_ON(!alternatives_patched); + WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); From 245a396a9b1a67ac5c3228737c261b3e48708a2a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:18 +0200 Subject: [PATCH 0670/1322] iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()' If 'devm_regulator_get()' fails, we should go through the existing error handling path instead of returning directly, as done is all the other error handling paths in this function. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index bd3d37fc2144..252f5890311e 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -887,8 +887,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) /* Enable 3v1 bias regulator for MADC[3:6] */ madc->usb3v1 = devm_regulator_get(madc->dev, "vusb3v1"); - if (IS_ERR(madc->usb3v1)) - return -ENODEV; + if (IS_ERR(madc->usb3v1)) { + ret = -ENODEV; + goto err_i2c; + } ret = regulator_enable(madc->usb3v1); if (ret) From 7f70be6e4025db0551e6863e7eb9cca07122695c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:19 +0200 Subject: [PATCH 0671/1322] iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()' Commit 7cc97d77ee8a has introduced a call to 'regulator_disable()' in the .remove function. So we should also have such a call in the .probe function in case of error after a successful 'regulator_enable()' call. Add a new label for that and use it. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 252f5890311e..0c86fbb3033e 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -899,11 +899,13 @@ static int twl4030_madc_probe(struct platform_device *pdev) ret = iio_device_register(iio_dev); if (ret) { dev_err(&pdev->dev, "could not register iio device\n"); - goto err_i2c; + goto err_usb3v1; } return 0; +err_usb3v1: + regulator_disable(madc->usb3v1); err_i2c: twl4030_madc_set_current_generator(madc, 0, 0); err_current_generator: From 53063846affd27def6f96e13a9fb80b9a3c2d126 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:20 +0200 Subject: [PATCH 0672/1322] iio: adc: twl4030: Return an error if we can not enable the vusb3v1 regulator in 'twl4030_madc_probe()' If we can not enable the regulator, go through the error handling path instead of silently continuing. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 0c86fbb3033e..28df096e84ec 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -893,8 +893,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) } ret = regulator_enable(madc->usb3v1); - if (ret) + if (ret) { dev_err(madc->dev, "could not enable 3v1 bias regulator\n"); + goto err_i2c; + } ret = iio_device_register(iio_dev); if (ret) { From 0a56eabc4e3f730782e4a9f3af4f60aa03a8a849 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 12:05:30 +0200 Subject: [PATCH 0673/1322] iio: trigger: stm32-timer: preset shouldn't be buffered Currently, setting preset value (ARR) will update directly 'Auto reload value' only on 1st write access. But then, ARPE is set. This makes ARR a shadow register. Preset value should be updated upon each write request: ensure ARPE is 0. This fixes successive writes to preset attribute. Fixes: 4adec7da0536 ("iio: stm32 trigger: Add quadrature encoder device") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/trigger/stm32-timer-trigger.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c index a9bc5b603b86..4cec28af3ecf 100644 --- a/drivers/iio/trigger/stm32-timer-trigger.c +++ b/drivers/iio/trigger/stm32-timer-trigger.c @@ -681,8 +681,9 @@ static ssize_t stm32_count_set_preset(struct iio_dev *indio_dev, if (ret) return ret; + /* TIMx_ARR register shouldn't be buffered (ARPE=0) */ + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0); regmap_write(priv->regmap, TIM_ARR, preset); - regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, TIM_CR1_ARPE); return len; } From b7a9776c1f9443326632486fcbd82dca82f8511e Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 12:05:31 +0200 Subject: [PATCH 0674/1322] iio: trigger: stm32-timer: fix a corner case to write preset Balance timer start routine that sets ARPE: clear it in stop routine. This fixes a corner case, when timer is used successively as trigger (with sampling_frequency start/stop routines), then as a counter (with preset). Fixes: 93fbe91b5521 ("iio: Add STM32 timer trigger driver") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/trigger/stm32-timer-trigger.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c index 4cec28af3ecf..a30ba6e1dfec 100644 --- a/drivers/iio/trigger/stm32-timer-trigger.c +++ b/drivers/iio/trigger/stm32-timer-trigger.c @@ -174,6 +174,7 @@ static void stm32_timer_stop(struct stm32_timer_trigger *priv) clk_disable(priv->clk); /* Stop timer */ + regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, 0); regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_CEN, 0); regmap_write(priv->regmap, TIM_PSC, 0); regmap_write(priv->regmap, TIM_ARR, 0); From 4fb840c95f82652cece7352be9080884cafb92a0 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 18 Sep 2017 18:24:15 +0200 Subject: [PATCH 0675/1322] iio: adc: stm32: fix bad error check on max_channels Fix a bad error check when counting 'st,adc-channels' array elements. This is seen when all channels are in use simultaneously. Fixes: 64ad7f643 ("iio: adc: stm32: introduce compatible data cfg") Signed-off-by: Fabrice Gasnier Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/stm32-adc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/stm32-adc.c b/drivers/iio/adc/stm32-adc.c index 6bc602891f2f..e93244bc3edd 100644 --- a/drivers/iio/adc/stm32-adc.c +++ b/drivers/iio/adc/stm32-adc.c @@ -1656,7 +1656,7 @@ static int stm32_adc_chan_of_init(struct iio_dev *indio_dev) num_channels = of_property_count_u32_elems(node, "st,adc-channels"); if (num_channels < 0 || - num_channels >= adc_info->max_channels) { + num_channels > adc_info->max_channels) { dev_err(&indio_dev->dev, "Bad st,adc-channels?\n"); return num_channels < 0 ? num_channels : -EINVAL; } From 0964e40947a630a2a6f724e968246992f97bcf1c Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 0676/1322] iio: adc: mcp320x: Fix oops on module unload The driver calls spi_get_drvdata() in its ->remove hook even though it has never called spi_set_drvdata(). Stack trace for posterity: Unable to handle kernel NULL pointer dereference at virtual address 00000220 Internal error: Oops: 5 [#1] SMP ARM [<8072f564>] (mutex_lock) from [<7f1400d0>] (iio_device_unregister+0x24/0x7c [industrialio]) [<7f1400d0>] (iio_device_unregister [industrialio]) from [<7f15e020>] (mcp320x_remove+0x20/0x30 [mcp320x]) [<7f15e020>] (mcp320x_remove [mcp320x]) from [<8055a8cc>] (spi_drv_remove+0x2c/0x44) [<8055a8cc>] (spi_drv_remove) from [<805087bc>] (__device_release_driver+0x98/0x134) [<805087bc>] (__device_release_driver) from [<80509180>] (driver_detach+0xdc/0xe0) [<80509180>] (driver_detach) from [<8050823c>] (bus_remove_driver+0x5c/0xb0) [<8050823c>] (bus_remove_driver) from [<80509ab0>] (driver_unregister+0x38/0x58) [<80509ab0>] (driver_unregister) from [<7f15e69c>] (mcp320x_driver_exit+0x14/0x1c [mcp320x]) [<7f15e69c>] (mcp320x_driver_exit [mcp320x]) from [<801a78d0>] (SyS_delete_module+0x184/0x1d0) [<801a78d0>] (SyS_delete_module) from [<80108100>] (ret_fast_syscall+0x0/0x1c) Fixes: f5ce4a7a9291 ("iio: adc: add driver for MCP3204/08 12-bit ADC") Cc: Oskar Andero Signed-off-by: Lukas Wunner Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/mcp320x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 634717ae12f3..45d043c9a888 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -312,6 +312,7 @@ static int mcp320x_probe(struct spi_device *spi) indio_dev->name = spi_get_device_id(spi)->name; indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &mcp320x_info; + spi_set_drvdata(spi, indio_dev); chip_info = &mcp320x_chip_infos[spi_get_device_id(spi)->driver_data]; indio_dev->channels = chip_info->channels; From e6f4794371ee7cce1339e7ca9542f1e703c5f84a Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 0677/1322] iio: adc: mcp320x: Fix readout of negative voltages Commit f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") returns a signed voltage from mcp320x_adc_conversion() but neglects that the caller interprets a negative return value as failure. Only mcp3301 (and the upcoming mcp3550/1/3) is affected as the other chips are incapable of measuring negative voltages. Fix and while at it, add mcp3301 to the list of supported chips at the top of the file. Fixes: f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") Cc: Andrea Galbusera Signed-off-by: Lukas Wunner Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/mcp320x.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 45d043c9a888..071dd23a33d9 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -17,6 +17,8 @@ * MCP3204 * MCP3208 * ------------ + * 13 bit converter + * MCP3301 * * Datasheet can be found here: * http://ww1.microchip.com/downloads/en/DeviceDoc/21293C.pdf mcp3001 @@ -96,7 +98,7 @@ static int mcp320x_channel_to_tx_data(int device_index, } static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, - bool differential, int device_index) + bool differential, int device_index, int *val) { int ret; @@ -117,19 +119,25 @@ static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, switch (device_index) { case mcp3001: - return (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + *val = (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + return 0; case mcp3002: case mcp3004: case mcp3008: - return (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + *val = (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + return 0; case mcp3201: - return (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + *val = (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + return 0; case mcp3202: case mcp3204: case mcp3208: - return (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + *val = (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + return 0; case mcp3301: - return sign_extend32((adc->rx_buf[0] & 0x1f) << 8 | adc->rx_buf[1], 12); + *val = sign_extend32((adc->rx_buf[0] & 0x1f) << 8 + | adc->rx_buf[1], 12); + return 0; default: return -EINVAL; } @@ -150,12 +158,10 @@ static int mcp320x_read_raw(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_RAW: ret = mcp320x_adc_conversion(adc, channel->address, - channel->differential, device_index); - + channel->differential, device_index, val); if (ret < 0) goto out; - *val = ret; ret = IIO_VAL_INT; break; From 4b1f0c31f96c45e8521dd84aae50f2aa4aecfb7b Mon Sep 17 00:00:00 2001 From: Colin Parker Date: Mon, 28 Aug 2017 16:21:39 -0700 Subject: [PATCH 0678/1322] IIO: BME280: Updates to Humidity readings need ctrl_reg write! The ctrl_reg register needs to be written after any write to the humidity registers. The value written to the ctrl_reg register does not necessarily need to change, but a write operation must occur. The regmap_update_bits functions will not write to a register if the register value matches the value to be written. This saves unnecessary bus operations. The change in this patch forces a bus write during the chip_config operation by switching to regmap_write_bits. This will fix issues where the Humidity Sensor Oversampling bits are not updated after initialization. Signed-off-by: Colin Parker Acked-by: Andreas Klinger Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/pressure/bmp280-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c index d82b788374b6..e442c5248427 100644 --- a/drivers/iio/pressure/bmp280-core.c +++ b/drivers/iio/pressure/bmp280-core.c @@ -558,7 +558,7 @@ static int bmp280_chip_config(struct bmp280_data *data) u8 osrs = BMP280_OSRS_TEMP_X(data->oversampling_temp + 1) | BMP280_OSRS_PRESS_X(data->oversampling_press + 1); - ret = regmap_update_bits(data->regmap, BMP280_REG_CTRL_MEAS, + ret = regmap_write_bits(data->regmap, BMP280_REG_CTRL_MEAS, BMP280_OSRS_TEMP_MASK | BMP280_OSRS_PRESS_MASK | BMP280_MODE_MASK, From 7fc10de8d49a748c476532c9d8e8fe19e548dd67 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:14:45 +0300 Subject: [PATCH 0679/1322] iio: ad_sigma_delta: Implement a dedicated reset function Since most of the SD ADCs have the option of reseting the serial interface by sending a number of SCLKs with CS = 0 and DIN = 1, a dedicated function that can do this is usefull. Needed for the patch: iio: ad7793: Fix the serial interface reset Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad_sigma_delta.c | 28 ++++++++++++++++++++++++++ include/linux/iio/adc/ad_sigma_delta.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index d10bd0c97233..22c4c17cd996 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -177,6 +177,34 @@ out: } EXPORT_SYMBOL_GPL(ad_sd_read_reg); +/** + * ad_sd_reset() - Reset the serial interface + * + * @sigma_delta: The sigma delta device + * @reset_length: Number of SCLKs with DIN = 1 + * + * Returns 0 on success, an error code otherwise. + **/ +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length) +{ + uint8_t *buf; + unsigned int size; + int ret; + + size = DIV_ROUND_UP(reset_length, 8); + buf = kcalloc(size, sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 0xff, size); + ret = spi_write(sigma_delta->spi, buf, size); + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(ad_sd_reset); + static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, unsigned int mode, unsigned int channel) { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 5ba430cc9a87..1fc7abd28b0b 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -111,6 +111,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length); + int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta, From 7ee3b7ebcb74714df6d94c8f500f307e1ee5dda5 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:16:13 +0300 Subject: [PATCH 0680/1322] iio: ad7793: Fix the serial interface reset The serial interface can be reset by writing 32 consecutive 1s to the device. 'ret' was initialized correctly but its value was overwritten when ad7793_check_platform_data() was called. Since a dedicated reset function is present now, it should be used instead. Fixes: 2edb769d246e ("iio:ad7793: Add support for the ad7798 and ad7799") Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ad7793.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index e6706a09e100..47c3d7f32900 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -257,7 +257,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, unsigned int vref_mv) { struct ad7793_state *st = iio_priv(indio_dev); - int i, ret = -1; + int i, ret; unsigned long long scale_uv; u32 id; @@ -266,7 +266,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, return ret; /* reset the serial interface */ - ret = spi_write(st->sd.spi, (u8 *)&ret, sizeof(ret)); + ret = ad_sd_reset(&st->sd, 32); if (ret < 0) goto out; usleep_range(500, 2000); /* Wait for at least 500us */ From 3d62c78a6eb9a7d67bace9622b66ad51e81c5f9b Mon Sep 17 00:00:00 2001 From: Matt Fornero Date: Tue, 5 Sep 2017 16:34:10 +0200 Subject: [PATCH 0681/1322] iio: core: Return error for failed read_reg If an IIO device returns an error code for a read access via debugfs, it is currently ignored by the IIO core (other than emitting an error message). Instead, return this error code to user space, so upper layers can detect it correctly. Signed-off-by: Matt Fornero Signed-off-by: Lars-Peter Clausen Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 17ec4cee51dc..a47428b4d31b 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -310,8 +310,10 @@ static ssize_t iio_debugfs_read_reg(struct file *file, char __user *userbuf, ret = indio_dev->info->debugfs_reg_access(indio_dev, indio_dev->cached_reg_addr, 0, &val); - if (ret) + if (ret) { dev_err(indio_dev->dev.parent, "%s: read failed\n", __func__); + return ret; + } len = snprintf(buf, sizeof(buf), "0x%X\n", val); From f790923f146140a261ad211e5baf75d169f16fb2 Mon Sep 17 00:00:00 2001 From: Stefan Popa Date: Thu, 14 Sep 2017 16:50:28 +0300 Subject: [PATCH 0682/1322] staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack. Depends on: 691c4b95d1 ("iio: ad_sigma_delta: Implement a dedicated reset function") SPI host drivers can use DMA to transfer data, so the buffer should be properly allocated. Keeping it on the stack could cause an undefined behavior. The dedicated reset function solves this issue. Signed-off-by: Stefan Popa Acked-by: Lars-Peter Clausen Acked-by: Michael Hennerich Cc: Signed-off-by: Jonathan Cameron --- drivers/staging/iio/adc/ad7192.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c index d11c6de9c777..6150d2780e22 100644 --- a/drivers/staging/iio/adc/ad7192.c +++ b/drivers/staging/iio/adc/ad7192.c @@ -223,11 +223,9 @@ static int ad7192_setup(struct ad7192_state *st, struct iio_dev *indio_dev = spi_get_drvdata(st->sd.spi); unsigned long long scale_uv; int i, ret, id; - u8 ones[6]; /* reset the serial interface */ - memset(&ones, 0xFF, 6); - ret = spi_write(st->sd.spi, &ones, 6); + ret = ad_sd_reset(&st->sd, 48); if (ret < 0) goto out; usleep_range(500, 1000); /* Wait for at least 500us */ From e19b205be43d11bff638cad4487008c48d21c103 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Sep 2017 16:38:56 -0700 Subject: [PATCH 0683/1322] Linux 4.14-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 64cbc66cebca..d1119941261c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Fearless Coyote # *DOCUMENTATION* From 5f3d862a736398e7068fa67142133f1713fdee8c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 18 Sep 2017 09:41:45 +0200 Subject: [PATCH 0684/1322] qxl: fix framebuffer unpinning qxl_plane_cleanup_fb() unpins the just activated framebuffer instead of the old one. Oops. Fix it. Cc: Gabriel Krisman Bertazi Fixes: 1277eed5fecb8830c8cc414ad70c1ef640464bc0 Signed-off-by: Gerd Hoffmann Reviewed-by: Gabriel Krisman Bertazi Link: http://patchwork.freedesktop.org/patch/msgid/20170918074145.2257-1-kraxel@redhat.com --- drivers/gpu/drm/qxl/qxl_display.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index e1dd05423e86..afbf50d0c08f 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -702,14 +702,15 @@ static void qxl_plane_cleanup_fb(struct drm_plane *plane, struct drm_gem_object *obj; struct qxl_bo *user_bo; - if (!plane->state->fb) { - /* we never executed prepare_fb, so there's nothing to + if (!old_state->fb) { + /* + * we never executed prepare_fb, so there's nothing to * unpin. */ return; } - obj = to_qxl_framebuffer(plane->state->fb)->obj; + obj = to_qxl_framebuffer(old_state->fb)->obj; user_bo = gem_to_qxl_bo(obj); qxl_bo_unpin(user_bo); } From 814fb7bb7db5433757d76f4c4502c96fc53b0b5e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:07 +0200 Subject: [PATCH 0685/1322] x86/fpu: Don't let userspace set bogus xcomp_bv On x86, userspace can use the ptrace() or rt_sigreturn() system calls to set a task's extended state (xstate) or "FPU" registers. ptrace() can set them for another task using the PTRACE_SETREGSET request with NT_X86_XSTATE, while rt_sigreturn() can set them for the current task. In either case, registers can be set to any value, but the kernel assumes that the XSAVE area itself remains valid in the sense that the CPU can restore it. However, in the case where the kernel is using the uncompacted xstate format (which it does whenever the XSAVES instruction is unavailable), it was possible for userspace to set the xcomp_bv field in the xstate_header to an arbitrary value. However, all bits in that field are reserved in the uncompacted case, so when switching to a task with nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault. This caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit. In addition, since the error is otherwise ignored, the FPU registers from the task previously executing on the CPU were leaked. Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in the uncompacted case, and returning an error otherwise. The reason for validating xcomp_bv rather than simply overwriting it with 0 is that we want userspace to see an error if it (incorrectly) provides an XSAVE area in compacted format rather than in uncompacted format. Note that as before, in case of error we clear the task's FPU state. This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be better to return an error before changing anything. But it seems the "clear on error" behavior is fine for now, and it's a little tricky to do otherwise because it would mean we couldn't simply copy the full userspace state into kernel memory in one __copy_from_user(). This bug was found by syzkaller, which hit the above-mentioned WARN_ON_FPU(): WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000 RIP: 0010:__switch_to+0x5b5/0x5d0 RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082 RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100 RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0 RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0 R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40 FS: 00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0 Call Trace: Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f Here is a C reproducer. The expected behavior is that the program spin forever with no output. However, on a buggy kernel running on a processor with the "xsave" feature but without the "xsaves" feature (e.g. Sandy Bridge through Broadwell for Intel), within a second or two the program reports that the xmm registers were corrupted, i.e. were not restored correctly. With CONFIG_X86_DEBUG_FPU=y it also hits the above kernel warning. #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(void) { int pid = fork(); uint64_t xstate[512]; struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) }; if (pid == 0) { bool tracee = true; for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++) tracee = (fork() != 0); uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF }; asm volatile(" movdqu %0, %%xmm0\n" " mov %0, %%rbx\n" "1: movdqu %%xmm0, %0\n" " mov %0, %%rax\n" " cmp %%rax, %%rbx\n" " je 1b\n" : "+m" (xmm0) : : "rax", "rbx", "xmm0"); printf("BUG: xmm registers corrupted! tracee=%d, xmm0=%08X%08X%08X%08X\n", tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]); } else { usleep(100000); ptrace(PTRACE_ATTACH, pid, 0, 0); wait(NULL); ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); xstate[65] = -1; ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); ptrace(PTRACE_CONT, pid, 0, 0); wait(NULL); } return 1; } Note: the program only tests for the bug using the ptrace() system call. The bug can also be reproduced using the rt_sigreturn() system call, but only when called from a 32-bit program, since for 64-bit programs the kernel restores the FPU state from the signal frame by doing XRSTOR directly from userspace memory (with proper error checking). Reported-by: Dmitry Vyukov Signed-off-by: Eric Biggers Reviewed-by: Kees Cook Reviewed-by: Rik van Riel Acked-by: Dave Hansen Cc: [v3.17+] Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header") Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 4 ++++ arch/x86/kernel/fpu/signal.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19a7385a912c..c764f7405322 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,6 +141,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 629106e51a29..da68ea1c3a44 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -324,11 +324,16 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (using_compacted_format()) + if (using_compacted_format()) { err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); - else + } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; + } + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); From d5c8028b4788f62b31fb79a331b3ad3e041fa366 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Sep 2017 15:00:09 +0200 Subject: [PATCH 0686/1322] x86/fpu: Reinitialize FPU registers if restoring FPU state fails Userspace can change the FPU state of a task using the ptrace() or rt_sigreturn() system calls. Because reserved bits in the FPU state can cause the XRSTOR instruction to fail, the kernel has to carefully validate that no reserved bits or other invalid values are being set. Unfortunately, there have been bugs in this validation code. For example, we were not checking that the 'xcomp_bv' field in the xstate_header was 0. As-is, such bugs are exploitable to read the FPU registers of other processes on the system. To do so, an attacker can create a task, assign to it an invalid FPU state, then spin in a loop and monitor the values of the FPU registers. Because the task's FPU registers are not being restored, sometimes the FPU registers will have the values from another process. This is likely to continue to be a problem in the future because the validation done by the CPU instructions like XRSTOR is not immediately visible to kernel developers. Nor will invalid FPU states ever be encountered during ordinary use --- they will only be seen during fuzzing or exploits. There can even be reserved bits outside the xstate_header which are easy to forget about. For example, the MXCSR register contains reserved bits, which were not validated by the KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load damaged SSEx MXCSR register"). Therefore, mitigate this class of vulnerability by restoring the FPU registers from init_fpstate if restoring from the task's state fails. We actually used to do this, but it was (perhaps unwisely) removed by commit 9ccc27a5d297 ("x86/fpu: Remove error return values from copy_kernel_to_*regs() functions"). This new patch is also a bit different. First, it only clears the registers, not also the bad in-memory state; this is simpler and makes it easier to make the mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it does the register clearing in an exception handler so that no extra instructions are added to context switches. In fact, we *remove* instructions, since previously we were always zeroing the register containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled. Signed-off-by: Eric Biggers Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 51 +++++++++-------------------- arch/x86/mm/extable.c | 24 ++++++++++++++ 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2dca7c65319c..cf290d424e48 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -120,20 +120,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -#define check_insn(insn, output, input...) \ -({ \ - int err; \ +#define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { @@ -153,20 +144,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - int err; - if (IS_ENABLED(CONFIG_X86_32)) { - err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ - err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } - /* Copying from a kernel buffer to FPU registers should never fail: */ - WARN_ON_FPU(err); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -183,9 +170,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline void copy_kernel_to_fregs(struct fregs_state *fx) { - int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - - WARN_ON_FPU(err); + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -281,18 +266,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ -#define XSTATE_XRESTORE(st, lmask, hmask, err) \ +#define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ - "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -336,7 +316,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ WARN_ON_FPU(err); } @@ -365,12 +348,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err; - XSTATE_XRESTORE(xstate, lmask, hmask, err); - - /* We should never fault when copying from a kernel buffer: */ - WARN_ON_FPU(err); + XSTATE_XRESTORE(xstate, lmask, hmask); } /* diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c076f710de4c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_refcount); +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { From 10430364ebb562311ba6a6efa74e0c2298007912 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Tue, 29 Aug 2017 23:47:11 +0530 Subject: [PATCH 0687/1322] x86/numachip: Add const and __initconst to numachip2_clockevent Make this const as it is only used during a copy operation and add __initconst as this usage is during the initialization phase. Signed-off-by: Bhumika Goyal Signed-off-by: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: daniel.lezcano@linaro.org Link: http://lkml.kernel.org/r/1504030631-10812-1-git-send-email-bhumirks@gmail.com --- drivers/clocksource/numachip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/numachip.c b/drivers/clocksource/numachip.c index 6a20dc8b253f..9a7d7f0f23fe 100644 --- a/drivers/clocksource/numachip.c +++ b/drivers/clocksource/numachip.c @@ -43,7 +43,7 @@ static int numachip2_set_next_event(unsigned long delta, struct clock_event_devi return 0; } -static struct clock_event_device numachip2_clockevent = { +static const struct clock_event_device numachip2_clockevent __initconst = { .name = "numachip2", .rating = 400, .set_next_event = numachip2_set_next_event, From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 4 Sep 2017 10:32:15 +0200 Subject: [PATCH 0688/1322] x86/mm: Fix fault error path using unsafe vma pointer commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") passes down a vma pointer to the error path, but that is done once the mmap_sem is released when calling mm_fault_error() from __do_page_fault(). This is dangerous as the vma structure is no more safe to be used once the mmap_sem has been released. As only the protection key value is required in the error processing, we could just pass down this value. Fix it by passing a pointer to a protection key value down to the fault signal generation code. The use of a pointer allows to keep the check generating a warning message in fill_sig_info_pkey() when the vma was not known. If the pointer is valid, the protection value can be accessed by deferencing the pointer. [ tglx: Made *pkey u32 as that's the type which is passed in siginfo ] Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") Signed-off-by: Laurent Dufour Signed-off-by: Thomas Gleixner Cc: linux-mm@kvack.org Cc: Dave Hansen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com --- arch/x86/mm/fault.c | 47 +++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 39567b5c33da..e2baeaa053a5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -762,8 +760,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -788,7 +784,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -896,8 +892,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -945,7 +940,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -958,9 +953,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -968,6 +963,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -975,7 +974,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1018,7 +1018,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1045,13 +1045,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1075,9 +1074,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1267,6 +1266,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1467,9 +1467,10 @@ good_area: return; } + pkey = vma_pkey(vma); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } From 7d7099433d9eaaa5a989a55f1fa354c16a3ad297 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Mon, 11 Sep 2017 08:33:21 +0800 Subject: [PATCH 0689/1322] x86/sysfs: Fix off-by-one error in loop termination An off-by-one error in loop terminantion conditions in create_setup_data_nodes() will lead to memory leak when create_setup_data_node() failed. Signed-off-by: Sean Fu Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1505090001-1157-1-git-send-email-fxinrong@gmail.com --- arch/x86/kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4b0592ca9e47..8c1cc08f514f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) return 0; out_clean_nodes: - for (j = i - 1; j > 0; j--) + for (j = i - 1; j >= 0; j--) cleanup_setup_data_node(*(kobjp + j)); kfree(kobjp); out_setup_data_kobj: From 5ac751d9e6b187c4a0000879d6598eb2292db949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Tue, 12 Sep 2017 19:40:00 +0300 Subject: [PATCH 0690/1322] x86: Don't cast away the __user in __get_user_asm_u64() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't cast away the __user in __get_user_asm_u64() on x86-32. Prevents sparse getting upset. Signed-off-by: Ville Syrjälä Signed-off-by: Thomas Gleixner Cc: Benjamin LaHaise Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170912164000.13745-1-ville.syrjala@linux.intel.com --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 78e8fcc87d4c..4b892917edeb 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -337,7 +337,7 @@ do { \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ : "=r" (retval), "=&A"(x) \ - : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) From b09c146f8f63c0e03adba74df76bf9c2be466fec Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:47 -0400 Subject: [PATCH 0691/1322] perf/x86/intel/cstate: Add missing CPU IDs Skylake server uses the same C-state residency events as Sandy Bridge. Denverton and Gemini lake use the same C-state residency events as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-1-kan.liang@intel.com --- arch/x86/events/intel/cstate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4cf100ff2a37..72db0664a53d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -552,6 +552,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), @@ -560,6 +561,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From 1aaccc40a1864053da26605b0297be16dd52641e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:48 -0400 Subject: [PATCH 0692/1322] perf/x86/msr: Add missing CPU IDs Goldmont, Glodmont plus and Xeon Phi have MSR_SMI_COUNT as well. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-2-kan.liang@intel.com --- arch/x86/events/msr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4bb3ec69e8ea..06723671ae4e 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -63,6 +63,14 @@ static bool test_intel(int idx) case INTEL_FAM6_ATOM_SILVERMONT1: case INTEL_FAM6_ATOM_SILVERMONT2: case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; From 450a97893559354b927c935f39ee11126f01f520 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 8 Sep 2017 17:34:49 -0400 Subject: [PATCH 0693/1322] perf/x86/intel/rapl: Add missing CPU IDs DENVERTON and GEMINI_LAKE support same RAPL counters as Apollo Lake. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: piotr.luc@intel.com Cc: harry.pan@intel.com Cc: srinivas.pandruvada@linux.intel.com Link: http://lkml.kernel.org/r/20170908213449.6224-3-kan.liang@intel.com --- arch/x86/events/intel/rapl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 8e2457cb6b4a..005908ee9333 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), {}, }; From 57999d1107c1e60c2ca7088f2ac0f819e2f554b3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:25 +0300 Subject: [PATCH 0694/1322] USB: devio: Prevent integer overflow in proc_do_submiturb() There used to be an integer overflow check in proc_do_submiturb() but we removed it. It turns out that it's still required. The uurb->buffer_length variable is a signed integer and it's controlled by the user. It can lead to an integer overflow when we do: num_sgs = DIV_ROUND_UP(uurb->buffer_length, USB_SG_SIZE); If we strip away the macro then that line looks like this: num_sgs = (uurb->buffer_length + USB_SG_SIZE - 1) / USB_SG_SIZE; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It's the first addition which can overflow. Fixes: 1129d270cbfb ("USB: Increase usbfs transfer limit") Cc: stable Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 318bb3b96687..e9326f31db8d 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -140,6 +140,9 @@ module_param(usbfs_memory_mb, uint, 0644); MODULE_PARM_DESC(usbfs_memory_mb, "maximum MB allowed for usbfs buffers (0 = no limit)"); +/* Hard limit, necessary to avoid arithmetic overflow */ +#define USBFS_XFER_MAX (UINT_MAX / 2 - 1000000) + static atomic64_t usbfs_memory_usage; /* Total memory currently allocated */ /* Check whether it's okay to allocate more memory for a transfer */ @@ -1460,6 +1463,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb USBDEVFS_URB_ZERO_PACKET | USBDEVFS_URB_NO_INTERRUPT)) return -EINVAL; + if ((unsigned int)uurb->buffer_length >= USBFS_XFER_MAX) + return -EINVAL; if (uurb->buffer_length > 0 && !uurb->buffer) return -EINVAL; if (!(uurb->type == USBDEVFS_URB_TYPE_CONTROL && From fa1ed74eb1c233be6131ec92df21ab46499a15b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:46 +0300 Subject: [PATCH 0695/1322] USB: devio: Don't corrupt user memory The user buffer has "uurb->buffer_length" bytes. If the kernel has more information than that, we should truncate it instead of writing past the end of the user's buffer. I added a WARN_ONCE() to help the user debug the issue. Reported-by: Alan Stern Cc: stable Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index e9326f31db8d..4664e543cf2f 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1576,7 +1576,11 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); - uurb->buffer_length = totlen; + if (totlen <= uurb->buffer_length) + uurb->buffer_length = totlen; + else + WARN_ONCE(1, "uurb->buffer_length is too short %d vs %d", + totlen, uurb->buffer_length); break; default: From 8fec9355a968ad240f3a2e9ad55b823cf1cc52ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 22 Sep 2017 22:18:18 +0200 Subject: [PATCH 0696/1322] USB: cdc-wdm: ignore -EPIPE from GetEncapsulatedResponse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver will forward errors to userspace after turning most of them into -EIO. But all status codes are not equal. The -EPIPE (stall) in particular can be seen more as a result of normal USB signaling than an actual error. The state is automatically cleared by the USB core without intervention from either driver or userspace. And most devices and firmwares will never trigger a stall as a result of GetEncapsulatedResponse. This is in fact a requirement for CDC WDM devices. Quoting from section 7.1 of the CDC WMC spec revision 1.1: The function shall not return STALL in response to GetEncapsulatedResponse. But this driver is also handling GetEncapsulatedResponse on behalf of the qmi_wwan and cdc_mbim drivers. Unfortunately the relevant specs are not as clear wrt stall. So some QMI and MBIM devices *will* occasionally stall, causing the GetEncapsulatedResponse to return an -EPIPE status. Translating this into -EIO for userspace has proven to be harmful. Treating it as an empty read is safer, making the driver behave as if the device was conforming to the CDC WDM spec. There have been numerous reports of issues related to -EPIPE errors from some newer CDC MBIM devices in particular, like for example the Fibocom L831-EAU. Testing on this device has shown that the issues go away if we simply ignore the -EPIPE status. Similar handling of -EPIPE is already known from e.g. usb_get_string() The -EPIPE log message is still kept to let us track devices with this unexpected behaviour, hoping that it attracts attention from firmware developers. Cc: Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100938 Reported-and-tested-by: Christian Ehrig Reported-and-tested-by: Patrick Chilton Reported-and-tested-by: Andreas Böhler Signed-off-by: Bjørn Mork Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 5aacea1978a5..3e865dbf878c 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -190,8 +190,10 @@ static void wdm_in_callback(struct urb *urb) /* * only set a new error if there is no previous error. * Errors are only cleared during read/open + * Avoid propagating -EPIPE (stall) to userspace since it is + * better handled as an empty read */ - if (desc->rerr == 0) + if (desc->rerr == 0 && status != -EPIPE) desc->rerr = status; if (length + desc->length > desc->wMaxCommand) { From 29b46dfb136cdbeece542b3f01115237e43f2855 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 11 Sep 2017 10:10:15 -0700 Subject: [PATCH 0697/1322] perf/x86/intel/uncore: Correct num_boxes for IIO and IRP There are 6 IIO/IRP boxes for CBDMA, PCIe0-2, MCP 0 and MCP 1 separately. Correct the num_boxes. Signed-off-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@kernel.org Link: http://lkml.kernel.org/r/1505149816-12580-1-git-send-email-kan.liang@intel.com --- arch/x86/events/intel/uncore_snbep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index db1fe377e6dd..a7196818416a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3462,7 +3462,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IIO0_MSR_PMON_CTL0, .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, @@ -3492,7 +3492,7 @@ static const struct attribute_group skx_uncore_format_group = { static struct intel_uncore_type skx_uncore_irp = { .name = "irp", .num_counters = 2, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IRP0_MSR_PMON_CTL0, .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, From 0add53713b1c07a1c71e27a20e21eb7c180b4e7b Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 19 Sep 2017 16:40:21 +0200 Subject: [PATCH 0698/1322] microblaze: Add missing kvm_para.h to Kbuild Running make allmodconfig;make is throwing compilation error: CC kernel/watchdog.o In file included from ./include/linux/kvm_para.h:4:0, from kernel/watchdog.c:29: ./include/uapi/linux/kvm_para.h:32:26: fatal error: asm/kvm_para.h: No such file or directory #include ^ compilation terminated. make[1]: *** [kernel/watchdog.o] Error 1 make: *** [kernel/watchdog.o] Error 2 Reported-by: Michal Hocko Suggested-by: Geert Uytterhoeven Signed-off-by: Michal Simek Fixes: 83f0124ad81e87b ("microblaze: remove asm-generic wrapper headers") Reviewed-by: Tobias Klauser Tested-by: Michal Hocko --- arch/microblaze/include/uapi/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/include/uapi/asm/Kbuild b/arch/microblaze/include/uapi/asm/Kbuild index e77a596f3f1e..06609ca36115 100644 --- a/arch/microblaze/include/uapi/asm/Kbuild +++ b/arch/microblaze/include/uapi/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += fcntl.h generic-y += ioctl.h generic-y += ioctls.h generic-y += ipcbuf.h +generic-y += kvm_para.h generic-y += mman.h generic-y += msgbuf.h generic-y += param.h From 64c99853baca40e2f06038c4a926009edd14c7c3 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 21 Sep 2017 00:29:36 +0200 Subject: [PATCH 0699/1322] microblaze: Cocci spatch "vma_pages" Use vma_pages function on vma object instead of explicit computation. Found by coccinelle spatch "api/vma_pages.cocci" Signed-off-by: Thomas Meyer Signed-off-by: Michal Simek --- arch/microblaze/kernel/dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index e45ada8fb006..94700c5270a9 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -165,7 +165,7 @@ int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { #ifdef CONFIG_MMU - unsigned long user_count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; unsigned long pfn; From 428dbf156cc5a8f9994d1f1a5c79373d15476f3c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 18 Sep 2017 10:53:29 -0600 Subject: [PATCH 0700/1322] arch: change default endian for microblaze Fix the default for microblaze. Michal Simek mentioned default for microblaze should be CPU_LITTLE_ENDIAN. Fixes : commit 206d3642d8ee ("arch/microblaze: add choice for endianness and update Makefile") Signed-off-by: Babu Moger Cc: Michal Simek Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 9d26abdf0dc1..4f798aa671dd 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -39,7 +39,7 @@ config MICROBLAZE # Endianness selection choice prompt "Endianness selection" - default CPU_BIG_ENDIAN + default CPU_LITTLE_ENDIAN help microblaze architectures can be configured for either little or big endian formats. Be sure to select the appropriate mode. From 89975bd335f37b96ffd3cc24b9effb1fa25e7788 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 20 Sep 2017 16:41:34 -0300 Subject: [PATCH 0701/1322] perf tools: Get all of tools/{arch,include}/ in the MANIFEST Now that I'm switching the container builds from using a local volume pointing to the kernel repository with the perf sources, instead getting a detached tarball to be able to use a container cluster, some places broke because I forgot to put some of the required files in tools/perf/MANIFEST, namely some bitsperlong.h files. So, to fix it do the same as for tools/build/ and pack the whole tools/arch/ directory. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-wmenpjfjsobwdnfde30qqncj@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/MANIFEST | 87 ++------------------------------------------- 1 file changed, 2 insertions(+), 85 deletions(-) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 62072822dc85..627b7cada144 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -1,34 +1,8 @@ tools/perf -tools/arch/alpha/include/asm/barrier.h -tools/arch/arm/include/asm/barrier.h -tools/arch/arm64/include/asm/barrier.h -tools/arch/ia64/include/asm/barrier.h -tools/arch/mips/include/asm/barrier.h -tools/arch/powerpc/include/asm/barrier.h -tools/arch/s390/include/asm/barrier.h -tools/arch/sh/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier.h -tools/arch/sparc/include/asm/barrier_32.h -tools/arch/sparc/include/asm/barrier_64.h -tools/arch/tile/include/asm/barrier.h -tools/arch/x86/include/asm/barrier.h -tools/arch/x86/include/asm/cmpxchg.h -tools/arch/x86/include/asm/cpufeatures.h -tools/arch/x86/include/asm/disabled-features.h -tools/arch/x86/include/asm/required-features.h -tools/arch/x86/include/uapi/asm/svm.h -tools/arch/x86/include/uapi/asm/vmx.h -tools/arch/x86/include/uapi/asm/kvm.h -tools/arch/x86/include/uapi/asm/kvm_perf.h -tools/arch/x86/lib/memcpy_64.S -tools/arch/x86/lib/memset_64.S -tools/arch/s390/include/uapi/asm/kvm_perf.h -tools/arch/s390/include/uapi/asm/sie.h -tools/arch/xtensa/include/asm/barrier.h +tools/arch tools/scripts tools/build -tools/arch/x86/include/asm/atomic.h -tools/arch/x86/include/asm/rmwcc.h +tools/include tools/lib/traceevent tools/lib/api tools/lib/bpf @@ -42,60 +16,3 @@ tools/lib/find_bit.c tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c -tools/include/asm/alternative-asm.h -tools/include/asm/atomic.h -tools/include/asm/barrier.h -tools/include/asm/bug.h -tools/include/asm-generic/atomic-gcc.h -tools/include/asm-generic/barrier.h -tools/include/asm-generic/bitops/arch_hweight.h -tools/include/asm-generic/bitops/atomic.h -tools/include/asm-generic/bitops/const_hweight.h -tools/include/asm-generic/bitops/__ffs.h -tools/include/asm-generic/bitops/__ffz.h -tools/include/asm-generic/bitops/__fls.h -tools/include/asm-generic/bitops/find.h -tools/include/asm-generic/bitops/fls64.h -tools/include/asm-generic/bitops/fls.h -tools/include/asm-generic/bitops/hweight.h -tools/include/asm-generic/bitops.h -tools/include/linux/atomic.h -tools/include/linux/bitops.h -tools/include/linux/compiler.h -tools/include/linux/compiler-gcc.h -tools/include/linux/coresight-pmu.h -tools/include/linux/bug.h -tools/include/linux/filter.h -tools/include/linux/hash.h -tools/include/linux/kernel.h -tools/include/linux/list.h -tools/include/linux/log2.h -tools/include/uapi/asm-generic/fcntl.h -tools/include/uapi/asm-generic/ioctls.h -tools/include/uapi/asm-generic/mman-common.h -tools/include/uapi/asm-generic/mman.h -tools/include/uapi/drm/drm.h -tools/include/uapi/drm/i915_drm.h -tools/include/uapi/linux/bpf.h -tools/include/uapi/linux/bpf_common.h -tools/include/uapi/linux/fcntl.h -tools/include/uapi/linux/hw_breakpoint.h -tools/include/uapi/linux/kvm.h -tools/include/uapi/linux/mman.h -tools/include/uapi/linux/perf_event.h -tools/include/uapi/linux/sched.h -tools/include/uapi/linux/stat.h -tools/include/uapi/linux/vhost.h -tools/include/uapi/sound/asound.h -tools/include/linux/poison.h -tools/include/linux/rbtree.h -tools/include/linux/rbtree_augmented.h -tools/include/linux/refcount.h -tools/include/linux/string.h -tools/include/linux/stringify.h -tools/include/linux/types.h -tools/include/linux/err.h -tools/include/linux/bitmap.h -tools/include/linux/time64.h -tools/arch/*/include/uapi/asm/mman.h -tools/arch/*/include/uapi/asm/perf_regs.h From 549a3976523c69a0245c0a310210c824a0b26e35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Sep 2017 09:38:23 +0200 Subject: [PATCH 0702/1322] tools include: Sync kernel ABI headers with tooling headers Time for a sync with ABI/uapi headers with the upcoming v4.14 kernel. None of the ABI changes require any source code level changes to our existing in-kernel tooling code: - tools/arch/s390/include/uapi/asm/kvm.h: New KVM_S390_VM_TOD_EXT ABI, not used by in-kernel tooling. - tools/arch/x86/include/asm/cpufeatures.h: tools/arch/x86/include/asm/disabled-features.h: New PCID, SME and VGIF x86 CPU feature bits defined. - tools/include/asm-generic/hugetlb_encode.h: tools/include/uapi/asm-generic/mman-common.h: tools/include/uapi/linux/mman.h: Two new madvise() flags, plus a hugetlb system call mmap flags restructuring/extension changes. - tools/include/uapi/drm/drm.h: tools/include/uapi/drm/i915_drm.h: New drm_syncobj_create flags definitions, new drm_syncobj_wait and drm_syncobj_array ABIs. DRM_I915_PERF_* calls and a new I915_PARAM_HAS_EXEC_FENCE_ARRAY ABI for the Intel driver. - tools/include/uapi/linux/bpf.h: New bpf_sock fields (::mark and ::priority), new XDP_REDIRECT action, new kvm_ppc_smmu_info fields (::data_keys, instr_keys) Signed-off-by: Ingo Molnar Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Taeung Song Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/20170913073823.lxmi4c7ejqlfabjx@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/kvm.h | 6 +++ tools/arch/x86/include/asm/cpufeatures.h | 2 + .../arch/x86/include/asm/disabled-features.h | 4 +- tools/include/asm-generic/hugetlb_encode.h | 34 +++++++++++++ tools/include/uapi/asm-generic/mman-common.h | 14 ++--- tools/include/uapi/drm/drm.h | 22 ++++++++ tools/include/uapi/drm/i915_drm.h | 51 ++++++++++++++++++- tools/include/uapi/linux/bpf.h | 32 +++++++----- tools/include/uapi/linux/kvm.h | 3 +- tools/include/uapi/linux/mman.h | 24 ++++++++- 10 files changed, 164 insertions(+), 28 deletions(-) create mode 100644 tools/include/asm-generic/hugetlb_encode.h diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index 69d09c39bbcd..cd7359e23d86 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -88,6 +88,12 @@ struct kvm_s390_io_adapter_req { /* kvm attributes for KVM_S390_VM_TOD */ #define KVM_S390_VM_TOD_LOW 0 #define KVM_S390_VM_TOD_HIGH 1 +#define KVM_S390_VM_TOD_EXT 2 + +struct kvm_s390_vm_tod_clock { + __u8 epoch_idx; + __u64 tod; +}; /* kvm attributes for KVM_S390_VM_CPU_MODEL */ /* processor related attributes are r/w */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 8ea315a11fe0..2519c6c801c9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ @@ -287,6 +288,7 @@ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h new file mode 100644 index 000000000000..e4732d3c2998 --- /dev/null +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -0,0 +1,34 @@ +#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_ +#define _ASM_GENERIC_HUGETLB_ENCODE_H_ + +/* + * Several system calls take a flag to request "hugetlb" huge pages. + * Without further specification, these system calls will use the + * system's default huge page size. If a system supports multiple + * huge page sizes, the desired huge page size can be specified in + * bits [26:31] of the flag arguments. The value in these 6 bits + * will encode the log2 of the huge page size. + * + * The following definitions are associated with this huge page size + * encoding in flag arguments. System call specific header files + * that use this encoding should include this file. They can then + * provide definitions based on these with their own specific prefix. + * for example: + * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT + */ + +#define HUGETLB_FLAG_ENCODE_SHIFT 26 +#define HUGETLB_FLAG_ENCODE_MASK 0x3f + +#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) + +#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 8c27db0c5c08..203268f9231e 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -58,20 +58,12 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 -/* - * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size. - * This gives us 6 bits, which is enough until someone invents 128 bit address - * spaces. - * - * Assume these are all power of twos. - * When 0 use the default page size. - */ -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f - #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 101593ab10ac..97677cd6964d 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -700,6 +700,7 @@ struct drm_prime_handle { struct drm_syncobj_create { __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) __u32 flags; }; @@ -718,6 +719,24 @@ struct drm_syncobj_handle { __u32 pad; }; +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; +}; + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + #if defined(__cplusplus) } #endif @@ -840,6 +859,9 @@ extern "C" { #define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) #define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) #define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) /** * Device specific ioctls should only be in their respective headers diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 7ccbd6a2bbe0..6598fb76d2c2 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -260,6 +260,8 @@ typedef struct _drm_i915_sarea { #define DRM_I915_GEM_CONTEXT_GETPARAM 0x34 #define DRM_I915_GEM_CONTEXT_SETPARAM 0x35 #define DRM_I915_PERF_OPEN 0x36 +#define DRM_I915_PERF_ADD_CONFIG 0x37 +#define DRM_I915_PERF_REMOVE_CONFIG 0x38 #define DRM_IOCTL_I915_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t) #define DRM_IOCTL_I915_FLUSH DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH) @@ -315,6 +317,8 @@ typedef struct _drm_i915_sarea { #define DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_GETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_SETPARAM, struct drm_i915_gem_context_param) #define DRM_IOCTL_I915_PERF_OPEN DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_OPEN, struct drm_i915_perf_open_param) +#define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config) +#define DRM_IOCTL_I915_PERF_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64) /* Allow drivers to submit batchbuffers directly to hardware, relying * on the security mechanisms provided by hardware. @@ -431,6 +435,11 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_HAS_EXEC_BATCH_FIRST 48 +/* Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying an array of + * drm_i915_gem_exec_fence structures. See I915_EXEC_FENCE_ARRAY. + */ +#define I915_PARAM_HAS_EXEC_FENCE_ARRAY 49 + typedef struct drm_i915_getparam { __s32 param; /* @@ -812,6 +821,17 @@ struct drm_i915_gem_exec_object2 { __u64 rsvd2; }; +struct drm_i915_gem_exec_fence { + /** + * User's handle for a drm_syncobj to wait on or signal. + */ + __u32 handle; + +#define I915_EXEC_FENCE_WAIT (1<<0) +#define I915_EXEC_FENCE_SIGNAL (1<<1) + __u32 flags; +}; + struct drm_i915_gem_execbuffer2 { /** * List of gem_exec_object2 structs @@ -826,7 +846,11 @@ struct drm_i915_gem_execbuffer2 { __u32 DR1; __u32 DR4; __u32 num_cliprects; - /** This is a struct drm_clip_rect *cliprects */ + /** + * This is a struct drm_clip_rect *cliprects if I915_EXEC_FENCE_ARRAY + * is not set. If I915_EXEC_FENCE_ARRAY is set, then this is a + * struct drm_i915_gem_exec_fence *fences. + */ __u64 cliprects_ptr; #define I915_EXEC_RING_MASK (7<<0) #define I915_EXEC_DEFAULT (0<<0) @@ -927,7 +951,14 @@ struct drm_i915_gem_execbuffer2 { * element). */ #define I915_EXEC_BATCH_FIRST (1<<18) -#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1)) + +/* Setting I915_FENCE_ARRAY implies that num_cliprects and cliprects_ptr + * define an array of i915_gem_exec_fence structures which specify a set of + * dma fences to wait upon or signal. + */ +#define I915_EXEC_FENCE_ARRAY (1<<19) + +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_ARRAY<<1)) #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff) #define i915_execbuffer2_set_context_id(eb2, context) \ @@ -1467,6 +1498,22 @@ enum drm_i915_perf_record_type { DRM_I915_PERF_RECORD_MAX /* non-ABI */ }; +/** + * Structure to upload perf dynamic configuration into the kernel. + */ +struct drm_i915_perf_oa_config { + /** String formatted like "%08x-%04x-%04x-%04x-%012x" */ + char uuid[36]; + + __u32 n_mux_regs; + __u32 n_boolean_regs; + __u32 n_flex_regs; + + __u64 __user mux_regs_ptr; + __u64 __user boolean_regs_ptr; + __u64 __user flex_regs_ptr; +}; + #if defined(__cplusplus) } #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 461811e57140..43ab5c402f98 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -143,12 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -enum bpf_sockmap_flags { - BPF_SOCKMAP_UNSPEC, - BPF_SOCKMAP_STRPARSER, - __MAX_BPF_SOCKMAP_FLAG -}; - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -368,9 +362,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) + * redirect to endpoint in map + * @map: pointer to dev map + * @key: index in map to lookup + * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -632,7 +637,7 @@ union bpf_attr { FN(skb_adjust_room), \ FN(redirect_map), \ FN(sk_redirect_map), \ - FN(sock_map_update), + FN(sock_map_update), \ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -753,20 +758,23 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 6cd63c18708a..838887587411 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size { struct kvm_ppc_smmu_info { __u64 flags; __u32 slb_size; - __u32 pad; + __u16 data_keys; /* # storage keys supported for data */ + __u16 instr_keys; /* # storage keys supported for instructions */ struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index 81d8edf11789..a937480d7cd3 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -1,7 +1,8 @@ #ifndef _UAPI_LINUX_MMAN_H #define _UAPI_LINUX_MMAN_H -#include +#include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -10,4 +11,25 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +/* + * Huge page size encoding when MAP_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB + #endif /* _UAPI_LINUX_MMAN_H */ From f1e52f14a69386ac460a8d700df0647a631cf595 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 22 Sep 2017 15:41:44 -0300 Subject: [PATCH 0703/1322] perf evsel: Fix attr.exclude_kernel setting for default cycles:p Yet another fix for probing the max attr.precise_ip setting: it is not enough settting attr.exclude_kernel for !root users, as they _can_ profile the kernel if the kernel.perf_event_paranoid sysctl is set to -1, so check that as well. Testing it: As non root: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = 2 $ perf record sleep 1 $ perf evlist -v cycles:uppp: ..., exclude_kernel: 1, ... precise_ip: 3, ... Now as non-root, but with kernel.perf_event_paranoid set set to the most permissive value, -1: $ sysctl kernel.perf_event_paranoid kernel.perf_event_paranoid = -1 $ perf record sleep 1 $ perf evlist -v cycles:ppp: ..., exclude_kernel: 0, ... precise_ip: 3, ... $ I.e. non-root, default kernel.perf_event_paranoid: :uppp modifier = not allowed to sample the kernel, non-root, most permissible kernel.perf_event_paranoid: :ppp = allowed to sample the kernel. In both cases, use the highest available precision: attr.precise_ip = 3. Reported-and-Tested-by: Ingo Molnar Cc: Adrian Hunter Cc: Andy Lutomirski Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Fixes: d37a36979077 ("perf evsel: Fix attr.exclude_kernel setting for default cycles:p") Link: http://lkml.kernel.org/n/tip-nj2qkf75xsd6pw6hhjzfqqdx@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 4bb89373eb52..0dccdb89572c 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -271,12 +271,17 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) return evsel; } +static bool perf_event_can_profile_kernel(void) +{ + return geteuid() == 0 || perf_event_paranoid() == -1; +} + struct perf_evsel *perf_evsel__new_cycles(bool precise) { struct perf_event_attr attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .exclude_kernel = geteuid() != 0, + .exclude_kernel = !perf_event_can_profile_kernel(), }; struct perf_evsel *evsel; From 44d8143340a99b167c74365e844516b73523c087 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:40 -0700 Subject: [PATCH 0704/1322] KEYS: fix cred refcount leak in request_key_auth_new() In request_key_auth_new(), if key_alloc() or key_instantiate_and_link() were to fail, we would leak a reference to the 'struct cred'. Currently this can only happen if key_alloc() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. Fix it by cleaning things up to use a helper function which frees a 'struct request_key_auth' correctly. Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 68 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index afe9d22ab361..69d6b3b35470 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -120,6 +120,18 @@ static void request_key_auth_revoke(struct key *key) } } +static void free_request_key_auth(struct request_key_auth *rka) +{ + if (!rka) + return; + key_put(rka->target_key); + key_put(rka->dest_keyring); + if (rka->cred) + put_cred(rka->cred); + kfree(rka->callout_info); + kfree(rka); +} + /* * Destroy an instantiation authorisation token key. */ @@ -129,15 +141,7 @@ static void request_key_auth_destroy(struct key *key) kenter("{%d}", key->serial); - if (rka->cred) { - put_cred(rka->cred); - rka->cred = NULL; - } - - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); + free_request_key_auth(rka); } /* @@ -151,22 +155,17 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, const struct cred *cred = current->cred; struct key *authkey = NULL; char desc[20]; - int ret; + int ret = -ENOMEM; kenter("%d,", target->serial); /* allocate a auth record */ - rka = kmalloc(sizeof(*rka), GFP_KERNEL); - if (!rka) { - kleave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); - } + rka = kzalloc(sizeof(*rka), GFP_KERNEL); + if (!rka) + goto error; rka->callout_info = kmalloc(callout_len, GFP_KERNEL); - if (!rka->callout_info) { - kleave(" = -ENOMEM"); - kfree(rka); - return ERR_PTR(-ENOMEM); - } + if (!rka->callout_info) + goto error_free_rka; /* see if the calling process is already servicing the key request of * another process */ @@ -176,8 +175,12 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, /* if the auth key has been revoked, then the key we're * servicing is already instantiated */ - if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags)) - goto auth_key_revoked; + if (test_bit(KEY_FLAG_REVOKED, + &cred->request_key_auth->flags)) { + up_read(&cred->request_key_auth->sem); + ret = -EKEYREVOKED; + goto error_free_rka; + } irka = cred->request_key_auth->payload.data[0]; rka->cred = get_cred(irka->cred); @@ -205,32 +208,23 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(authkey)) { ret = PTR_ERR(authkey); - goto error_alloc; + goto error_free_rka; } /* construct the auth key */ ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL); if (ret < 0) - goto error_inst; + goto error_put_authkey; kleave(" = {%d,%d}", authkey->serial, refcount_read(&authkey->usage)); return authkey; -auth_key_revoked: - up_read(&cred->request_key_auth->sem); - kfree(rka->callout_info); - kfree(rka); - kleave("= -EKEYREVOKED"); - return ERR_PTR(-EKEYREVOKED); - -error_inst: +error_put_authkey: key_revoke(authkey); key_put(authkey); -error_alloc: - key_put(rka->target_key); - key_put(rka->dest_keyring); - kfree(rka->callout_info); - kfree(rka); +error_free_rka: + free_request_key_auth(rka); +error: kleave("= %d", ret); return ERR_PTR(ret); } From f7b48cf08fa63a68b59c2894806ee478216d7f91 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:41 -0700 Subject: [PATCH 0705/1322] KEYS: don't revoke uninstantiated key in request_key_auth_new() If key_instantiate_and_link() were to fail (which fortunately isn't possible currently), the call to key_revoke(authkey) would crash with a NULL pointer dereference in request_key_auth_revoke() because the key has not yet been instantiated. Fix this by removing the call to key_revoke(). key_put() is sufficient, as it's not possible for an uninstantiated authkey to have been used for anything yet. Fixes: b5f545c880a2 ("[PATCH] keys: Permit running process to instantiate keys") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 69d6b3b35470..e356075ed2f8 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -220,7 +220,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, return authkey; error_put_authkey: - key_revoke(authkey); key_put(authkey); error_free_rka: free_request_key_auth(rka); From 884bee0215fcc239b30c062c37ca29077005e064 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:12 -0700 Subject: [PATCH 0706/1322] KEYS: fix key refcount leak in keyctl_assume_authority() In keyctl_assume_authority(), if keyctl_change_reqkey_auth() were to fail, we would leak the reference to the 'authkey'. Currently this can only happen if prepare_creds() fails to allocate memory. But it still should be fixed, as it is a more severe bug waiting to happen. This patch also moves the read of 'authkey->serial' to before the reference to the authkey is dropped. Doing the read after dropping the reference is very fragile because it assumes we still hold another reference to the key. (Which we do, in current->cred->request_key_auth, but there's no reason not to write it in the "obviously correct" way.) Fixes: d84f4f992cbd ("CRED: Inaugurate COW credentials") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index ab0b337c84b4..562f7fe287a0 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1406,11 +1406,9 @@ long keyctl_assume_authority(key_serial_t id) } ret = keyctl_change_reqkey_auth(authkey); - if (ret < 0) - goto error; + if (ret == 0) + ret = authkey->serial; key_put(authkey); - - ret = authkey->serial; error: return ret; } From 7fc0786d956d9e59b68d282be9b156179846ea3d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:31 -0700 Subject: [PATCH 0707/1322] KEYS: fix key refcount leak in keyctl_read_key() In keyctl_read_key(), if key_permission() were to return an error code other than EACCES, we would leak a the reference to the key. This can't actually happen currently because key_permission() can only return an error code other than EACCES if security_key_permission() does, only SELinux and Smack implement that hook, and neither can return an error code other than EACCES. But it should still be fixed, as it is a bug waiting to happen. Fixes: 29db91906340 ("[PATCH] Keys: Add LSM hooks for key management [try #3]") Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 562f7fe287a0..aa1d11a29136 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -771,7 +771,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) if (ret == 0) goto can_read_key; if (ret != -EACCES) - goto error; + goto error2; /* we can't; see if it's searchable from this process's keyrings * - we automatically take account of the fact that it may be From e645016abc803dafc75e4b8f6e4118f088900ffb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:45 -0700 Subject: [PATCH 0708/1322] KEYS: fix writing past end of user-supplied buffer in keyring_read() Userspace can call keyctl_read() on a keyring to get the list of IDs of keys in the keyring. But if the user-supplied buffer is too small, the kernel would write the full list anyway --- which will corrupt whatever userspace memory happened to be past the end of the buffer. Fix it by only filling the space that is available. Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index de81793f9920..94f038967c17 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -423,7 +423,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) } struct keyring_read_iterator_context { - size_t qty; + size_t buflen; size_t count; key_serial_t __user *buffer; }; @@ -435,9 +435,9 @@ static int keyring_read_iterator(const void *object, void *data) int ret; kenter("{%s,%d},,{%zu/%zu}", - key->type->name, key->serial, ctx->count, ctx->qty); + key->type->name, key->serial, ctx->count, ctx->buflen); - if (ctx->count >= ctx->qty) + if (ctx->count >= ctx->buflen) return 1; ret = put_user(key->serial, ctx->buffer); @@ -472,16 +472,12 @@ static long keyring_read(const struct key *keyring, return 0; /* Calculate how much data we could return */ - ctx.qty = nr_keys * sizeof(key_serial_t); - if (!buffer || !buflen) - return ctx.qty; - - if (buflen > ctx.qty) - ctx.qty = buflen; + return nr_keys * sizeof(key_serial_t); /* Copy the IDs of the subscribed keys into the buffer */ ctx.buffer = (key_serial_t __user *)buffer; + ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { From 237bbd29f7a049d310d907f4b2716a7feef9abf3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: [PATCH 0709/1322] KEYS: prevent creating a different user's keyrings It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Cc: [v2.6.26+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 6 ++++-- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 044114185120..e315e16b6ff8 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -187,6 +187,7 @@ struct key { #define KEY_FLAG_BUILTIN 8 /* set if key is built in to the kernel */ #define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 10 /* set if key should not be removed */ +#define KEY_FLAG_UID_KEYRING 11 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -243,6 +244,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_BUILT_IN 0x0004 /* Key is built into kernel */ #define KEY_ALLOC_BYPASS_RESTRICTION 0x0008 /* Override the check on restricted keyrings */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 1c02c6547038..503adbae7b0d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -141,7 +141,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 83da68d98b40..e5c0896c3a8f 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -302,6 +302,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 94f038967c17..4fa82a8a9c0e 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1097,15 +1097,15 @@ found: /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -1133,10 +1133,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 86bced9fdbdf..293d3598153b 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -77,7 +77,8 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); @@ -94,7 +95,8 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); From 37863c43b2c6464f252862bf2e9768264e961678 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:23 -0700 Subject: [PATCH 0710/1322] KEYS: prevent KEYCTL_READ on negative key Because keyctl_read_key() looks up the key with no permissions requested, it may find a negatively instantiated key. If the key is also possessed, we went ahead and called ->read() on the key. But the key payload will actually contain the ->reject_error rather than the normal payload. Thus, the kernel oopses trying to read the user_key_payload from memory address (int)-ENOKEY = 0x00000000ffffff82. Fortunately the payload data is stored inline, so it shouldn't be possible to abuse this as an arbitrary memory read primitive... Reproducer: keyctl new_session keyctl request2 user desc '' @s keyctl read $(keyctl show | awk '/user: desc/ {print $1}') It causes a crash like the following: BUG: unable to handle kernel paging request at 00000000ffffff92 IP: user_read+0x33/0xa0 PGD 36a54067 P4D 36a54067 PUD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 211 Comm: keyctl Not tainted 4.14.0-rc1 #337 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 task: ffff90aa3b74c3c0 task.stack: ffff9878c0478000 RIP: 0010:user_read+0x33/0xa0 RSP: 0018:ffff9878c047bee8 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff90aa3d7da340 RCX: 0000000000000017 RDX: 0000000000000000 RSI: 00000000ffffff82 RDI: ffff90aa3d7da340 RBP: ffff9878c047bf00 R08: 00000024f95da94f R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f58ece69740(0000) GS:ffff90aa3e200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffff92 CR3: 0000000036adc001 CR4: 00000000003606f0 Call Trace: keyctl_read_key+0xac/0xe0 SyS_keyctl+0x99/0x120 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f58ec787bb9 RSP: 002b:00007ffc8d401678 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa RAX: ffffffffffffffda RBX: 00007ffc8d402800 RCX: 00007f58ec787bb9 RDX: 0000000000000000 RSI: 00000000174a63ac RDI: 000000000000000b RBP: 0000000000000004 R08: 00007ffc8d402809 R09: 0000000000000020 R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc8d402800 R13: 00007ffc8d4016e0 R14: 0000000000000000 R15: 0000000000000000 Code: e5 41 55 49 89 f5 41 54 49 89 d4 53 48 89 fb e8 a4 b4 ad ff 85 c0 74 09 80 3d b9 4c 96 00 00 74 43 48 8b b3 20 01 00 00 4d 85 ed <0f> b7 5e 10 74 29 4d 85 e4 74 24 4c 39 e3 4c 89 e2 4c 89 ef 48 RIP: user_read+0x33/0xa0 RSP: ffff9878c047bee8 CR2: 00000000ffffff92 Fixes: 61ea0c0ba904 ("KEYS: Skip key state checks when checking for possession") Cc: [v3.13+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/keyctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index aa1d11a29136..365ff85d7e27 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -766,6 +766,11 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + ret = -ENOKEY; + goto error2; + } + /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) From 8f674565d405a8c0b36ee531849df87f43e72ed5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:39 -0700 Subject: [PATCH 0711/1322] KEYS: reset parent each time before searching key_user_tree In key_user_lookup(), if there is no key_user for the given uid, we drop key_user_lock, allocate a new key_user, and search the tree again. But we failed to set 'parent' to NULL at the beginning of the second search. If the tree were to be empty for the second search, the insertion would be done with an invalid 'parent', scribbling over freed memory. Fortunately this can't actually happen currently because the tree always contains at least the root_key_user. But it still should be fixed to make the code more robust. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/key.c b/security/keys/key.c index e5c0896c3a8f..eb914a838840 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -54,10 +54,10 @@ void __key_check(const struct key *key) struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; - struct rb_node *parent = NULL; - struct rb_node **p; + struct rb_node *parent, **p; try_again: + parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); From 4aa68e07d845562561f5e73c04aa521376e95252 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:38:29 -0700 Subject: [PATCH 0712/1322] KEYS: restrict /proc/keys by credentials at open time When checking for permission to view keys whilst reading from /proc/keys, we should use the credentials with which the /proc/keys file was opened. This is because, in a classic type of exploit, it can be possible to bypass checks for the *current* credentials by passing the file descriptor to a suid program. Following commit 34dbbcdbf633 ("Make file credentials available to the seqfile interfaces") we can finally fix it. So let's do it. Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/proc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/security/keys/proc.c b/security/keys/proc.c index bf08d02b6646..de834309d100 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -187,7 +187,7 @@ static int proc_keys_show(struct seq_file *m, void *v) struct keyring_search_context ctx = { .index_key.type = key->type, .index_key.description = key->description, - .cred = current_cred(), + .cred = m->file->f_cred, .match_data.cmp = lookup_user_key_possessed, .match_data.raw_data = key, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, @@ -207,11 +207,7 @@ static int proc_keys_show(struct seq_file *m, void *v) } } - /* check whether the current task is allowed to view the key (assuming - * non-possession) - * - the caller holds a spinlock, and thus the RCU read lock, making our - * access to __current_cred() safe - */ + /* check whether the current task is allowed to view the key */ rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); if (rc < 0) return 0; From e007ce9c59bddd1e67b94bc29036d920f5c5428a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Sep 2017 13:57:42 -0700 Subject: [PATCH 0713/1322] KEYS: use kmemdup() in request_key_auth_new() kmemdup() is preferred to kmalloc() followed by memcpy(). Signed-off-by: Eric Biggers Signed-off-by: David Howells --- security/keys/request_key_auth.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index e356075ed2f8..6ebf1af8fce9 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -163,9 +163,10 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka = kzalloc(sizeof(*rka), GFP_KERNEL); if (!rka) goto error; - rka->callout_info = kmalloc(callout_len, GFP_KERNEL); + rka->callout_info = kmemdup(callout_info, callout_len, GFP_KERNEL); if (!rka->callout_info) goto error_free_rka; + rka->callout_len = callout_len; /* see if the calling process is already servicing the key request of * another process */ @@ -196,8 +197,6 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, rka->target_key = key_get(target); rka->dest_keyring = key_get(dest_keyring); - memcpy(rka->callout_info, callout_info, callout_len); - rka->callout_len = callout_len; /* allocate the auth key */ sprintf(desc, "%x", target->serial); From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 17:48:06 +0200 Subject: [PATCH 0714/1322] futex: Fix pi_state->owner serialization There was a reported suspicion about a race between exit_pi_state_list() and put_pi_state(). The same report mentioned the comment with put_pi_state() said it should be called with hb->lock held, and it no longer is in all places. As it turns out, the pi_state->owner serialization is indeed broken. As per the new rules: 734009e96d19 ("futex: Change locking rules") pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. For the sites setting pi_state->owner we already hold wait_lock (where required) but exit_pi_state_list() and put_pi_state() were not and raced on clearing it. Fixes: 734009e96d19 ("futex: Change locking rules") Reported-by: Gratian Crisan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: dvhart@infradead.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net --- kernel/futex.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); From 2827a418ca1b23e432e62c9b3d0e7cf3255dfe88 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 19 Sep 2017 22:04:12 +0200 Subject: [PATCH 0715/1322] genirq: Check __free_irq() return value for NULL __free_irq() can return a NULL irqaction for example when trying to free already-free IRQ, but the callsite unconditionally dereferences the returned pointer. Fix this by adding a check and return NULL. Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170919200412.GA29985@gmail.com --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; From 02a4843618fb35f847cf8c31cd3893873aa0edde Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 13 Sep 2017 09:17:57 -0400 Subject: [PATCH 0716/1322] brd: fix overflow in __brd_direct_access The code in __brd_direct_access multiplies the pgoff variable by page size and divides it by 512. It can cause overflow on 32-bit architectures. The overflow happens if we create ramdisk larger than 4G and use it as a sparse device. This patch replaces multiplication and division with multiplication by the number of sectors per page. Reviewed-by: Dan Williams Signed-off-by: Mikulas Patocka Fixes: 1647b9b959c7 ("brd: add dax_operations support") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bbd0d186cfc0..2d7178f7754e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -342,7 +342,7 @@ static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff, if (!brd) return -ENODEV; - page = brd_insert_page(brd, PFN_PHYS(pgoff) / 512); + page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT); if (!page) return -ENOSPC; *kaddr = page_address(page); From f507b54dccfd8000c517d740bc45f20c74532d18 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:35 +0200 Subject: [PATCH 0717/1322] bsg-lib: don't free job in bsg_prepare_job The job structure is allocated as part of the request, so we should not free it in the error path of bsg_prepare_job. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bsg-lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index c82408c7cc3c..dbddff8174e5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -154,7 +154,6 @@ static int bsg_prepare_job(struct device *dev, struct request *req) failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - kfree(job); return -ENOMEM; } From 1dae69bedeeca0b57e441eae491fbd38049c0b47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 May 2017 22:25:18 -0400 Subject: [PATCH 0718/1322] nbd: ignore non-nbd ioctl's In testing we noticed that nbd would spew if you ran a fio job against the raw device itself. This is because fio calls a block device specific ioctl, however the block layer will first pass this back to the driver ioctl handler in case the driver wants to do something special. Since the device was setup using netlink this caused us to spew every time fio called this ioctl. Since we don't have special handling, just error out for any non-nbd specific ioctl's that come in. This fixes the spew. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2aa87cbdede0..3684e21d543f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1194,6 +1194,12 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* The block layer will pass back some non-nbd ioctls in case we have + * special handling for them, but we don't so just return an error. + */ + if (_IOC_TYPE(cmd) != 0xab) + return -EINVAL; + mutex_lock(&nbd->config_lock); /* Don't allow ioctl operations on a nbd device that was created with From 5acb3cc2c2e9d3020a4fee43763c6463767f1572 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 20 Sep 2017 13:12:20 -0600 Subject: [PATCH 0719/1322] blktrace: Fix potential deadlock between delete & sysfs ops The lockdep code had reported the following unsafe locking scenario: CPU0 CPU1 ---- ---- lock(s_active#228); lock(&bdev->bd_mutex/1); lock(s_active#228); lock(&bdev->bd_mutex); *** DEADLOCK *** The deadlock may happen when one task (CPU1) is trying to delete a partition in a block device and another task (CPU0) is accessing tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that partition. The s_active isn't an actual lock. It is a reference count (kn->count) on the sysfs (kernfs) file. Removal of a sysfs file, however, require a wait until all the references are gone. The reference count is treated like a rwsem using lockdep instrumentation code. The fact that a thread is in the sysfs callback method or in the ioctl call means there is a reference to the opended sysfs or device file. That should prevent the underlying block structure from being removed. Instead of using bd_mutex in the block_device structure, a new blk_trace_mutex is now added to the request_queue structure to protect access to the blk_trace structure. Suggested-by: Christoph Hellwig Signed-off-by: Waiman Long Acked-by: Steven Rostedt (VMware) Fix typo in patch subject line, and prune a comment detailing how the code used to work. Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ include/linux/blkdev.h | 1 + kernel/trace/blktrace.c | 18 ++++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index aebe676225e6..048be4aa6024 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,6 +854,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) kobject_init(&q->kobj, &blk_queue_ktype); +#ifdef CONFIG_BLK_DEV_IO_TRACE + mutex_init(&q->blk_trace_mutex); +#endif mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 460294bb0fa5..02fa42d24b52 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; + struct mutex blk_trace_mutex; #endif /* * for flush operations diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: From e5313c141b49c1b1af43d1ca81398185d66ad1a6 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 20 Sep 2017 14:24:34 -0700 Subject: [PATCH 0720/1322] loop: remove union of use_aio and ref in struct loop_cmd When the request is completed, lo_complete_rq() checks cmd->use_aio. However, if this is in fact an aio request, cmd->use_aio will have already been reused as cmd->ref by lo_rw_aio*. Fix it by not using a union. On x86_64, there's a hole after the union anyways, so this doesn't make struct loop_cmd any bigger. Fixes: 92d773324b7e ("block/loop: fix use after free") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- drivers/block/loop.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index f68c1d50802f..1f3956702993 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -67,10 +67,8 @@ struct loop_device { struct loop_cmd { struct kthread_work work; struct request *rq; - union { - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ - }; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; From 56b7103a06083b8ce1160f8289460ba2f584e182 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:26 -0700 Subject: [PATCH 0721/1322] nvme-fc: remove use of FC-specific error codes The FC-NVME transport used the FC-specific error codes in cases where it had to fabricate an error to go back up stack. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d2e882c0f496..9100779b58c9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1376,7 +1376,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); else if (freq->status) - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); /* * For the linux implementation, if we have an unsuccesful @@ -1404,7 +1404,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) */ if (freq->transferred_length != be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result.u64 = 0; @@ -1421,7 +1421,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } result = cqe->result; @@ -1429,7 +1429,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1); + status = cpu_to_le16(NVME_SC_INTERNAL << 1); goto done; } From 29b3d26ecc8046838de88205b7c4b182ac27ff65 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:27 -0700 Subject: [PATCH 0722/1322] nvmet-fc: remove use of FC-specific error codes The FC-NVME target transport used the FC-specific error codes in return codes when the transport or lldd failed. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 421e43bf1dd7..088f07250d76 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1910,8 +1910,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, spin_lock_irqsave(&fod->flock, flags); fod->writedataactive = false; spin_unlock_irqrestore(&fod->flock, flags); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ { fcpreq->fcp_error = ret; fcpreq->transferred_length = 0; @@ -1929,8 +1928,7 @@ __nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort) /* if in the middle of an io and we need to tear down */ if (abort) { if (fcpreq->op == NVMET_FCOP_WRITEDATA) { - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return true; } @@ -1968,8 +1966,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) fod->abort = true; spin_unlock(&fod->flock); - nvmet_req_complete(&fod->req, - NVME_SC_FC_TRANSPORT_ERROR); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; } From fc9608e8b4dc3c2545fa0bc5d3eef158ca56ccf8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:28 -0700 Subject: [PATCH 0723/1322] nvmet-fcloop: remove use of FC-specific error codes The FC-NVME transport loopback test module used the FC-specific error codes in cases where it emulated a transport abort case. Instead of using the FC-specific values, now use a generic value (NVME_SC_INTERNAL). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1cb9847ec261..1fd1afbb8b2a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -576,7 +576,7 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); - tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED; + tfcp_req->status = NVME_SC_INTERNAL; /* * nothing more to do. If io wasn't active, the transport should From 8e009ce84683fa124b23ff5cb7fd87c48b331b88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:29 -0700 Subject: [PATCH 0724/1322] lpfc: remove use of FC-specific error codes The lpfc driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/lpfc/lpfc_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 79ba3ce063a4..23bdb1ca106e 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -884,7 +884,7 @@ out_err: wcqe->total_data_placed); nCmd->transferred_length = 0; nCmd->rcv_rsplen = 0; - nCmd->status = NVME_SC_FC_TRANSPORT_ERROR; + nCmd->status = NVME_SC_INTERNAL; } } From 39a550d2d9eaee8b618084e6011441eac6a2a3b7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:30:15 -0700 Subject: [PATCH 0725/1322] qla2xxx: remove use of FC-specific error codes The qla2xxx driver uses the FC-specific error when it needed to return an error to the FC-NVME transport. Convert to use a generic value instead. Signed-off-by: James Smart Acked-by: Himanshu Madhani Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/qla2xxx/qla_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 1f59e7a74c7b..6b33a1f24f56 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -180,7 +180,7 @@ static void qla_nvme_sp_done(void *ptr, int res) goto rel; if (unlikely(res == QLA_FUNCTION_FAILED)) - fd->status = NVME_SC_FC_TRANSPORT_ERROR; + fd->status = NVME_SC_INTERNAL; else fd->status = 0; From c98cb3bd882119e7e1a7c8df2f1eacfcc701450b Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 16:27:25 -0700 Subject: [PATCH 0726/1322] nvme.h: remove FC transport-specific error values The NVM express group recinded the reserved range for the transport. Remove the FC-centric values that had been defined. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 87723c86f136..2440be32be1d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1127,19 +1127,6 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, - - - /* - * FC Transport-specific error status values for NVME commands - * - * Transport-specific status code values must be in the range 0xB0..0xBF - */ - - /* Generic FC failure - catchall */ - NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, - - /* I/O failure due to FC ABTS'd */ - NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { From d85cf207499e6740ab9c490ff4f360af5c432d23 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:23 -0700 Subject: [PATCH 0727/1322] nvme: add transport SGL definitions Add transport SGL defintions from NVMe TP 4008, required for the final NVMe-FC standard. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2440be32be1d..9310ce77d8e1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -471,12 +471,14 @@ enum nvme_opcode { * * @NVME_SGL_FMT_ADDRESS: absolute address of the data block * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation * request subtype */ enum { NVME_SGL_FMT_ADDRESS = 0x00, NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, NVME_SGL_FMT_INVALIDATE = 0x0f, }; @@ -490,12 +492,16 @@ enum { * * For struct nvme_keyed_sgl_desc: * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor */ enum { NVME_SGL_FMT_DATA_DESC = 0x00, NVME_SGL_FMT_SEG_DESC = 0x02, NVME_SGL_FMT_LAST_SEG_DESC = 0x03, NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, }; struct nvme_sgl_desc { From d9d34c0b2327e85da0ad1476575264fe957fc6ef Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:20:24 -0700 Subject: [PATCH 0728/1322] nvme-fc: use transport-specific sgl format Sync with NVM Express spec change and FC-NVME 1.18. FC transport sets SGL type to Transport SGL Data Block Descriptor and subtype to transport-specific value 0x0A. Removed the warn-on's on the PRP fields. They are unneeded. They were to check for values from the upper layer that weren't set right, and for the most part were fine. But, with Async events, which reuse the same structure and 2nd time issued the SGL overlay converted them to the Transport SGL values - the warn-on's were errantly firing. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9100779b58c9..af075e998944 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1989,16 +1989,17 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * as well as those by FC-NVME spec. */ WARN_ON_ONCE(sqe->common.metadata); - WARN_ON_ONCE(sqe->common.dptr.prp1); - WARN_ON_ONCE(sqe->common.dptr.prp2); sqe->common.flags |= NVME_CMD_SGL_METABUF; /* - * format SQE DPTR field per FC-NVME rules - * type=data block descr; subtype=offset; - * offset is currently 0. + * format SQE DPTR field per FC-NVME rules: + * type=0x5 Transport SGL Data Block Descriptor + * subtype=0xA Transport-specific value + * address=0 + * length=length of the data series */ - sqe->rw.dptr.sgl.type = NVME_SGL_FMT_OFFSET; + sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; sqe->rw.dptr.sgl.length = cpu_to_le32(data_len); sqe->rw.dptr.sgl.addr = 0; From deb61742e060d4447712598bc11bb50f8b2e51dd Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 11 Sep 2017 16:16:53 -0700 Subject: [PATCH 0729/1322] nvmet-fc: fix failing max io queue connections fc transport is treating NVMET_NR_QUEUES as maximum queue count, e.g. admin queue plus NVMET_NR_QUEUES-1 io queues. But NVMET_NR_QUEUES is the number of io queues, so maximum queue count is really NVMET_NR_QUEUES+1. Fix the handling in the target fc transport Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 088f07250d76..c48c83d97e30 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -148,7 +148,7 @@ struct nvmet_fc_tgt_assoc { u32 a_id; struct nvmet_fc_tgtport *tgtport; struct list_head a_list; - struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES]; + struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; }; @@ -608,7 +608,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, unsigned long flags; int ret; - if (qid >= NVMET_NR_QUEUES) + if (qid > NVMET_NR_QUEUES) return NULL; queue = kzalloc((sizeof(*queue) + @@ -888,7 +888,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) int i; spin_lock_irqsave(&tgtport->lock, flags); - for (i = NVMET_NR_QUEUES - 1; i >= 0; i--) { + for (i = NVMET_NR_QUEUES; i >= 0; i--) { queue = assoc->queues[i]; if (queue) { if (!nvmet_fc_tgt_q_get(queue)) From 161b8be2bd6abad250d4b3f674bdd5480f15beeb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 14 Sep 2017 13:54:39 -0400 Subject: [PATCH 0730/1322] nvme-pci: initialize queue memory before interrupts A spurious interrupt before the nvme driver has initialized the completion queue may inadvertently cause the driver to believe it has a completion to process. This may result in a NULL dereference since the nvmeq's tags are not set at this point. The patch initializes the host's CQ memory so that a spurious interrupt isn't mistaken for a real completion. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4a2121335f48..004018c5dccc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1313,11 +1313,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_cq; + nvme_init_queue(nvmeq, qid); result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; - nvme_init_queue(nvmeq, qid); return result; release_sq: @@ -1464,6 +1464,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) return result; nvmeq->cq_vector = 0; + nvme_init_queue(nvmeq, 0); result = queue_request_irq(nvmeq); if (result) { nvmeq->cq_vector = -1; @@ -2156,7 +2157,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto out; From d08774738446e77734777adcf5d1045237b4475a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Sep 2017 13:05:38 -0400 Subject: [PATCH 0731/1322] nvme-pci: Print invalid SGL only once The WARN_ONCE macro returns true if the condition is true, not if the warn was raised, so we're printing the scatter list every time it's invalid. This is excessive and makes debugging harder, so this patch prints it just once. Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 004018c5dccc..cb73bc8cad3b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -540,6 +541,20 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif +static void nvme_print_sgl(struct scatterlist *sgl, int nents) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + dma_addr_t phys = sg_phys(sg); + pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " + "dma_address:%pad dma_length:%d\n", + i, &phys, sg->offset, sg->length, &sg_dma_address(sg), + sg_dma_len(sg)); + } +} + static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -622,19 +637,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; bad_sgl: - if (WARN_ONCE(1, "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->nents)) { - for_each_sg(iod->sg, sg, iod->nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", i, &phys, - sg->offset, sg->length, - &sg_dma_address(sg), - sg_dma_len(sg)); - } - } + WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), + "Invalid SGL for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), iod->nents); return BLK_STS_IOERR; - } static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, From cd48282cc736377d5abf7c04de8c6ba864ba3794 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:03:09 -0700 Subject: [PATCH 0732/1322] nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index acc816b67582..d470f031e27f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2590,7 +2590,7 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); spin_lock_irq(&ctrl->lock); - while (ctrl->event_limit > 0) { + while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { int aer_idx = --ctrl->event_limit; spin_unlock_irq(&ctrl->lock); @@ -2677,7 +2677,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - queue_work(nvme_wq, &ctrl->async_event_work); + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->async_event_work); break; default: break; From 0951338d9677f546e230685d68631dfd3f81cca5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:18:04 -0700 Subject: [PATCH 0733/1322] nvme: allow timed-out ios to retry Currently the nvme_req_needs_retry() applies several checks to see if a retry is allowed. On of those is whether the current time has exceeded the start time of the io plus the timeout length. This check, if an io times out, means there is never a retry allowed for the io. Which means applications see the io failure. Remove this check and allow the io to timeout, like it does on other protocols, and retries to be made. On the FC transport, a frame can be lost for an individual io, and there may be no other errors that escalate for the connection/association. The io will timeout, which causes the transport to escalate into creating a new association, but the io that timed out, due to this retry logic, has already failed back to the application and things are hosed. Signed-off-by: James Smart Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d470f031e27f..5589f67d2cd8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -134,8 +134,6 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->status & NVME_SC_DNR) return false; - if (jiffies - req->start_time >= req->timeout) - return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; return true; From 8edd11c9ad3a6205eea6de9d02eaf64c681a0658 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Thu, 14 Sep 2017 13:59:28 -0300 Subject: [PATCH 0734/1322] nvme-fabrics: Allow 0 as KATO value Currently, driver code allows user to set 0 as KATO (Keep Alive TimeOut), but this is not being respected. This patch enforces the expected behavior. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fabrics.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 47307752dc65..555c976cc2ee 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -565,6 +565,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->queue_size = NVMF_DEF_QUEUE_SIZE; opts->nr_io_queues = num_online_cpus(); opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + opts->kato = NVME_DEFAULT_KATO; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -655,21 +656,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } - if (opts->discovery_nqn) { - pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); - ret = -EINVAL; - goto out; - } - if (token < 0) { pr_err("Invalid keep_alive_tmo %d\n", token); ret = -EINVAL; goto out; - } else if (token == 0) { + } else if (token == 0 && !opts->discovery_nqn) { /* Allowed for debug */ pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; + + if (opts->discovery_nqn && opts->kato) { + pr_err("Discovery controllers cannot accept KATO != 0\n"); + ret = -EINVAL; + goto out; + } + break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -762,8 +764,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, uuid_copy(&opts->host->id, &hostid); out: - if (!opts->discovery_nqn && !opts->kato) - opts->kato = NVME_DEFAULT_KATO; kfree(options); return ret; } From bb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 18 Sep 2017 09:08:29 -0700 Subject: [PATCH 0735/1322] nvmet: implement valid sqhd values in completions To support sqhd, for initiators that are following the spec and paying attention to sqhd vs their sqtail values: - add sqhd to struct nvmet_sq - initialize sqhd to 0 in nvmet_sq_setup - rather than propagate the 0's-based qsize value from the connect message which requires a +1 in every sqhd update, and as nothing else references it, convert to 1's-based value in nvmt_sq/cq_setup() calls. - validate connect message sqsize being non-zero per spec. - updated assign sqhd for every completion that goes back. Also remove handling the NULL sq case in __nvmet_req_complete, as it can't happen with the current code. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 8 ++++---- drivers/nvme/target/fabrics-cmd.c | 9 +++++++-- drivers/nvme/target/nvmet.h | 1 + 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7c23eaf8e563..c2a768a94235 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,10 +390,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - /* XXX: need to fill in something useful for sq_head */ - req->rsp->sq_head = 0; - if (likely(req->sq)) /* may happen during early failure */ - req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); + req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; if (req->ns) @@ -420,6 +419,7 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size) { + sq->sqhd = 0; sq->qid = qid; sq->size = size; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 859a66725291..db3bf6b8bf9e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,9 +109,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) pr_warn("queue already connected!\n"); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } + if (!sqsize) { + pr_warn("queue size zero!\n"); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } - nvmet_cq_setup(ctrl, req->cq, qid, sqsize); - nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + /* note: convert queue size from 0's-based value to 1's-based value */ + nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); return 0; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7d261ab894f4..7b8e20adf760 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -74,6 +74,7 @@ struct nvmet_sq { struct percpu_ref ref; u16 qid; u16 size; + u16 sqhd; struct completion free_done; struct completion confirm_done; }; From 332391a9935da939319e473b4680e173df75afcf Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 21 Sep 2017 08:16:29 -0600 Subject: [PATCH 0736/1322] fs: Fix page cache inconsistency when mixing buffered and AIO DIO Currently when mixing buffered reads and asynchronous direct writes it is possible to end up with the situation where we have stale data in the page cache while the new data is already written to disk. This is permanent until the affected pages are flushed away. Despite the fact that mixing buffered and direct IO is ill-advised it does pose a thread for a data integrity, is unexpected and should be fixed. Fix this by deferring completion of asynchronous direct writes to a process context in the case that there are mapped pages to be found in the inode. Later before the completion in dio_complete() invalidate the pages in question. This ensures that after the completion the pages in the written area are either unmapped, or populated with up-to-date data. Also do the same for the iomap case which uses iomap_dio_complete() instead. This has a side effect of deferring the completion to a process context for every AIO DIO that happens on inode that has pages mapped. However since the consensus is that this is ill-advised practice the performance implication should not be a problem. This was based on proposal from Jeff Moyer, thanks! Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Reviewed-by: Jeff Moyer Signed-off-by: Lukas Czerner Signed-off-by: Jens Axboe --- fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ fs/iomap.c | 29 ++++++++++++++++------------- mm/filemap.c | 10 ++++++++-- 3 files changed, 67 insertions(+), 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..62cf812ed0e5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (ret > 0 && dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages) { + err = invalidate_inode_pages2_range(dio->inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + ret - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + if (dio->end_io) { - int err; // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio) struct dio *dio = bio->bi_private; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - if (dio->result && dio->defer_completion) { + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && iov_iter_rw(iter) == WRITE && - ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host))) { - retval = dio_set_defer_completion(dio); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } if (retval) { /* * We grab i_mutex only for reads so we don't have diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..8194d30bdca0 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -713,8 +713,24 @@ struct iomap_dio { static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = iomap_dio_complete(dio); - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (iov_iter_rw(iter) == WRITE) { - int err = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(err); - } - return ret; out_free_dio: diff --git a/mm/filemap.c b/mm/filemap.c index 870971e20967..db250d0e0565 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... + * + * Most of the time we do not need this since dio_complete() will do + * the invalidation for us. However there are some file systems that + * do not end up with dio_complete() being called, so let's not break + * them by removing it completely */ - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (mapping->nrpages) + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); if (written > 0) { pos += written; From f5c156c4c29a3d87176dd6e5c099388e187ec29b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 12:17:16 -0700 Subject: [PATCH 0737/1322] block: fix a crash caused by wrong API part_stat_show takes a part device not a disk, so we should use part_to_disk. Fixes: d62e26b3ffd2("block: pass in queue to inflight accounting") Cc: Bart Van Assche Cc: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/partition-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 86e8fe1adcdb..88c555db4e5d 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -112,7 +112,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = dev_to_disk(dev)->queue; + struct request_queue *q = part_to_disk(p)->queue; unsigned int inflight[2]; int cpu; From 62e082430ea4bb5b28909ca4375bb683931e22aa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 20 Sep 2017 07:29:49 -0400 Subject: [PATCH 0738/1322] dm ioctl: fix alignment of event number in the device list The size of struct dm_name_list is different on 32-bit and 64-bit kernels (so "(nl + 1)" differs between 32-bit and 64-bit kernels). This mismatch caused some harmless difference in padding when using 32-bit or 64-bit kernel. Commit 23d70c5e52dd ("dm ioctl: report event number in DM_LIST_DEVICES") added reporting event number in the output of DM_LIST_DEVICES_CMD. This difference in padding makes it impossible for userspace to determine the location of the event number (the location would be different when running on 32-bit and 64-bit kernels). Fix the padding by using offsetof(struct dm_name_list, name) instead of sizeof(struct dm_name_list) to determine the location of entries. Also, the ioctl version number is incremented to 37 so that userspace can use the version number to determine that the event number is present and correctly located. In addition, a global event is now raised when a DM device is created, removed, renamed or when table is swapped, so that the user can monitor for device changes. Reported-by: Eugene Syromiatnikov Fixes: 23d70c5e52dd ("dm ioctl: report event number in DM_LIST_DEVICES") Cc: stable@vger.kernel.org # 4.13 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 1 + drivers/md/dm-ioctl.c | 37 +++++++++++++++++++++++------------ drivers/md/dm.c | 10 ++++++++-- include/uapi/linux/dm-ioctl.h | 4 ++-- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 24eddbdf2ab4..203144762f36 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -149,5 +149,6 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen extern atomic_t dm_global_event_nr; extern wait_queue_head_t dm_global_eventq; +void dm_issue_global_event(void); #endif diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 8756a6850431..e52676fa9832 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -477,9 +477,13 @@ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_si * Round up the ptr to an 8-byte boundary. */ #define ALIGN_MASK 7 +static inline size_t align_val(size_t val) +{ + return (val + ALIGN_MASK) & ~ALIGN_MASK; +} static inline void *align_ptr(void *ptr) { - return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); + return (void *)align_val((size_t)ptr); } /* @@ -505,7 +509,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ struct hash_cell *hc; size_t len, needed = 0; struct gendisk *disk; - struct dm_name_list *nl, *old_nl = NULL; + struct dm_name_list *orig_nl, *nl, *old_nl = NULL; uint32_t *event_nr; down_write(&_hash_lock); @@ -516,17 +520,15 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ */ for (i = 0; i < NUM_BUCKETS; i++) { list_for_each_entry (hc, _name_buckets + i, name_list) { - needed += sizeof(struct dm_name_list); - needed += strlen(hc->name) + 1; - needed += ALIGN_MASK; - needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK; + needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1); + needed += align_val(sizeof(uint32_t)); } } /* * Grab our output buffer. */ - nl = get_result_buffer(param, param_size, &len); + nl = orig_nl = get_result_buffer(param, param_size, &len); if (len < needed) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; @@ -549,11 +551,16 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ strcpy(nl->name, hc->name); old_nl = nl; - event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1); + event_nr = align_ptr(nl->name + strlen(hc->name) + 1); *event_nr = dm_get_event_nr(hc->md); nl = align_ptr(event_nr + 1); } } + /* + * If mismatch happens, security may be compromised due to buffer + * overflow, so it's better to crash. + */ + BUG_ON((char *)nl - (char *)orig_nl != needed); out: up_write(&_hash_lock); @@ -1621,7 +1628,8 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para * which has a variable size, is not used by the function processing * the ioctl. */ -#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_NO_PARAMS 1 +#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT 2 /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char @@ -1635,12 +1643,12 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) ioctl_fn fn; } _ioctls[] = { {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all}, {DM_LIST_DEVICES_CMD, 0, list_devices}, - {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, - {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, - {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove}, + {DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename}, {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, {DM_DEV_WAIT_CMD, 0, dev_wait}, @@ -1869,6 +1877,9 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); + if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT) + dm_issue_global_event(); + /* * Copy the results back to userland. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5..4be85324f44d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -52,6 +52,12 @@ static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); +void dm_issue_global_event(void) +{ + atomic_inc(&dm_global_event_nr); + wake_up(&dm_global_eventq); +} + /* * One of these is allocated per bio. */ @@ -1865,9 +1871,8 @@ static void event_callback(void *context) dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); - atomic_inc(&dm_global_event_nr); wake_up(&md->eventq); - wake_up(&dm_global_eventq); + dm_issue_global_event(); } /* @@ -2283,6 +2288,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) } map = __bind(md, table, &limits); + dm_issue_global_event(); out: mutex_unlock(&md->suspend_lock); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 412c06a624c8..ccaea525340b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -269,9 +269,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 36 +#define DM_VERSION_MINOR 37 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2017-06-09)" +#define DM_VERSION_EXTRA "-ioctl (2017-09-20)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ From 9789e7e93f2b892098d7684ac8131092aa617814 Mon Sep 17 00:00:00 2001 From: Mengting Zhang Date: Sat, 23 Sep 2017 16:18:14 +0800 Subject: [PATCH 0739/1322] perf report: Fix debug messages with --call-graph option With --call-graph option, perf report can display call chains using type, min percent threshold, optional print limit and order. And the default call-graph parameter is 'graph,0.5,caller,function,percent'. Before this patch, 'perf report --call-graph' shows incorrect debug messages as below: # perf report --call-graph Invalid callchain mode: 0.5 Invalid callchain order: 0.5 Invalid callchain sort key: 0.5 Invalid callchain config key: 0.5 Invalid callchain mode: caller Invalid callchain mode: function Invalid callchain order: function Invalid callchain mode: percent Invalid callchain order: percent Invalid callchain sort key: percent That is because in function __parse_callchain_report_opt(),each field of the call-graph parameter is passed to parse_callchain_{mode,order, sort_key,value} in turn until it meets the matching value. For example, the order field "caller" is passed to parse_callchain_mode() firstly and obviously it doesn't match any mode field. Therefore parse_callchain_mode() will shows the debug message "Invalid callchain mode: caller", which could confuse users. The patch fixes this issue by moving the warning out of the function parse_callchain_{mode,order,sort_key,value}. Signed-off-by: Mengting Zhang Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Krister Johansen Cc: Li Bin Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Cc: Yao Jin Link: http://lkml.kernel.org/r/1506154694-39691-1-git-send-email-zhangmengting@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/callchain.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 510b513e0f01..be09d77cade0 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -65,8 +65,6 @@ static int parse_callchain_mode(const char *value) callchain_param.mode = CHAIN_FOLDED; return 0; } - - pr_err("Invalid callchain mode: %s\n", value); return -1; } @@ -82,8 +80,6 @@ static int parse_callchain_order(const char *value) callchain_param.order_set = true; return 0; } - - pr_err("Invalid callchain order: %s\n", value); return -1; } @@ -105,8 +101,6 @@ static int parse_callchain_sort_key(const char *value) callchain_param.branch_callstack = 1; return 0; } - - pr_err("Invalid callchain sort key: %s\n", value); return -1; } @@ -124,8 +118,6 @@ static int parse_callchain_value(const char *value) callchain_param.value = CCVAL_COUNT; return 0; } - - pr_err("Invalid callchain config key: %s\n", value); return -1; } @@ -319,12 +311,27 @@ int perf_callchain_config(const char *var, const char *value) return ret; } - if (!strcmp(var, "print-type")) - return parse_callchain_mode(value); - if (!strcmp(var, "order")) - return parse_callchain_order(value); - if (!strcmp(var, "sort-key")) - return parse_callchain_sort_key(value); + if (!strcmp(var, "print-type")){ + int ret; + ret = parse_callchain_mode(value); + if (ret == -1) + pr_err("Invalid callchain mode: %s\n", value); + return ret; + } + if (!strcmp(var, "order")){ + int ret; + ret = parse_callchain_order(value); + if (ret == -1) + pr_err("Invalid callchain order: %s\n", value); + return ret; + } + if (!strcmp(var, "sort-key")){ + int ret; + ret = parse_callchain_sort_key(value); + if (ret == -1) + pr_err("Invalid callchain sort key: %s\n", value); + return ret; + } if (!strcmp(var, "threshold")) { callchain_param.min_percent = strtod(value, &endptr); if (value == endptr) { From 090657c9fb7094e4c1b05c1713d6c2a12ef43dea Mon Sep 17 00:00:00 2001 From: Akemi Yagi Date: Fri, 22 Sep 2017 22:11:53 +0000 Subject: [PATCH 0740/1322] perf tools: Fix syscalltbl build failure The build of kernel v4.14-rc1 for i686 fails on RHEL 6 with the error in tools/perf: util/syscalltbl.c:157: error: expected ';', ',' or ')' before '__maybe_unused' mv: cannot stat `util/.syscalltbl.o.tmp': No such file or directory Fix it by placing/moving: #include outside of #ifdef HAVE_SYSCALL_TABLE block. Signed-off-by: Akemi Yagi Cc: Alan Bartlett Link: http://lkml.kernel.org/r/oq41r8$1v9$1@blaine.gmane.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/syscalltbl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index 19e5db90394c..6eea7cff3d4e 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -15,9 +15,9 @@ #include "syscalltbl.h" #include +#include #ifdef HAVE_SYSCALL_TABLE -#include #include #include "string2.h" #include "util.h" From 78b1beb0998437107ed144b341fbe1252188916b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 24 Sep 2017 21:46:29 +0300 Subject: [PATCH 0741/1322] IB/core: Fix typo in the name of the tag-matching cap struct The tag matching functionality is implemented by mlx5 driver by extending XRQ, however this internal kernel information was exposed to user space applications with *xrq* name instead of *tm*. This patch renames *xrq* to *tm* to handle that. Fixes: 8d50505ada72 ("IB/uverbs: Expose XRQ capabilities") Signed-off-by: Leon Romanovsky Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++++------- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- include/rdma/ib_verbs.h | 4 ++-- include/uapi/rdma/ib_user_verbs.h | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4ab30d832ac5..52a2cf2d83aa 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3869,15 +3869,15 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); - if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + if (ucore->outlen < resp.response_length + sizeof(resp.tm_caps)) goto end; - resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; - resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; - resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; - resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; - resp.xrq_caps.flags = attr.xrq_caps.flags; - resp.response_length += sizeof(resp.xrq_caps); + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.response_length += sizeof(resp.tm_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 05fb4bdff6a0..d6fbad8f34aa 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -778,13 +778,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(mdev, tag_matching)) { - props->xrq_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; - props->xrq_caps.max_num_tags = + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + props->tm_caps.max_num_tags = (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; - props->xrq_caps.flags = IB_TM_CAP_RC; - props->xrq_caps.max_ops = + props->tm_caps.flags = IB_TM_CAP_RC; + props->tm_caps.max_ops = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - props->xrq_caps.max_sge = MLX5_TM_MAX_SGE; + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bdb1279a415b..bbb5f54db882 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -285,7 +285,7 @@ enum ib_tm_cap_flags { IB_TM_CAP_RC = 1 << 0, }; -struct ib_xrq_caps { +struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ @@ -358,7 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ - struct ib_xrq_caps xrq_caps; + struct ib_tm_caps tm_caps; }; enum ib_mtu { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 9a0b6479fe0c..d4e0b53bfc75 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -261,7 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; - struct ib_uverbs_tm_caps xrq_caps; + struct ib_uverbs_tm_caps tm_caps; }; struct ib_uverbs_query_port { From 73827a605bbd7cebef4cfd1261e497246a82a0e7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:30 +0300 Subject: [PATCH 0742/1322] IB/core: Fix qp_sec use after free access When security_ib_alloc_security fails, qp->qp_sec memory is freed. However ib_destroy_qp still tries to access this memory which result in kernel crash. So its initialized to NULL to avoid such access. Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/security.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 70ad19c4c73e..88bdafb297f5 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -432,8 +432,10 @@ int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) atomic_set(&qp->qp_sec->error_list_count, 0); init_completion(&qp->qp_sec->error_complete); ret = security_ib_alloc_security(&qp->qp_sec->security); - if (ret) + if (ret) { kfree(qp->qp_sec); + qp->qp_sec = NULL; + } return ret; } From edd31551148c09608feee6b8756ad148d550ee3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 24 Sep 2017 21:46:31 +0300 Subject: [PATCH 0743/1322] IB: Correct MR length field to be 64-bit The ib_mr->length represents the length of the MR in bytes as per the IBTA spec 1.3 section 11.2.10.3 (REGISTER PHYSICAL MEMORY REGION). Currently ib_mr->length field is defined as only 32-bits field. This might result into truncation and failed WRs of consumers who registers more than 4GB bytes memory regions and whose WRs accessing such MRs. This patch makes the length 64-bit to avoid such truncation. Cc: Sagi Grimberg Cc: Chuck Lever Cc: Faisal Latif Fixes: 4c67e2bfc8b7 ("IB/core: Introduce new fast registration API") Signed-off-by: Ilya Lesokhin Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_verbs.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- include/rdma/ib_verbs.h | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f0dc5f4aa177..442b9bdc0f03 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3232,7 +3232,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->ibmr.iova); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX, - mr->ibmr.length); + lower_32_bits(mr->ibmr.length)); set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0); set_wqe_32bit_value(wqe->wqe_words, @@ -3274,7 +3274,7 @@ static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, mr->npages * 8); nes_debug(NES_DBG_IW_TX, "SQ_REG_MR: iova_start: %llx, " - "length: %d, rkey: %0x, pgl_paddr: %llx, " + "length: %lld, rkey: %0x, pgl_paddr: %llx, " "page_list_len: %u, wqe_misc: %x\n", (unsigned long long) mr->ibmr.iova, mr->ibmr.length, diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9c3e9ab53a41..322209d5ff58 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -154,7 +154,7 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec npages %d data length %d\n", + iser_err("page vec npages %d data length %lld\n", page_vec->npages, page_vec->fake_mr.length); for (i = 0; i < page_vec->npages; i++) iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbb5f54db882..e8608b2dc844 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1739,7 +1739,7 @@ struct ib_mr { u32 lkey; u32 rkey; u64 iova; - u32 length; + u64 length; unsigned int page_size; bool need_inval; union { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); From 9c6f42e9254150d2772242d9f8bd8d0b7b7431ff Mon Sep 17 00:00:00 2001 From: Shalom Lagziel Date: Sun, 24 Sep 2017 21:46:32 +0300 Subject: [PATCH 0744/1322] IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock A possible ABBA lock can happen with RTNL and vlan_rwsem. For example: Flow A: Device Flush __ipoib_ib_dev_flush down_read(vlan_rwsem) // Lock A ipoib_flush_ah flush_workqueue(priv->wq) // Wait for completion A work on shared WQ (Mcast carrier) ipoib_mcast_carrier_on_task while (!rtnl_trylock()) // Wait for lock B Flow B: Sysfs PKEY delete ipoib_vlan_delete lock(RTNL) // Lock B down_write(vlan_rwsem) // Wait for lock A This can happen with PKEY creates as well. The solution is to release the RTNL lock in sysfs functions in case it is not possible to lock VLAN RW semaphore and reset the SYS call. Fixes: 69956d83267e ("IB/ipoib: Sync between remove_one to sysfs calls that use rtnl_lock") Signed-off-by: Shalom Lagziel Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 9927cd6b7082..e01c58edca15 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -141,14 +141,17 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); - if (!priv) { + if (!down_write_trylock(&ppriv->vlan_rwsem)) { rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - return -ENOMEM; + return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (!priv) { + result = -ENOMEM; + goto out; + } /* * First ensure this isn't a duplicate. We check the parent device and @@ -175,7 +178,7 @@ out: rtnl_unlock(); mutex_unlock(&ppriv->sysfs_mutex); - if (result) { + if (result && priv) { free_netdev(priv->dev); kfree(priv); } @@ -204,7 +207,12 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) return restart_syscall(); } - down_write(&ppriv->vlan_rwsem); + if (!down_write_trylock(&ppriv->vlan_rwsem)) { + rtnl_unlock(); + mutex_unlock(&ppriv->sysfs_mutex); + return restart_syscall(); + } + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { From 7c9d9662103ae1c11acc7bfc47d988466cff23cf Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 24 Sep 2017 21:46:33 +0300 Subject: [PATCH 0745/1322] IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev Call free_rdma_netdev instead of free_netdev each time we want to release a netdevice. This call is also relevant for future freeing of offloaded child interfaces. This patch also adds a missing call for free netdevice when releasing a parent interface that has child interfaces using ipoib_remove_one. Fixes: cd565b4b51e5 ('IB/IPoIB: Support acceleration options callbacks') Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++++++++++---- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index bac95b509a9b..dcc77014018d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2180,6 +2180,7 @@ static struct net_device *ipoib_add_port(const char *format, { struct ipoib_dev_priv *priv; struct ib_port_attr attr; + struct rdma_netdev *rn; int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); @@ -2279,7 +2280,8 @@ register_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: - free_netdev(priv->dev); + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); alloc_mem_failed: @@ -2328,7 +2330,7 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *rn = netdev_priv(priv->dev); + struct rdma_netdev *parent_rn = netdev_priv(priv->dev); ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); @@ -2350,10 +2352,15 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) unregister_netdev(priv->dev); mutex_unlock(&priv->sysfs_mutex); - rn->free_rdma_netdev(priv->dev); + parent_rn->free_rdma_netdev(priv->dev); - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + struct rdma_netdev *child_rn; + + child_rn = netdev_priv(cpriv->dev); + child_rn->free_rdma_netdev(cpriv->dev); kfree(cpriv); + } kfree(priv); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index e01c58edca15..55a9b71ed05a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -179,7 +179,10 @@ out: mutex_unlock(&ppriv->sysfs_mutex); if (result && priv) { - free_netdev(priv->dev); + struct rdma_netdev *rn; + + rn = netdev_priv(priv->dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); } @@ -232,7 +235,10 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) mutex_unlock(&ppriv->sysfs_mutex); if (dev) { - free_netdev(dev); + struct rdma_netdev *rn; + + rn = netdev_priv(dev); + rn->free_rdma_netdev(priv->dev); kfree(priv); return 0; } From d67bc5d4e3e100d762c0f57ea67f28bc219698a6 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:34 +0300 Subject: [PATCH 0746/1322] IB/mlx5: Simplify mlx5_ib_cont_pages The patch simplifies mlx5_ib_cont_pages and fixes the following issues in the original implementation: First issues is related to alignment of the PFNs. After the check base + p != PFN, the alignment of the PFN wasn't checked. So the PFN sequence 0, 1, 1, 2 would result in a page_shift of 13 even though the 3rd PFN is not 8KB aligned. This wasn't actually a bug because it was supported by all the existing mlx5 compatible device, but we don't want to require this support in all future devices. Another issue is because the inner loop didn't advance PFN so the test "if (base + p != pfn)" always failed for SGE with len > (1< Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mem.c | 47 ++++++++++++-------------------- 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 914f212e7ef6..f3dbd75a0a96 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -50,13 +50,9 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, { unsigned long tmp; unsigned long m; - int i, k; - u64 base = 0; - int p = 0; - int skip; - int mask; - u64 len; - u64 pfn; + u64 base = ~0, p = 0; + u64 len, pfn; + int i = 0; struct scatterlist *sg; int entry; unsigned long page_shift = umem->page_shift; @@ -76,33 +72,24 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, m = find_first_bit(&tmp, BITS_PER_LONG); if (max_page_shift) m = min_t(unsigned long, max_page_shift - page_shift, m); - skip = 1 << m; - mask = skip - 1; - i = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> page_shift; pfn = sg_dma_address(sg) >> page_shift; - for (k = 0; k < len; k++) { - if (!(i & mask)) { - tmp = (unsigned long)pfn; - m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG)); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } else { - if (base + p != pfn) { - tmp = (unsigned long)p; - m = find_first_bit(&tmp, BITS_PER_LONG); - skip = 1 << m; - mask = skip - 1; - base = pfn; - p = 0; - } - } - p++; - i++; + if (base + p != pfn) { + /* If either the offset or the new + * base are unaligned update m + */ + tmp = (unsigned long)(pfn | p); + if (!IS_ALIGNED(tmp, 1 << m)) + m = find_first_bit(&tmp, BITS_PER_LONG); + + base = pfn; + p = 0; } + + p += len; + i += len; } if (i) { From fbcd49838d9094ca45772356e7b33afe4b7c93e7 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Sun, 24 Sep 2017 21:46:35 +0300 Subject: [PATCH 0747/1322] IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure mlx5_ib_reg_user_mr called mlx5_ib_dereg_mr in case of MR population failure. This resulted in a NULL dereference as ibmr->device wasn't initialized yet. We address this by adding an internal dereg_mr function that can handle partially initialized MRs, and fixing clean_mr to work on partially initialized MRs. Fixes: ff740aefecb9 ("IB/mlx5: Decouple MR allocation and population flows") Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mr.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0e2789d9bb4d..37bbc543847a 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -47,7 +47,8 @@ enum { #define MLX5_UMR_ALIGN 2048 -static int clean_mr(struct mlx5_ib_mr *mr); +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); @@ -1270,8 +1271,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift, update_xlt_flags); + if (err) { - mlx5_ib_dereg_mr(&mr->ibmr); + dereg_mr(dev, mr); return ERR_PTR(err); } } @@ -1356,7 +1358,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1410,7 +1412,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); ib_umem_release(mr->umem); - clean_mr(mr); + clean_mr(dev, mr); return err; } } @@ -1469,9 +1471,8 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) } } -static int clean_mr(struct mlx5_ib_mr *mr) +static int clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); int allocated_from_cache = mr->allocated_from_cache; int err; @@ -1507,10 +1508,8 @@ static int clean_mr(struct mlx5_ib_mr *mr) return 0; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +static int dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); int npages = mr->npages; struct ib_umem *umem = mr->umem; @@ -1539,7 +1538,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) } #endif - clean_mr(mr); + clean_mr(dev, mr); if (umem) { ib_umem_release(umem); @@ -1549,6 +1548,14 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + + return dereg_mr(dev, mr); +} + struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) From fe59493240169a2cc3f445ae5f2a2308fda06b63 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Sep 2017 14:29:15 +0200 Subject: [PATCH 0748/1322] PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build If CONFIG_PCI=n and gcc (e.g. 4.1.2) decides not to inline get_pci_function_alias_group(), the build fails with: drivers/iommu/iommu.o: In function `get_pci_function_alias_group': iommu.c:(.text+0xfdc): undefined reference to `pci_acs_enabled' Due to the various dummies for PCI calls in the CONFIG_PCI=n case, pci_acs_enabled() never called, but not all versions of gcc are smart enough to realize that. While explicitly marking get_pci_function_alias_group() inline would fix the build, this would inflate the code for the CONFIG_PCI=y case, as get_pci_function_alias_group() is a not-so-small function called from two places. Hence fix the issue by introducing a dummy for pci_acs_enabled() instead. Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Geert Uytterhoeven Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci.h b/include/linux/pci.h index f68c58a93dd0..f4f8ee5a7362 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1685,6 +1685,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #define dev_is_pci(d) (false) #define dev_is_pf(d) (false) +static inline bool pci_acs_enabled(struct pci_dev *pdev, u16 acs_flags) +{ return false; } #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ From 9c3340ea7f5dabf88ca096a917cb0ab1f208ef2a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 11 Sep 2017 19:11:07 -0600 Subject: [PATCH 0749/1322] selftests: futex: copy sub-dir test scripts for make O=dir run For make O=dir run_tests to work, test scripts from sub-directories need to be copied over to the object directory. Running tests from the object directory is necessary to avoid making the source tree dirty. Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 7c647f619d63..9358cb210fd5 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -11,10 +11,13 @@ all: BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ + if [ -e $$DIR/$(TEST_PROGS) ]; then + rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; + fi done override define RUN_TESTS - $(OUTPUT)/run.sh + cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE From 8230b905a6780c60372bf5df7bf5f23041ca3196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Tue, 12 Sep 2017 08:52:13 -0600 Subject: [PATCH 0750/1322] selftests: mqueue: Use full path to run tests from Makefile Use full path including $(OUTPUT) to run tests from Makefile for normal case when objects reside in the source tree as well as when objects are relocated with make O=dir. In both cases $(OUTPUT) will be set correctly by lib.mk. Signed-off-by: Shuah Khan --- tools/testing/selftests/mqueue/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile index 79a664aeb8d7..0f5e347b068d 100644 --- a/tools/testing/selftests/mqueue/Makefile +++ b/tools/testing/selftests/mqueue/Makefile @@ -5,8 +5,8 @@ TEST_GEN_PROGS := mq_open_tests mq_perf_tests include ../lib.mk override define RUN_TESTS - @./mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" - @./mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" + $(OUTPUT)/mq_open_tests /test1 || echo "selftests: mq_open_tests [FAIL]" + $(OUTPUT)//mq_perf_tests || echo "selftests: mq_perf_tests [FAIL]" endef override define EMIT_TESTS From 1ede053632f6380d7e1dba4d781e5eb78621aa3a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 17:30:50 -0600 Subject: [PATCH 0751/1322] selftests: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: e.g run from breakpoints for TARGET in breakpoints; do \ BUILD_TARGET=$BUILD/$TARGET; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $TARGET;\ done; Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index f4368db011ea..ff805643b5f7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -66,32 +66,32 @@ endif export BUILD all: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_tests: all - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\ done; hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET;\ done; run_hotplug: hotplug - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\ done; clean_hotplug: - for TARGET in $(TARGETS_HOTPLUG); do \ + @for TARGET in $(TARGETS_HOTPLUG); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; @@ -107,7 +107,7 @@ install: ifdef INSTALL_PATH @# Ask all targets to install their files mkdir -p $(INSTALL_PATH) - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ done; @@ -132,7 +132,7 @@ else endif clean: - for TARGET in $(TARGETS); do \ + @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; From 659dbfd8c47adeb03f401d1a1f17091bb63cc5a2 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 18 Sep 2017 18:46:23 -0600 Subject: [PATCH 0752/1322] selftests: futex: Makefile: fix for loops in targets to run silently Fix for loops in targets to run silently to avoid cluttering the test results. Suppresses the following from targets: for DIR in functional; do \ BUILD_TARGET=./tools/testing/selftests/futex/$DIR; \ mkdir $BUILD_TARGET -p; \ make OUTPUT=$BUILD_TARGET -C $DIR all;\ done ./tools/testing/selftests/futex/run.sh Signed-off-by: Shuah Khan Reviewed-by: Darren Hart (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile index 9358cb210fd5..f0c0369ccb79 100644 --- a/tools/testing/selftests/futex/Makefile +++ b/tools/testing/selftests/futex/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run.sh include ../lib.mk all: - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ @@ -17,7 +17,7 @@ all: done override define RUN_TESTS - cd $(OUTPUT); ./run.sh + @cd $(OUTPUT); ./run.sh endef override define INSTALL_RULE @@ -36,7 +36,7 @@ override define EMIT_TESTS endef override define CLEAN - for DIR in $(SUBDIRS); do \ + @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ mkdir $$BUILD_TARGET -p; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ From 10859f3855db4c6f10dc7974ff4b3a292f3de8e0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 Sep 2017 16:32:46 -0700 Subject: [PATCH 0753/1322] selftests/seccomp: Support glibc 2.26 siginfo_t.h The 2.26 release of glibc changed how siginfo_t is defined, and the earlier work-around to using the kernel definition are no longer needed. The old way needs to stay around for a while, though. Reported-by: Seth Forshee Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Tested-by: Seth Forshee Signed-off-by: Shuah Khan --- tools/testing/selftests/seccomp/seccomp_bpf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 4d6f92a9df6b..19cd272c234d 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -6,10 +6,18 @@ */ #include -#include -#define __have_siginfo_t 1 -#define __have_sigval_t 1 -#define __have_sigevent_t 1 + +/* + * glibc 2.26 and later have SIGSYS in siginfo_t. Before that, + * we need to use the kernel's siginfo.h file and trick glibc + * into accepting it. + */ +#if !__GLIBC_PREREQ(2, 26) +# include +# define __have_siginfo_t 1 +# define __have_sigval_t 1 +# define __have_sigevent_t 1 +#endif #include #include @@ -676,7 +684,7 @@ TEST_F_SIGNAL(TRAP, ign, SIGSYS) syscall(__NR_getpid); } -static struct siginfo TRAP_info; +static siginfo_t TRAP_info; static volatile int TRAP_nr; static void TRAP_action(int nr, siginfo_t *info, void *void_context) { From 21aadfa2426d5d199ceb474d0159d079c7f17bfa Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 21 Sep 2017 17:13:27 +0800 Subject: [PATCH 0754/1322] selftests/memfd: correct run_tests.sh permission to fix the following issue: ------------------ TAP version 13 selftests: run_tests.sh ======================================== selftests: Warning: file run_tests.sh is not executable, correct this. not ok 1..1 selftests: run_tests.sh [FAIL] ------------------ Signed-off-by: Li Zhijian Signed-off-by: Shuah Khan --- tools/testing/selftests/memfd/run_tests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/memfd/run_tests.sh diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh old mode 100644 new mode 100755 From 01db7fbf5487505b887fbd6a03c51f2adc952196 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:46:01 -0600 Subject: [PATCH 0755/1322] selftests: timers: set-timer-lat: fix hang when std out/err are redirected do_timer_oneshot() uses select() as a timer with FD_SETSIZE and readfs is cleared with FD_ZERO without FD_SET. When stdout and stderr are redirected, the test hangs in select forever. Fix the problem calling select() with readfds empty and nfds zero. This is sufficient for using select() for timer. With this fix "./set-timer-lat > /dev/null 2>&1" no longer hangs. Signed-off-by: Shuah Khan Acked-by: Greg Hackmann Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 9c92b7bd5641..ea1af5dbc7b6 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -228,7 +228,6 @@ int do_timer_oneshot(int clock_id, int flags) timer_t tm1; const int interval = 0; struct timeval timeout; - fd_set fds; int err; err = setup_timer(clock_id, flags, interval, &tm1); @@ -237,9 +236,8 @@ int do_timer_oneshot(int clock_id, int flags) memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; - FD_ZERO(&fds); do { - err = select(FD_SETSIZE, &fds, NULL, NULL, &timeout); + err = select(0, NULL, NULL, NULL, &timeout); } while (err == -1 && errno == EINTR); timer_delete(tm1); From eefd95e1f3d47b90dc768e9ebc77d390c4f34809 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 21 Sep 2017 13:05:18 -0600 Subject: [PATCH 0756/1322] selftests: timers: set-timer-lat: Fix hang when testing unsupported alarms When timer_create() fails on a bootime or realtime clock, setup_timer() returns 0 as if timer has been set. Callers wait forever for the timer to expire. This hang is seen on a system that doesn't have support for: CLOCK_REALTIME_ALARM ABSTIME missing CAP_WAKE_ALARM? : [UNSUPPORTED] Test hangs waiting for a timer that hasn't been set to expire. Fix setup_timer() to return 1, add handling in callers to detect the unsupported case and return 0 without waiting to not fail the test. Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/set-timer-lat.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index ea1af5dbc7b6..50da45437daa 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -143,7 +143,8 @@ int setup_timer(int clock_id, int flags, int interval, timer_t *tm1) printf("%-22s %s missing CAP_WAKE_ALARM? : [UNSUPPORTED]\n", clockstring(clock_id), flags ? "ABSTIME":"RELTIME"); - return 0; + /* Indicate timer isn't set, so caller doesn't wait */ + return 1; } printf("%s - timer_create() failed\n", clockstring(clock_id)); return -1; @@ -213,8 +214,9 @@ int do_timer(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; while (alarmcount < 5) sleep(1); @@ -231,8 +233,9 @@ int do_timer_oneshot(int clock_id, int flags) int err; err = setup_timer(clock_id, flags, interval, &tm1); + /* Unsupported case - return 0 to not fail the test */ if (err) - return err; + return err == 1 ? 0 : err; memset(&timeout, 0, sizeof(timeout)); timeout.tv_sec = 5; From 10201655b085df8e000822e496e5d4016a167a36 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 07:15:35 -0500 Subject: [PATCH 0757/1322] gfs2: Fix debugfs glocks dump The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock dump (/sys/kernel/debug/gfs2//glocks) for dumps bigger than a single buffer: the right function for restarting an rhashtable iteration from the beginning of the hash table is rhashtable_walk_enter; rhashtable_walk_stop + rhashtable_walk_start will just resume from the current position. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Cc: stable@vger.kernel.org # v4.3+ --- fs/gfs2/glock.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 98e845b7841b..11066d8647d2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1945,13 +1945,9 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - - ret = rhashtable_walk_start(&gi->hti); - if (ret) + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1959,6 +1955,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1970,6 +1967,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1980,6 +1978,7 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) gi->gl = NULL; rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -2042,12 +2041,10 @@ static int __gfs2_glocks_open(struct inode *inode, struct file *file, struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } @@ -2063,7 +2060,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } From 8cbd96a6285e8eb65232b5afd3e8d9418453a61c Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 21 Sep 2017 08:13:49 -0700 Subject: [PATCH 0758/1322] nvme: fix sqhd reference when admin queue connect fails Fix bug in sqhd patch. It wasn't the sq that was at risk. In the case where the admin queue connect command fails, the sq->size field is not set. Therefore, this becomes a divide by zero error. Add a quick check to bypass under this failure condition. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index c2a768a94235..1b208beeef50 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -390,7 +390,8 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (status) nvmet_set_status(req, status); - req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; + if (req->sq->size) + req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size; req->rsp->sq_head = cpu_to_le16(req->sq->sqhd); req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; From 1a40d97288c6ffea9b355139e88fa62f0e5439f7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:36 +0300 Subject: [PATCH 0759/1322] nvme-core: Use nvme_wq to queue async events and fw activation async_event_work might race as it is executed from two different workqueues at the moment. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5589f67d2cd8..bb2aad078637 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2676,7 +2676,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, case NVME_SC_ABORT_REQ: ++ctrl->event_limit; if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2691,7 +2691,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - schedule_work(&ctrl->fw_act_work); + queue_work(nvme_wq, &ctrl->fw_act_work); break; default: dev_warn(ctrl->device, "async event result %08x\n", result); From 0a960afd60d02808c7f7f36d4aa8a2e07045e1e9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:37 +0300 Subject: [PATCH 0760/1322] nvme-rdma: give up reconnect if state change fails If we failed to transition to state LIVE after a successful reconnect, then controller deletion already started. In this case there is no point moving forward with reconnect. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 58983000964b..8441f6b3f617 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -942,7 +942,12 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } + ctrl->ctrl.nr_reconnects = 0; nvme_start_ctrl(&ctrl->ctrl); From e4d753d7e51c0648b9ee33efeed55d45f362fc3d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:38 +0300 Subject: [PATCH 0761/1322] nvme-rdma: don't fully stop the controller in error recovery By calling nvme_stop_ctrl on a already failed controller will wait for the scan work to complete (only by identify timeout expiration which is 60 seconds). This is unnecessary when we already know that the controller has failed. Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 8441f6b3f617..92a03ff5fb4d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -967,7 +967,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_stop_ctrl(&ctrl->ctrl); + nvme_stop_keep_alive(&ctrl->ctrl); if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); From 3688feb582a1bc4e58ad50f5eccfdb90615de27b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 15:13:11 -0700 Subject: [PATCH 0762/1322] nvmet-fc: on port remove call put outside lock Avoid calling the put routine, as it may traverse to free routines while holding the target lock. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index c48c83d97e30..6850672ad2a2 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2530,13 +2530,17 @@ nvmet_fc_remove_port(struct nvmet_port *port) { struct nvmet_fc_tgtport *tgtport = port->priv; unsigned long flags; + bool matched = false; spin_lock_irqsave(&nvmet_fc_tgtlock, flags); if (tgtport->port == port) { - nvmet_fc_tgtport_put(tgtport); + matched = true; tgtport->port = NULL; } spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (matched) + nvmet_fc_tgtport_put(tgtport); } static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { From 0c319d3a144d4b8f1ea2047fd614d2149b68f889 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 16:33:56 -0700 Subject: [PATCH 0763/1322] nvmet-fc: ensure target queue id within range. When searching for queue id's ensure they are within the expected range. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6850672ad2a2..58e010bdda3e 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -783,6 +783,9 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, u16 qid = nvmet_fc_getqueueid(connection_id); unsigned long flags; + if (qid > NVMET_NR_QUEUES) + return NULL; + spin_lock_irqsave(&tgtport->lock, flags); list_for_each_entry(assoc, &tgtport->assoc_list, a_list) { if (association_id == assoc->association_id) { From 6b71f9e1e849f82abb4a8d54ce7f4b1c71f19ac4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Sep 2017 11:07:26 -0700 Subject: [PATCH 0764/1322] nvmet-fc: sync header templates with comments Comments were incorrect: - defer_rcv was in host port template. moved to target port template - Added Mandatory statements for target port template items Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 9c5cb4480806..a726f96010d5 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -346,11 +346,6 @@ struct nvme_fc_remote_port { * indicating an FC transport Aborted status. * Entrypoint is Mandatory. * - * @defer_rcv: Called by the transport to signal the LLLD that it has - * begun processing of a previously received NVME CMD IU. The LLDD - * is now free to re-use the rcv buffer associated with the - * nvmefc_tgt_fcp_req. - * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -806,11 +801,19 @@ struct nvmet_fc_target_port { * outstanding operation (if there was one) to complete, then will * call the fcp_req_release() callback to return the command's * exchange context back to the LLDD. + * Entrypoint is Mandatory. * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of * abort cleanup. + * Entrypoint is Mandatory. + * + * @defer_rcv: Called by the transport to signal the LLLD that it has + * begun processing of a previously received NVME CMD IU. The LLDD + * is now free to re-use the rcv buffer associated with the + * nvmefc_tgt_fcp_req. + * Entrypoint is Optional. * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. From fddc9923c6d41de9fe7b1f323a3cece53e046c88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 19 Sep 2017 14:01:50 -0700 Subject: [PATCH 0765/1322] nvme-fcloop: fix port deletes and callbacks Now that there are potentially long delays between when a remoteport or targetport delete calls is made and when the callback occurs (dev_loss_tmo timeout), no longer block in the delete routines and move the final nport puts to the callbacks. Moved the fcloop_nport_get/put/free routines to avoid forward declarations. Ensure port_info structs used in registrations are nulled in case fields are not set (ex: devloss_tmo values). Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/fcloop.c | 102 +++++++++++++---------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1fd1afbb8b2a..7b75d9de55ab 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -224,8 +224,6 @@ struct fcloop_nport { struct fcloop_lport *lport; struct list_head nport_list; struct kref ref; - struct completion rport_unreg_done; - struct completion tport_unreg_done; u64 node_name; u64 port_name; u32 port_role; @@ -630,6 +628,32 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, schedule_work(&inireq->iniwork); } +static void +fcloop_nport_free(struct kref *ref) +{ + struct fcloop_nport *nport = + container_of(ref, struct fcloop_nport, ref); + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(nport); +} + +static void +fcloop_nport_put(struct fcloop_nport *nport) +{ + kref_put(&nport->ref, fcloop_nport_free); +} + +static int +fcloop_nport_get(struct fcloop_nport *nport) +{ + return kref_get_unless_zero(&nport->ref); +} + static void fcloop_localport_delete(struct nvme_fc_local_port *localport) { @@ -644,8 +668,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; - /* release any threads waiting for the unreg to complete */ - complete(&rport->nport->rport_unreg_done); + fcloop_nport_put(rport->nport); } static void @@ -653,8 +676,7 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; - /* release any threads waiting for the unreg to complete */ - complete(&tport->nport->tport_unreg_done); + fcloop_nport_put(tport->nport); } #define FCLOOP_HW_QUEUES 4 @@ -722,6 +744,7 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, goto out_free_opts; } + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = opts->wwnn; pinfo.port_name = opts->wwpn; pinfo.port_role = opts->roles; @@ -804,32 +827,6 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } -static void -fcloop_nport_free(struct kref *ref) -{ - struct fcloop_nport *nport = - container_of(ref, struct fcloop_nport, ref); - unsigned long flags; - - spin_lock_irqsave(&fcloop_lock, flags); - list_del(&nport->nport_list); - spin_unlock_irqrestore(&fcloop_lock, flags); - - kfree(nport); -} - -static void -fcloop_nport_put(struct fcloop_nport *nport) -{ - kref_put(&nport->ref, fcloop_nport_free); -} - -static int -fcloop_nport_get(struct fcloop_nport *nport) -{ - return kref_get_unless_zero(&nport->ref); -} - static struct fcloop_nport * fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) { @@ -938,6 +935,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -EIO; + memset(&pinfo, 0, sizeof(pinfo)); pinfo.node_name = nport->node_name; pinfo.port_name = nport->port_name; pinfo.port_role = nport->port_role; @@ -979,24 +977,12 @@ __unlink_remote_port(struct fcloop_nport *nport) } static int -__wait_remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) +__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) { - int ret; - if (!rport) return -EALREADY; - init_completion(&nport->rport_unreg_done); - - ret = nvme_fc_unregister_remoteport(rport->remoteport); - if (ret) - return ret; - - wait_for_completion(&nport->rport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvme_fc_unregister_remoteport(rport->remoteport); } static ssize_t @@ -1029,7 +1015,7 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); return ret ? ret : count; } @@ -1086,24 +1072,12 @@ __unlink_target_port(struct fcloop_nport *nport) } static int -__wait_targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) +__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) { - int ret; - if (!tport) return -EALREADY; - init_completion(&nport->tport_unreg_done); - - ret = nvmet_fc_unregister_targetport(tport->targetport); - if (ret) - return ret; - - wait_for_completion(&nport->tport_unreg_done); - - fcloop_nport_put(nport); - - return ret; + return nvmet_fc_unregister_targetport(tport->targetport); } static ssize_t @@ -1136,7 +1110,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, if (!nport) return -ENOENT; - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); return ret ? ret : count; } @@ -1223,11 +1197,11 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __wait_targetport_unreg(nport, tport); + ret = __targetport_unreg(nport, tport); if (ret) pr_warn("%s: Failed deleting target port\n", __func__); - ret = __wait_remoteport_unreg(nport, rport); + ret = __remoteport_unreg(nport, rport); if (ret) pr_warn("%s: Failed deleting remote port\n", __func__); From a08588ea486a5590b50c36f437dc86350271b250 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:39 -0700 Subject: [PATCH 0766/1322] irqchip/mips-gic: Fix shifts to extract register fields The MIPS GIC driver is incorrectly using __fls to shift registers, intending to shift to the least significant bit of a value based upon its mask but instead shifting off all but the value's top bit. It should actually be using __ffs to shift to the first, not last, bit of the value. Apparently the system I used when testing commit 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") and commit b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") managed to work correctly despite this issue, but not all systems do... Fixes: 3680746abd87 ("irqchip: mips-gic: Convert remaining shared reg access to new accessors") Fixes: b2b2e584ceab ("irqchip: mips-gic: Clean up mti, reserved-cpu-vectors handling") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-2-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 40159ac12ac8..0022b31ad2c5 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -645,7 +645,7 @@ static int __init gic_of_init(struct device_node *node, /* Find the first available CPU vector. */ i = 0; - reserved = (C_SW0 | C_SW1) >> __fls(C_SW0); + reserved = (C_SW0 | C_SW1) >> __ffs(C_SW0); while (!of_property_read_u32_index(node, "mti,reserved-cpu-vectors", i++, &cpu_vec)) reserved |= BIT(cpu_vec); @@ -684,11 +684,11 @@ static int __init gic_of_init(struct device_node *node, gicconfig = read_gic_config(); gic_shared_intrs = gicconfig & GIC_CONFIG_NUMINTERRUPTS; - gic_shared_intrs >>= __fls(GIC_CONFIG_NUMINTERRUPTS); + gic_shared_intrs >>= __ffs(GIC_CONFIG_NUMINTERRUPTS); gic_shared_intrs = (gic_shared_intrs + 1) * 8; gic_vpes = gicconfig & GIC_CONFIG_PVPS; - gic_vpes >>= __fls(GIC_CONFIG_PVPS); + gic_vpes >>= __ffs(GIC_CONFIG_PVPS); gic_vpes = gic_vpes + 1; if (cpu_has_veic) { From d9f82930a5b41f28fadb1e4838b877ae528456d3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 21 Sep 2017 23:24:40 -0700 Subject: [PATCH 0767/1322] irqchip/mips-gic: Use effective affinity to unmask Commit 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") adjusted the way we handle masking interrupts to set & clear the interrupt's bit in each pcpu_mask. This allows us to avoid needing to read the GIC mask registers and perform a bitwise and of their values with the pending & pcpu_masks. Unfortunately this didn't quite work for IPIs, which were mapped to a particular CPU/VP during initialisation but never set the affinity or effective_affinity fields of their struct irq_desc. This led to them losing their affinity when gic_unmask_irq() was called for them, and they'd all become affine to cpu0. Fix this by: 1) Setting the effective affinity of interrupts in gic_shared_irq_domain_map(), which is where we actually map an interrupt to a CPU/VP. This ensures that the effective affinity mask is always valid, not just after explicitly setting affinity. 2) Using an interrupt's effective affinity when unmasking it, which prevents gic_unmask_irq() from unintentionally changing which pcpu_mask includes an interrupt. Fixes: 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") Signed-off-by: Paul Burton Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/20170922062440.23701-3-paul.burton@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 0022b31ad2c5..c90976d7e53c 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -175,14 +175,13 @@ static void gic_mask_irq(struct irq_data *d) static void gic_unmask_irq(struct irq_data *d) { - struct cpumask *affinity = irq_data_get_affinity_mask(d); unsigned int intr = GIC_HWIRQ_TO_SHARED(d->hwirq); unsigned int cpu; write_gic_smask(intr); gic_clear_pcpu_masks(intr); - cpu = cpumask_first_and(affinity, cpu_online_mask); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(d)); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); } @@ -420,13 +419,17 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw, unsigned int cpu) { int intr = GIC_HWIRQ_TO_SHARED(hw); + struct irq_data *data; unsigned long flags; + data = irq_get_irq_data(virq); + spin_lock_irqsave(&gic_lock, flags); write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin); write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu))); gic_clear_pcpu_masks(intr); set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); + irq_data_update_effective_affinity(data, cpumask_of(cpu)); spin_unlock_irqrestore(&gic_lock, flags); return 0; From 7755d83e48397e822aac751b1545f8bcf71d133e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 22 Sep 2017 21:20:41 +0900 Subject: [PATCH 0768/1322] irqdomain: Add __rcu annotations to radix tree accessors Fix various address spaces warning of sparse. kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces) kernel/irq/irqdomain.c:1463:14: expected void **slot kernel/irq/irqdomain.c:1463:14: got void [noderef] ** kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces) kernel/irq/irqdomain.c:1465:66: expected void [noderef] **slot kernel/irq/irqdomain.c:1465:66: got void **slot Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jason Cooper Link: https://lkml.kernel.org/r/1506082841-11530-1-git-send-email-yamada.masahiro@socionext.com --- kernel/irq/irqdomain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ From c88f0e6b06f4092995688211a631bb436125d77b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 27 Aug 2017 20:25:26 +0800 Subject: [PATCH 0769/1322] scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly ChunYu found a kernel crash by syzkaller: [ 651.617875] kasan: CONFIG_KASAN_INLINE enabled [ 651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 651.618731] general protection fault: 0000 [#1] SMP KASAN [ 651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32 [ 651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 651.622309] task: ffff880117780000 task.stack: ffff8800a3188000 [ 651.622762] RIP: 0010:skb_release_data+0x26c/0x590 [...] [ 651.627260] Call Trace: [ 651.629156] skb_release_all+0x4f/0x60 [ 651.629450] consume_skb+0x1a5/0x600 [ 651.630705] netlink_unicast+0x505/0x720 [ 651.632345] netlink_sendmsg+0xab2/0xe70 [ 651.633704] sock_sendmsg+0xcf/0x110 [ 651.633942] ___sys_sendmsg+0x833/0x980 [ 651.637117] __sys_sendmsg+0xf3/0x240 [ 651.638820] SyS_sendmsg+0x32/0x50 [ 651.639048] entry_SYSCALL_64_fastpath+0x1f/0xc2 It's caused by skb_shared_info at the end of sk_buff was overwritten by ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx. During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh), ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a new value to skb_shinfo(SKB)->nr_frags by ev->type. This patch is to fix it by checking nlh->nlmsg_len properly there to avoid over accessing sk_buff. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Chris Leech Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 8934f19bce8e..0190aeff5f7f 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3689,7 +3689,7 @@ iscsi_if_rx(struct sk_buff *skb) uint32_t group; nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || + if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) || skb->len < nlh->nlmsg_len) { break; } From f3a0c7b3fa7cdf6783827c245c62772687f8b3ac Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 9 Sep 2017 20:37:03 +0800 Subject: [PATCH 0770/1322] MAINTAINERS: Add entry for MediaTek PMIC LED driver Add myself as a maintainer to support existing SoCs and push forward following MediaTek PMICs with LEDs to reuse the driver. Signed-off-by: Sean Wang Signed-off-by: Jacek Anaszewski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2281af4b41b6..22cb6cc91081 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8586,6 +8586,12 @@ M: Sean Wang S: Maintained F: drivers/media/rc/mtk-cir.c +MEDIATEK PMIC LED DRIVER +M: Sean Wang +S: Maintained +F: drivers/leds/leds-mt6323.c +F: Documentation/devicetree/bindings/leds/leds-mt6323.txt + MEDIATEK ETHERNET DRIVER M: Felix Fietkau M: John Crispin From fac1c2040203363eab6c6e86ce883cb71390418f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:15 +0200 Subject: [PATCH 0771/1322] smp/hotplug: Add state diagram Add a state diagram to clarify when which states are ran where. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.661598270@infradead.org --- include/linux/cpuhotplug.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f24bfb2b9a2d..477b2e6f60f7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -3,6 +3,24 @@ #include +/* + * CPU-up CPU-down + * + * BP AP BP AP + * + * OFFLINE OFFLINE + * | ^ + * v | + * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) + * | AP_OFFLINE + * v (IRQ-off) ,---------------^ + * AP_ONLNE | (stop_machine) + * | TEARDOWN_CPU <- AP_ONLINE_IDLE + * | ^ + * v | + * AP_ACTIVE AP_ACTIVE + */ + enum cpuhp_state { CPUHP_OFFLINE, CPUHP_CREATE_THREADS, From 96abb968549cdefd0964d1f7af0a79f4e6e7f897 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:16 +0200 Subject: [PATCH 0772/1322] smp/hotplug: Allow external multi-instance rollback Currently the rollback of multi-instance states is handled inside cpuhp_invoke_callback(). The problem is that when we want to allow an explicit state change for rollback, we need to return from the function without doing the rollback. Change cpuhp_invoke_callback() to optionally return the multi-instance state, such that rollback can be done from a subsequent call. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.720361181@infradead.org --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..323b71050b54 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -123,13 +123,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -138,6 +141,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int ret, cnt; if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +156,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +166,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -323,7 +338,7 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -334,7 +349,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, int ret = 0; for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_down(cpu, st); @@ -350,7 +365,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +377,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -428,11 +443,13 @@ static void cpuhp_thread_fun(unsigned int cpu) if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); local_irq_enable(); } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + st->bringup, st->node, + NULL); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -472,7 +489,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); st->cb_state = state; st->single = true; @@ -595,7 +612,7 @@ static int take_cpu_down(void *_param) st->state--; /* Invoke the former CPU_DYING callbacks */ for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -776,7 +793,7 @@ void notify_cpu_starting(unsigned int cpu) rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); } } @@ -1307,9 +1324,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; From 4dddfb5faa6118564b0c54a163353d13882299d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:17 +0200 Subject: [PATCH 0773/1322] smp/hotplug: Rewrite AP state machine core There is currently no explicit state change on rollback. That is, st->bringup, st->rollback and st->target are not consistent when doing the rollback. Rework the AP state handling to be more coherent. This does mean we have to do a second AP kick-and-wait for rollback, but since rollback is the slow path of a slowpath, this really should not matter. Take this opportunity to simplify the AP thread function to only run a single callback per invocation. This unifies the three single/up/down modes is supports. The looping it used to do for up/down are achieved by retaining should_run and relying on the main smpboot_thread_fn() loop. (I have most of a patch that does the same for the BP state handling, but that's not critical and gets a little complicated because CPUHP_BRINGUP_CPU does the AP handoff from a callback, which gets recursive @st usage, I still have de-fugly that.) [ tglx: Move cpuhp_down_callbacks() et al. into the HOTPLUG_CPU section to avoid gcc complaining about unused functions. Make the HOTPLUG_CPU one piece instead of having two consecutive ifdef sections of the same type. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.769658088@infradead.org --- kernel/cpu.c | 327 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 209 insertions(+), 118 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 323b71050b54..1139063de5af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -58,6 +58,7 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; struct completion done; @@ -112,6 +113,14 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -286,7 +295,72 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { @@ -301,12 +375,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -332,32 +404,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -404,71 +450,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) + + if (WARN_ON_ONCE(!st->should_run)) return; - st->should_run = false; - lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ + if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - local_irq_enable(); - } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node, - NULL); - } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - undo_cpu_down(cpu, st); - st->rollback = false; + state = st->cb_state; + st->should_run = false; } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); + } else { + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); + } } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); + + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); + } else { + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; + } + +next: lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + + if (!st->should_run) + complete(&st->done); } /* Invoke a single callback on a remote cpu */ @@ -477,6 +542,7 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; @@ -491,48 +557,43 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!st->thread) return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + st->rollback = false; + st->last = NULL; + + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; + + __cpuhp_kick_ap(st); /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); lock_map_acquire(&cpuhp_state_lock_map); lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -693,11 +754,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -716,13 +798,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -737,6 +819,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -744,9 +828,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: @@ -771,11 +854,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -846,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1313,6 +1400,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; From 724a86881d03ee5794148e65142e24ed3621be66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:18 +0200 Subject: [PATCH 0774/1322] smp/hotplug: Callback vs state-machine consistency While the generic callback functions have an 'int' return and thus appear to be allowed to return error, this is not true for all states. Specifically, what used to be STARTING/DYING are ran with IRQs disabled from critical parts of CPU bringup/teardown and are not allowed to fail. Add WARNs to enforce this rule. But since some callbacks are indeed allowed to fail, we have the situation where a state-machine rollback encounters a failure, in this case we're stuck, we can't go forward and we can't go back. Also add a WARN for that case. AFAICT this is a fundamental 'problem' with no real obvious solution. We want the 'prepare' callbacks to allow failure on either up or down. Typically on prepare-up this would be things like -ENOMEM from resource allocations, and the typical usage in prepare-down would be something like -EBUSY to avoid CPUs being taken away. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.819539119@infradead.org --- kernel/cpu.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 1139063de5af..d6f1b8c36400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -202,7 +202,14 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } @@ -659,6 +666,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -672,8 +680,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -876,11 +889,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } From 5f4b55e10645b7371322c800a5ec745cab487a6c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:20 +0200 Subject: [PATCH 0775/1322] smp/hotplug: Differentiate the AP-work lockdep class between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. CPU0 CPU1 CPU2 cpuhp_up_callbacks: takedown_cpu: cpuhp_thread_fun: cpuhp_state irq_lock_sparse() irq_lock_sparse() wait_for_completion() cpuhp_state complete() Now that we have consistent AP state, we can trivially separate the AP-work class between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.922524234@infradead.org --- kernel/cpu.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d6f1b8c36400..d5c09985fbb6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -68,9 +68,26 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -486,7 +503,7 @@ static void cpuhp_thread_fun(unsigned int cpu) if (WARN_ON_ONCE(!st->should_run)) return; - lock_map_acquire(&cpuhp_state_lock_map); + cpuhp_lock_acquire(bringup); if (st->single) { state = st->cb_state; @@ -537,7 +554,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } next: - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_release(bringup); if (!st->should_run) complete(&st->done); @@ -554,8 +571,11 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls @@ -593,8 +613,11 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state prev_state = st->state; int ret; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); ret = cpuhp_kick_ap(st, st->target); From 5ebe7742fff8be5f1359bc50f5d43fb6ff7bd060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:19 +0200 Subject: [PATCH 0776/1322] smp/hotplug: Differentiate the AP completion between up and down With lockdep-crossrelease we get deadlock reports that span cpu-up and cpu-down chains. Such deadlocks cannot possibly happen because cpu-up and cpu-down are globally serialized. takedown_cpu() irq_lock_sparse() wait_for_completion(&st->done) cpuhp_thread_fun cpuhp_up_callback cpuhp_invoke_callback irq_affinity_online_cpu irq_local_spare() irq_unlock_sparse() complete(&st->done) Now that we have consistent AP state, we can trivially separate the AP completion between up and down using st->bringup. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: max.byungchul.park@gmail.com Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20170920170546.872472799@infradead.org --- kernel/cpu.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index d5c09985fbb6..6bbe261b851f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -46,7 +46,8 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; @@ -61,7 +62,8 @@ struct cpuhp_cpu_state { struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; @@ -130,14 +132,6 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } -/* - * The former STARTING/DYING states, ran with IRQs disabled and must not fail. - */ -static bool cpuhp_is_atomic_state(enum cpuhp_state state) -{ - return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; -} - static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) { struct cpuhp_step *sp; @@ -232,6 +226,26 @@ err: } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -368,7 +382,7 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) smp_mb(); st->should_run = true; wake_up_process(st->thread); - wait_for_completion(&st->done); + wait_for_ap_thread(st, st->bringup); } static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) @@ -391,7 +405,7 @@ static int bringup_wait_for_ap(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -464,7 +478,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -557,7 +572,7 @@ next: cpuhp_lock_release(bringup); if (!st->should_run) - complete(&st->done); + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -753,7 +768,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -772,7 +787,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -939,7 +954,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ From 1db49484f21ed0fcdadd0635a3669f5f386546fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Sep 2017 19:00:21 +0200 Subject: [PATCH 0777/1322] smp/hotplug: Hotplug state fail injection Add a sysfs file to one-time fail a specific state. This can be used to test the state rollback code paths. Something like this (hotplug-up.sh): #!/bin/bash echo 0 > /debug/sched_debug echo 1 > /debug/tracing/events/cpuhp/enable ALL_STATES=`cat /sys/devices/system/cpu/hotplug/states | cut -d':' -f1` STATES=${1:-$ALL_STATES} for state in $STATES do echo 0 > /sys/devices/system/cpu/cpu1/online echo 0 > /debug/tracing/trace echo Fail state: $state echo $state > /sys/devices/system/cpu/cpu1/hotplug/fail cat /sys/devices/system/cpu/cpu1/hotplug/fail echo 1 > /sys/devices/system/cpu/cpu1/online cat /debug/tracing/trace > hotfail-${state}.trace sleep 1 done Can be used to test for all possible rollback (barring multi-instance) scenarios on CPU-up, CPU-down is a trivial modification of the above. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: bigeasy@linutronix.de Cc: efault@gmx.de Cc: rostedt@goodmis.org Cc: max.byungchul.park@gmail.com Link: https://lkml.kernel.org/r/20170920170546.972581715@infradead.org --- include/linux/cpuhotplug.h | 3 +- kernel/cpu.c | 60 +++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 477b2e6f60f7..6d508767e144 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -22,7 +22,8 @@ */ enum cpuhp_state { - CPUHP_OFFLINE, + CPUHP_INVALID = -1, + CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, diff --git a/kernel/cpu.c b/kernel/cpu.c index 6bbe261b851f..8de11a29e495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -52,6 +52,7 @@ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -67,7 +68,9 @@ struct cpuhp_cpu_state { #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = @@ -160,6 +163,15 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; @@ -1805,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; From 2d04cfcfa2e86cac751979b584e5420dd470903b Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:02 +0530 Subject: [PATCH 0778/1322] net: af_packet: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..b0f221885dfd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -544,9 +544,7 @@ static void prb_init_blk_timer(struct packet_sock *po, struct tpacket_kbdq_core *pkc, void (*func) (unsigned long)) { - init_timer(&pkc->retire_blk_timer); - pkc->retire_blk_timer.data = (long)po; - pkc->retire_blk_timer.function = func; + setup_timer(&pkc->retire_blk_timer, func, (long)po); pkc->retire_blk_timer.expires = jiffies; } From b1e07c5486f0df26caf93234bd7db5545a37aba8 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:03 +0530 Subject: [PATCH 0779/1322] net: nfc: hci: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index b740fef0acc5..a8a6e7814e09 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -1004,9 +1004,8 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev) INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work); - init_timer(&hdev->cmd_timer); - hdev->cmd_timer.data = (unsigned long)hdev; - hdev->cmd_timer.function = nfc_hci_cmd_timeout; + setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout, + (unsigned long)hdev); skb_queue_head_init(&hdev->rx_hcp_frags); From 22d387e13ac357279ddac55694883fd3e30a5639 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:04 +0530 Subject: [PATCH 0780/1322] net: nfc: hci: llc_shdlc: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/hci/llc_shdlc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 17e59a009ce6..58df37eae1e8 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -763,17 +763,14 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, mutex_init(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; - init_timer(&shdlc->connect_timer); - shdlc->connect_timer.data = (unsigned long)shdlc; - shdlc->connect_timer.function = llc_shdlc_connect_timeout; + setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t1_timer); - shdlc->t1_timer.data = (unsigned long)shdlc; - shdlc->t1_timer.function = llc_shdlc_t1_timeout; + setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t2_timer); - shdlc->t2_timer.data = (unsigned long)shdlc; - shdlc->t2_timer.function = llc_shdlc_t2_timeout; + setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout, + (unsigned long)shdlc); shdlc->w = SHDLC_MAX_WINDOW; shdlc->srej_support = SHDLC_SREJ_SUPPORT; From d835b63cc4ee67e59eed9d1957f729c0a30b7331 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 25 Sep 2017 13:00:05 +0530 Subject: [PATCH 0781/1322] net: nfc: llcp_core: use setup_timer() helper. Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: David S. Miller --- net/nfc/llcp_core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 02eef5cf3cce..7988185072e5 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1573,9 +1573,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); - init_timer(&local->link_timer); - local->link_timer.data = (unsigned long) local; - local->link_timer.function = nfc_llcp_symm_timer; + setup_timer(&local->link_timer, nfc_llcp_symm_timer, + (unsigned long)local); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); @@ -1601,9 +1600,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); - init_timer(&local->sdreq_timer); - local->sdreq_timer.data = (unsigned long) local; - local->sdreq_timer.function = nfc_llcp_sdreq_timer; + setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer, + (unsigned long)local); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); list_add(&local->list, &llcp_devices); From b5d7388f9db78f19e6af7b56a469ca8d1860329d Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 21 Sep 2017 18:43:29 -0400 Subject: [PATCH 0782/1322] bpf: Optimize lpm trie delete Before the delete operator was added, this datastructure maintained an invariant that intermediate nodes were only present when necessary to build the tree. This patch updates the delete operation to reinstate that invariant by removing unnecessary intermediate nodes after a node is removed and thus keeping the tree structure at a minimal size. Suggested-by: Daniel Mack Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 71 ++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9d58a576b2ae..34d8a690ea05 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -394,8 +394,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; - struct lpm_trie_node __rcu **trim; - struct lpm_trie_node *node; + struct lpm_trie_node __rcu **trim, **trim2; + struct lpm_trie_node *node, *parent; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -407,31 +407,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping - * track of where we could begin trimming the tree. The trim-point - * is the sub-tree along the walk consisting of only single-child - * intermediate nodes and ending at a leaf node that we want to - * remove. + * track of the path we traverse. We will need to know the node + * we wish to delete, and the slot that points to the node we want + * to delete. We may also need to know the nodes parent and the + * slot that contains it. */ trim = &trie->root; - node = rcu_dereference_protected( - trie->root, lockdep_is_held(&trie->lock)); - while (node) { + trim2 = trim; + parent = NULL; + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || node->prefixlen == key->prefixlen) break; + parent = node; + trim2 = trim; next_bit = extract_bit(key->data, node->prefixlen); - /* If we hit a node that has more than one child or is a valid - * prefix itself, do not remove it. Reset the root of the trim - * path to its descendant on our path. - */ - if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || - (node->child[0] && node->child[1])) - trim = &node->child[next_bit]; - node = rcu_dereference_protected( - node->child[next_bit], lockdep_is_held(&trie->lock)); + trim = &node->child[next_bit]; } if (!node || node->prefixlen != key->prefixlen || @@ -442,27 +437,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) trie->n_entries--; - /* If the node we are removing is not a leaf node, simply mark it + /* If the node we are removing has two children, simply mark it * as intermediate and we are done. */ - if (rcu_access_pointer(node->child[0]) || + if (rcu_access_pointer(node->child[0]) && rcu_access_pointer(node->child[1])) { node->flags |= LPM_TREE_NODE_FLAG_IM; goto out; } - /* trim should now point to the slot holding the start of a path from - * zero or more intermediate nodes to our leaf node for deletion. + /* If the parent of the node we are about to delete is an intermediate + * node, and the deleted node doesn't have any children, we can delete + * the intermediate parent as well and promote its other child + * up the tree. Doing this maintains the invariant that all + * intermediate nodes have exactly 2 children and that there are no + * unnecessary intermediate nodes in the tree. */ - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { - RCU_INIT_POINTER(*trim, NULL); - trim = rcu_access_pointer(node->child[0]) ? - &node->child[0] : - &node->child[1]; + if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && + !node->child[0] && !node->child[1]) { + if (node == rcu_access_pointer(parent->child[0])) + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[1])); + else + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[0])); + kfree_rcu(parent, rcu); kfree_rcu(node, rcu); + goto out; } + /* The node we are removing has either zero or one child. If there + * is a child, move it into the removed node's slot then delete + * the node. Otherwise just clear the slot and delete the node. + */ + if (node->child[0]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); + else if (node->child[1]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); + else + RCU_INIT_POINTER(*trim, NULL); + kfree_rcu(node, rcu); + out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); From cdd10c9627496ad25c87ce6394e29752253c69d3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:23 +0200 Subject: [PATCH 0783/1322] l2tp: ensure sessions are freed after their PPPOL2TP socket If l2tp_tunnel_delete() or l2tp_tunnel_closeall() deletes a session right after pppol2tp_release() orphaned its socket, then the 'sock' variable of the pppol2tp_session_close() callback is NULL. Yet the session is still used by pppol2tp_release(). Therefore we need to take an extra reference in any case, to prevent l2tp_tunnel_delete() or l2tp_tunnel_closeall() from freeing the session. Since the pppol2tp_session_close() callback is only set if the session is associated to a PPPOL2TP socket and that both l2tp_tunnel_delete() and l2tp_tunnel_closeall() hold the PPPOL2TP socket before calling pppol2tp_session_close(), we're sure that pppol2tp_session_close() and pppol2tp_session_destruct() are paired and called in the right order. So the reference taken by the former will be released by the later. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if From b228a94066406b6c456321d69643b0d7ce11cfa6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 22 Sep 2017 15:39:24 +0200 Subject: [PATCH 0784/1322] l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall() There are several ways to remove L2TP sessions: * deleting a session explicitly using the netlink interface (with L2TP_CMD_SESSION_DELETE), * deleting the session's parent tunnel (either by closing the tunnel's file descriptor or using the netlink interface), * closing the PPPOL2TP file descriptor of a PPP pseudo-wire. In some cases, when these methods are used concurrently on the same session, the session can be removed twice, leading to use-after-free bugs. This patch adds a 'dead' flag, used by l2tp_session_delete() and l2tp_tunnel_closeall() to prevent them from stepping on each other's toes. The session deletion path used when closing a PPPOL2TP file descriptor doesn't need to be adapted. It already has to ensure that a session remains valid for the lifetime of its PPPOL2TP file descriptor. So it takes an extra reference on the session in the ->session_close() callback (pppol2tp_session_close()), which is eventually dropped in the ->sk_destruct() callback of the PPPOL2TP socket (pppol2tp_session_destruct()). Still, __l2tp_session_unhash() and l2tp_session_queue_purge() can be called twice and even concurrently for a given session, but thanks to proper locking and re-initialisation of list fields, this is not an issue. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++++++ net/l2tp/l2tp_core.h | 1 + 2 files changed, 7 insertions(+) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..d8c2a89a76e1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1750,6 +1753,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..70a12df40a5f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ From 910801809b2e40a4baedd080ef5d80b4a180e70e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:38 +0200 Subject: [PATCH 0785/1322] security/keys: properly zero out sensitive key material in big_key Error paths forgot to zero out sensitive material, so this patch changes some kfrees into a kzfrees. Signed-off-by: Jason A. Donenfeld Signed-off-by: David Howells Reviewed-by: Eric Biggers Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/big_key.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 6acb00f6f22c..507d6fb86a4f 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -195,7 +195,7 @@ int big_key_preparse(struct key_preparsed_payload *prep) *path = file->f_path; path_get(path); fput(file); - kfree(data); + kzfree(data); } else { /* Just store the data in a buffer */ void *data = kmalloc(datalen, GFP_KERNEL); @@ -211,9 +211,9 @@ int big_key_preparse(struct key_preparsed_payload *prep) err_fput: fput(file); err_enckey: - kfree(enckey); + kzfree(enckey); error: - kfree(data); + kzfree(data); return ret; } @@ -227,7 +227,7 @@ void big_key_free_preparse(struct key_preparsed_payload *prep) path_put(path); } - kfree(prep->payload.data[big_key_data]); + kzfree(prep->payload.data[big_key_data]); } /* @@ -259,7 +259,7 @@ void big_key_destroy(struct key *key) path->mnt = NULL; path->dentry = NULL; } - kfree(key->payload.data[big_key_data]); + kzfree(key->payload.data[big_key_data]); key->payload.data[big_key_data] = NULL; } @@ -328,7 +328,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) err_fput: fput(file); error: - kfree(data); + kzfree(data); } else { ret = datalen; if (copy_to_user(buffer, key->payload.data[big_key_data], From 428490e38b2e352812e0b765d8bceafab0ec441d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 20 Sep 2017 16:58:39 +0200 Subject: [PATCH 0786/1322] security/keys: rewrite all of big_key crypto This started out as just replacing the use of crypto/rng with get_random_bytes_wait, so that we wouldn't use bad randomness at boot time. But, upon looking further, it appears that there were even deeper underlying cryptographic problems, and that this seems to have been committed with very little crypto review. So, I rewrote the whole thing, trying to keep to the conventions introduced by the previous author, to fix these cryptographic flaws. It makes no sense to seed crypto/rng at boot time and then keep using it like this, when in fact there's already get_random_bytes_wait, which can ensure there's enough entropy and be a much more standard way of generating keys. Since this sensitive material is being stored untrusted, using ECB and no authentication is simply not okay at all. I find it surprising and a bit horrifying that this code even made it past basic crypto review, which perhaps points to some larger issues. This patch moves from using AES-ECB to using AES-GCM. Since keys are uniquely generated each time, we can set the nonce to zero. There was also a race condition in which the same key would be reused at the same time in different threads. A mutex fixes this issue now. So, to summarize, this commit fixes the following vulnerabilities: * Low entropy key generation, allowing an attacker to potentially guess or predict keys. * Unauthenticated encryption, allowing an attacker to modify the cipher text in particular ways in order to manipulate the plaintext, which is is even more frightening considering the next point. * Use of ECB mode, allowing an attacker to trivially swap blocks or compare identical plaintext blocks. * Key re-use. * Faulty memory zeroing. Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Biggers Signed-off-by: David Howells Cc: Herbert Xu Cc: Kirill Marinushkin Cc: security@kernel.org Cc: stable@vger.kernel.org --- security/keys/Kconfig | 4 +- security/keys/big_key.c | 125 +++++++++++++++++++--------------------- 2 files changed, 59 insertions(+), 70 deletions(-) diff --git a/security/keys/Kconfig b/security/keys/Kconfig index a7a23b5541f8..91eafada3164 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -45,10 +45,8 @@ config BIG_KEYS bool "Large payload keys" depends on KEYS depends on TMPFS - depends on (CRYPTO_ANSI_CPRNG = y || CRYPTO_DRBG = y) select CRYPTO_AES - select CRYPTO_ECB - select CRYPTO_RNG + select CRYPTO_GCM help This option provides support for holding large keys within the kernel (for example Kerberos ticket caches). The data may be stored out to diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 507d6fb86a4f..e607830b6154 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -1,5 +1,6 @@ /* Large capacity key type * + * Copyright (C) 2017 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * @@ -16,10 +17,10 @@ #include #include #include +#include #include #include -#include -#include +#include /* * Layout of key payload words. @@ -49,7 +50,12 @@ enum big_key_op { /* * Key size for big_key data encryption */ -#define ENC_KEY_SIZE 16 +#define ENC_KEY_SIZE 32 + +/* + * Authentication tag length + */ +#define ENC_AUTHTAG_SIZE 16 /* * big_key defined keys take an arbitrary string as the description and an @@ -64,57 +70,62 @@ struct key_type key_type_big_key = { .destroy = big_key_destroy, .describe = big_key_describe, .read = big_key_read, + /* no ->update(); don't add it without changing big_key_crypt() nonce */ }; /* - * Crypto names for big_key data encryption + * Crypto names for big_key data authenticated encryption */ -static const char big_key_rng_name[] = "stdrng"; -static const char big_key_alg_name[] = "ecb(aes)"; +static const char big_key_alg_name[] = "gcm(aes)"; /* - * Crypto algorithms for big_key data encryption + * Crypto algorithms for big_key data authenticated encryption */ -static struct crypto_rng *big_key_rng; -static struct crypto_skcipher *big_key_skcipher; +static struct crypto_aead *big_key_aead; /* - * Generate random key to encrypt big_key data + * Since changing the key affects the entire object, we need a mutex. */ -static inline int big_key_gen_enckey(u8 *key) -{ - return crypto_rng_get_bytes(big_key_rng, key, ENC_KEY_SIZE); -} +static DEFINE_MUTEX(big_key_aead_lock); /* * Encrypt/decrypt big_key data */ static int big_key_crypt(enum big_key_op op, u8 *data, size_t datalen, u8 *key) { - int ret = -EINVAL; + int ret; struct scatterlist sgio; - SKCIPHER_REQUEST_ON_STACK(req, big_key_skcipher); + struct aead_request *aead_req; + /* We always use a zero nonce. The reason we can get away with this is + * because we're using a different randomly generated key for every + * different encryption. Notably, too, key_type_big_key doesn't define + * an .update function, so there's no chance we'll wind up reusing the + * key to encrypt updated data. Simply put: one key, one encryption. + */ + u8 zero_nonce[crypto_aead_ivsize(big_key_aead)]; - if (crypto_skcipher_setkey(big_key_skcipher, key, ENC_KEY_SIZE)) { + aead_req = aead_request_alloc(big_key_aead, GFP_KERNEL); + if (!aead_req) + return -ENOMEM; + + memset(zero_nonce, 0, sizeof(zero_nonce)); + sg_init_one(&sgio, data, datalen + (op == BIG_KEY_ENC ? ENC_AUTHTAG_SIZE : 0)); + aead_request_set_crypt(aead_req, &sgio, &sgio, datalen, zero_nonce); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + aead_request_set_ad(aead_req, 0); + + mutex_lock(&big_key_aead_lock); + if (crypto_aead_setkey(big_key_aead, key, ENC_KEY_SIZE)) { ret = -EAGAIN; goto error; } - - skcipher_request_set_tfm(req, big_key_skcipher); - skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, - NULL, NULL); - - sg_init_one(&sgio, data, datalen); - skcipher_request_set_crypt(req, &sgio, &sgio, datalen, NULL); - if (op == BIG_KEY_ENC) - ret = crypto_skcipher_encrypt(req); + ret = crypto_aead_encrypt(aead_req); else - ret = crypto_skcipher_decrypt(req); - - skcipher_request_zero(req); - + ret = crypto_aead_decrypt(aead_req); error: + mutex_unlock(&big_key_aead_lock); + aead_request_free(aead_req); return ret; } @@ -146,16 +157,13 @@ int big_key_preparse(struct key_preparsed_payload *prep) * * File content is stored encrypted with randomly generated key. */ - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; - /* prepare aligned data to encrypt */ data = kmalloc(enclen, GFP_KERNEL); if (!data) return -ENOMEM; - memcpy(data, prep->data, datalen); - memset(data + datalen, 0x00, enclen - datalen); /* generate random key */ enckey = kmalloc(ENC_KEY_SIZE, GFP_KERNEL); @@ -163,13 +171,12 @@ int big_key_preparse(struct key_preparsed_payload *prep) ret = -ENOMEM; goto error; } - - ret = big_key_gen_enckey(enckey); - if (ret) + ret = get_random_bytes_wait(enckey, ENC_KEY_SIZE); + if (unlikely(ret)) goto err_enckey; /* encrypt aligned data */ - ret = big_key_crypt(BIG_KEY_ENC, data, enclen, enckey); + ret = big_key_crypt(BIG_KEY_ENC, data, datalen, enckey); if (ret) goto err_enckey; @@ -295,7 +302,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen) struct file *file; u8 *data; u8 *enckey = (u8 *)key->payload.data[big_key_data]; - size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher)); + size_t enclen = datalen + ENC_AUTHTAG_SIZE; loff_t pos = 0; data = kmalloc(enclen, GFP_KERNEL); @@ -344,47 +351,31 @@ error: */ static int __init big_key_init(void) { - struct crypto_skcipher *cipher; - struct crypto_rng *rng; int ret; - rng = crypto_alloc_rng(big_key_rng_name, 0, 0); - if (IS_ERR(rng)) { - pr_err("Can't alloc rng: %ld\n", PTR_ERR(rng)); - return PTR_ERR(rng); - } - - big_key_rng = rng; - - /* seed RNG */ - ret = crypto_rng_reset(rng, NULL, crypto_rng_seedsize(rng)); - if (ret) { - pr_err("Can't reset rng: %d\n", ret); - goto error_rng; - } - /* init block cipher */ - cipher = crypto_alloc_skcipher(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(cipher)) { - ret = PTR_ERR(cipher); + big_key_aead = crypto_alloc_aead(big_key_alg_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(big_key_aead)) { + ret = PTR_ERR(big_key_aead); pr_err("Can't alloc crypto: %d\n", ret); - goto error_rng; + return ret; + } + ret = crypto_aead_setauthsize(big_key_aead, ENC_AUTHTAG_SIZE); + if (ret < 0) { + pr_err("Can't set crypto auth tag len: %d\n", ret); + goto free_aead; } - - big_key_skcipher = cipher; ret = register_key_type(&key_type_big_key); if (ret < 0) { pr_err("Can't register type: %d\n", ret); - goto error_cipher; + goto free_aead; } return 0; -error_cipher: - crypto_free_skcipher(big_key_skcipher); -error_rng: - crypto_free_rng(big_key_rng); +free_aead: + crypto_free_aead(big_key_aead); return ret; } From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Sep 2017 10:44:36 -0700 Subject: [PATCH 0787/1322] PM / OPP: Call notifier without holding opp_table->lock The notifier callbacks may want to call some OPP helper routines which may try to take the same opp_table->lock again and cause a deadlock. One such usecase was reported by Chanwoo Choi, where calling dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler, which further calls dev_pm_opp_find_freq_floor() and it deadlocks. We don't really need the opp_table->lock to be held across the notifier call though, all we want to make sure is that the 'opp' doesn't get freed while being used from within the notifier chain. We can do it with help of dev_pm_opp_get/put() as well. Let's do it. Cc: 4.11+ # 4.11+ Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()" Reported-by: Chanwoo Choi Tested-by: Chanwoo Choi Reviewed-by: Stephen Boyd Reviewed-by: Chanwoo Choi Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index a8cc14fd8ae4..a6de32530693 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, opp->available = availability_req; + dev_pm_opp_get(opp); + mutex_unlock(&opp_table->lock); + /* Notify the change of the OPP availability */ if (availability_req) blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE, @@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct device *dev, unsigned long freq, blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_DISABLE, opp); + dev_pm_opp_put(opp); + goto put_table; + unlock: mutex_unlock(&opp_table->lock); +put_table: dev_pm_opp_put_opp_table(opp_table); return r; } From 675195d0be27391d48d8d23c7c62991505168528 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 20 Sep 2017 08:58:53 +0200 Subject: [PATCH 0788/1322] scsi: scsi_transport_fc: set scsi_target_id upon rescan When an rport is found in the bindings array there is no guarantee that it had been a target port, so we need to call fc_remote_port_rolechg() here to ensure the scsi_target_id is set correctly. Otherwise the port will never be scanned. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index ba9d70f8a6a1..e74fffc32c75 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2876,7 +2876,6 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, memcpy(&rport->port_name, &ids->port_name, sizeof(rport->port_name)); rport->port_id = ids->port_id; - rport->roles = ids->roles; rport->port_state = FC_PORTSTATE_ONLINE; rport->flags &= ~FC_RPORT_FAST_FAIL_TIMEDOUT; @@ -2885,15 +2884,7 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, fci->f->dd_fcrport_size); spin_unlock_irqrestore(shost->host_lock, flags); - if (ids->roles & FC_PORT_ROLE_FCP_TARGET) { - scsi_target_unblock(&rport->dev, SDEV_RUNNING); - - /* initiate a scan of the target */ - spin_lock_irqsave(shost->host_lock, flags); - rport->flags |= FC_RPORT_SCAN_PENDING; - scsi_queue_work(shost, &rport->scan_work); - spin_unlock_irqrestore(shost->host_lock, flags); - } + fc_remote_port_rolechg(rport, ids->roles); return rport; } } From d477bf3af1e88fe27c893f84136647fe11963198 Mon Sep 17 00:00:00 2001 From: Suniel Mahesh Date: Thu, 21 Sep 2017 19:09:03 +0530 Subject: [PATCH 0789/1322] cpufreq: dt: Fix sysfs duplicate filename creation for platform-device ti-cpufreq and cpufreq-dt-platdev drivers are registering platform-device with same name "cpufreq-dt" using platform_device_register_*() routines. This is leading to build warnings appended below. Providing hardware information to OPP framework along with the platform- device creation should be done by ti-cpufreq driver before cpufreq-dt driver comes into place. This patch add's TI am33xx, am43 and dra7 platforms (which use opp-v2 property) to the blacklist of devices in cpufreq-dt-platform driver to avoid creating platform-device twice and remove build warnings. [ 2.370167] ------------[ cut here ]------------ [ 2.375087] WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x58/0x78 [ 2.383112] sysfs: cannot create duplicate filename '/devices/platform/cpufreq-dt' [ 2.391219] Modules linked in: [ 2.394506] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-next-20170912 #1 [ 2.402006] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.408437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.416568] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.424165] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.431488] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.439351] [] (warn_slowpath_fmt) from [] (sysfs_warn_dup+0x58/0x78) [ 2.447938] [] (sysfs_warn_dup) from [] (sysfs_create_dir_ns+0x80/0x98) [ 2.456719] [] (sysfs_create_dir_ns) from [] (kobject_add_internal+0x9c/0x2d4) [ 2.466124] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.474712] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.482489] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.491085] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.501305] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.511253] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.519838] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.528974] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.537565] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.545981] ---[ end trace 2fc00e213c13ab20 ]--- [ 2.551051] ------------[ cut here ]------------ [ 2.555931] WARNING: CPU: 0 PID: 1 at lib/kobject.c:240 kobject_add_internal+0x254/0x2d4 [ 2.564578] kobject_add_internal failed for cpufreq-dt with -EEXIST, don't try to register things with the same name in the same directory. [ 2.577977] Modules linked in: [ 2.581261] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.13.0-next-20170912 #1 [ 2.590013] Hardware name: Generic AM33XX (Flattened Device Tree) [ 2.596437] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 2.604573] [] (show_stack) from [] (dump_stack+0xac/0xe0) [ 2.612172] [] (dump_stack) from [] (__warn+0xd8/0x104) [ 2.619494] [] (__warn) from [] (warn_slowpath_fmt+0x34/0x44) [ 2.627362] [] (warn_slowpath_fmt) from [] (kobject_add_internal+0x254/0x2d4) [ 2.636666] [] (kobject_add_internal) from [] (kobject_add+0x4c/0x9c) [ 2.645255] [] (kobject_add) from [] (device_add+0xcc/0x57c) [ 2.653027] [] (device_add) from [] (platform_device_add+0x100/0x220) [ 2.661615] [] (platform_device_add) from [] (platform_device_register_full+0xf4/0x118) [ 2.671833] [] (platform_device_register_full) from [] (ti_cpufreq_init+0x150/0x22c) [ 2.681779] [] (ti_cpufreq_init) from [] (do_one_initcall+0x3c/0x170) [ 2.690377] [] (do_one_initcall) from [] (kernel_init_freeable+0x1fc/0x2c4) [ 2.699510] [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [ 2.708106] [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) [ 2.716217] ---[ end trace 2fc00e213c13ab21 ]--- Fixes: edeec420de24 (cpufreq: dt-cpufreq: platdev Automatically create device with OPP v2) Signed-off-by: Suniel Mahesh Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt-platdev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 430edadca527..a753c50e9e41 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -118,6 +118,10 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "sigma,tango4", }, + { .compatible = "ti,am33xx", }, + { .compatible = "ti,am43", }, + { .compatible = "ti,dra7", }, + { } }; From 1b17ca044a82342fb96529e1bdff6da7af90bc53 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 22 Sep 2017 16:50:23 +0100 Subject: [PATCH 0790/1322] hv_netvsc: make const array ver_list static, reduces object code size Don't populate const array ver_list on the stack, instead make it static. Makes the object code smaller by over 400 bytes: Before: text data bss dec hex filename 18444 3168 320 21932 55ac drivers/net/hyperv/netvsc.o After: text data bss dec hex filename 17950 3224 320 21494 53f6 drivers/net/hyperv/netvsc.o (gcc 6.3.0, x86-64) Signed-off-by: Colin Ian King Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 8d5077fb0492..b0d323e24978 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -484,7 +484,7 @@ static int netvsc_connect_vsp(struct hv_device *device, struct netvsc_device *net_device, const struct netvsc_device_info *device_info) { - const u32 ver_list[] = { + static const u32 ver_list[] = { NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2, NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5 }; From e94cd8113ce63bca34040aae52d0603baf6ec07c Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Fri, 22 Sep 2017 23:57:49 +0800 Subject: [PATCH 0791/1322] net: remove MTU limits for dummy and ifb device These two drivers (dummy and ifb) call ether_setup(), after commit 61e84623ace3 ("net: centralize net_device min/max MTU checking"), the range of mtu is [min_mtu, max_mtu], which is [68, 1500] by default. These two devices should not have limits on MTU. This patch set their min_mtu/max_mtu to 0. So that dev_set_mtu() will not check the mtu range, and can be set with any value. CC: Eric Dumazet CC: Sabrina Dubroca Signed-off-by: Zhang Shengju Signed-off-by: David S. Miller --- drivers/net/dummy.c | 2 +- drivers/net/ifb.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index e31ab3b94c6f..58483af80bdb 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -353,7 +353,7 @@ static void dummy_setup(struct net_device *dev) eth_hw_addr_random(dev); dev->min_mtu = 0; - dev->max_mtu = ETH_MAX_MTU; + dev->max_mtu = 0; } static int dummy_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 8870bd2a2e8a..0008da7e9d4c 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -231,6 +231,9 @@ static void ifb_setup(struct net_device *dev) eth_hw_addr_random(dev); dev->needs_free_netdev = true; dev->priv_destructor = ifb_dev_free; + + dev->min_mtu = 0; + dev->max_mtu = 0; } static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) From a93ad944f4ff9a797abff17c73fc4b1e4a1d9141 Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 22 Sep 2017 15:32:44 -0500 Subject: [PATCH 0792/1322] net: qcom/emac: specify the correct size when mapping a DMA buffer When mapping the RX DMA buffers, the driver was accidentally specifying zero for the buffer length. Under normal circumstances, SWIOTLB does not need to allocate a bounce buffer, so the address is just mapped without checking the size field. This is why the error was not detected earlier. Fixes: b9b17debc69d ("net: emac: emac gigabit ethernet controller driver") Cc: stable@vger.kernel.org Signed-off-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-mac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 0ea3ca09c689..3ed9033e56db 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -898,7 +898,8 @@ static void emac_mac_rx_descs_refill(struct emac_adapter *adpt, curr_rxbuf->dma_addr = dma_map_single(adpt->netdev->dev.parent, skb->data, - curr_rxbuf->length, DMA_FROM_DEVICE); + adpt->rxbuf_size, DMA_FROM_DEVICE); + ret = dma_mapping_error(adpt->netdev->dev.parent, curr_rxbuf->dma_addr); if (ret) { From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:40 +0200 Subject: [PATCH 0793/1322] PCI: Fix race condition with driver_override The driver_override implementation is susceptible to a race condition when different threads are reading vs. storing a different driver override. Add locking to avoid the race condition. This is in close analogy to commit 6265539776a0 ("driver core: platform: fix race condition with driver_override") from Adrian Salido. Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Signed-off-by: Nicolai Stange Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v3.16+ --- drivers/pci/pci-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 1eecfa301f7f..8e075ea2743e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -686,7 +686,7 @@ static ssize_t driver_override_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - char *driver_override, *old = pdev->driver_override, *cp; + char *driver_override, *old, *cp; /* We need to keep extra room for a newline */ if (count >= (PAGE_SIZE - 1)) @@ -700,12 +700,15 @@ static ssize_t driver_override_store(struct device *dev, if (cp) *cp = '\0'; + device_lock(dev); + old = pdev->driver_override; if (strlen(driver_override)) { pdev->driver_override = driver_override; } else { kfree(driver_override); pdev->driver_override = NULL; } + device_unlock(dev); kfree(old); @@ -716,8 +719,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + ssize_t len; - return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); From b776e4b1a990045a7c70798f1f353c3160c26594 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Sep 2017 20:38:45 -0400 Subject: [PATCH 0794/1322] fix a typo in put_compat_shm_info() "uip" misspelled as "up"; unfortunately, the latter happens to be a function and gcc is happy to convert it to void *... Signed-off-by: Al Viro --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 1e2b1692ba2c..badac463e2c8 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1154,7 +1154,7 @@ static int put_compat_shm_info(struct shm_info *ip, info.shm_swp = ip->shm_swp; info.swap_attempts = ip->swap_attempts; info.swap_successes = ip->swap_successes; - return copy_to_user(up, &info, sizeof(info)); + return copy_to_user(uip, &info, sizeof(info)); } static int copy_compat_shmid_to_user(void __user *buf, struct shmid64_ds *in, From cc6f77710a6de6210f9feda7cd53e2f5ee7a7e69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:16 -0700 Subject: [PATCH 0795/1322] xfs: don't unconditionally clear the reflink flag on zero-block files If we have speculative cow preallocations hanging around in the cow fork, don't let a truncate operation clear the reflink flag because if we do then there's a chance we'll forget to free those extents when we destroy the incore inode. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5599dda4727a..4ec5b7f45401 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1624,10 +1624,12 @@ xfs_itruncate_extents( goto out; /* - * Clear the reflink flag if we truncated everything. + * Clear the reflink flag if there are no data fork blocks and + * there are no extents staged in the cow fork. */ - if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { - ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + if (xfs_is_reflink_inode(ip) && ip->i_cnextents == 0) { + if (ip->i_d.di_nblocks == 0) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); } From 3af423b03435c81036fa710623d3ae92fbe346a3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: [PATCH 0796/1322] xfs: evict CoW fork extents when performing finsert/fcollapse When we perform an finsert/fcollapse operation, cancel all the CoW extents for the affected file offset range so that they don't end up pointing to the wrong blocks. Reported-by: Amir Goldstein Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cd9a5400ba4f..bc6c6e10a969 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1459,7 +1459,19 @@ xfs_shift_file_space( return error; /* - * The extent shiting code works on extent granularity. So, if + * Clean out anything hanging around in the cow fork now that + * we've flushed all the dirty data out to disk to avoid having + * CoW extents at the wrong offsets. + */ + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF, + true); + if (error) + return error; + } + + /* + * The extent shifting code works on extent granularity. So, if * stop_fsb is not the starting block of extent, we need to split * the extent at stop_fsb. */ From e150dcd459e1b441eaf08f341a986f04e61bf3b8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Sep 2017 11:34:16 -0700 Subject: [PATCH 0797/1322] fs/xfs: Use %pS printk format for direct addresses Use the %pS instead of the %pF printk format specifier for printing symbols from direct addresses. This is needed for the ia64, ppc64 and parisc64 architectures. Signed-off-by: Helge Deller Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index bd786a9ac2c3..eaf86f55b7f2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,7 +347,7 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", __return_address, bp->b_ops->name, bp->b_bn); From 64671bafbdd984535aa382bccadd91fbe7be0e80 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:38:58 -0700 Subject: [PATCH 0798/1322] xfs: kill meaningless variable 'zero' In xfs_file_aio_write_checks(), variable 'zero' is there only to satisfy xfs_zero_eof(), the result of it is ignored. Now, with iomap_zero_range() based xfs_zero_eof(), we can safely pass NULL as the last param of it and kill 'zero'. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebdd0bd2b261..261d83f1db76 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -377,8 +377,6 @@ restart: */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { - bool zero = false; - spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -399,7 +397,7 @@ restart: drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else From d20a5e3851969fa685f118a80e4df670255a4e8d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 18 Sep 2017 11:39:23 -0700 Subject: [PATCH 0799/1322] xfs: report zeroed or not correctly in xfs_zero_range() The 'did_zero' param of xfs_zero_range() was not passed to iomap_zero_range() correctly. This was introduced by commit 7bb41db3ea16 ("xfs: handle 64-bit length in xfs_iozero"), and found by code inspection. Signed-off-by: Eryu Guan Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 261d83f1db76..350b6d43ba23 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -58,7 +58,7 @@ xfs_zero_range( xfs_off_t count, bool *did_zero) { - return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); + return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); } int From 1e6fa688bffc0ff419a4c3e78dbaf7aabfb55183 Mon Sep 17 00:00:00 2001 From: Kenjiro Nakayama Date: Mon, 18 Sep 2017 12:03:56 -0700 Subject: [PATCH 0800/1322] xfs: Output warning message when discard option was enabled even though the device does not support discard In order to using discard function, it is necessary that not only xfs is mounted with discard option, but also the discard function is supported by the device. Current code doesn't output any message when users mount with discard option on unsupported device, so it is difficult to notice that it was not enabled actually. This patch adds the warning message to notice that discard option is not enabled due to unsupported device when the filesystem is mounted. Changes in v2 (Suggested by Brian Foster): - Move the unsupported device check into xfs_fs_fill_super(). - Clear the discard flag when device is unsupported. Signed-off-by: Kenjiro Nakayama Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c996f4ae4a5f..584cf2d573ba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1654,6 +1654,16 @@ xfs_fs_fill_super( "DAX and reflink have not been tested together!"); } + if (mp->m_flags & XFS_MOUNT_DISCARD) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) { + xfs_warn(mp, "mounting with \"discard\" option, but " + "the device does not support discard"); + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } + } + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, From 60915f83cd1e021a66fc1503a446aef5c772553a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Sep 2017 13:38:46 -0700 Subject: [PATCH 0801/1322] xfs: remove redundant re-initialization of total_nr_pages Variable total_nr_pages is being initialized and then updated with the same value, this latter assignment is redundant and can be removed. Cleans up clang build warning: Value stored to 'total_nr_pages' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index da14658da310..2f97c12ca75e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1258,8 +1258,6 @@ xfs_buf_ioapply_map( int size; int offset; - total_nr_pages = bp->b_page_count; - /* skip the pages in the buffer before the start offset */ page_index = 0; offset = *buf_offset; From f091fb8c344ce13cbf058d304c6cbb042be97058 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 25 Sep 2017 13:47:23 +0200 Subject: [PATCH 0802/1322] scsi: scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add() During failover there is a small race window between fc_remote_port_add() and fc_timeout_deleted_rport(); the latter drops the lock after setting the port to NOTPRESENT, so if fc_remote_port_add() is called right at that time it will fail to detect the existing rport and happily adding a new structure, causing rports to get registered twice. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_fc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index e74fffc32c75..cbd4495d0ff9 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -2739,7 +2739,8 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel, list_for_each_entry(rport, &fc_host->rports, peers) { - if ((rport->port_state == FC_PORTSTATE_BLOCKED) && + if ((rport->port_state == FC_PORTSTATE_BLOCKED || + rport->port_state == FC_PORTSTATE_NOTPRESENT) && (rport->channel == channel)) { switch (fc_host->tgtid_bind_type) { From 943170998b200190f99d3fe7e771437e2c51f319 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:14 -0700 Subject: [PATCH 0803/1322] tun: enable NAPI for TUN/TAP driver Changes TUN driver to use napi_gro_receive() upon receiving packets rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these changes and operation is not affected if the flag is disabled. SKBs are constructed upon packet arrival and are queued to be processed later. The new path was evaluated with a benchmark with the following setup: Open two tap devices and a receiver thread that reads in a loop for each device. Start one sender thread and pin all threads to different CPUs. Send 1M minimum UDP packets to each device and measure sending time for each of the sending methods: napi_gro_receive(): 4.90s netif_rx_ni(): 4.90s netif_receive_skb(): 7.20s Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/tun.c | 133 ++++++++++++++++++++++++++++++++---- include/uapi/linux/if_tun.h | 1 + 2 files changed, 119 insertions(+), 15 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3c9985f29950..f16407242b18 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -121,7 +121,7 @@ do { \ #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE) + IFF_MULTI_QUEUE | IFF_NAPI) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -172,6 +172,7 @@ struct tun_file { u16 queue_index; unsigned int ifindex; }; + struct napi_struct napi; struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -229,6 +230,68 @@ struct tun_struct { struct bpf_prog __rcu *xdp_prog; }; +static int tun_napi_receive(struct napi_struct *napi, int budget) +{ + struct tun_file *tfile = container_of(napi, struct tun_file, napi); + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; + struct sk_buff_head process_queue; + struct sk_buff *skb; + int received = 0; + + __skb_queue_head_init(&process_queue); + + spin_lock(&queue->lock); + skb_queue_splice_tail_init(queue, &process_queue); + spin_unlock(&queue->lock); + + while (received < budget && (skb = __skb_dequeue(&process_queue))) { + napi_gro_receive(napi, skb); + ++received; + } + + if (!skb_queue_empty(&process_queue)) { + spin_lock(&queue->lock); + skb_queue_splice(&process_queue, queue); + spin_unlock(&queue->lock); + } + + return received; +} + +static int tun_napi_poll(struct napi_struct *napi, int budget) +{ + unsigned int received; + + received = tun_napi_receive(napi, budget); + + if (received < budget) + napi_complete_done(napi, received); + + return received; +} + +static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, + bool napi_en) +{ + if (napi_en) { + netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, + NAPI_POLL_WEIGHT); + napi_enable(&tfile->napi); + } +} + +static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) +{ + if (tun->flags & IFF_NAPI) + napi_disable(&tfile->napi); +} + +static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) +{ + if (tun->flags & IFF_NAPI) + netif_napi_del(&tfile->napi); +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -541,6 +604,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); + if (tun && clean) { + tun_napi_disable(tun, tfile); + tun_napi_del(tun, tfile); + } + if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); @@ -598,6 +666,7 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); + tun_napi_disable(tun, tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); @@ -613,6 +682,7 @@ static void tun_detach_all(struct net_device *dev) synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); + tun_napi_del(tun, tfile); /* Drop read queue */ tun_queue_purge(tfile); sock_put(&tfile->sk); @@ -631,7 +701,8 @@ static void tun_detach_all(struct net_device *dev) module_put(THIS_MODULE); } -static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter) +static int tun_attach(struct tun_struct *tun, struct file *file, + bool skip_filter, bool napi) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; @@ -677,10 +748,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; - if (tfile->detached) + if (tfile->detached) { tun_enable_queue(tfile); - else + } else { sock_hold(&tfile->sk); + tun_napi_init(tun, tfile, napi); + } tun_set_real_num_queues(tun); @@ -956,13 +1029,28 @@ static void tun_poll_controller(struct net_device *dev) * Tun only receives frames when: * 1) the char device endpoint gets data from user space * 2) the tun socket gets a sendmsg call from user space - * Since both of those are synchronous operations, we are guaranteed - * never to have pending data when we poll for it - * so there is nothing to do here but return. + * If NAPI is not enabled, since both of those are synchronous + * operations, we are guaranteed never to have pending data when we poll + * for it so there is nothing to do here but return. * We need this though so netpoll recognizes us as an interface that * supports polling, which enables bridge devices in virt setups to * still use netconsole + * If NAPI is enabled, however, we need to schedule polling for all + * queues. */ + struct tun_struct *tun = netdev_priv(dev); + + if (tun->flags & IFF_NAPI) { + struct tun_file *tfile; + int i; + + rcu_read_lock(); + for (i = 0; i < tun->numqueues; i++) { + tfile = rcu_dereference(tun->tfiles[i]); + napi_schedule(&tfile->napi); + } + rcu_read_unlock(); + } return; } #endif @@ -1549,11 +1637,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } rxhash = __skb_get_hash_symmetric(skb); -#ifndef CONFIG_4KSTACKS - tun_rx_batched(tun, tfile, skb, more); -#else - netif_rx_ni(skb); -#endif + + if (tun->flags & IFF_NAPI) { + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; + int queue_len; + + spin_lock_bh(&queue->lock); + __skb_queue_tail(queue, skb); + queue_len = skb_queue_len(queue); + spin_unlock(&queue->lock); + + if (!more || queue_len > NAPI_POLL_WEIGHT) + napi_schedule(&tfile->napi); + + local_bh_enable(); + } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { + tun_rx_batched(tun, tfile, skb, more); + } else { + netif_rx_ni(skb); + } stats = get_cpu_ptr(tun->pcpu_stats); u64_stats_update_begin(&stats->syncp); @@ -1980,7 +2082,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (err < 0) return err; - err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); + err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, + ifr->ifr_flags & IFF_NAPI); if (err < 0) return err; @@ -2066,7 +2169,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) NETIF_F_HW_VLAN_STAG_TX); INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false); + err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); if (err < 0) goto err_free_flow; @@ -2216,7 +2319,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file, false); + ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 3cb5e1d85ddd..30b6184884eb 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 +#define IFF_NAPI 0x0010 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 From 90e33d45940793def6f773b2d528e9f3c84ffdc7 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 22 Sep 2017 13:49:15 -0700 Subject: [PATCH 0804/1322] tun: enable napi_gro_frags() for TUN/TAP driver Add a TUN/TAP receive mode that exercises the napi_gro_frags() interface. This mode is available only in TAP mode, as the interface expects packets with Ethernet headers. Furthermore, packets follow the layout of the iovec_iter that was received. The first iovec is the linear data, and every one after the first is a fragment. If there are more fragments than the max number, drop the packet. Additionally, invoke eth_get_headlen() to exercise flow dissector code and to verify that the header resides in the linear data. The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option. This is imposed because this mode is intended for testing via tools like syzkaller and packetdrill, and the increased flexibility it provides can introduce security vulnerabilities. This flag is accepted only if the device is in TAP mode and has the IFF_NAPI flag set as well. This is done because both of these are explicit requirements for correct operation in this mode. Signed-off-by: Petar Penkov Cc: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Cc: davem@davemloft.net Cc: ppenkov@stanford.edu Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/tun.c | 134 ++++++++++++++++++++++++++++++++++-- include/uapi/linux/if_tun.h | 1 + 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f16407242b18..9880b3bc8fa5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -121,7 +122,8 @@ do { \ #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE | IFF_NAPI) + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) + #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -173,6 +175,7 @@ struct tun_file { unsigned int ifindex; }; struct napi_struct napi; + struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); napi_enable(&tfile->napi); + mutex_init(&tfile->napi_mutex); } } @@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) netif_napi_del(&tfile->napi); } +static bool tun_napi_frags_enabled(const struct tun_struct *tun) +{ + return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev) * supports polling, which enables bridge devices in virt setups to * still use netconsole * If NAPI is enabled, however, we need to schedule polling for all - * queues. + * queues unless we are using napi_gro_frags(), which we call in + * process context and not in NAPI context. */ struct tun_struct *tun = netdev_priv(dev); @@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev) struct tun_file *tfile; int i; + if (tun_napi_frags_enabled(tun)) + return; + rcu_read_lock(); for (i = 0; i < tun->numqueues; i++) { tfile = rcu_dereference(tun->tfiles[i]); @@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) return mask; } +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, + size_t len, + const struct iov_iter *it) +{ + struct sk_buff *skb; + size_t linear; + int err; + int i; + + if (it->nr_segs > MAX_SKB_FRAGS + 1) + return ERR_PTR(-ENOMEM); + + local_bh_disable(); + skb = napi_get_frags(&tfile->napi); + local_bh_enable(); + if (!skb) + return ERR_PTR(-ENOMEM); + + linear = iov_iter_single_seg_count(it); + err = __skb_grow(skb, linear); + if (err) + goto free; + + skb->len = len; + skb->data_len = len - linear; + skb->truesize += skb->data_len; + + for (i = 1; i < it->nr_segs; i++) { + size_t fragsz = it->iov[i].iov_len; + unsigned long offset; + struct page *page; + void *data; + + if (fragsz == 0 || fragsz > PAGE_SIZE) { + err = -EINVAL; + goto free; + } + + local_bh_disable(); + data = napi_alloc_frag(fragsz); + local_bh_enable(); + if (!data) { + err = -ENOMEM; + goto free; + } + + page = virt_to_head_page(data); + offset = data - page_address(page); + skb_fill_page_desc(skb, i - 1, page, offset, fragsz); + } + + return skb; +free: + /* frees skb and all frags allocated with napi_alloc_frag() */ + napi_free_frags(&tfile->napi); + return ERR_PTR(err); +} + /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, @@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int err; u32 rxhash; int skb_xdp = 1; + bool frags = tun_napi_frags_enabled(tun); if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, zerocopy = true; } - if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { + if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. @@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, linear = tun16_to_cpu(tun, gso.hdr_len); } - skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (frags) { + mutex_lock(&tfile->napi_mutex); + skb = tun_napi_alloc_frags(tfile, copylen, from); + /* tun_napi_alloc_frags() enforces a layout for the skb. + * If zerocopy is enabled, then this layout will be + * overwritten by zerocopy_sg_from_iter(). + */ + zerocopy = false; + } else { + skb = tun_alloc_skb(tfile, align, copylen, linear, + noblock); + } + if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) this_cpu_inc(tun->pcpu_stats->rx_dropped); + if (frags) + mutex_unlock(&tfile->napi_mutex); return PTR_ERR(skb); } @@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err) { this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EFAULT; } } @@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { this_cpu_inc(tun->pcpu_stats->rx_frame_errors); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EINVAL; } @@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb->dev = tun->dev; break; case IFF_TAP: - skb->protocol = eth_type_trans(skb, tun->dev); + if (!frags) + skb->protocol = eth_type_trans(skb, tun->dev); break; } @@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rxhash = __skb_get_hash_symmetric(skb); - if (tun->flags & IFF_NAPI) { + if (frags) { + /* Exercise flow dissector code path. */ + u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); + + if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) { + this_cpu_inc(tun->pcpu_stats->rx_dropped); + napi_free_frags(&tfile->napi); + mutex_unlock(&tfile->napi_mutex); + WARN_ON(1); + return -ENOMEM; + } + + local_bh_disable(); + napi_gro_frags(&tfile->napi); + local_bh_enable(); + mutex_unlock(&tfile->napi_mutex); + } else if (tun->flags & IFF_NAPI) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; @@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (tfile->detached) return -EINVAL; + if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!(ifr->ifr_flags & IFF_NAPI) || + (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) + return -EINVAL; + } + dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 30b6184884eb..365ade5685c9 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -61,6 +61,7 @@ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 #define IFF_NAPI 0x0010 +#define IFF_NAPI_FRAGS 0x0020 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000 From 6098d7ddd62f532f80ee2a4b01aca500a8e4e9e4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:18 +0200 Subject: [PATCH 0805/1322] rocker: fix rocker_tlv_put_* functions for KASAN Inlining these functions creates lots of stack variables that each take 64 bytes when KASAN is enabled, leading to this warning about potential stack overflow: drivers/net/ethernet/rocker/rocker_ofdpa.c: In function 'ofdpa_cmd_flow_tbl_add': drivers/net/ethernet/rocker/rocker_ofdpa.c:621:1: error: the frame size of 2752 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] gcc-8 can now consolidate the stack slots itself, but on older versions we get the same behavior by using a temporary variable that holds a copy of the inline function argument. Cc: stable@vger.kernel.org Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_tlv.h | 48 +++++++++++++++--------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_tlv.h b/drivers/net/ethernet/rocker/rocker_tlv.h index a63ef82e7c72..dfae3c9d57c6 100644 --- a/drivers/net/ethernet/rocker/rocker_tlv.h +++ b/drivers/net/ethernet/rocker/rocker_tlv.h @@ -139,40 +139,52 @@ rocker_tlv_start(struct rocker_desc_info *desc_info) int rocker_tlv_put(struct rocker_desc_info *desc_info, int attrtype, int attrlen, const void *data); -static inline int rocker_tlv_put_u8(struct rocker_desc_info *desc_info, - int attrtype, u8 value) +static inline int +rocker_tlv_put_u8(struct rocker_desc_info *desc_info, int attrtype, u8 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &value); + u8 tmp = value; /* work around GCC PR81715 */ + + return rocker_tlv_put(desc_info, attrtype, sizeof(u8), &tmp); } -static inline int rocker_tlv_put_u16(struct rocker_desc_info *desc_info, - int attrtype, u16 value) +static inline int +rocker_tlv_put_u16(struct rocker_desc_info *desc_info, int attrtype, u16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u16), &tmp); } -static inline int rocker_tlv_put_be16(struct rocker_desc_info *desc_info, - int attrtype, __be16 value) +static inline int +rocker_tlv_put_be16(struct rocker_desc_info *desc_info, int attrtype, __be16 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be16), &tmp); } -static inline int rocker_tlv_put_u32(struct rocker_desc_info *desc_info, - int attrtype, u32 value) +static inline int +rocker_tlv_put_u32(struct rocker_desc_info *desc_info, int attrtype, u32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u32), &tmp); } -static inline int rocker_tlv_put_be32(struct rocker_desc_info *desc_info, - int attrtype, __be32 value) +static inline int +rocker_tlv_put_be32(struct rocker_desc_info *desc_info, int attrtype, __be32 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(__be32), &tmp); } -static inline int rocker_tlv_put_u64(struct rocker_desc_info *desc_info, - int attrtype, u64 value) +static inline int +rocker_tlv_put_u64(struct rocker_desc_info *desc_info, int attrtype, u64 value) { - return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &value); + u64 tmp = value; + + return rocker_tlv_put(desc_info, attrtype, sizeof(u64), &tmp); } static inline struct rocker_tlv * From b4391db42308c9940944b5d7be5ca4b78fb88dd0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:19 +0200 Subject: [PATCH 0806/1322] netlink: fix nla_put_{u8,u16,u32} for KASAN When CONFIG_KASAN is enabled, the "--param asan-stack=1" causes rather large stack frames in some functions. This goes unnoticed normally because CONFIG_FRAME_WARN is disabled with CONFIG_KASAN by default as of commit 3f181b4d8652 ("lib/Kconfig.debug: disable -Wframe-larger-than warnings with KASAN=y"). The kernelci.org build bot however has the warning enabled and that led me to investigate it a little further, as every build produces these warnings: net/wireless/nl80211.c:4389:1: warning: the frame size of 2240 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1895:1: warning: the frame size of 3776 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/wireless/nl80211.c:1410:1: warning: the frame size of 2208 bytes is larger than 2048 bytes [-Wframe-larger-than=] net/bridge/br_netlink.c:1282:1: warning: the frame size of 2544 bytes is larger than 2048 bytes [-Wframe-larger-than=] Most of this problem is now solved in gcc-8, which can consolidate the stack slots for the inline function arguments. On older compilers we can add a workaround by declaring a local variable in each function to pass the inline function argument. Cc: stable@vger.kernel.org Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/netlink.h | 73 ++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index e51cf5f81597..14c289393071 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -773,7 +773,10 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { - return nla_put(skb, attrtype, sizeof(u8), &value); + /* temporary variables to work around GCC PR81715 with asan-stack=1 */ + u8 tmp = value; + + return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** @@ -784,7 +787,9 @@ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { - return nla_put(skb, attrtype, sizeof(u16), &value); + u16 tmp = value; + + return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** @@ -795,7 +800,9 @@ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put(skb, attrtype, sizeof(__be16), &value); + __be16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** @@ -806,7 +813,9 @@ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { - return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be16 tmp = value; + + return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -817,7 +826,9 @@ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { - return nla_put(skb, attrtype, sizeof(__le16), &value); + __le16 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** @@ -828,7 +839,9 @@ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { - return nla_put(skb, attrtype, sizeof(u32), &value); + u32 tmp = value; + + return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** @@ -839,7 +852,9 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put(skb, attrtype, sizeof(__be32), &value); + __be32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** @@ -850,7 +865,9 @@ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { - return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, value); + __be32 tmp = value; + + return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** @@ -861,7 +878,9 @@ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { - return nla_put(skb, attrtype, sizeof(__le32), &value); + __le32 tmp = value; + + return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** @@ -874,7 +893,9 @@ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr); + u64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** @@ -887,7 +908,9 @@ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr); + __be64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** @@ -900,7 +923,9 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { - return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value, + __be64 tmp = value; + + return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } @@ -914,7 +939,9 @@ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(__le64), &value, padattr); + __le64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** @@ -925,7 +952,9 @@ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { - return nla_put(skb, attrtype, sizeof(s8), &value); + s8 tmp = value; + + return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** @@ -936,7 +965,9 @@ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { - return nla_put(skb, attrtype, sizeof(s16), &value); + s16 tmp = value; + + return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** @@ -947,7 +978,9 @@ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { - return nla_put(skb, attrtype, sizeof(s32), &value); + s32 tmp = value; + + return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** @@ -960,7 +993,9 @@ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { - return nla_put_64bit(skb, attrtype, sizeof(s64), &value, padattr); + s64 tmp = value; + + return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** @@ -1010,7 +1045,9 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { - return nla_put_be32(skb, attrtype, addr); + __be32 tmp = addr; + + return nla_put_be32(skb, attrtype, tmp); } /** From 6450f8f269a9271985e4a8c13920b7e4cf21c0f3 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 22 Sep 2017 15:31:38 -0700 Subject: [PATCH 0807/1322] hv_netvsc: Fix the real number of queues of non-vRSS cases For older hosts without multi-channel (vRSS) support, and some error cases, we still need to set the real number of queues to one. This patch adds this missing setting. Fixes: 8195b1396ec8 ("hv_netvsc: fix deadlock on hotplug") Signed-off-by: Haiyang Zhang Reviewed-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index a32ae02e1b6c..e9d54c9ee78c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1935,6 +1935,12 @@ static int netvsc_probe(struct hv_device *dev, /* We always need headroom for rndis header */ net->needed_headroom = RNDIS_AND_PPI_SIZE; + /* Initialize the number of queues to be 1, we may change it if more + * channels are offered later. + */ + netif_set_real_num_tx_queues(net, 1); + netif_set_real_num_rx_queues(net, 1); + /* Notify the netvsc driver of the new device */ memset(&device_info, 0, sizeof(device_info)); device_info.ring_size = ring_size; From 6457edfe7344ac1da334b9f24a42aacea084a451 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 22 Sep 2017 19:01:55 -0400 Subject: [PATCH 0808/1322] net: dsa: make slave close symmetrical to open The DSA slave open function configures the unicast MAC addresses on the master device, enable the switch port, change its STP state, then start the PHY device. Make the close function symmetric, by first stopping the PHY device, then changing the STP state, disabling the switch port and restore the master device. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 02ace7d462c4..c2bb48579032 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -133,6 +133,11 @@ static int dsa_slave_close(struct net_device *dev) if (p->phy) phy_stop(p->phy); + dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); + + if (ds->ops->port_disable) + ds->ops->port_disable(ds, p->dp->index, p->phy); + dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); if (dev->flags & IFF_ALLMULTI) @@ -143,11 +148,6 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->dp->index, p->phy); - - dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); - return 0; } From fb8a6a2b8b7cfee8a0cf2cd431e1d0f45dd1c9e0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 22 Sep 2017 19:01:56 -0400 Subject: [PATCH 0809/1322] net: dsa: add port enable and disable helpers Provide dsa_port_enable and dsa_port_disable helpers to respectively enable and disable a switch port. This makes the dsa_port_set_state_now helper static. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 3 ++- net/dsa/port.c | 31 ++++++++++++++++++++++++++++++- net/dsa/slave.c | 19 +++++-------------- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9803952a5b40..0298a0f6a349 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -117,7 +117,8 @@ void dsa_master_ethtool_restore(struct net_device *dev); /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); -void dsa_port_set_state_now(struct dsa_port *dp, u8 state); +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, diff --git a/net/dsa/port.c b/net/dsa/port.c index 76d43a82d397..72c8dbd3d3f2 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -56,7 +56,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, return 0; } -void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) { int err; @@ -65,6 +65,35 @@ void dsa_port_set_state_now(struct dsa_port *dp, u8 state) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +{ + u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + int err; + + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, port, phy); + if (err) + return err; + } + + dsa_port_set_state_now(dp, stp_state); + + return 0; +} + +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + dsa_port_set_state_now(dp, BR_STATE_DISABLED); + + if (ds->ops->port_disable) + ds->ops->port_disable(ds, port, phy); +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c2bb48579032..bd51ef56ec5b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -73,9 +73,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; struct net_device *master = dsa_master_netdev(p); - u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; if (!(master->flags & IFF_UP)) @@ -98,13 +96,9 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->ops->port_enable) { - err = ds->ops->port_enable(ds, p->dp->index, p->phy); - if (err) - goto clear_promisc; - } - - dsa_port_set_state_now(p->dp, stp_state); + err = dsa_port_enable(dp, p->phy); + if (err) + goto clear_promisc; if (p->phy) phy_start(p->phy); @@ -128,15 +122,12 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = dsa_master_netdev(p); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = p->dp; if (p->phy) phy_stop(p->phy); - dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); - - if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->dp->index, p->phy); + dsa_port_disable(dp, p->phy); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); From 088b8749da1e35b0dd9cb0e6500ca1c94c9bf547 Mon Sep 17 00:00:00 2001 From: Rick Farrington Date: Fri, 22 Sep 2017 17:12:43 -0700 Subject: [PATCH 0810/1322] liquidio: allow override of firmware present in flash Signed-off-by: Rick Farrington Signed-off-by: Felix Manlunas Signed-off-by: David S. Miller --- .../net/ethernet/cavium/liquidio/lio_main.c | 68 ++++++++++++------- .../ethernet/cavium/liquidio/liquidio_image.h | 1 + .../ethernet/cavium/liquidio/octeon_device.c | 11 ++- .../ethernet/cavium/liquidio/octeon_device.h | 10 +++ 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index e7f54948173f..ce08f710de0b 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -59,9 +59,9 @@ static int debug = -1; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "NETIF_MSG debug bits"); -static char fw_type[LIO_MAX_FW_TYPE_LEN] = LIO_FW_NAME_TYPE_NIC; +static char fw_type[LIO_MAX_FW_TYPE_LEN] = LIO_FW_NAME_TYPE_AUTO; module_param_string(fw_type, fw_type, sizeof(fw_type), 0444); -MODULE_PARM_DESC(fw_type, "Type of firmware to be loaded. Default \"nic\". Use \"none\" to load firmware from flash."); +MODULE_PARM_DESC(fw_type, "Type of firmware to be loaded (default is \"auto\"), which uses firmware in flash, if present, else loads \"nic\"."); static u32 console_bitmask; module_param(console_bitmask, int, 0644); @@ -1115,10 +1115,10 @@ liquidio_probe(struct pci_dev *pdev, return 0; } -static bool fw_type_is_none(void) +static bool fw_type_is_auto(void) { - return strncmp(fw_type, LIO_FW_NAME_TYPE_NONE, - sizeof(LIO_FW_NAME_TYPE_NONE)) == 0; + return strncmp(fw_type, LIO_FW_NAME_TYPE_AUTO, + sizeof(LIO_FW_NAME_TYPE_AUTO)) == 0; } /** @@ -1302,7 +1302,7 @@ static void octeon_destroy_resources(struct octeon_device *oct) * Implementation note: only soft-reset the device * if it is a CN6XXX OR the LAST CN23XX device. */ - if (fw_type_is_none()) + if (atomic_read(oct->adapter_fw_state) == FW_IS_PRELOADED) octeon_pci_flr(oct); else if (OCTEON_CN6XXX(oct) || !refcount) oct->fn_list.soft_reset(oct); @@ -1934,7 +1934,7 @@ static int load_firmware(struct octeon_device *oct) char fw_name[LIO_MAX_FW_FILENAME_LEN]; char *tmp_fw_type; - if (fw_type[0] == '\0') + if (fw_type_is_auto()) tmp_fw_type = LIO_FW_NAME_TYPE_NIC; else tmp_fw_type = fw_type; @@ -3882,9 +3882,9 @@ octeon_recv_vf_drv_notice(struct octeon_recv_info *recv_info, void *buf) static int octeon_device_init(struct octeon_device *octeon_dev) { int j, ret; - int fw_loaded = 0; char bootcmd[] = "\n"; char *dbg_enb = NULL; + enum lio_fw_state fw_state; struct octeon_device_priv *oct_priv = (struct octeon_device_priv *)octeon_dev->priv; atomic_set(&octeon_dev->status, OCT_DEV_BEGIN_STATE); @@ -3916,24 +3916,40 @@ static int octeon_device_init(struct octeon_device *octeon_dev) octeon_dev->app_mode = CVM_DRV_INVALID_APP; - if (OCTEON_CN23XX_PF(octeon_dev)) { - if (!cn23xx_fw_loaded(octeon_dev) && !fw_type_is_none()) { - fw_loaded = 0; - /* Do a soft reset of the Octeon device. */ - if (octeon_dev->fn_list.soft_reset(octeon_dev)) - return 1; - /* things might have changed */ - if (!cn23xx_fw_loaded(octeon_dev)) - fw_loaded = 0; - else - fw_loaded = 1; - } else { - fw_loaded = 1; - } - } else if (octeon_dev->fn_list.soft_reset(octeon_dev)) { - return 1; + /* CN23XX supports preloaded firmware if the following is true: + * + * The adapter indicates that firmware is currently running AND + * 'fw_type' is 'auto'. + * + * (default state is NEEDS_TO_BE_LOADED, override it if appropriate). + */ + if (OCTEON_CN23XX_PF(octeon_dev) && + cn23xx_fw_loaded(octeon_dev) && fw_type_is_auto()) { + atomic_cmpxchg(octeon_dev->adapter_fw_state, + FW_NEEDS_TO_BE_LOADED, FW_IS_PRELOADED); } + /* If loading firmware, only first device of adapter needs to do so. */ + fw_state = atomic_cmpxchg(octeon_dev->adapter_fw_state, + FW_NEEDS_TO_BE_LOADED, + FW_IS_BEING_LOADED); + + /* Here, [local variable] 'fw_state' is set to one of: + * + * FW_IS_PRELOADED: No firmware is to be loaded (see above) + * FW_NEEDS_TO_BE_LOADED: The driver's first instance will load + * firmware to the adapter. + * FW_IS_BEING_LOADED: The driver's second instance will not load + * firmware to the adapter. + */ + + /* Prior to f/w load, perform a soft reset of the Octeon device; + * if error resetting, return w/error. + */ + if (fw_state == FW_NEEDS_TO_BE_LOADED) + if (octeon_dev->fn_list.soft_reset(octeon_dev)) + return 1; + /* Initialize the dispatch mechanism used to push packets arriving on * Octeon Output queues. */ @@ -4063,7 +4079,7 @@ static int octeon_device_init(struct octeon_device *octeon_dev) atomic_set(&octeon_dev->status, OCT_DEV_IO_QUEUES_DONE); - if ((!OCTEON_CN23XX_PF(octeon_dev)) || !fw_loaded) { + if (fw_state == FW_NEEDS_TO_BE_LOADED) { dev_dbg(&octeon_dev->pci_dev->dev, "Waiting for DDR initialization...\n"); if (!ddr_timeout) { dev_info(&octeon_dev->pci_dev->dev, @@ -4125,6 +4141,8 @@ static int octeon_device_init(struct octeon_device *octeon_dev) dev_err(&octeon_dev->pci_dev->dev, "Could not load firmware to board\n"); return 1; } + + atomic_set(octeon_dev->adapter_fw_state, FW_HAS_BEEN_LOADED); } handshake[octeon_dev->octeon_id].init_ok = 1; diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_image.h b/drivers/net/ethernet/cavium/liquidio/liquidio_image.h index 78a3685f6fe0..5bf5e8791dfb 100644 --- a/drivers/net/ethernet/cavium/liquidio/liquidio_image.h +++ b/drivers/net/ethernet/cavium/liquidio/liquidio_image.h @@ -24,6 +24,7 @@ #define LIO_FW_BASE_NAME "lio_" #define LIO_FW_NAME_SUFFIX ".bin" #define LIO_FW_NAME_TYPE_NIC "nic" +#define LIO_FW_NAME_TYPE_AUTO "auto" #define LIO_FW_NAME_TYPE_NONE "none" #define LIO_MAX_FIRMWARE_VERSION_LEN 16 diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c index 29d53b1763a7..e4aa3395a578 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c @@ -541,6 +541,7 @@ static char oct_dev_app_str[CVM_DRV_APP_COUNT + 1][32] = { static struct octeon_device *octeon_device[MAX_OCTEON_DEVICES]; static atomic_t adapter_refcounts[MAX_OCTEON_DEVICES]; +static atomic_t adapter_fw_states[MAX_OCTEON_DEVICES]; static u32 octeon_device_count; /* locks device array (i.e. octeon_device[]) */ @@ -770,6 +771,10 @@ int octeon_register_device(struct octeon_device *oct, oct->adapter_refcount = &adapter_refcounts[oct->octeon_id]; atomic_set(oct->adapter_refcount, 0); + /* Like the reference count, the f/w state is shared 'per-adapter' */ + oct->adapter_fw_state = &adapter_fw_states[oct->octeon_id]; + atomic_set(oct->adapter_fw_state, FW_NEEDS_TO_BE_LOADED); + spin_lock(&octeon_devices_lock); for (idx = (int)oct->octeon_id - 1; idx >= 0; idx--) { if (!octeon_device[idx]) { @@ -780,11 +785,15 @@ int octeon_register_device(struct octeon_device *oct, atomic_inc(oct->adapter_refcount); return 1; /* here, refcount is guaranteed to be 1 */ } - /* if another device is at same bus/dev, use its refcounter */ + /* If another device is at same bus/dev, use its refcounter + * (and f/w state variable). + */ if ((octeon_device[idx]->loc.bus == bus) && (octeon_device[idx]->loc.dev == dev)) { oct->adapter_refcount = octeon_device[idx]->adapter_refcount; + oct->adapter_fw_state = + octeon_device[idx]->adapter_fw_state; break; } } diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h index 894af199ddef..33d19c4509bc 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h @@ -50,6 +50,13 @@ enum octeon_pci_swap_mode { OCTEON_PCI_32BIT_LW_SWAP = 3 }; +enum lio_fw_state { + FW_IS_PRELOADED = 0, + FW_NEEDS_TO_BE_LOADED = 1, + FW_IS_BEING_LOADED = 2, + FW_HAS_BEEN_LOADED = 3, +}; + enum { OCTEON_CONFIG_TYPE_DEFAULT = 0, NUM_OCTEON_CONFS, @@ -557,6 +564,9 @@ struct octeon_device { } loc; atomic_t *adapter_refcount; /* reference count of adapter */ + + atomic_t *adapter_fw_state; /* per-adapter, lio_fw_state */ + bool ptp_enable; }; From b36e48209157fdd98a5589a3dd60ff3fbf51e16d Mon Sep 17 00:00:00 2001 From: Rick Farrington Date: Fri, 22 Sep 2017 17:12:47 -0700 Subject: [PATCH 0811/1322] liquidio: verify firmware version when auto-loaded from flash. Signed-off-by: Rick Farrington Signed-off-by: Felix Manlunas Signed-off-by: David S. Miller --- .../net/ethernet/cavium/liquidio/lio_main.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index ce08f710de0b..a3c9867c0340 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -3303,7 +3303,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) { struct lio *lio = NULL; struct net_device *netdev; - u8 mac[6], i, j; + u8 mac[6], i, j, *fw_ver; struct octeon_soft_command *sc; struct liquidio_if_cfg_context *ctx; struct liquidio_if_cfg_resp *resp; @@ -3414,6 +3414,22 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) goto setup_nic_dev_fail; } + /* Verify f/w version (in case of 'auto' loading from flash) */ + fw_ver = octeon_dev->fw_info.liquidio_firmware_version; + if (memcmp(LIQUIDIO_BASE_VERSION, + fw_ver, + strlen(LIQUIDIO_BASE_VERSION))) { + dev_err(&octeon_dev->pci_dev->dev, + "Unmatched firmware version. Expected %s.x, got %s.\n", + LIQUIDIO_BASE_VERSION, fw_ver); + goto setup_nic_dev_fail; + } else if (atomic_read(octeon_dev->adapter_fw_state) == + FW_IS_PRELOADED) { + dev_info(&octeon_dev->pci_dev->dev, + "Using auto-loaded firmware version %s.\n", + fw_ver); + } + octeon_swap_8B_data((u64 *)(&resp->cfg_info), (sizeof(struct liquidio_if_cfg_info)) >> 3); From 429cbf6bde1adff108171ad4b2387e62f851d609 Mon Sep 17 00:00:00 2001 From: Rick Farrington Date: Fri, 22 Sep 2017 17:12:51 -0700 Subject: [PATCH 0812/1322] liquidio: update module parameter fw_type to reflect firmware type loaded Signed-off-by: Rick Farrington Signed-off-by: Felix Manlunas Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/liquidio/lio_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index a3c9867c0340..963803bc6633 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -1934,10 +1934,12 @@ static int load_firmware(struct octeon_device *oct) char fw_name[LIO_MAX_FW_FILENAME_LEN]; char *tmp_fw_type; - if (fw_type_is_auto()) + if (fw_type_is_auto()) { tmp_fw_type = LIO_FW_NAME_TYPE_NIC; - else + strncpy(fw_type, tmp_fw_type, sizeof(fw_type)); + } else { tmp_fw_type = fw_type; + } sprintf(fw_name, "%s%s%s_%s%s", LIO_FW_DIR, LIO_FW_BASE_NAME, octeon_get_conf(oct)->card_name, tmp_fw_type, From ba581f77df23c8ee70b372966e69cf10bc5453d8 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Sat, 23 Sep 2017 16:07:28 +0530 Subject: [PATCH 0813/1322] cxgb4: do DCB state reset in couple of places reset the driver's DCB state in couple of places where it was missing. Signed-off-by: Casey Leedom Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c | 15 +++++++++++---- drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h | 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 10 ++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c index 6ee2ed30626b..4e7f72b17e82 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c @@ -40,8 +40,7 @@ static inline bool cxgb4_dcb_state_synced(enum cxgb4_dcb_state state) return false; } -/* Initialize a port's Data Center Bridging state. Typically used after a - * Link Down event. +/* Initialize a port's Data Center Bridging state. */ void cxgb4_dcb_state_init(struct net_device *dev) { @@ -106,6 +105,15 @@ static void cxgb4_dcb_cleanup_apps(struct net_device *dev) } } +/* Reset a port's Data Center Bridging state. Typically used after a + * Link Down event. + */ +void cxgb4_dcb_reset(struct net_device *dev) +{ + cxgb4_dcb_cleanup_apps(dev); + cxgb4_dcb_state_init(dev); +} + /* Finite State machine for Data Center Bridging. */ void cxgb4_dcb_state_fsm(struct net_device *dev, @@ -194,8 +202,7 @@ void cxgb4_dcb_state_fsm(struct net_device *dev, * state. We need to reset back to a ground state * of incomplete. */ - cxgb4_dcb_cleanup_apps(dev); - cxgb4_dcb_state_init(dev); + cxgb4_dcb_reset(dev); dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE; dcb->supported = CXGB4_DCBX_FW_SUPPORT; linkwatch_fire_event(dev); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h index ccf24d3dc982..02040b99c78a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h @@ -131,6 +131,7 @@ struct port_dcb_info { void cxgb4_dcb_state_init(struct net_device *); void cxgb4_dcb_version_init(struct net_device *); +void cxgb4_dcb_reset(struct net_device *dev); void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input); void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *); void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index aa93ae95d3b9..13b636b0af5f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -281,7 +281,7 @@ void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat) else { #ifdef CONFIG_CHELSIO_T4_DCB if (cxgb4_dcb_enabled(dev)) { - cxgb4_dcb_state_init(dev); + cxgb4_dcb_reset(dev); dcb_tx_queue_prio_enable(dev, false); } #endif /* CONFIG_CHELSIO_T4_DCB */ @@ -2304,10 +2304,16 @@ static int cxgb_close(struct net_device *dev) { struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; + int ret; netif_tx_stop_all_queues(dev); netif_carrier_off(dev); - return t4_enable_vi(adapter, adapter->pf, pi->viid, false, false); + ret = t4_enable_vi(adapter, adapter->pf, pi->viid, false, false); +#ifdef CONFIG_CHELSIO_T4_DCB + cxgb4_dcb_reset(dev); + dcb_tx_queue_prio_enable(dev, false); +#endif + return ret; } int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid, From 9484dc74fcf0750cd6726c9aa27edf97223916a8 Mon Sep 17 00:00:00 2001 From: yuan linyu Date: Sat, 23 Sep 2017 22:36:52 +0800 Subject: [PATCH 0814/1322] tun: delete original tun_get() and rename __tun_get() to tun_get() it seems no need to keep tun_get() and __tun_get() at same time. Signed-off-by: yuan linyu Signed-off-by: David S. Miller --- drivers/net/tun.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9880b3bc8fa5..2c36f6ebad79 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -774,7 +774,7 @@ out: return err; } -static struct tun_struct *__tun_get(struct tun_file *tfile) +static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; @@ -787,11 +787,6 @@ static struct tun_struct *__tun_get(struct tun_file *tfile) return tun; } -static struct tun_struct *tun_get(struct file *file) -{ - return __tun_get(file->private_data); -} - static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); @@ -1250,7 +1245,7 @@ static void tun_net_init(struct net_device *dev) static unsigned int tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); struct sock *sk; unsigned int mask = 0; @@ -1784,8 +1779,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct tun_struct *tun = tun_get(file); struct tun_file *tfile = file->private_data; + struct tun_struct *tun = tun_get(tfile); ssize_t result; if (!tun) @@ -1969,7 +1964,7 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; if (!tun) @@ -2046,7 +2041,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); if (!tun) return -EBADFD; @@ -2062,7 +2057,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); - struct tun_struct *tun = __tun_get(tfile); + struct tun_struct *tun = tun_get(tfile); int ret; if (!tun) @@ -2094,7 +2089,7 @@ static int tun_peek_len(struct socket *sock) struct tun_struct *tun; int ret = 0; - tun = __tun_get(tfile); + tun = tun_get(tfile); if (!tun) return 0; @@ -2490,7 +2485,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ret = 0; rtnl_lock(); - tun = __tun_get(tfile); + tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) @@ -2837,15 +2832,16 @@ static int tun_chr_close(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS -static void tun_chr_show_fdinfo(struct seq_file *m, struct file *f) +static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { + struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); - tun = tun_get(f); + tun = tun_get(tfile); if (tun) tun_get_iff(current->nsproxy->net_ns, tun, &ifr); rtnl_unlock(); From 3aa605f28b0d004a640a826380b39c7dcf70195d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Sep 2017 11:07:28 -0700 Subject: [PATCH 0815/1322] sch_netem: faster rb tree removal While running TCP tests involving netem storing millions of packets, I had the idea to speed up tfifo_reset() and did experiments. I tried the rbtree_postorder_for_each_entry_safe() method that is used in skb_rbtree_purge() but discovered it was slower than the current tfifo_reset() method. I measured time taken to release skbs with three occupation levels : 10^4, 10^5 and 10^6 skbs with three methods : 1) (current 'naive' method) while ((p = rb_first(&q->t_root))) { struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); rtnl_kfree_skbs(skb, skb); } 2) Use rb_next() instead of rb_first() in the loop : p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = netem_rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } 3) "optimized" method using rbtree_postorder_for_each_entry_safe() struct sk_buff *skb, *next; rbtree_postorder_for_each_entry_safe(skb, next, &q->t_root, rbnode) { rtnl_kfree_skbs(skb, skb); } q->t_root = RB_ROOT; Results : method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 10000 skbs in 541846 ns (54 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb) method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 100000 skbs in 5942456 ns (59 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb) method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 1000000 skbs in 82619635 ns (82 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb) Method 2) is simply faster, probably because it maintains a smaller working size set. Note that this is the method we use in tcp_ofo_queue() already. I will also change skb_rbtree_purge() in a second patch. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 063a4bdb9ee6..5a4f10080290 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -361,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct rb_node *p; + struct rb_node *p = rb_first(&q->t_root); - while ((p = rb_first(&q->t_root))) { + while (p) { struct sk_buff *skb = netem_rb_to_skb(p); - rb_erase(p, &q->t_root); + p = rb_next(p); + rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } } From 7c90584c66cc4b033a3b684b0e0950f79e7b7166 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Sep 2017 12:39:12 -0700 Subject: [PATCH 0816/1322] net: speed up skb_rbtree_purge() As measured in my prior patch ("sch_netem: faster rb tree removal"), rbtree_postorder_for_each_entry_safe() is nice looking but much slower than using rb_next() directly, except when tree is small enough to fit in CPU caches (then the cost is the same) Also note that there is not even an increase of text size : $ size net/core/skbuff.o.before net/core/skbuff.o text data bss dec hex filename 40711 1298 0 42009 a419 net/core/skbuff.o.before 40711 1298 0 42009 a419 net/core/skbuff.o From: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16982de649b9..000ce735fa8d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2848,12 +2848,15 @@ EXPORT_SYMBOL(skb_queue_purge); */ void skb_rbtree_purge(struct rb_root *root) { - struct sk_buff *skb, *next; + struct rb_node *p = rb_first(root); - rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); kfree_skb(skb); - - *root = RB_ROOT; + } } /** From e451ae8e4f6b3f6bd3b83a5595657b5421b3bf69 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:01:06 +0300 Subject: [PATCH 0817/1322] neigh: make struct neigh_table::entry_size unsigned int Neigh entry size can't be negative. Space savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-7 (-7) function old new delta lowpan_neigh_construct 25 24 -1 clip_seq_sub_iter 152 151 -1 clip_ioctl 1475 1474 -1 clip_constructor 93 92 -1 __neigh_create 2455 2452 -3 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9816df225af3..9a25512e0a6e 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -190,7 +190,7 @@ struct neigh_hash_table { struct neigh_table { int family; - int entry_size; + unsigned int entry_size; int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, From 01ccdf126ca5f9d4fe0889f65ee67afac910f19c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:03:04 +0300 Subject: [PATCH 0818/1322] neigh: make strucrt neigh_table::entry_size unsigned int Key length can't be negative. Leave comparisons against nla_len() signed just in case truncated attribute can sneak in there. Space savings: add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-7 (-7) function old new delta pneigh_delete 273 272 -1 mlx5e_rep_netevent_event 1415 1414 -1 mlx5e_create_encap_header_ipv6 1194 1193 -1 mlx5e_create_encap_header_ipv4 1071 1070 -1 cxgb4_l2t_get 1104 1103 -1 __pneigh_lookup 69 68 -1 __neigh_create 2452 2451 -1 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/l2t.c | 4 ++-- include/net/neighbour.h | 2 +- net/core/neighbour.c | 18 +++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index f7ef8871dd0b..1817a0307d26 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -422,7 +422,7 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh, u8 lport; u16 vlan; struct l2t_entry *e; - int addr_len = neigh->tbl->key_len; + unsigned int addr_len = neigh->tbl->key_len; u32 *addr = (u32 *)neigh->primary_key; int ifidx = neigh->dev->ifindex; int hash = addr_hash(d, addr, addr_len, ifidx); @@ -536,7 +536,7 @@ void t4_l2t_update(struct adapter *adap, struct neighbour *neigh) struct l2t_entry *e; struct sk_buff_head *arpq = NULL; struct l2t_data *d = adap->l2t; - int addr_len = neigh->tbl->key_len; + unsigned int addr_len = neigh->tbl->key_len; u32 *addr = (u32 *) neigh->primary_key; int ifidx = neigh->dev->ifindex; int hash = addr_hash(d, addr, addr_len, ifidx); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a25512e0a6e..2492000e1035 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -191,7 +191,7 @@ struct neigh_hash_table { struct neigh_table { int family; unsigned int entry_size; - int key_len; + unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 16a1a4c4eb57..6ea3a1a7f36a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -457,7 +457,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey) { struct neighbour *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val; struct neigh_hash_table *nht; @@ -488,7 +488,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; @@ -572,7 +572,7 @@ out_neigh_release: } EXPORT_SYMBOL(__neigh_create); -static u32 pneigh_hash(const void *pkey, int key_len) +static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); @@ -585,7 +585,7 @@ static u32 pneigh_hash(const void *pkey, int key_len) static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, - int key_len, + unsigned int key_len, struct net_device *dev) { while (n) { @@ -601,7 +601,7 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], @@ -614,7 +614,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net_device *dev, int creat) { struct pneigh_entry *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); @@ -659,7 +659,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); @@ -1662,7 +1662,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) goto out; if (ndm->ndm_flags & NTF_PROXY) { @@ -1730,7 +1730,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; From 4618e90965f272fe522f2af2523a60d0d4bc78f3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:10 +0200 Subject: [PATCH 0819/1322] x86/fpu: Fix fpu__activate_fpstate_read() and update comments fpu__activate_fpstate_read() can be called for the current task when coredumping - or for stopped tasks when ptrace-ing. Implement this properly in the code and update the comments. This also fixes an incorrect (but harmless) warning introduced by one of the earlier patches. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-28-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 93103a909c47..afd3f2a5c64e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,18 +254,21 @@ EXPORT_SYMBOL_GPL(fpu__activate_curr); /* * This function must be called before we read a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out - we only have to initialize the registers + * if they've never been initialized. * * If the task has used the FPU before then save it. */ void fpu__activate_fpstate_read(struct fpu *fpu) { - /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: - */ - if (fpu->fpstate_active) { + if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { if (!fpu->fpstate_active) { From 685c930d6e58e31e251ec354f9dca3958a4c5040 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:11 +0200 Subject: [PATCH 0820/1322] x86/fpu: Remove fpu__current_fpstate_write_begin/end() These functions are not used anymore, so remove them. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Bobby Powers Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-29-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 - arch/x86/kernel/fpu/core.c | 63 ----------------------------- 2 files changed, 65 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index cf290d424e48..508e4181c4af 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,8 +26,6 @@ extern void fpu__activate_curr(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); -extern void fpu__current_fpstate_write_begin(void); -extern void fpu__current_fpstate_write_end(void); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index afd3f2a5c64e..b2cdeb3b1860 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -316,69 +316,6 @@ void fpu__activate_fpstate_write(struct fpu *fpu) } } -/* - * This function must be called before we write the current - * task's fpstate. - * - * This call gets the current FPU register state and moves - * it in to the 'fpstate'. Preemption is disabled so that - * no writes to the 'fpstate' can occur from context - * swiches. - * - * Must be followed by a fpu__current_fpstate_write_end(). - */ -void fpu__current_fpstate_write_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * Ensure that the context-switching code does not write - * over the fpstate while we are doing our update. - */ - preempt_disable(); - - /* - * Move the fpregs in to the fpu's 'fpstate'. - */ - fpu__activate_fpstate_read(fpu); - - /* - * The caller is about to write to 'fpu'. Ensure that no - * CPU thinks that its fpregs match the fpstate. This - * ensures we will not be lazy and skip a XRSTOR in the - * future. - */ - __fpu_invalidate_fpregs_state(fpu); -} - -/* - * This function must be paired with fpu__current_fpstate_write_begin() - * - * This will ensure that the modified fpstate gets placed back in - * the fpregs if necessary. - * - * Note: This function may be called whether or not an _actual_ - * write to the fpstate occurred. - */ -void fpu__current_fpstate_write_end(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * 'fpu' now has an updated copy of the state, but the - * registers may still be out of date. Update them with - * an XRSTOR if they are active. - */ - if (fpu->fpstate_active) - copy_kernel_to_fpregs(&fpu->state); - - /* - * Our update is done and the fpregs/fpstate are in sync - * if necessary. Context switches can happen again. - */ - preempt_enable(); -} - /* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate From e4a81bfcaae1ebbdc6efe74e8ea563144d90e9a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 Sep 2017 09:43:36 +0200 Subject: [PATCH 0821/1322] x86/fpu: Rename fpu::fpstate_active to fpu::initialized The x86 FPU code used to have a complex state machine where both the FPU registers and the FPU state context could be 'active' (or inactive) independently of each other - which enabled features like lazy FPU restore. Much of this complexity is gone in the current code: now we basically can have FPU-less tasks (kernel threads) that don't use (and save/restore) FPU state at all, plus full FPU users that save/restore directly with no laziness whatsoever. But the fpu::fpstate_active still carries bits of the old complexity - meanwhile this flag has become a simple flag that shows whether the FPU context saving area in the thread struct is initialized and used, or not. Rename it to fpu::initialized to express this simplicity in the name as well. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-30-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/include/asm/fpu/types.h | 6 +++--- arch/x86/include/asm/trace/fpu.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 24 ++++++++++++------------ arch/x86/kernel/fpu/init.c | 2 +- arch/x86/kernel/fpu/regset.c | 6 +++--- arch/x86/kernel/fpu/signal.c | 8 ++++---- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/signal.c | 6 +++--- arch/x86/mm/pkeys.c | 2 +- 11 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e0bb46c02857..0e2a5edbce00 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->fpstate_active) { + if (fpu->initialized) { unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 508e4181c4af..b26ae05da18a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -527,7 +527,7 @@ static inline void fpregs_activate(struct fpu *fpu) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpstate_active) { + if (old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else @@ -550,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; + new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 71db45ca8870..a1520575d86b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -293,13 +293,13 @@ struct fpu { unsigned int last_cpu; /* - * @fpstate_active: + * @initialized: * - * This flag indicates whether this context is active: if the task + * This flag indicates whether this context is initialized: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ - unsigned char fpstate_active; + unsigned char initialized; /* * @state: diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index da565aae9fd2..39f7a27bef13 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -12,22 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpstate_active) + __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->fpstate_active = fpu->fpstate_active; + __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpstate_active, + __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index b2cdeb3b1860..c8d6032f04d0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpstate_active) + if (fpu->initialized) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -191,7 +191,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) + if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -240,13 +240,13 @@ void fpu__activate_curr(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } EXPORT_SYMBOL_GPL(fpu__activate_curr); @@ -271,13 +271,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } } @@ -303,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Invalidate any lazy state: */ __fpu_invalidate_fpregs_state(fpu); } else { @@ -312,7 +312,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } @@ -354,7 +354,7 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->fpstate_active) { + if (fpu->initialized) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -363,7 +363,7 @@ void fpu__drop(struct fpu *fpu) } } - fpu->fpstate_active = 0; + fpu->initialized = 0; trace_x86_fpu_dropped(fpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d5d44c452624..7affb7e3d9a5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - WARN_ON_FPU(current->thread.fpu.fpstate_active); + WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c764f7405322..19e82334e811 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r { struct fpu *target_fpu = &target->thread.fpu; - return target_fpu->fpstate_active ? regset->n : 0; + return target_fpu->initialized ? regset->n : 0; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) return regset->n; else return 0; @@ -380,7 +380,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = fpu->fpstate_active; + fpvalid = fpu->initialized; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index da68ea1c3a44..ab2dd24cfea4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -171,7 +171,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->fpstate_active || using_compacted_format()) { + if (fpu->initialized || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -315,12 +315,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int err = 0; /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures + * Drop the current fpu which clears fpu->initialized. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. + * fpu->initialized is again set. */ fpu__drop(fpu); @@ -342,7 +342,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } - fpu->fpstate_active = 1; + fpu->initialized = 1; preempt_disable(); fpu__restore(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fda1109cc355..703e76d027ee 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -867,7 +867,7 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) + if (!fpu->initialized) return NULL; /* * fpu__save() takes the CPU's xstate registers diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e04442345fc0..4e188fda5961 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->fpstate_active) { + if (fpu->initialized) { sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->fpstate_active && + if (fpu->initialized && copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->fpstate_active) + if (fpu->initialized) fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 4d24269c071f..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -44,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.fpstate_active && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; From 7f1487c59b7c6dcb20155f4302985da2659a2997 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:13 +0200 Subject: [PATCH 0822/1322] x86/fpu: Fix stale comments about lazy FPU logic We don't do any lazy restore anymore, what we have are two pieces of optimization: - no-FPU tasks that don't save/restore the FPU context (kernel threads are such) - cached FPU registers maintained via the fpu->last_cpu field. This means that if an FPU task context switches to a non-FPU task then we can maintain the FPU registers as an in-FPU copies (cache), and skip the restoration of them once we switch back to the original FPU-using task. Update all the comments that still referred to old 'lazy' and 'unlazy' concepts. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-31-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c8d6032f04d0..77668d91fdc1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -205,9 +205,6 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * In lazy mode, if the FPU context isn't loaded into - * fpregs, CR0.TS will be set and do_device_not_available - * will load the FPU context. * * We have to do all this with preemption disabled, * mostly because of the FNSAVE case, because in that @@ -285,13 +282,13 @@ void fpu__activate_fpstate_read(struct fpu *fpu) /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then unlazy it. + * If the task has used the FPU before then invalidate any cached FPU registers. * If the task has not used the FPU before then initialize its fpstate. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers + * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ @@ -304,7 +301,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) WARN_ON_FPU(fpu == ¤t->thread.fpu); if (fpu->initialized) { - /* Invalidate any lazy state: */ + /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); From e10078eba69859359ce8644dd423b4132a6a8913 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:14 +0200 Subject: [PATCH 0823/1322] x86/fpu: Simplify and speed up fpu__copy() fpu__copy() has a preempt_disable()/enable() pair, which it had to do to be able to atomically unlazy the current task when doing an FNSAVE. But we don't unlazy tasks anymore, we always do direct saves/restores of FPU context. So remove both the unnecessary critical section, and update the comments. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-32-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 77668d91fdc1..52122dd418ae 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -206,22 +206,13 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. * - * We have to do all this with preemption disabled, - * mostly because of the FNSAVE case, because in that - * case we must not allow preemption in the window - * between the FNSAVE and us marking the context lazy. - * - * It shouldn't be an issue as even FNSAVE is plenty - * fast in terms of critical section length. + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to copy them back. ) */ - preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, - fpu_kernel_xstate_size); - + memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); copy_kernel_to_fpregs(&src_fpu->state); } - preempt_enable(); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); From 2ce03d850b9a2f17d55596ecfa86e72b5687a627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 15:00:15 +0200 Subject: [PATCH 0824/1322] x86/fpu: Rename fpu__activate_curr() to fpu__initialize() Rename this function to better express that it's all about initializing the FPU state of a task which goes hand in hand with the fpu::initialized field. Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Eric Biggers Cc: Fenghua Yu Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Yu-cheng Yu Link: http://lkml.kernel.org/r/20170923130016.21448-33-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/kernel/fpu/core.c | 8 ++++---- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kvm/x86.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index b26ae05da18a..7c980aafb8aa 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -23,7 +23,7 @@ /* * High level FPU state handling functions: */ -extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__initialize(struct fpu *fpu); extern void fpu__activate_fpstate_read(struct fpu *fpu); extern void fpu__activate_fpstate_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 52122dd418ae..07db9d94b68b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -224,7 +224,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__activate_curr(struct fpu *fpu) +void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); @@ -237,7 +237,7 @@ void fpu__activate_curr(struct fpu *fpu) fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__activate_curr); +EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -316,7 +316,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ void fpu__restore(struct fpu *fpu) { - fpu__activate_curr(fpu); + fpu__initialize(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -392,7 +392,7 @@ void fpu__clear(struct fpu *fpu) */ if (static_cpu_has(X86_FEATURE_FPU)) { preempt_disable(); - fpu__activate_curr(fpu); + fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); preempt_enable(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ab2dd24cfea4..7fa3bdb331e9 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -280,7 +280,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd17b7d9a107..03869eb7fcd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index d4a7df2205b8..220638a4cb94 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - fpu__activate_curr(fpu); + fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { From 369a036de206710ff27a66f9bffe78ef657648c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 23 Sep 2017 13:37:45 +0200 Subject: [PATCH 0825/1322] x86/fpu: Rename fpu__activate_fpstate_read/write() to fpu__prepare_[read|write]() As per the new nomenclature we don't 'activate' the FPU state anymore, we initialize it. So drop the _activate_fpstate name from these functions, which were a bit of a mouthful anyway, and name them: fpu__prepare_read() fpu__prepare_write() Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/fpu/regset.c | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7c980aafb8aa..e3221ffa304e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -24,8 +24,8 @@ * High level FPU state handling functions: */ extern void fpu__initialize(struct fpu *fpu); -extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate_write(struct fpu *fpu); +extern void fpu__prepare_read(struct fpu *fpu); +extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 07db9d94b68b..f92a6593de1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -254,7 +254,7 @@ EXPORT_SYMBOL_GPL(fpu__initialize); * * If the task has used the FPU before then save it. */ -void fpu__activate_fpstate_read(struct fpu *fpu) +void fpu__prepare_read(struct fpu *fpu) { if (fpu == ¤t->thread.fpu) { fpu__save(fpu); @@ -283,7 +283,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu) * state pending on its former CPU could be restored, corrupting * the modifications. */ -void fpu__activate_fpstate_write(struct fpu *fpu) +void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 19e82334e811..ee8d2f049818 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -89,7 +89,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (using_compacted_format()) { if (kbuf) @@ -132,7 +132,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) { if (kbuf) @@ -310,7 +310,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -340,7 +340,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) From e63e5d5c15c6b1dba26f7cbd1b1089a1d6155db5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:04 +0200 Subject: [PATCH 0826/1322] x86/fpu: Introduce validate_xstate_header() Move validation of user-supplied xstate_header into a helper function, in preparation of calling it from both the ptrace and sigreturn syscall paths. The new function also considers it to be an error if *any* reserved bits are set, whereas before we were just clearing most of them silently. This should reduce the chance of bugs that fail to correctly validate user-supplied XSAVE areas. It also will expose any broken userspace programs that set the other reserved bits; this is desirable because such programs will lose compatibility with future CPUs and kernels if those bits are ever used for anything. (There shouldn't be any such programs, and in fact in the case where the compacted format is in use we were already validating xfeatures. But you never know...) Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-2-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/xstate.h | 4 ++++ arch/x86/kernel/fpu/xstate.c | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 579ac2358e63..83fee2469eb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -52,4 +52,8 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +extern int validate_xstate_header(const struct xstate_header *hdr); + #endif diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 703e76d027ee..2427aeea33b5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -483,6 +483,30 @@ int using_compacted_format(void) return boot_cpu_has(X86_FEATURE_XSAVES); } +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + static void __xstate_dump_leaves(void) { int i; From cf9df81b139b6ebaec188d73758f02ca3b2110e4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:05 +0200 Subject: [PATCH 0827/1322] x86/fpu: Use validate_xstate_header() to validate the xstate_header in xstateregs_set() Tighten the checks in xstateregs_set(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-3-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ee8d2f049818..b831d5b9de99 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -141,28 +141,21 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, ret = copy_user_to_xstate(xsave, ubuf); } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - - /* xcomp_bv must be 0 when using uncompacted format */ - if (!ret && xsave->header.xcomp_bv) - ret = -EINVAL; + if (!ret) + ret = validate_xstate_header(&xsave->header); } + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + /* * In case of failure, mark all states as init: */ if (ret) fpstate_init(&fpu->state); - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; - /* - * These bits must be zero. - */ - memset(&xsave->header.reserved, 0, 48); - return ret; } From b11e2e18a7fc8eaa3d592c260d50c7129e094ded Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:06 +0200 Subject: [PATCH 0828/1322] x86/fpu: Use validate_xstate_header() to validate the xstate_header in __fpu__restore_sig() Tighten the checks in __fpu__restore_sig() and update comments. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-4-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/signal.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 7fa3bdb331e9..fb639e70048f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -214,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk, struct xstate_header *header = &xsave->header; if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); + /* + * Note: we don't need to zero the reserved bits in the + * xstate_header here because we either didn't copy them at all, + * or we checked earlier that they aren't set. + */ /* * Init the state that is not present in the memory @@ -224,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= (xfeatures_mask & xfeatures); + header->xfeatures &= xfeatures; } if (use_fxsr()) { @@ -308,7 +311,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) /* * For 32-bit frames with fxstate, copy the user state to the * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. + * header. Validate and sanitize the copied state. */ struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; @@ -329,9 +332,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - /* xcomp_bv must be 0 when using uncompacted format */ - if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) - err = -EINVAL; + if (!err && state_size > offsetof(struct xregs_state, header)) + err = validate_xstate_header(&fpu->state.xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { From 80d8ae86b36791a545ca28ddc95133ea59bba6e0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:07 +0200 Subject: [PATCH 0829/1322] x86/fpu: Copy the full state_header in copy_kernel_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-5-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2427aeea33b5..02591b96bb25 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1148,11 +1148,13 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - memcpy(&xfeatures, kbuf + offset, size); + memcpy(&hdr, kbuf + offset, size); + xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: From b89eda482d7849a1c146b6d0a42f4e76369bb08e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:08 +0200 Subject: [PATCH 0830/1322] x86/fpu: Eliminate the 'xfeatures' local variable in copy_kernel_to_xstate() We have this information in the xstate_header. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-6-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 02591b96bb25..c97c4a9db52a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1146,7 +1146,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1154,20 +1153,19 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) size = sizeof(hdr); memcpy(&hdr, kbuf + offset, size); - xfeatures = hdr.xfeatures; /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1177,7 +1175,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; memcpy(&xsave->i387.mxcsr, kbuf + offset, size); @@ -1192,7 +1190,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } From af95774b3ca080b0e1e651c0fc7680f3444ddda7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:09 +0200 Subject: [PATCH 0831/1322] x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_kernel_to_xstate() Tighten the checks in copy_kernel_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-7-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c97c4a9db52a..325db7850335 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1138,15 +1138,12 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * and copy to the target thread. This is called from xstateregs_set(). */ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1154,12 +1151,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { From af2c4322d986a08a6e793b74b83a62b325019c20 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:10 +0200 Subject: [PATCH 0832/1322] x86/fpu: Copy the full header in copy_user_to_xstate() This is in preparation to verify the full xstate header as supplied by user-space. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-8-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 325db7850335..0cd7b73c25e8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1199,13 +1199,16 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) int i; u64 xfeatures; u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - if (__copy_from_user(&xfeatures, ubuf + offset, size)) + if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; + xfeatures = hdr.xfeatures; + /* * Reject if the user sets any disabled or supervisor features: */ From 3d703477bcfe8bb57079d97198cf1e342fe1fef9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:11 +0200 Subject: [PATCH 0833/1322] x86/fpu: Eliminate the 'xfeatures' local variable in copy_user_to_xstate() We now have this field in hdr.xfeatures. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-9-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0cd7b73c25e8..b6d78b78b5c2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1197,7 +1197,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 xfeatures; u64 allowed_features; struct xstate_header hdr; @@ -1207,20 +1206,18 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - xfeatures = hdr.xfeatures; - /* * Reject if the user sets any disabled or supervisor features: */ allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - if (xfeatures & ~allowed_features) + if (hdr.xfeatures & ~allowed_features) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; @@ -1231,7 +1228,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) } } - if (xfeatures_mxcsr_quirk(xfeatures)) { + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) @@ -1247,7 +1244,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } From 98c0fad9d60e8b2cd47e15b7bee7df343648f5bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:12 +0200 Subject: [PATCH 0834/1322] x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_user_to_xstate() Tighten the checks in copy_user_to_xstate(). Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-10-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/xstate.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b6d78b78b5c2..f1d5476c9022 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1188,16 +1188,15 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) } /* - * Convert from a ptrace standard-format user-space buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. */ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { unsigned int offset, size; int i; - u64 allowed_features; struct xstate_header hdr; offset = offsetof(struct xregs_state, header); @@ -1206,12 +1205,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - /* - * Reject if the user sets any disabled or supervisor features: - */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; - - if (hdr.xfeatures & ~allowed_features) + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { From 738f48cb5fdd5878d11934f1898aa2bcf1578289 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Sep 2017 12:59:13 +0200 Subject: [PATCH 0835/1322] x86/fpu: Use using_compacted_format() instead of open coded X86_FEATURE_XSAVES This is the canonical method to use. Signed-off-by: Eric Biggers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kees Cook Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20170924105913.9157-11-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/regset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b831d5b9de99..3ea151372389 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -134,7 +134,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, fpu__prepare_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (using_compacted_format()) { if (kbuf) ret = copy_kernel_to_xstate(xsave, kbuf); else From a98c75fcd0ec02623f4f56d824d76e659410a52b Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Aug 2017 19:13:26 +0200 Subject: [PATCH 0836/1322] drm/tegra: trace: Fix path to include The TRACE_INCLUDE_FILE macro needs to specify the path relative to the define_trace.h header rather than relative to the file defining it. Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20170823171326.23620-1-thierry.reding@gmail.com --- drivers/gpu/drm/tegra/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/tegra/trace.h b/drivers/gpu/drm/tegra/trace.h index e9b7cdad5c4c..5a1ab4046e92 100644 --- a/drivers/gpu/drm/tegra/trace.h +++ b/drivers/gpu/drm/tegra/trace.h @@ -63,6 +63,6 @@ DEFINE_EVENT(register_access, sor_readl, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/tegra #define TRACE_INCLUDE_FILE trace #include From ff40adf7fbdff96860b1153332c0b1c7bab6e0c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 24 Aug 2017 18:19:48 -0600 Subject: [PATCH 0837/1322] Btrfs: use the new helper wbc_to_write_flags This updates btrfs to use the helper wbc_to_write_flags which has been applied in ext4/xfs/f2fs/block. Please note that, with this, btrfs's dirty pages written by a writeback job will carry the flag REQ_BACKGROUND, which is currently used by writeback-throttle to determine whether it should go to get a request or wait. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d17783d70228..4ead6da5a645 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3471,8 +3471,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned int write_flags = 0; unsigned long nr_written = 0; - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = REQ_SYNC; + write_flags = wbc_to_write_flags(wbc); trace___extent_writepage(page, inode, wbc); @@ -3718,7 +3717,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); From 5f14efd3d437205143dcffcf776e0122eae1755a Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 23 Aug 2017 12:15:09 -0600 Subject: [PATCH 0838/1322] Btrfs: do not reset bio->bi_ops while writing bio flush_epd_write_bio() sets bio->bi_opf by itself to honor REQ_SYNC, but it's not needed at all since bio->bi_opf has set up properly in both __extent_writepage() and write_one_eb(), and in the case of write_one_eb(), it also sets REQ_META, which we will lose in flush_epd_write_bio(). This remove this unnecessary bio->bi_opf setting. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ead6da5a645..3738d245518c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4062,9 +4062,6 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? REQ_SYNC : 0); - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; From bea7eafdbda3ba1d4b2ccb9cca829eefb7989bb9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 22 Aug 2017 23:46:00 -0700 Subject: [PATCH 0839/1322] Btrfs: fix incorrect {node,sector}size endianness from BTRFS_IOC_FS_INFO fs_info->super_copy->{node,sector}size are little-endian, but the ioctl should return the values in native endianness. Use the cached values in btrfs_fs_info instead. Found with sparse. Fixes: 80a773fbfc2d ("btrfs: retrieve more info from FS_INFO ioctl") Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ae8fbf9d3de2..cf1c2ee030dd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2769,9 +2769,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } mutex_unlock(&fs_devices->device_list_mutex); - fi_args->nodesize = fs_info->super_copy->nodesize; - fi_args->sectorsize = fs_info->super_copy->sectorsize; - fi_args->clone_alignment = fs_info->super_copy->sectorsize; + fi_args->nodesize = fs_info->nodesize; + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:58:47 +0900 Subject: [PATCH 0840/1322] btrfs: clear ordered flag on cleaning up ordered extents Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup submitted ordered extents. However, it does not clear the ordered bit (Private2) of corresponding pages. Thus, the following BUG occurs from free_pages_check_bad() (on btrfs/125 with nospace_cache). BUG: Bad page state in process btrfs pfn:3fa787 page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping: (null) index:0xd flags: 0x8000000000002008(uptodate|private_2) raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: 0x2000(private_2) This patch clears the flag same as other places calling btrfs_dec_test_ordered_pending() for every page in the specified range. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d184a46e46c4..455c0f22fe2d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, const u64 offset, const u64 bytes) { + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + index++; + if (!page) + continue; + ClearPagePrivate2(page); + put_page(page); + } return __endio_write_update_ordered(inode, offset + PAGE_SIZE, bytes - PAGE_SIZE, false); } From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 1 Sep 2017 17:59:07 +0900 Subject: [PATCH 0841/1322] btrfs: finish ordered extent cleaning if no progress is found __endio_write_update_ordered() repeats the search until it reaches the end of the specified range. This works well with direct IO path, because before the function is called, it's ensured that there are ordered extents filling whole the range. It's not the case, however, when it's called from run_delalloc_range(): it is possible to have error in the midle of the loop in e.g. run_delalloc_nocow(), so that there exisits the range not covered by any ordered extents. By cleaning such "uncomplete" range, __endio_write_update_ordered() stucks at offset where there're no ordered extents. Since the ordered extents are created from head to tail, we can stop the search if there are no offset progress. Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") Cc: # 4.12 Signed-off-by: Naohiro Aota Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 455c0f22fe2d..f78c5640c6dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8396,6 +8396,7 @@ static void __endio_write_update_ordered(struct inode *inode, btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; + u64 last_offset; int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -8407,6 +8408,7 @@ static void __endio_write_update_ordered(struct inode *inode, } again: + last_offset = ordered_offset; ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, @@ -8417,6 +8419,12 @@ again: btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered->work); out_test: + /* + * If btrfs_dec_test_ordered_pending does not find any ordered extent + * in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; /* * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 25 Aug 2017 14:15:14 +0900 Subject: [PATCH 0842/1322] btrfs: fix NULL pointer dereference from free_reloc_roots() __del_reloc_root should be called before freeing up reloc_root->node. If not, calling __del_reloc_root() dereference reloc_root->node, causing the system BUG. Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") Cc: # 4.9 Signed-off-by: Naohiro Aota Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3a49a3c2fca4..9841faef08ea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2400,11 +2400,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } From ca6842bf01dc1ad41195eac1e343b4f08c496ba8 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Fri, 22 Jan 2016 09:13:25 +0900 Subject: [PATCH 0843/1322] Btrfs: send: fix error number for unknown inode types ENOTSUPP should not be returned to the user program. (cf. include/linux/errno.h) Therefore, EOPNOTSUPP is used instead of ENOTSUPP. Signed-off-by: Tsutomu Itoh Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f1d3d6e7087..43430e6c99aa 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2640,7 +2640,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) } else { btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001 From: satoru takeuchi Date: Tue, 12 Sep 2017 22:42:52 +0900 Subject: [PATCH 0844/1322] btrfs: prevent to set invalid default subvolid `btrfs sub set-default` succeeds to set an ID which isn't corresponding to any fs/file tree. If such the bad ID is set to a filesystem, we can't mount this filesystem without specifying `subvol` or `subvolid` mount options. Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") Cc: Signed-off-by: Satoru Takeuchi Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cf1c2ee030dd..d4a77993e52f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4057,6 +4057,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 8 Sep 2017 17:48:55 +0900 Subject: [PATCH 0845/1322] btrfs: propagate error to btrfs_cmp_data_prepare caller btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors from gather_extent_pages(). While the pages are freed by btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, btrfs_extent_same() try to access the already freed pages causing faults (or violates PageLocked assertion). This patch just return the error as is so that the caller stop the process. Signed-off-by: Naohiro Aota Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") Cc: # 4.2 Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4a77993e52f..802df5755cd3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3028,7 +3028,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) From c2faff790ccd11ea5be8e3ca99713f116fcd6030 Mon Sep 17 00:00:00 2001 From: "Misono, Tomohiro" Date: Wed, 30 Aug 2017 16:33:16 +0900 Subject: [PATCH 0846/1322] btrfs: remove BTRFS_FS_QUOTA_DISABLING flag Currently, "btrfs quota enable" would fail after "btrfs quota disable" on the first time with syslog output "qgroup_rescan_init failed with -22", but it would succeed on the second time. When "quota disable" is called, BTRFS_FS_QUOTA_DISABLING flag bit will be set in fs_info->flags in btrfs_quota_disable(), but it will not be droppd in btrfs_run_qgroups() (which is called in btrfs_commit_transaction()) because quota_root has already been freed. If "quota enable" is called after that, both BTRFS_FS_QUOTA_DISABLING and BTRFS_FS_QUOTA_ENABLED flag would be dropped in the btrfs_run_qgroups() since quota_root is not NULL. This leads to the failure of "quota enable" on the first time. BTRFS_FS_QUOTA_DISABLING flag is not used outside of "quota disable" context and is equivalent to whether quota_root is NULL or not. btrfs_run_qgroups() checks whether quota_root is NULL or not in the first place. So, let's remove BTRFS_FS_QUOTA_DISABLING flag. Signed-off-by: Tomohiro Misono Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/qgroup.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2add002662f4..b7ccfcc01732 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -708,7 +708,6 @@ struct btrfs_delayed_root; #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 #define BTRFS_FS_QUOTA_ENABLING 7 -#define BTRFS_FS_QUOTA_DISABLING 8 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c8b61c86e61..770f667269f5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -807,7 +807,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, } ret = 0; out: - set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags); btrfs_free_path(path); return ret; } @@ -953,7 +952,6 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, if (!fs_info->quota_root) goto out; clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -2086,8 +2084,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { From fed3b381145e2e7c66b0b3f640851e1633ebd07f Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 13 Sep 2017 12:25:21 -0600 Subject: [PATCH 0847/1322] Btrfs: do not backup tree roots when fsync It doesn't make sense to backup tree roots when doing fsync, since during fsync those tree roots have not been consistent on disk. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 27d458640536..0f2271815eb6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3641,7 +3641,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); - backup_super_roots(fs_info); + + /* + * max_mirrors == 0 indicates we're from commit_transaction, + * not from fsync where the tree roots in fs_info have not + * been consistent on disk. + */ + if (max_mirrors == 0) + backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; From bd7d63c2ceaf737eeb21630a2b62fc5fe34dba29 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 19 Sep 2017 17:50:09 -0600 Subject: [PATCH 0848/1322] Btrfs: use btrfs_op instead of bio_op in __btrfs_map_block This seems to be a leftover of commit cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block"). It should use btrfs_op() helper to provide one of 'enum btrfs_map_op' types. Fixes: cf8cddd38bab ("btrfs: don't abuse REQ_OP_* flags for btrfs_map_block") Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d024f1b07282..6a72f88f77b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6166,7 +6166,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, bio_op(bio), logical, + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); From cf1167d5c1abf3bc42b2a1562bfa7937c05337e2 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:18 -0600 Subject: [PATCH 0849/1322] Btrfs: fix kernel oops while reading compressed data The kernel oops happens at kernel BUG at fs/btrfs/extent_io.c:2104! ... RIP: clean_io_failure+0x263/0x2a0 [btrfs] It's showing that read-repair code is using an improper mirror index. This is due to the fact that compression read's endio hasn't recorded the failed mirror index in %cb->orig_bio. With this, btrfs's read-repair can work properly on reading compressed data. Signed-off-by: Liu Bo Reported-by: Paul Jones Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 883ecc58fd0d..c6aa53c4c102 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -107,6 +107,7 @@ static void end_compressed_bio_read(struct bio *bio) struct inode *inode; struct page *page; unsigned long index; + unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret; if (bio->bi_status) @@ -118,6 +119,14 @@ static void end_compressed_bio_read(struct bio *bio) if (!refcount_dec_and_test(&cb->pending_bios)) goto out; + /* + * Record the correct mirror_num in cb->orig_bio so that + * read-repair can work properly. + */ + ASSERT(btrfs_io_bio(cb->orig_bio)); + btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + cb->mirror_num = mirror; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); From e6311f240c946788131ba2b97e14f37312688072 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Sep 2017 17:50:19 -0600 Subject: [PATCH 0850/1322] Btrfs: skip checksum when reading compressed data if some IO have failed Currently even if the underlying disk reports failure on IO, compressed read endio still gets to verify checksum and reports it as a checksum error. In fact, if some IO have failed during reading a compressed data extent , there's no way the checksum could match, therefore, we can skip that in order to return error quickly to the upper layer. Please note that we need to do this after recording the failed mirror index so that read-repair in the upper layer's endio can work properly. Signed-off-by: Liu Bo Tested-by: Paul Jones Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6aa53c4c102..fc31af98a41b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -108,7 +108,7 @@ static void end_compressed_bio_read(struct bio *bio) struct page *page; unsigned long index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; - int ret; + int ret = 0; if (bio->bi_status) cb->errors = 1; @@ -127,6 +127,13 @@ static void end_compressed_bio_read(struct bio *bio) btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; + /* + * Some IO in this cb have failed, just skip checksum as there + * is no way it could be correct. + */ + if (cb->errors == 1) + goto csum_failed; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); From 36b96fdc6b2dc6f4a0fedc563fa7508c91b90a10 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Sun, 17 Sep 2017 09:02:29 +0000 Subject: [PATCH 0851/1322] btrfs: Report error on removing qgroup if del_qgroup_item fails Previously, we were calling del_qgroup_item, and ignoring the return code resulting in a potential to have divergent in-memory state without an error. Perhaps, it makes sense to handle this error code, and put the filesystem into a read only, or similar state. This patch only adds reporting of the error if the error is fatal, (any error other than qgroup not found). Signed-off-by: Sargun Dhillon Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 770f667269f5..e172d4843eae 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1305,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, } } ret = del_qgroup_item(trans, quota_root, qgroupid); + if (ret && ret != -ENOENT) + goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, From 99c4e3b96c797f047be4e6b7c03cfca01959f146 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Sep 2017 15:06:51 -0600 Subject: [PATCH 0852/1322] Btrfs: fix unexpected result when dio reading corrupted blocks commit 4246a0b63bd8 ("block: add a bi_error field to struct bio") changed the logic of how dio read endio reports errors. For single stripe dio read, %bio->bi_status reflects the error before verifying checksum, and now we're updating it when data block matches with its checksum, while in the mismatching case, %bio->bi_status is not updated to relfect that. When some blocks in a file have been corrupted on disk, reading such a file ends up with 1) checksum errors are reported in kernel log 2) read(2) returns successfully with some content being 0x01. In order to fix it, we need to report its checksum mismatch error to the upper layer (dio layer in this case) as well. Fixes: 4246a0b63bd8 ("block: add a bi_error field to struct bio") Signed-off-by: Liu Bo Reported-by: Goffredo Baroncelli Tested-by: Goffredo Baroncelli Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f78c5640c6dc..c242d0230db9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8366,11 +8366,8 @@ static void btrfs_endio_direct_read(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); - if (!err) - bio->bi_status = 0; - } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8378,7 +8375,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_status = bio->bi_status; + dio_bio->bi_status = err; dio_end_io(dio_bio); if (io_bio->end_io) From 8c6c592831a09a28428448e68fb08c6bbb8b9b8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Aug 2017 10:11:39 -0400 Subject: [PATCH 0853/1322] btrfs: log csums for all modified extents Amir reported a bug discovered by his cleaned up version of my dm-log-writes xfstests where we were missing csums at certain replay points. This is because fsx was doing an msync(), which essentially fsync()'s a specific range of a file. We will log all modified extents, but only search for the checksums in the range we are being asked to sync. We cannot simply log the extents in the range we're being asked because we are logging the inode item as it is currently, which if it has had a i_size update before the msync means we will miss extents when replaying. We could possibly get around this by marking the inode with the transaction that extended the i_size to see if we have this case, but this would be racy and we'd have to lock the whole range of the inode to make sure we didn't have an ordered extent outside of our range that was in the middle of completing. Fix this simply by keeping track of the modified extents range and logging the csums for the entire range of extents that we are logging. This makes the xfstest pass. Reported-by: Amir Goldstein Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ad7f4bab640b..c800d067fcbf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4181,6 +4181,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; + u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4190,10 +4191,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; + logged_start = start; + logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); - /* * Just an arbitrary number, this can be really CPU intensive * once we start getting a lot of extents, and really once we @@ -4208,6 +4210,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + + if (em->start < logged_start) + logged_start = em->start; + if ((em->start + em->len - 1) > logged_end) + logged_end = em->start + em->len - 1; + /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -4216,7 +4224,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, start, end); + btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* * Some ordered extents started by fsync might have completed * before we could collect them into the list logged_list, which From 0bc15d85d97d44e8979ff91d0c1fbafe6fd4172c Mon Sep 17 00:00:00 2001 From: Nickey Yang Date: Tue, 26 Sep 2017 15:55:22 +0800 Subject: [PATCH 0854/1322] arm64: dts: rockchip: add the grf clk for dw-mipi-dsi on rk3399 The clk of grf must be enabled before writing grf register for rk3399. Signed-off-by: Nickey Yang [the grf clock is already part of the binding since march 2017] Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 6aa43fd47148..ab7629c5b856 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1630,8 +1630,8 @@ reg = <0x0 0xff960000 0x0 0x8000>; interrupts = ; clocks = <&cru SCLK_DPHY_PLL>, <&cru PCLK_MIPI_DSI0>, - <&cru SCLK_DPHY_TX0_CFG>; - clock-names = "ref", "pclk", "phy_cfg"; + <&cru SCLK_DPHY_TX0_CFG>, <&cru PCLK_VIO_GRF>; + clock-names = "ref", "pclk", "phy_cfg", "grf"; power-domains = <&power RK3399_PD_VIO>; rockchip,grf = <&grf>; status = "disabled"; From e88d62cd4b2f0b1ae55e9008e79c2794b1fc914d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 26 Sep 2017 12:41:52 +0100 Subject: [PATCH 0855/1322] percpu: make this_cpu_generic_read() atomic w.r.t. interrupts As raw_cpu_generic_read() is a plain read from a raw_cpu_ptr() address, it's possible (albeit unlikely) that the compiler will split the access across multiple instructions. In this_cpu_generic_read() we disable preemption but not interrupts before calling raw_cpu_generic_read(). Thus, an interrupt could be taken in the middle of the split load instructions. If a this_cpu_write() or RMW this_cpu_*() op is made to the same variable in the interrupt handling path, this_cpu_read() will return a torn value. For native word types, we can avoid tearing using READ_ONCE(), but this won't work in all cases (e.g. 64-bit types on most 32-bit platforms). This patch reworks this_cpu_generic_read() to use READ_ONCE() where possible, otherwise falling back to disabling interrupts. Signed-off-by: Mark Rutland Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Pranith Kumar Cc: Tejun Heo Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- include/asm-generic/percpu.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 0504ef8f3aa3..976f8ac26665 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -115,15 +115,35 @@ do { \ (__ret); \ }) -#define this_cpu_generic_read(pcp) \ +#define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ typeof(pcp) __ret; \ preempt_disable_notrace(); \ - __ret = raw_cpu_generic_read(pcp); \ + __ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ __ret; \ }) +#define __this_cpu_generic_read_noirq(pcp) \ +({ \ + typeof(pcp) __ret; \ + unsigned long __flags; \ + raw_local_irq_save(__flags); \ + __ret = raw_cpu_generic_read(pcp); \ + raw_local_irq_restore(__flags); \ + __ret; \ +}) + +#define this_cpu_generic_read(pcp) \ +({ \ + typeof(pcp) __ret; \ + if (__native_word(pcp)) \ + __ret = __this_cpu_generic_read_nopreempt(pcp); \ + else \ + __ret = __this_cpu_generic_read_noirq(pcp); \ + __ret; \ +}) + #define this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long __flags; \ From 1f4cf93b133ba6596ae69cfd09b48aa7b47cca41 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:04:23 +0200 Subject: [PATCH 0856/1322] net: ena: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 52beba8c7a39..ded29af648c9 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -315,7 +315,7 @@ static struct ena_comp_ctx *ena_com_submit_admin_cmd(struct ena_com_admin_queue cmd_size_in_bytes, comp, comp_size_in_bytes); - if (unlikely(IS_ERR(comp_ctx))) + if (IS_ERR(comp_ctx)) admin_queue->running_state = false; spin_unlock_irqrestore(&admin_queue->q_lock, flags); @@ -1130,7 +1130,7 @@ int ena_com_execute_admin_command(struct ena_com_admin_queue *admin_queue, comp_ctx = ena_com_submit_admin_cmd(admin_queue, cmd, cmd_size, comp, comp_size); - if (unlikely(IS_ERR(comp_ctx))) { + if (IS_ERR(comp_ctx)) { if (comp_ctx == ERR_PTR(-ENODEV)) pr_debug("Failed to submit command [%ld]\n", PTR_ERR(comp_ctx)); From 98e4fcff3e755c6c3de2d4f63c080b4009a599bd Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:21:42 +0200 Subject: [PATCH 0857/1322] datagram: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index f7fb7e3f2acf..0b7b4c22719e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -188,7 +188,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, } if (!skb->len) { skb = skb_set_peeked(skb); - if (unlikely(IS_ERR(skb))) { + if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } From 63a4e80be4f523dbde6412decfa8244ff73cc4b9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:22:31 +0200 Subject: [PATCH 0858/1322] ipv6: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..13c3b697f8c0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3297,7 +3297,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) + if (IS_ERR(rt)) return PTR_ERR(rt); /* ifp->rt can be accessed outside of rtnl */ From d9db5e3680c87fdbdf86fcb1c42caea4bf98680c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 11:22:58 +0200 Subject: [PATCH 0859/1322] kcm: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index af4e76ac88ff..0b750a22c4b9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1650,7 +1650,7 @@ static int kcm_clone(struct socket *osock, struct kcm_clone *info, } newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { err = PTR_ERR(newfile); goto out_sock_alloc_fail; } From 36f6ee22d2d66046e369757ec6bbe1c482957ba6 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Tue, 26 Sep 2017 15:14:29 +0300 Subject: [PATCH 0860/1322] vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit When running LTP IPsec tests, KASan might report: BUG: KASAN: use-after-free in vti_tunnel_xmit+0xeee/0xff0 [ip_vti] Read of size 4 at addr ffff880dc6ad1980 by task swapper/0/0 ... Call Trace: dump_stack+0x63/0x89 print_address_description+0x7c/0x290 kasan_report+0x28d/0x370 ? vti_tunnel_xmit+0xeee/0xff0 [ip_vti] __asan_report_load4_noabort+0x19/0x20 vti_tunnel_xmit+0xeee/0xff0 [ip_vti] ? vti_init_net+0x190/0x190 [ip_vti] ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 dev_hard_start_xmit+0x147/0x510 ? icmp_echo.part.24+0x1f0/0x210 __dev_queue_xmit+0x1394/0x1c60 ... Freed by task 0: save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 kasan_slab_free+0x70/0xc0 kmem_cache_free+0x81/0x1e0 kfree_skbmem+0xb1/0xe0 kfree_skb+0x75/0x170 kfree_skb_list+0x3e/0x60 __dev_queue_xmit+0x1298/0x1c60 dev_queue_xmit+0x10/0x20 neigh_resolve_output+0x3a8/0x740 ip_finish_output2+0x5c0/0xe70 ip_finish_output+0x4ba/0x680 ip_output+0x1c1/0x3a0 xfrm_output_resume+0xc65/0x13d0 xfrm_output+0x1e4/0x380 xfrm4_output_finish+0x5c/0x70 Can be fixed if we get skb->len before dst_output(). Fixes: b9959fd3b0fa ("vti: switch to new ip tunnel code") Fixes: 22e1b23dafa8 ("vti6: Support inter address family tunneling.") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 3 ++- net/ipv6/ip6_vti.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..89453cf62158 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { From 1fac4b2fdbccab69cb781aae68f540be94d5549e Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 15:12:26 +0200 Subject: [PATCH 0861/1322] bnxt_en: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c25f5b555adf..5ba49938ba55 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1491,7 +1491,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, (struct rx_tpa_end_cmp *)rxcmp, (struct rx_tpa_end_cmp_ext *)rxcmp1, event); - if (unlikely(IS_ERR(skb))) + if (IS_ERR(skb)) return -EBUSY; rc = -ENOMEM; From 92978ee801844b16180f2168ffffd05647da551a Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 15:13:23 +0200 Subject: [PATCH 0862/1322] net/mlx5: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 4614ddfa91eb..6a7c8b04447e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -256,7 +256,7 @@ struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, goto drop; } mdata = mlx5e_ipsec_add_metadata(skb); - if (unlikely(IS_ERR(mdata))) { + if (IS_ERR(mdata)) { atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata); goto drop; } From 2091c227fa855776aafffad7ecd25ac0734df1a0 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 26 Sep 2017 15:14:15 +0200 Subject: [PATCH 0863/1322] ldmvsw: Remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/ldmvsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index 5b56c24b6ed2..5feeaa9f0a9e 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -307,7 +307,7 @@ static int vsw_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) /* Get (or create) the vnet associated with this port */ vp = vsw_get_vnet(hp, vdev->mp, &handle); - if (unlikely(IS_ERR(vp))) { + if (IS_ERR(vp)) { err = PTR_ERR(vp); pr_err("Failed to get vnet for vsw-port\n"); mdesc_release(hp); From 62b982eeb4589b2e6d7c01a90590e3a4c2b2ca19 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Sep 2017 16:16:43 +0200 Subject: [PATCH 0864/1322] l2tp: fix race condition in l2tp_tunnel_delete If we try to delete the same tunnel twice, the first delete operation does a lookup (l2tp_tunnel_get), finds the tunnel, calls l2tp_tunnel_delete, which queues it for deletion by l2tp_tunnel_del_work. The second delete operation also finds the tunnel and calls l2tp_tunnel_delete. If the workqueue has already fired and started running l2tp_tunnel_del_work, then l2tp_tunnel_delete will queue the same tunnel a second time, and try to free the socket again. Add a dead flag to prevent firing the workqueue twice. Then we can remove the check of queue_work's result that was meant to prevent that race but doesn't. Reproducer: ip l2tp add tunnel tunnel_id 3000 peer_tunnel_id 4000 local 192.168.0.2 remote 192.168.0.1 encap udp udp_sport 5000 udp_dport 6000 ip l2tp add session name l2tp1 tunnel_id 3000 session_id 1000 peer_session_id 2000 ip link set l2tp1 up ip l2tp del tunnel tunnel_id 3000 ip l2tp del tunnel tunnel_id 3000 Fixes: f8ccac0e4493 ("l2tp: put tunnel socket release on a workqueue") Reported-by: Jianlin Shi Signed-off-by: Sabrina Dubroca Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 10 ++++------ net/l2tp/l2tp_core.h | 5 ++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8c2a89a76e1..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1688,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 70a12df40a5f..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -161,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -255,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 18 Sep 2017 14:46:03 -0700 Subject: [PATCH 0865/1322] xfs: validate bdev support for DAX inode flag Currently only the blocksize is checked, but we should really be calling bdev_dax_supported() which also tests to make sure we can get a struct dax_device and that the dax_direct_access() path is working. This is the same check that we do for the "-o dax" mount option in xfs_fs_fill_super(). This does not fix the race issues that caused the XFS DAX inode option to be disabled, so that option will still be disabled. If/when we re-enable it, though, I think we will want this issue to have been fixed. I also do think that we want to fix this in stable kernels. Signed-off-by: Ross Zwisler CC: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5049e8ab6e30..aa75389be8cf 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate( int *join_flags) { struct inode *inode = VFS_I(ip); + struct super_block *sb = inode->i_sb; int error; *join_flags = 0; @@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) + if (bdev_dax_supported(sb, sb->s_blocksize) < 0) return -EINVAL; } From 546e7be8244dc050effef0555df5b8d94d10dafc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Fri, 22 Sep 2017 11:47:33 -0700 Subject: [PATCH 0866/1322] iomap_dio_rw: Allocate AIO completion queue before submitting dio Executing xfs/104 test in a loop on Linux-v4.13 kernel on a ppc64 machine can cause the following NULL pointer dereference, .queue_work_on+0x4c/0x80 .iomap_dio_bio_end_io+0xbc/0x1f0 .bio_endio+0x118/0x1f0 .blk_update_request+0xd0/0x470 .blk_mq_end_request+0x24/0xc0 .lo_complete_rq+0x40/0xe0 .__blk_mq_complete_request_remote+0x28/0x40 .flush_smp_call_function_queue+0xc4/0x1e0 .smp_ipi_demux_relaxed+0x8c/0x100 .icp_hv_ipi_action+0x54/0xa0 .__handle_irq_event_percpu+0x84/0x2c0 .handle_irq_event_percpu+0x28/0x80 .handle_percpu_irq+0x78/0xc0 .generic_handle_irq+0x40/0x70 .__do_irq+0x88/0x200 .call_do_irq+0x14/0x24 .do_IRQ+0x84/0x130 This occurs due to the following sequence of events, 1. Allocate dio for Direct I/O write. 2. Invoke iomap_apply() until iov_iter_count() bytes have been submitted. - Assume that we have submitted atleast one bio. Hence iomap_dio->ref value will be >= 2. - If during the second iteration, iomap_apply() ends up returning -ENOSPC, we would break out of the loop and since the 'ret' value is a negative number we end up not allocating memory for super_block->s_dio_done_wq. 3. Meanwhile, iomap_dio_bio_end_io() is invoked for bios that have been submitted and here the code ends up dereferencing the NULL pointer stored at super_block->s_dio_done_wq. This commit fixes the bug by allocating memory for super_block->s_dio_done_wq before iomap_apply() is invoked. Reported-by: Eryu Guan Reviewed-by: Christoph Hellwig Tested-by: Eryu Guan Signed-off-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..d4f526a3f5b2 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -993,6 +993,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, WARN_ON_ONCE(ret); ret = 0; + if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; + } + inode_dio_begin(inode); blk_start_plug(&plug); @@ -1015,13 +1022,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); - if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && - !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - iomap_dio_set_error(dio, ret); - } - if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; From ee70daaba82d70766d0723b743d9fdeb3b06102a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 21 Sep 2017 11:26:18 -0700 Subject: [PATCH 0867/1322] xfs: update i_size after unwritten conversion in dio completion Since commit d531d91d6990 ("xfs: always use unwritten extents for direct I/O writes"), we start allocating unwritten extents for all direct writes to allow appending aio in XFS. But for dio writes that could extend file size we update the in-core inode size first, then convert the unwritten extents to real allocations at dio completion time in xfs_dio_write_end_io(). Thus a racing direct read could see the new i_size and find the unwritten extents first and read zeros instead of actual data, if the direct writer also takes a shared iolock. Fix it by updating the in-core inode size after the unwritten extent conversion. To do this, introduce a new boolean argument to xfs_iomap_write_unwritten() to tell if we want to update in-core i_size or not. Suggested-by: Brian Foster Reviewed-by: Brian Foster Signed-off-by: Eryu Guan Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 3 ++- fs/xfs/xfs_file.c | 33 +++++++++++++++++++-------------- fs/xfs/xfs_iomap.c | 7 +++++-- fs/xfs/xfs_iomap.h | 2 +- fs/xfs/xfs_pnfs.c | 2 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29172609f2a3..f18e5932aec4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -343,7 +343,8 @@ xfs_end_io( error = xfs_reflink_end_cow(ip, offset, size); break; case XFS_IO_UNWRITTEN: - error = xfs_iomap_write_unwritten(ip, offset, size); + /* writeback should never update isize */ + error = xfs_iomap_write_unwritten(ip, offset, size, false); break; default: ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 350b6d43ba23..309e26c9dddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -434,7 +434,6 @@ xfs_dio_write_end_io( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; - bool update_size = false; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); @@ -445,6 +444,21 @@ xfs_dio_write_end_io( if (size <= 0) return size; + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + /* + * Unwritten conversion updates the in-core isize after extent + * conversion but before updating the on-disk size. Updating isize any + * earlier allows a racing dio read to find unwritten extents before + * they are converted. + */ + if (flags & IOMAP_DIO_UNWRITTEN) + return xfs_iomap_write_unwritten(ip, offset, size, true); + /* * We need to update the in-core inode size here so that we don't end up * with the on-disk inode size being outside the in-core inode size. We @@ -459,20 +473,11 @@ xfs_dio_write_end_io( spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); - update_size = true; - } - spin_unlock(&ip->i_flags_lock); - - if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); - if (error) - return error; - } - - if (flags & IOMAP_DIO_UNWRITTEN) - error = xfs_iomap_write_unwritten(ip, offset, size); - else if (update_size) + spin_unlock(&ip->i_flags_lock); error = xfs_setfilesize(ip, offset, size); + } else { + spin_unlock(&ip->i_flags_lock); + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a1909bc064e9..f179bdf1644d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -829,7 +829,8 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct xfs_defer_ops dfops; + struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; @@ -899,7 +901,8 @@ xfs_iomap_write_unwritten( i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; - + if (update_isize && i_size > i_size_read(inode)) + i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_d.di_size = i_size; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 00db3ecea084..ee535065c5d0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 2f2dc3c09ad0..4246876df7b7 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -274,7 +274,7 @@ xfs_fs_commit_blocks( (end - 1) >> PAGE_SHIFT); WARN_ON_ONCE(error); - error = xfs_iomap_write_unwritten(ip, start, length); + error = xfs_iomap_write_unwritten(ip, start, length, false); if (error) goto out_drop_iolock; } From 9789dd9e1d939232e8ff4c50ef8e75aa6781b3fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:42:09 -0700 Subject: [PATCH 0868/1322] xfs: perag initialization should only touch m_ag_max_usable for AG 0 We call __xfs_ag_resv_init to make a per-AG reservation for each AG. This makes the reservation per-AG, not per-filesystem. Therefore, it is incorrect to adjust m_ag_max_usable for each AG. Adjust it only when we're reserving AG 0's blocks so that we only do it once per fs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_ag_resv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index b008ff3250eb..df3e600835e8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -156,7 +156,8 @@ __xfs_ag_resv_free( trace_xfs_ag_resv_free(pag, type, 0); resv = xfs_perag_resv(pag, type); - pag->pag_mount->m_ag_max_usable += resv->ar_asked; + if (pag->pag_agno == 0) + pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* * AGFL blocks are always considered "free", so whatever * was reserved at mount time must be given back at umount. @@ -216,7 +217,14 @@ __xfs_ag_resv_init( return error; } - mp->m_ag_max_usable -= ask; + /* + * Reduce the maximum per-AG allocation length by however much we're + * trying to reserve for an AG. Since this is a filesystem-wide + * counter, we only make the adjustment for AG 0. This assumes that + * there aren't any AGs hungrier for per-AG reservation than AG 0. + */ + if (pag->pag_agno == 0) + mp->m_ag_max_usable -= ask; resv = xfs_perag_resv(pag, type); resv->ar_asked = ask; From 842f6e9f786226c58fcbd5ef80eadca72fdfe652 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Fri, 22 Sep 2017 11:47:46 -0700 Subject: [PATCH 0869/1322] xfs: Capture state of the right inode in xfs_iflush_done My previous patch: d3a304b6292168b83b45d624784f973fdc1ca674 check for XFS_LI_FAILED flag xfs_iflush done, so the failed item can be properly resubmitted. In the loop scanning other inodes being completed, it should check the current item for the XFS_LI_FAILED, and not the initial one. The state of the initial inode is checked after the loop ends Kudos to Eric for catching this. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6d0f74ec31e8..a705f34b58fa 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -745,7 +745,7 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + (blip->li_flags & XFS_LI_FAILED)) need_ail++; blip = next; From 5e5c943c1f257c2b3424fc3f8a7b18570152dab3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 18 Sep 2017 09:41:17 -0700 Subject: [PATCH 0870/1322] xfs: revert "xfs: factor rmap btree size into the indlen calculations" In commit fd26a88093ba we added a worst case estimate for rmapbt blocks needed to satisfy the block mapping request. Since then, we added the ability to reserve enough space in each AG such that we should never run out of blocks to grow the rmapbt, which makes this calculation unnecessary. Revert the commit because it makes the extra delalloc indlen accounting unnecessary and incorrect. Reported-by: Eryu Guan Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 459f4b4f08fe..044a363119be 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,7 +49,6 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_rmap_btree.h" #include "xfs_icache.h" @@ -192,12 +191,8 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ - xfs_filblks_t orig_len; mp = ip->i_mount; - - /* Calculate the worst-case size of the bmbt. */ - orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -205,20 +200,12 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) { - rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; - break; - } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } - - /* Calculate the worst-case size of the rmapbt. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + - mp->m_rmap_maxlevels; - return rval; } From e0a8f9de16fce34fc2957eca4c71d3ff2ac286d5 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Sun, 24 Sep 2017 12:09:42 +0300 Subject: [PATCH 0871/1322] qed: Add iWARP enablement support This patch is the last of the initial iWARP patch series. It adds the possiblity to actually detect iWARP from the device and enable it in the critical locations which basically make iWARP available. It wasn't submitted until now as iWARP hadn't been accepted into the rdma tree. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_cxt.c | 6 ++++++ drivers/net/ethernet/qlogic/qed/qed_mcp.c | 10 +++++----- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 5 ++++- drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 1 + 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c index af106be8cc08..afd07ad91631 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c +++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c @@ -2069,6 +2069,12 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn, num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs); + if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) { + DP_NOTICE(p_hwfn, + "Current day drivers don't support RoCE & iWARP simultaneously on the same PF. Default to RoCE-only\n"); + p_hwfn->hw_info.personality = QED_PCI_ETH_ROCE; + } + switch (p_hwfn->hw_info.personality) { case QED_PCI_ETH_IWARP: /* Each QP requires one connection */ diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 376485d99357..8b99c7d26f34 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -1691,12 +1691,12 @@ qed_mcp_get_shmem_proto_mfw(struct qed_hwfn *p_hwfn, case FW_MB_PARAM_GET_PF_RDMA_ROCE: *p_proto = QED_PCI_ETH_ROCE; break; - case FW_MB_PARAM_GET_PF_RDMA_BOTH: - DP_NOTICE(p_hwfn, - "Current day drivers don't support RoCE & iWARP. Default to RoCE-only\n"); - *p_proto = QED_PCI_ETH_ROCE; - break; case FW_MB_PARAM_GET_PF_RDMA_IWARP: + *p_proto = QED_PCI_ETH_IWARP; + break; + case FW_MB_PARAM_GET_PF_RDMA_BOTH: + *p_proto = QED_PCI_ETH_RDMA; + break; default: DP_NOTICE(p_hwfn, "MFW answers GET_PF_RDMA_PROTOCOL but param is %08x\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 6fb99518a61f..06715f7403ef 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -156,7 +156,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, return rc; p_hwfn->p_rdma_info = p_rdma_info; - p_rdma_info->proto = PROTOCOLID_ROCE; + if (QED_IS_IWARP_PERSONALITY(p_hwfn)) + p_rdma_info->proto = PROTOCOLID_IWARP; + else + p_rdma_info->proto = PROTOCOLID_ROCE; num_cons = qed_cxt_get_proto_cid_count(p_hwfn, p_rdma_info->proto, NULL); diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c index 46d0c3cb83a5..a1d33f35aad3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c @@ -377,6 +377,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn, p_ramrod->personality = PERSONALITY_ISCSI; break; case QED_PCI_ETH_ROCE: + case QED_PCI_ETH_IWARP: p_ramrod->personality = PERSONALITY_RDMA_AND_ETH; break; default: From d1abfd0b4ee2b83af88098a0c7105622c3d66e73 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Sun, 24 Sep 2017 12:09:43 +0300 Subject: [PATCH 0872/1322] qed: Add iWARP out of order support iWARP requires OOO support which is already provided by the ll2 interface (until now was used only for iSCSI offload). The changes mostly include opening a ll2 dedicated connection for OOO and notifiying the FW about the handle id. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 44 +++++++++++++++++++++ drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 11 +++++- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 7 +++- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 9d989c96278c..568e9853cc8d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -41,6 +41,7 @@ #include "qed_rdma.h" #include "qed_reg_addr.h" #include "qed_sp.h" +#include "qed_ooo.h" #define QED_IWARP_ORD_DEFAULT 32 #define QED_IWARP_IRD_DEFAULT 32 @@ -119,6 +120,13 @@ static void qed_iwarp_cid_cleaned(struct qed_hwfn *p_hwfn, u32 cid) spin_unlock_bh(&p_hwfn->p_rdma_info->lock); } +void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn, + struct iwarp_init_func_params *p_ramrod) +{ + p_ramrod->ll2_ooo_q_index = RESC_START(p_hwfn, QED_LL2_QUEUE) + + p_hwfn->p_rdma_info->iwarp.ll2_ooo_handle; +} + static int qed_iwarp_alloc_cid(struct qed_hwfn *p_hwfn, u32 *cid) { int rc; @@ -1876,6 +1884,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; } + if (iwarp_info->ll2_ooo_handle != QED_IWARP_HANDLE_INVAL) { + rc = qed_ll2_terminate_connection(p_hwfn, + iwarp_info->ll2_ooo_handle); + if (rc) + DP_INFO(p_hwfn, "Failed to terminate ooo connection\n"); + + qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_ooo_handle); + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; + } + qed_llh_remove_mac_filter(p_hwfn, p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr); return rc; @@ -1927,10 +1945,12 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, struct qed_iwarp_info *iwarp_info; struct qed_ll2_acquire_data data; struct qed_ll2_cbs cbs; + u16 n_ooo_bufs; int rc = 0; iwarp_info = &p_hwfn->p_rdma_info->iwarp; iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; + iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; iwarp_info->max_mtu = params->max_mtu; @@ -1978,6 +1998,29 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, if (rc) goto err; + /* Start OOO connection */ + data.input.conn_type = QED_LL2_TYPE_OOO; + data.input.mtu = params->max_mtu; + + n_ooo_bufs = (QED_IWARP_MAX_OOO * QED_IWARP_RCV_WND_SIZE_DEF) / + iwarp_info->max_mtu; + n_ooo_bufs = min_t(u32, n_ooo_bufs, QED_IWARP_LL2_OOO_MAX_RX_SIZE); + + data.input.rx_num_desc = n_ooo_bufs; + data.input.rx_num_ooo_buffers = n_ooo_bufs; + + data.input.tx_max_bds_per_packet = 1; /* will never be fragmented */ + data.input.tx_num_desc = QED_IWARP_LL2_OOO_DEF_TX_SIZE; + data.p_connection_handle = &iwarp_info->ll2_ooo_handle; + + rc = qed_ll2_acquire_connection(p_hwfn, &data); + if (rc) + goto err; + + rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_ooo_handle); + if (rc) + goto err; + return rc; err: qed_iwarp_ll2_stop(p_hwfn, p_ptt); @@ -2014,6 +2057,7 @@ int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, qed_spq_register_async_cb(p_hwfn, PROTOCOLID_IWARP, qed_iwarp_async_event); + qed_ooo_setup(p_hwfn); return qed_iwarp_ll2_start(p_hwfn, params, p_ptt); } diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h index 148ef3c33a5d..9e2bfde894df 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h @@ -47,7 +47,12 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state); #define QED_IWARP_LL2_SYN_TX_SIZE (128) #define QED_IWARP_LL2_SYN_RX_SIZE (256) #define QED_IWARP_MAX_SYN_PKT_SIZE (128) -#define QED_IWARP_HANDLE_INVAL (0xff) + +#define QED_IWARP_LL2_OOO_DEF_TX_SIZE (256) +#define QED_IWARP_MAX_OOO (16) +#define QED_IWARP_LL2_OOO_MAX_RX_SIZE (16384) + +#define QED_IWARP_HANDLE_INVAL (0xff) struct qed_iwarp_ll2_buff { void *data; @@ -67,6 +72,7 @@ struct qed_iwarp_info { u8 crc_needed; u8 tcp_flags; u8 ll2_syn_handle; + u8 ll2_ooo_handle; u8 peer2peer; enum mpa_negotiation_mode mpa_rev; enum mpa_rtr_type rtr_type; @@ -147,6 +153,9 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn); int qed_iwarp_setup(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_rdma_start_in_params *params); +void qed_iwarp_init_fw_ramrod(struct qed_hwfn *p_hwfn, + struct iwarp_init_func_params *p_ramrod); + int qed_iwarp_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn); diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 06715f7403ef..4f46f2851780 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -551,10 +551,13 @@ static int qed_rdma_start_fw(struct qed_hwfn *p_hwfn, if (rc) return rc; - if (QED_IS_IWARP_PERSONALITY(p_hwfn)) + if (QED_IS_IWARP_PERSONALITY(p_hwfn)) { + qed_iwarp_init_fw_ramrod(p_hwfn, + &p_ent->ramrod.iwarp_init_func.iwarp); p_ramrod = &p_ent->ramrod.iwarp_init_func.rdma; - else + } else { p_ramrod = &p_ent->ramrod.roce_init_func.rdma; + } p_params_header = &p_ramrod->params_header; p_params_header->cnq_start_offset = (u8)RESC_START(p_hwfn, From 471115ab9804f45cb8e091e426c9c67fe75e41b0 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Sun, 24 Sep 2017 12:09:44 +0300 Subject: [PATCH 0873/1322] qed: Fix maximum number of CQs for iWARP The maximum number of CQs supported is bound to the number of connections supported, which differs between RoCE and iWARP. This fixes a crash that occurred in iWARP when running 1000 sessions using perftest. Fixes: 67b40dccc45 ("qed: Implement iWARP initialization, teardown and qp operations") Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 4f46f2851780..c8c4b3940564 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -209,11 +209,11 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, goto free_pd_map; } - /* Allocate bitmap for cq's. The maximum number of CQs is bounded to - * twice the number of QPs. + /* Allocate bitmap for cq's. The maximum number of CQs is bound to + * the number of connections we support. (num_qps in iWARP or + * num_qps/2 in RoCE). */ - rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map, - p_rdma_info->num_qps * 2, "CQ"); + rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->cq_map, num_cons, "CQ"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate cq bitmap, rc = %d\n", rc); @@ -222,10 +222,10 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, /* Allocate bitmap for toggle bit for cq icids * We toggle the bit every time we create or resize cq for a given icid. - * The maximum number of CQs is bounded to twice the number of QPs. + * Size needs to equal the size of the cq bmap. */ rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->toggle_bits, - p_rdma_info->num_qps * 2, "Toggle"); + num_cons, "Toggle"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Failed to allocate toogle bits, rc = %d\n", rc); From 1e99c497012cd8647972876f1bd18545bc907aea Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Sun, 24 Sep 2017 12:09:45 +0300 Subject: [PATCH 0874/1322] qed: iWARP - Add check for errors on a SYN packet A SYN packet which arrives with errors from FW should be dropped. This required adding an additional field to the ll2 rx completion data. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 8 ++++++++ drivers/net/ethernet/qlogic/qed/qed_ll2.c | 1 + include/linux/qed/qed_ll2_if.h | 1 + 3 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 568e9853cc8d..8fc9c811f6e3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1733,6 +1733,14 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data) memset(&cm_info, 0, sizeof(cm_info)); ll2_syn_handle = p_hwfn->p_rdma_info->iwarp.ll2_syn_handle; + + /* Check if packet was received with errors... */ + if (data->err_flags) { + DP_NOTICE(p_hwfn, "Error received on SYN packet: 0x%x\n", + data->err_flags); + goto err; + } + if (GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMWASCALCULATED) && GET_FIELD(data->parse_flags, PARSING_AND_ERR_FLAGS_L4CHKSMERROR)) { diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index c06ad4f0758e..250afa5486cf 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -413,6 +413,7 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn, struct qed_ll2_comp_rx_data *data) { data->parse_flags = le16_to_cpu(p_cqe->rx_cqe_fp.parse_flags.flags); + data->err_flags = le16_to_cpu(p_cqe->rx_cqe_fp.err_flags.flags); data->length.packet_length = le16_to_cpu(p_cqe->rx_cqe_fp.packet_length); data->vlan = le16_to_cpu(p_cqe->rx_cqe_fp.vlan); diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index dd7a3b86bb9e..89fa0bbd54f3 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data { void *cookie; dma_addr_t rx_buf_addr; u16 parse_flags; + u16 err_flags; u16 vlan; bool b_last_packet; u8 connection_handle; From 3bd3b9ed1b602c065aa0b1ba109b9622afa6ff98 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 24 Sep 2017 17:41:24 +0530 Subject: [PATCH 0875/1322] net: bcm63xx_enet: Use setup_timer and mod_timer Use setup_timer and mod_timer API instead of structure assignments. This is done using Coccinelle and semantic patch used for this as follows: @@ expression x,y,z,a,b; @@ -init_timer (&x); +setup_timer (&x, y, z); +mod_timer (&a, b); -x.function = y; -x.data = z; -x.expires = b; -add_timer(&a); Signed-off-by: Himanshu Jha Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index f8bbbbfca06e..c6221f04a748 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -2331,11 +2331,8 @@ static int bcm_enetsw_open(struct net_device *dev) } /* start phy polling timer */ - init_timer(&priv->swphy_poll); - priv->swphy_poll.function = swphy_poll_timer; - priv->swphy_poll.data = (unsigned long)priv; - priv->swphy_poll.expires = jiffies; - add_timer(&priv->swphy_poll); + setup_timer(&priv->swphy_poll, swphy_poll_timer, (unsigned long)priv); + mod_timer(&priv->swphy_poll, jiffies); return 0; out: From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: [PATCH 0876/1322] bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/filter.h | 9 ++++++--- kernel/bpf/sockmap.c | 4 ++-- net/bpf/test_run.c | 2 +- net/core/filter.c | 14 +++++++------- net/core/lwt_bpf.c | 2 +- net/sched/act_bpf.c | 4 ++-- net/sched/cls_bpf.c | 4 ++-- 7 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index d29e58fde364..052bab3d62e7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -496,10 +496,13 @@ struct xdp_buff { void *data_hard_start; }; -/* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf, act_bpf and lwt programs +/* Compute the linear packet data range [data, data_end) which + * will be accessed by various program types (cls_bpf, act_bpf, + * lwt, ...). Subsystems allowing direct data access must (!) + * ensure that cb[] area can be written to when BPF program is + * invoked (otherwise cb[] save/restore is necessary). */ -static inline void bpf_compute_data_end(struct sk_buff *skb) +static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..a298d6666698 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) skb_orphan(skb); skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; @@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6be41a44d688..df672517b4fd 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, ETH_HLEN); if (is_direct_pkt_access) - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) __skb_push(skb, ETH_HLEN); diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..c468e7cfad19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return err; } @@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, skb_gso_reset(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1307731ddfe4..e7e626fb87bb 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c0c707eb2c96..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 520c5027646a..36671b0fb125 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: [PATCH 0877/1322] bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 1 + .../net/ethernet/cavium/thunder/nicvf_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 1 + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 1 + .../ethernet/netronome/nfp/nfp_net_common.c | 1 + drivers/net/ethernet/qlogic/qede/qede_fp.c | 1 + drivers/net/tun.c | 1 + drivers/net/virtio_net.c | 2 + include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++- include/linux/skbuff.h | 68 ++++++++++- include/uapi/linux/bpf.h | 13 +- kernel/bpf/verifier.c | 114 +++++++++++++----- net/bpf/test_run.c | 1 + net/core/dev.c | 33 ++++- net/core/filter.c | 77 +++++++++++- net/core/skbuff.c | 2 + 19 files changed, 298 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index d8f0c837b72c..06ce63c00821 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, xdp.data_hard_start = *data_ptr - offset; xdp.data = *data_ptr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = *data_ptr + *len; orig_data = xdp.data; mapping = rx_buf->mapping - bp->rx_dma_offset; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 49b80da51ba7..d68478afccbf 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 1519dfb851d0..f426762bd83a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - i40e_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d962368d08d0..04bb03bda1cd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b97a55c827eb..8f9cb8abc497 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud xdp.data_hard_start = va - frags[0].page_offset; xdp.data = va; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + length; orig_data = xdp.data; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f1dd638384d3..30b3f3fbd719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, return false; xdp.data = va + *rx_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; xdp.data_hard_start = va; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 1c0187f0af51..e3a38be3600a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start, xdp.data_hard_start = hard_start; xdp.data = data + *off; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = data + *off + *len; orig_data = xdp.data; diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 6fc854b120b0..48ec4c56cddf 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, xdp.data_hard_start = page_address(bd->data); xdp.data = xdp.data_hard_start + *data_offset; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + *len; /* Queues always have a full reset currently, so for the time diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2c36f6ebad79..a6e0bffe3d29 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp.data_hard_start = buf; xdp.data = buf + pad; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index dd14a4547932..fc059f193e7d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; + xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); act = bpf_prog_run_xdp(xdp_prog, &xdp); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8390859e79e7..2b672c50f160 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -137,6 +137,7 @@ enum bpf_reg_type { PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ PTR_TO_STACK, /* reg == frame_pointer + offset */ + PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 052bab3d62e7..911d454af107 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -487,12 +487,14 @@ struct sk_filter { struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; + void *data_meta; void *data_end; }; struct xdp_buff { void *data; void *data_end; + void *data_meta; void *data_hard_start; }; @@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); - cb->data_end = skb->data + skb_headlen(skb); + cb->data_meta = skb->data - skb_metadata_len(skb); + cb->data_end = skb->data + skb_headlen(skb); } static inline u8 *bpf_skb_cb(struct sk_buff *skb) @@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev, struct bpf_prog *prog); void xdp_do_flush_map(void); +/* Drivers not supporting XDP metadata can use this helper, which + * rejects any room expansion for metadata as a result. + */ +static __always_inline void +xdp_set_data_meta_invalid(struct xdp_buff *xdp) +{ + xdp->data_meta = xdp->data + 1; +} + +static __always_inline bool +xdp_data_meta_unsupported(const struct xdp_buff *xdp) +{ + return unlikely(xdp->data_meta > xdp->data); +} + void bpf_warn_invalid_xdp_action(u32 act); -void bpf_warn_invalid_xdp_redirect(u32 ifindex); struct sock *do_sk_redirect_map(void); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f9db5539a6fb..19e64bfb1a66 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, * the end of the header data, ie. at skb->end. */ struct skb_shared_info { - unsigned short _unused; - unsigned char nr_frags; + __u8 __unused; + __u8 meta_len; + __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ @@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void) return 0; } +static inline u8 skb_metadata_len(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->meta_len; +} + +static inline void *skb_metadata_end(const struct sk_buff *skb) +{ + return skb_mac_header(skb); +} + +static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b, + u8 meta_len) +{ + const void *a = skb_metadata_end(skb_a); + const void *b = skb_metadata_end(skb_b); + /* Using more efficient varaiant than plain call to memcmp(). */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + u64 diffs = 0; + + switch (meta_len) { +#define __it(x, op) (x -= sizeof(u##op)) +#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) + case 32: diffs |= __it_diff(a, b, 64); + case 24: diffs |= __it_diff(a, b, 64); + case 16: diffs |= __it_diff(a, b, 64); + case 8: diffs |= __it_diff(a, b, 64); + break; + case 28: diffs |= __it_diff(a, b, 64); + case 20: diffs |= __it_diff(a, b, 64); + case 12: diffs |= __it_diff(a, b, 64); + case 4: diffs |= __it_diff(a, b, 32); + break; + } + return diffs; +#else + return memcmp(a - meta_len, b - meta_len, meta_len); +#endif +} + +static inline bool skb_metadata_differs(const struct sk_buff *skb_a, + const struct sk_buff *skb_b) +{ + u8 len_a = skb_metadata_len(skb_a); + u8 len_b = skb_metadata_len(skb_b); + + if (!(len_a | len_b)) + return false; + + return len_a != len_b ? + true : __skb_metadata_differs(skb_a, skb_b, len_a); +} + +static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) +{ + skb_shinfo(skb)->meta_len = meta_len; +} + +static inline void skb_metadata_clear(struct sk_buff *skb) +{ + skb_metadata_set(skb, 0); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..e43491ac4823 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -582,6 +582,12 @@ union bpf_attr { * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem + * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -638,6 +644,7 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ + FN(xdp_adjust_meta), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -715,7 +722,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -723,6 +730,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -783,6 +793,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..f849eca36052 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) va_end(args); } +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -187,6 +193,7 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) + if (type_is_pkt_pointer(t)) verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || @@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; @@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) mark_reg_unknown(state->regs, value_regno); @@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { err = check_stack_read(state, off, size, value_regno); } - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; @@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) + if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max_t(u16, regs[i].range, dst_reg->off); @@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, dst_reg->off); } } @@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index df672517b4fd..a86e6687026e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/core/dev.c b/net/core/dev.c index 97abddd9039a..e350c768d4b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3864,8 +3864,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: break; - + case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); + break; default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); diff --git a/net/core/filter.c b/net/core/filter.c index c468e7cfad19..9b6e7e84aafd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 000ce735fa8d..d98c2e3ce2bf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). From ac29991ba137cc0e3b0f647fb41e79300230f15c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:52 +0200 Subject: [PATCH 0878/1322] bpf: update bpf.h uapi header for tools Looks like a couple of updates missed to get carried into tools/include/uapi/, so copy the bpf.h header as usual to pull in latest updates. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 45 ++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 461811e57140..e43491ac4823 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -143,12 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -enum bpf_sockmap_flags { - BPF_SOCKMAP_UNSPEC, - BPF_SOCKMAP_STRPARSER, - __MAX_BPF_SOCKMAP_FLAG -}; - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -368,9 +362,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) + * redirect to endpoint in map + * @map: pointer to dev map + * @key: index in map to lookup + * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -577,6 +582,12 @@ union bpf_attr { * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem + * + * int bpf_xdp_adjust_meta(xdp_md, delta) + * Adjust the xdp_md.data_meta by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data_meta + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -632,7 +643,8 @@ union bpf_attr { FN(skb_adjust_room), \ FN(redirect_map), \ FN(sk_redirect_map), \ - FN(sock_map_update), + FN(sock_map_update), \ + FN(xdp_adjust_meta), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -710,7 +722,7 @@ struct __sk_buff { __u32 data_end; __u32 napi_id; - /* accessed by BPF_PROG_TYPE_sk_skb types */ + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ __u32 family; __u32 remote_ip4; /* Stored in network byte order */ __u32 local_ip4; /* Stored in network byte order */ @@ -718,6 +730,9 @@ struct __sk_buff { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; }; struct bpf_tunnel_key { @@ -753,20 +768,23 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will result - * in packet drop. + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). */ enum xdp_action { XDP_ABORTED = 0, XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook @@ -775,6 +793,7 @@ enum xdp_action { struct xdp_md { __u32 data; __u32 data_end; + __u32 data_meta; }; enum sk_action { From 22c8852624fc90a90709d1237625ca57999c5092 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:53 +0200 Subject: [PATCH 0879/1322] bpf: improve selftests and add tests for meta pointer Add various test_verifier selftests, and a simple xdp/tc functional test that is being attached to veths. Also let new versions of clang use the recently added -mcpu=probe support [1] for the BPF target, so that it can probe the underlying kernel for BPF insn set extensions. We could also just set this options always, where older versions just ignore it and give a note to the user that the -mcpu value is not supported, but given emitting the note cannot be turned off from clang side lets not confuse users running selftests with it, thus fallback to the default generic one when we see that clang doesn't support it. Also allow CPU option to be overridden in the Makefile from command line. [1] https://github.com/llvm-mirror/llvm/commit/d7276a40d87b89aed89978dec6457a5b8b3a0db5 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/Makefile | 21 +- tools/testing/selftests/bpf/bpf_helpers.h | 2 + tools/testing/selftests/bpf/test_verifier.c | 247 +++++++++++++++++++ tools/testing/selftests/bpf/test_xdp_meta.c | 53 ++++ tools/testing/selftests/bpf/test_xdp_meta.sh | 51 ++++ 5 files changed, 370 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_xdp_meta.c create mode 100755 tools/testing/selftests/bpf/test_xdp_meta.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f4b23d697448..924af8d79bde 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -15,9 +15,10 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ - test_pkt_md_access.o test_xdp_redirect.o sockmap_parse_prog.o sockmap_verdict_prog.o + test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ + sockmap_verdict_prog.o -TEST_PROGS := test_kmod.sh test_xdp_redirect.sh +TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh include ../lib.mk @@ -34,8 +35,20 @@ $(BPFOBJ): force $(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/ CLANG ?= clang +LLC ?= llc + +PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1) + +# Let newer LLVM versions transparently probe the kernel for availability +# of full BPF instruction set. +ifeq ($(PROBE),) + CPU ?= probe +else + CPU ?= generic +endif %.o: %.c $(CLANG) -I. -I./include/uapi -I../../../include/uapi \ - -Wno-compare-distinct-pointer-types \ - -O2 -target bpf -c $< -o $@ + -Wno-compare-distinct-pointer-types \ + -O2 -target bpf -emit-llvm -c $< -o - | \ + $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 4875395b0b52..a56053db26f5 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -62,6 +62,8 @@ static unsigned long long (*bpf_get_prandom_u32)(void) = (void *) BPF_FUNC_get_prandom_u32; static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = (void *) BPF_FUNC_xdp_adjust_head; +static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_meta; static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) = (void *) BPF_FUNC_setsockopt; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 26f3250bdcd2..a0426147523d 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6645,6 +6645,253 @@ static struct bpf_test tests[] = { .errstr = "BPF_END uses reserved fields", .result = REJECT, }, + { + "meta access, test1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test2", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 8), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet, off=-8", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test4", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_4), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test5", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_4, 3), + BPF_MOV64_IMM(BPF_REG_2, -8), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_xdp_adjust_meta), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "R3 !read_ok", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test6", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test7", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test8", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF), + BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test9", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1), + BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test10", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_MOV64_IMM(BPF_REG_5, 42), + BPF_MOV64_IMM(BPF_REG_6, 24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), + BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), + BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_5, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid access to packet", + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test11", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_MOV64_IMM(BPF_REG_5, 42), + BPF_MOV64_IMM(BPF_REG_6, 24), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), + BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), + BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), + BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8), + BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_5, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, + { + "meta access, test12", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_md, data_meta)), + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_md, data)), + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct xdp_md, data_end)), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_3), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16), + BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 5), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16), + BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_XDP, + }, }; static int probe_filter_length(const struct bpf_insn *fp) diff --git a/tools/testing/selftests/bpf/test_xdp_meta.c b/tools/testing/selftests/bpf/test_xdp_meta.c new file mode 100644 index 000000000000..8d0182650653 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_meta.c @@ -0,0 +1,53 @@ +#include +#include +#include + +#include "bpf_helpers.h" + +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem + +SEC("t") +int ing_cls(struct __sk_buff *ctx) +{ + __u8 *data, *data_meta, *data_end; + __u32 diff = 0; + + data_meta = ctx_ptr(ctx, data_meta); + data_end = ctx_ptr(ctx, data_end); + data = ctx_ptr(ctx, data); + + if (data + ETH_ALEN > data_end || + data_meta + round_up(ETH_ALEN, 4) > data) + return TC_ACT_SHOT; + + diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0]; + diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2]; + + return diff ? TC_ACT_SHOT : TC_ACT_OK; +} + +SEC("x") +int ing_xdp(struct xdp_md *ctx) +{ + __u8 *data, *data_meta, *data_end; + int ret; + + ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4)); + if (ret < 0) + return XDP_DROP; + + data_meta = ctx_ptr(ctx, data_meta); + data_end = ctx_ptr(ctx, data_end); + data = ctx_ptr(ctx, data); + + if (data + ETH_ALEN > data_end || + data_meta + round_up(ETH_ALEN, 4) > data) + return XDP_DROP; + + __builtin_memcpy(data_meta, data, ETH_ALEN); + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh new file mode 100755 index 000000000000..307aa856cee3 --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_meta.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +cleanup() +{ + if [ "$?" = "0" ]; then + echo "selftests: test_xdp_meta [PASS]"; + else + echo "selftests: test_xdp_meta [FAILED]"; + fi + + set +e + ip netns del ns1 2> /dev/null + ip netns del ns2 2> /dev/null +} + +ip link set dev lo xdp off 2>/dev/null > /dev/null +if [ $? -ne 0 ];then + echo "selftests: [SKIP] Could not run test without the ip xdp support" + exit 0 +fi +set -e + +ip netns add ns1 +ip netns add ns2 + +trap cleanup 0 2 3 6 9 + +ip link add veth1 type veth peer name veth2 + +ip link set veth1 netns ns1 +ip link set veth2 netns ns2 + +ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1 +ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2 + +ip netns exec ns1 tc qdisc add dev veth1 clsact +ip netns exec ns2 tc qdisc add dev veth2 clsact + +ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t +ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t + +ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x +ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x + +ip netns exec ns1 ip link set dev veth1 up +ip netns exec ns2 ip link set dev veth2 up + +ip netns exec ns1 ping -c 1 10.1.1.22 +ip netns exec ns2 ping -c 1 10.1.1.11 + +exit 0 From 65d88fd0baaa5c9def9383ac696097911d4ceb73 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:54 +0200 Subject: [PATCH 0880/1322] bpf, nfp: add meta data support Implement support for transferring XDP meta data into skb for nfp driver; before calling into the program, xdp.data_meta points to xdp.data, where on program return with pass verdict, we call into skb_metadata_set(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/nfp_net_common.c | 40 +++++++------------ 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index e3a38be3600a..d2f73feb8497 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1574,27 +1574,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring, return true; } -static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start, - unsigned int *off, unsigned int *len) -{ - struct xdp_buff xdp; - void *orig_data; - int ret; - - xdp.data_hard_start = hard_start; - xdp.data = data + *off; - xdp_set_data_meta_invalid(&xdp); - xdp.data_end = data + *off + *len; - - orig_data = xdp.data; - ret = bpf_prog_run_xdp(prog, &xdp); - - *len -= xdp.data - orig_data; - *off += xdp.data - orig_data; - - return ret; -} - /** * nfp_net_rx() - receive up to @budget packets on @rx_ring * @rx_ring: RX ring to receive from @@ -1630,6 +1609,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) struct nfp_meta_parsed meta; struct net_device *netdev; dma_addr_t new_dma_addr; + u32 meta_len_xdp = 0; void *new_frag; idx = D_IDX(rx_ring, rx_ring->rd_p); @@ -1708,16 +1688,24 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) if (xdp_prog && !(rxd->rxd.flags & PCIE_DESC_RX_BPF && dp->bpf_offload_xdp) && !meta.portid) { + void *orig_data = rxbuf->frag + pkt_off; unsigned int dma_off; - void *hard_start; + struct xdp_buff xdp; int act; - hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM; + xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM; + xdp.data = orig_data; + xdp.data_meta = orig_data; + xdp.data_end = orig_data + pkt_len; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + pkt_len -= xdp.data - orig_data; + pkt_off += xdp.data - orig_data; - act = nfp_net_run_xdp(xdp_prog, rxbuf->frag, hard_start, - &pkt_off, &pkt_len); switch (act) { case XDP_PASS: + meta_len_xdp = xdp.data - xdp.data_meta; break; case XDP_TX: dma_off = pkt_off - NFP_NET_RX_BUF_HEADROOM; @@ -1785,6 +1773,8 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) if (rxd->rxd.flags & PCIE_DESC_RX_VLAN) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(rxd->rxd.vlan)); + if (meta_len_xdp) + skb_metadata_set(skb, meta_len_xdp); napi_gro_receive(&rx_ring->r_vec->napi, skb); } From 366a88fe2f40d6772985ec78cdd34df7f109bb88 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:55 +0200 Subject: [PATCH 0881/1322] bpf, ixgbe: add meta data support Implement support for transferring XDP meta data into skb for ixgbe driver; before calling into the program, xdp.data_meta points to xdp.data, where on program return with pass verdict, we call into skb_metadata_set(). We implement this for the default ixgbe_build_skb() variant. For the ixgbe_construct_skb() that is used when legacy-rx buffer mananagement mode is turned on via ethtool, I found that XDP gets 0 headroom, so neither xdp_adjust_head() nor xdp_adjust_meta() can be used with this. Just add a comment with explanation for this operating mode. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 04bb03bda1cd..3942c6208745 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2133,6 +2133,21 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring, #if L1_CACHE_BYTES < 128 prefetch(xdp->data + L1_CACHE_BYTES); #endif + /* Note, we get here by enabling legacy-rx via: + * + * ethtool --set-priv-flags legacy-rx on + * + * In this mode, we currently get 0 extra XDP headroom as + * opposed to having legacy-rx off, where we process XDP + * packets going to stack via ixgbe_build_skb(). The latter + * provides us currently with 192 bytes of headroom. + * + * For ixgbe_construct_skb() mode it means that the + * xdp->data_meta will always point to xdp->data, since + * the helper cannot expand the head. Should this ever + * change in future for legacy-rx mode on, then lets also + * add xdp->data_meta handling here. + */ /* allocate a skb to store the frags */ skb = napi_alloc_skb(&rx_ring->q_vector->napi, IXGBE_RX_HDR_SIZE); @@ -2165,6 +2180,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, struct xdp_buff *xdp, union ixgbe_adv_rx_desc *rx_desc) { + unsigned int metasize = xdp->data - xdp->data_meta; #if (PAGE_SIZE < 8192) unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2; #else @@ -2174,10 +2190,14 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, #endif struct sk_buff *skb; - /* prefetch first cache line of first page */ - prefetch(xdp->data); + /* Prefetch first cache line of first page. If xdp->data_meta + * is unused, this points extactly as xdp->data, otherwise we + * likely have a consumer accessing first few bytes of meta + * data, and then actual data. + */ + prefetch(xdp->data_meta); #if L1_CACHE_BYTES < 128 - prefetch(xdp->data + L1_CACHE_BYTES); + prefetch(xdp->data_meta + L1_CACHE_BYTES); #endif /* build an skb to around the page buffer */ @@ -2188,6 +2208,8 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, /* update pointers within the skb to store the data */ skb_reserve(skb, xdp->data - xdp->data_hard_start); __skb_put(skb, xdp->data_end - xdp->data); + if (metasize) + skb_metadata_set(skb, metasize); /* record DMA address if this is the start of a chain of buffers */ if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)) @@ -2326,7 +2348,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, if (!skb) { xdp.data = page_address(rx_buffer->page) + rx_buffer->page_offset; - xdp_set_data_meta_invalid(&xdp); + xdp.data_meta = xdp.data; xdp.data_hard_start = xdp.data - ixgbe_rx_offset(rx_ring); xdp.data_end = xdp.data + size; From d85fc17beeb06f9979d63fe4d9fbffbb1a00bba4 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:47 +0300 Subject: [PATCH 0882/1322] aquantia: Setup max_mtu in ndev to enable jumbo frames Although hardware is capable for almost 16K MTU, without max_mtu field correctly set it only allows standard MTU to be used. This patch enables max MTU, calculating it from hardware maximum frame size of 16352 octets (including FCS). Fixes: 5513e16421cb ("net: ethernet: aquantia: Fixes for aq_ndev_change_mtu") Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 11 ++--------- .../aquantia/atlantic/hw_atl/hw_atl_b0_internal.h | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 6ac9e2602d6d..bf26a59a9d8e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -214,7 +214,6 @@ struct aq_nic_s *aq_nic_alloc_cold(const struct net_device_ops *ndev_ops, SET_NETDEV_DEV(ndev, dev); ndev->if_port = port; - ndev->min_mtu = ETH_MIN_MTU; self->ndev = ndev; self->aq_pci_func = aq_pci_func; @@ -283,6 +282,7 @@ int aq_nic_ndev_init(struct aq_nic_s *self) self->ndev->features = aq_hw_caps->hw_features; self->ndev->priv_flags = aq_hw_caps->hw_priv_flags; self->ndev->mtu = aq_nic_cfg->mtu - ETH_HLEN; + self->ndev->max_mtu = self->aq_hw_caps.mtu - ETH_FCS_LEN - ETH_HLEN; return 0; } @@ -693,16 +693,9 @@ int aq_nic_set_multicast_list(struct aq_nic_s *self, struct net_device *ndev) int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu) { - int err = 0; - - if (new_mtu > self->aq_hw_caps.mtu) { - err = -EINVAL; - goto err_exit; - } self->aq_nic_cfg.mtu = new_mtu; -err_exit: - return err; + return 0; } int aq_nic_set_mac(struct aq_nic_s *self, struct net_device *ndev) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h index f3957e930340..fcf89e25a773 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0_internal.h @@ -16,7 +16,7 @@ #include "../aq_common.h" -#define HW_ATL_B0_MTU_JUMBO (16000U) +#define HW_ATL_B0_MTU_JUMBO 16352U #define HW_ATL_B0_MTU 1514U #define HW_ATL_B0_TX_RINGS 4U From 3aec6412e007b294d4c135f5c7ed5e5ecf37dd2e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:48 +0300 Subject: [PATCH 0883/1322] aquantia: Fix Tx queue hangups Driver did a poor job in managing its Tx queues: Sometimes it could stop tx queues due to link down condition in aq_nic_xmit - but never waked up them. That led to Tx path total suspend. This patch fixes this and improves generic queue management: - introduces queue restart counter - uses generic netif_ interface to disable and enable tx path - refactors link up/down condition and introduces dmesg log event when link changes. - introduces new constant for minimum descriptors count required for queue wakeup Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_cfg.h | 4 + .../net/ethernet/aquantia/atlantic/aq_nic.c | 91 ++++++++----------- .../net/ethernet/aquantia/atlantic/aq_nic.h | 2 - .../net/ethernet/aquantia/atlantic/aq_ring.c | 26 ++++++ .../net/ethernet/aquantia/atlantic/aq_ring.h | 4 + .../net/ethernet/aquantia/atlantic/aq_vec.c | 8 +- 6 files changed, 76 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 214986436ece..0fdaaa643073 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -51,6 +51,10 @@ #define AQ_CFG_SKB_FRAGS_MAX 32U +/* Number of descriptors available in one ring to resume this ring queue + */ +#define AQ_CFG_RESTART_DESC_THRES (AQ_CFG_SKB_FRAGS_MAX * 2) + #define AQ_CFG_NAPI_WEIGHT 64U #define AQ_CFG_MULTICAST_ADDRESS_MAX 32U diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index bf26a59a9d8e..072a55029f04 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -119,6 +119,35 @@ int aq_nic_cfg_start(struct aq_nic_s *self) return 0; } +static int aq_nic_update_link_status(struct aq_nic_s *self) +{ + int err = self->aq_hw_ops.hw_get_link_status(self->aq_hw); + + if (err) + return err; + + if (self->link_status.mbps != self->aq_hw->aq_link_status.mbps) + pr_info("%s: link change old %d new %d\n", + AQ_CFG_DRV_NAME, self->link_status.mbps, + self->aq_hw->aq_link_status.mbps); + + self->link_status = self->aq_hw->aq_link_status; + if (!netif_carrier_ok(self->ndev) && self->link_status.mbps) { + aq_utils_obj_set(&self->header.flags, + AQ_NIC_FLAG_STARTED); + aq_utils_obj_clear(&self->header.flags, + AQ_NIC_LINK_DOWN); + netif_carrier_on(self->ndev); + netif_tx_wake_all_queues(self->ndev); + } + if (netif_carrier_ok(self->ndev) && !self->link_status.mbps) { + netif_carrier_off(self->ndev); + netif_tx_disable(self->ndev); + aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); + } + return 0; +} + static void aq_nic_service_timer_cb(unsigned long param) { struct aq_nic_s *self = (struct aq_nic_s *)param; @@ -131,26 +160,13 @@ static void aq_nic_service_timer_cb(unsigned long param) if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY)) goto err_exit; - err = self->aq_hw_ops.hw_get_link_status(self->aq_hw); - if (err < 0) + err = aq_nic_update_link_status(self); + if (err) goto err_exit; - self->link_status = self->aq_hw->aq_link_status; - self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw, self->aq_nic_cfg.is_interrupt_moderation); - if (self->link_status.mbps) { - aq_utils_obj_set(&self->header.flags, - AQ_NIC_FLAG_STARTED); - aq_utils_obj_clear(&self->header.flags, - AQ_NIC_LINK_DOWN); - netif_carrier_on(self->ndev); - } else { - netif_carrier_off(self->ndev); - aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); - } - memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s)); memset(&stats_tx, 0U, sizeof(struct aq_ring_stats_tx_s)); for (i = AQ_DIMOF(self->aq_vec); i--;) { @@ -240,7 +256,6 @@ err_exit: int aq_nic_ndev_register(struct aq_nic_s *self) { int err = 0; - unsigned int i = 0U; if (!self->ndev) { err = -EINVAL; @@ -262,8 +277,7 @@ int aq_nic_ndev_register(struct aq_nic_s *self) netif_carrier_off(self->ndev); - for (i = AQ_CFG_VECS_MAX; i--;) - aq_nic_ndev_queue_stop(self, i); + netif_tx_disable(self->ndev); err = register_netdev(self->ndev); if (err < 0) @@ -318,12 +332,8 @@ struct aq_nic_s *aq_nic_alloc_hot(struct net_device *ndev) err = -EINVAL; goto err_exit; } - if (netif_running(ndev)) { - unsigned int i; - - for (i = AQ_CFG_VECS_MAX; i--;) - netif_stop_subqueue(ndev, i); - } + if (netif_running(ndev)) + netif_tx_disable(ndev); for (self->aq_vecs = 0; self->aq_vecs < self->aq_nic_cfg.vecs; self->aq_vecs++) { @@ -383,16 +393,6 @@ err_exit: return err; } -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx) -{ - netif_start_subqueue(self->ndev, idx); -} - -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx) -{ - netif_stop_subqueue(self->ndev, idx); -} - int aq_nic_start(struct aq_nic_s *self) { struct aq_vec_s *aq_vec = NULL; @@ -451,10 +451,6 @@ int aq_nic_start(struct aq_nic_s *self) goto err_exit; } - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) - aq_nic_ndev_queue_start(self, i); - err = netif_set_real_num_tx_queues(self->ndev, self->aq_vecs); if (err < 0) goto err_exit; @@ -463,6 +459,8 @@ int aq_nic_start(struct aq_nic_s *self) if (err < 0) goto err_exit; + netif_tx_start_all_queues(self->ndev); + err_exit: return err; } @@ -602,7 +600,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs; unsigned int tc = 0U; int err = NETDEV_TX_OK; - bool is_nic_in_bad_state; frags = skb_shinfo(skb)->nr_frags + 1; @@ -613,13 +610,10 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) goto err_exit; } - is_nic_in_bad_state = aq_utils_obj_test(&self->header.flags, - AQ_NIC_FLAGS_IS_NOT_TX_READY) || - (aq_ring_avail_dx(ring) < - AQ_CFG_SKB_FRAGS_MAX); + aq_ring_update_queue_state(ring); - if (is_nic_in_bad_state) { - aq_nic_ndev_queue_stop(self, ring->idx); + /* Above status update may stop the queue. Check this. */ + if (__netif_subqueue_stopped(self->ndev, ring->idx)) { err = NETDEV_TX_BUSY; goto err_exit; } @@ -631,9 +625,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) ring, frags); if (err >= 0) { - if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1) - aq_nic_ndev_queue_stop(self, ring->idx); - ++ring->stats.tx.packets; ring->stats.tx.bytes += skb->len; } @@ -898,9 +889,7 @@ int aq_nic_stop(struct aq_nic_s *self) struct aq_vec_s *aq_vec = NULL; unsigned int i = 0U; - for (i = 0U, aq_vec = self->aq_vec[0]; - self->aq_vecs > i; ++i, aq_vec = self->aq_vec[i]) - aq_nic_ndev_queue_stop(self, i); + netif_tx_disable(self->ndev); del_timer_sync(&self->service_timer); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h index 7fc2a5ecb2b7..0ddd556ff901 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h @@ -83,8 +83,6 @@ struct net_device *aq_nic_get_ndev(struct aq_nic_s *self); int aq_nic_init(struct aq_nic_s *self); int aq_nic_cfg_start(struct aq_nic_s *self); int aq_nic_ndev_register(struct aq_nic_s *self); -void aq_nic_ndev_queue_start(struct aq_nic_s *self, unsigned int idx); -void aq_nic_ndev_queue_stop(struct aq_nic_s *self, unsigned int idx); void aq_nic_ndev_free(struct aq_nic_s *self); int aq_nic_start(struct aq_nic_s *self); int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 4eee1996a825..02f79b0640ba 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -104,6 +104,32 @@ int aq_ring_init(struct aq_ring_s *self) return 0; } +void aq_ring_update_queue_state(struct aq_ring_s *ring) +{ + if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX) + aq_ring_queue_stop(ring); + else if (aq_ring_avail_dx(ring) > AQ_CFG_RESTART_DESC_THRES) + aq_ring_queue_wake(ring); +} + +void aq_ring_queue_wake(struct aq_ring_s *ring) +{ + struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); + + if (__netif_subqueue_stopped(ndev, ring->idx)) { + netif_wake_subqueue(ndev, ring->idx); + ring->stats.tx.queue_restarts++; + } +} + +void aq_ring_queue_stop(struct aq_ring_s *ring) +{ + struct net_device *ndev = aq_nic_get_ndev(ring->aq_nic); + + if (!__netif_subqueue_stopped(ndev, ring->idx)) + netif_stop_subqueue(ndev, ring->idx); +} + void aq_ring_tx_clean(struct aq_ring_s *self) { struct device *dev = aq_nic_get_dev(self->aq_nic); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 782176c5f4f8..24523b5ac68c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -94,6 +94,7 @@ struct aq_ring_stats_tx_s { u64 errors; u64 packets; u64 bytes; + u64 queue_restarts; }; union aq_ring_stats_s { @@ -147,6 +148,9 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self, int aq_ring_init(struct aq_ring_s *self); void aq_ring_rx_deinit(struct aq_ring_s *self); void aq_ring_free(struct aq_ring_s *self); +void aq_ring_update_queue_state(struct aq_ring_s *ring); +void aq_ring_queue_wake(struct aq_ring_s *ring); +void aq_ring_queue_stop(struct aq_ring_s *ring); void aq_ring_tx_clean(struct aq_ring_s *self); int aq_ring_rx_clean(struct aq_ring_s *self, struct napi_struct *napi, diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index ebf588004c46..305ff8ffac2c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -59,12 +59,7 @@ static int aq_vec_poll(struct napi_struct *napi, int budget) if (ring[AQ_VEC_TX_ID].sw_head != ring[AQ_VEC_TX_ID].hw_head) { aq_ring_tx_clean(&ring[AQ_VEC_TX_ID]); - - if (aq_ring_avail_dx(&ring[AQ_VEC_TX_ID]) > - AQ_CFG_SKB_FRAGS_MAX) { - aq_nic_ndev_queue_start(self->aq_nic, - ring[AQ_VEC_TX_ID].idx); - } + aq_ring_update_queue_state(&ring[AQ_VEC_TX_ID]); was_tx_cleaned = true; } @@ -364,6 +359,7 @@ void aq_vec_add_stats(struct aq_vec_s *self, stats_tx->packets += tx->packets; stats_tx->bytes += tx->bytes; stats_tx->errors += tx->errors; + stats_tx->queue_restarts += tx->queue_restarts; } } From a7bb1bea3a296549ebfc28afa76276ef392f9afa Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 25 Sep 2017 10:48:49 +0300 Subject: [PATCH 0884/1322] aquantia: Fix transient invalid link down/up indications Due to a bug in aquantia atlantic card firmware, it sometimes reports invalid link speed bits. That caused driver to report link down events, although link itself is totally fine. This patch ignores such out of blue readings. Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 4f5ec9a0fbfb..bf734b32e44b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -351,8 +351,7 @@ int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self) break; default: - link_status->mbps = 0U; - break; + return -EBUSY; } } From c7545689244b50c562b1fbbc71905fba224c8a05 Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 25 Sep 2017 10:48:50 +0300 Subject: [PATCH 0885/1322] atlantic: fix iommu errors Call skb_frag_dma_map multiple times if tx length is greater than device max and avoid processing tx ring until entire packet has been sent. Signed-off-by: Igor Russkikh Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_nic.c | 43 +++++++++++-------- .../net/ethernet/aquantia/atlantic/aq_ring.c | 27 ++++++++---- .../net/ethernet/aquantia/atlantic/aq_ring.h | 6 ++- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 072a55029f04..0a5bb4114eb4 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -473,6 +473,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, unsigned int nr_frags = skb_shinfo(skb)->nr_frags; unsigned int frag_count = 0U; unsigned int dx = ring->sw_tail; + struct aq_ring_buff_s *first = NULL; struct aq_ring_buff_s *dx_buff = &ring->buff_ring[dx]; if (unlikely(skb_is_gso(skb))) { @@ -483,6 +484,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, dx_buff->len_l4 = tcp_hdrlen(skb); dx_buff->mss = skb_shinfo(skb)->gso_size; dx_buff->is_txc = 1U; + dx_buff->eop_index = 0xffffU; dx_buff->is_ipv6 = (ip_hdr(skb)->version == 6) ? 1U : 0U; @@ -502,6 +504,7 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) goto exit; + first = dx_buff; dx_buff->len_pkt = skb->len; dx_buff->is_sop = 1U; dx_buff->is_mapped = 1U; @@ -530,40 +533,46 @@ static unsigned int aq_nic_map_skb(struct aq_nic_s *self, for (; nr_frags--; ++frag_count) { unsigned int frag_len = 0U; + unsigned int buff_offset = 0U; + unsigned int buff_size = 0U; dma_addr_t frag_pa; skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_count]; frag_len = skb_frag_size(frag); - frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), frag, 0, - frag_len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(aq_nic_get_dev(self), frag_pa))) - goto mapping_error; + while (frag_len) { + if (frag_len > AQ_CFG_TX_FRAME_MAX) + buff_size = AQ_CFG_TX_FRAME_MAX; + else + buff_size = frag_len; + + frag_pa = skb_frag_dma_map(aq_nic_get_dev(self), + frag, + buff_offset, + buff_size, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(aq_nic_get_dev(self), + frag_pa))) + goto mapping_error; - while (frag_len > AQ_CFG_TX_FRAME_MAX) { dx = aq_ring_next_dx(ring, dx); dx_buff = &ring->buff_ring[dx]; dx_buff->flags = 0U; - dx_buff->len = AQ_CFG_TX_FRAME_MAX; + dx_buff->len = buff_size; dx_buff->pa = frag_pa; dx_buff->is_mapped = 1U; + dx_buff->eop_index = 0xffffU; + + frag_len -= buff_size; + buff_offset += buff_size; - frag_len -= AQ_CFG_TX_FRAME_MAX; - frag_pa += AQ_CFG_TX_FRAME_MAX; ++ret; } - - dx = aq_ring_next_dx(ring, dx); - dx_buff = &ring->buff_ring[dx]; - - dx_buff->flags = 0U; - dx_buff->len = frag_len; - dx_buff->pa = frag_pa; - dx_buff->is_mapped = 1U; - ++ret; } + first->eop_index = dx; dx_buff->is_eop = 1U; dx_buff->skb = skb; goto exit; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 02f79b0640ba..0654e0c76bc2 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -104,6 +104,12 @@ int aq_ring_init(struct aq_ring_s *self) return 0; } +static inline bool aq_ring_dx_in_range(unsigned int h, unsigned int i, + unsigned int t) +{ + return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t)); +} + void aq_ring_update_queue_state(struct aq_ring_s *ring) { if (aq_ring_avail_dx(ring) <= AQ_CFG_SKB_FRAGS_MAX) @@ -139,23 +145,28 @@ void aq_ring_tx_clean(struct aq_ring_s *self) struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head]; if (likely(buff->is_mapped)) { - if (unlikely(buff->is_sop)) + if (unlikely(buff->is_sop)) { + if (!buff->is_eop && + buff->eop_index != 0xffffU && + (!aq_ring_dx_in_range(self->sw_head, + buff->eop_index, + self->hw_head))) + break; + dma_unmap_single(dev, buff->pa, buff->len, DMA_TO_DEVICE); - else + } else { dma_unmap_page(dev, buff->pa, buff->len, DMA_TO_DEVICE); + } } if (unlikely(buff->is_eop)) dev_kfree_skb_any(buff->skb); - } -} -static inline unsigned int aq_ring_dx_in_range(unsigned int h, unsigned int i, - unsigned int t) -{ - return (h < t) ? ((h < i) && (i < t)) : ((h < i) || (i < t)); + buff->pa = 0U; + buff->eop_index = 0xffffU; + } } #define AQ_SKB_ALIGN SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 24523b5ac68c..5844078764bd 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -65,7 +65,7 @@ struct __packed aq_ring_buff_s { }; union { struct { - u32 len:16; + u16 len; u32 is_ip_cso:1; u32 is_udp_cso:1; u32 is_tcp_cso:1; @@ -77,8 +77,10 @@ struct __packed aq_ring_buff_s { u32 is_cleaned:1; u32 is_error:1; u32 rsvd3:6; + u16 eop_index; + u16 rsvd4; }; - u32 flags; + u64 flags; }; }; From fc46820b27a2d9a46f7e90c9ceb4a64a1bc5fab8 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 25 Sep 2017 12:23:03 +0200 Subject: [PATCH 0886/1322] vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets In generic_file_llseek_size, return -ENXIO for negative offsets as well as offsets beyond EOF. This affects filesystems which don't implement SEEK_HOLE / SEEK_DATA internally, possibly because they don't support holes. Fixes xfstest generic/448. Signed-off-by: Andreas Gruenbacher Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index a2b9a47235c5..f0d4b16873e8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -112,7 +112,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -120,7 +120,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; From be2336ebfd7a1aec597a26f086fc4235ab87dd2c Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:21 +0200 Subject: [PATCH 0887/1322] mlxsw: spectrum_dpipe: Fix indentation in header description Fix indentation in mlxsw_meta header's description. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_dpipe.c | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index 51e6846da72b..91648094ab4c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -46,18 +46,21 @@ enum mlxsw_sp_field_metadata_id { }; static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = { - { .name = "erif_port", - .id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT, - .bitwidth = 32, - .mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX, + { + .name = "erif_port", + .id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT, + .bitwidth = 32, + .mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX, }, - { .name = "l3_forward", - .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD, - .bitwidth = 1, + { + .name = "l3_forward", + .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD, + .bitwidth = 1, }, - { .name = "l3_drop", - .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP, - .bitwidth = 1, + { + .name = "l3_drop", + .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP, + .bitwidth = 1, }, }; From c0859d697c258f7c864e81bc1f83d1c274e7cf4c Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:22 +0200 Subject: [PATCH 0888/1322] mlxsw: Add fields for mlxsw's meta header for adjacency table This patch adds field for mlxsw's meta header which will be used to describe the match/action behavior of the adjacency table. The fields are: 1. Adj_index - The global index of the nexthop group in the adjacency table. 2. Adj_hash_index - Local index offset which is based on packets hash mod the nexthop group size. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index 91648094ab4c..9253273a5c03 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -43,6 +43,8 @@ enum mlxsw_sp_field_metadata_id { MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT, MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD, MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP, + MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX, + MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX, }; static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = { @@ -62,6 +64,16 @@ static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = { .id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP, .bitwidth = 1, }, + { + .name = "adj_index", + .id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX, + .bitwidth = 32, + }, + { + .name = "adj_hash_index", + .id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX, + .bitwidth = 32, + }, }; enum mlxsw_sp_dpipe_header_id { From dbe4598c1e929a24dc352a7dc523a3cc22a093f2 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:23 +0200 Subject: [PATCH 0889/1322] mlxsw: spectrum_router: Keep nexthops in a linked list Keep nexthops in a linked list for easy access. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 0bd93dc88ffa..0cd4b2a7d9d0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -78,6 +78,7 @@ struct mlxsw_sp_router { struct rhashtable neigh_ht; struct rhashtable nexthop_group_ht; struct rhashtable nexthop_ht; + struct list_head nexthop_list; struct { struct mlxsw_sp_lpm_tree *trees; unsigned int tree_count; @@ -2028,6 +2029,7 @@ struct mlxsw_sp_nexthop_key { struct mlxsw_sp_nexthop { struct list_head neigh_list_node; /* member of neigh entry list */ struct list_head rif_list_node; + struct list_head router_list_node; struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group * this belongs to */ @@ -2784,6 +2786,8 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, if (err) return err; + list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); + if (!dev) return 0; @@ -2807,6 +2811,7 @@ static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh) { mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh); + list_del(&nh->router_list_node); mlxsw_sp_nexthop_remove(mlxsw_sp, nh); } @@ -4045,6 +4050,8 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, nh->nh_grp = nh_grp; memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr)); + list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); + if (!dev) return 0; nh->ifindex = dev->ifindex; @@ -4056,6 +4063,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh) { mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh); + list_del(&nh->router_list_node); } static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, @@ -5990,6 +5998,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) if (err) goto err_nexthop_group_ht_init; + INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_list); err = mlxsw_sp_lpm_init(mlxsw_sp); if (err) goto err_lpm_init; From ec2437f42b44edc84054feb943d49e8030154c38 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:24 +0200 Subject: [PATCH 0890/1322] mlxsw: spectrum_router: Use helper to check for last neighbor Use list_is_last helper to check for last neighbor. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 0cd4b2a7d9d0..65e59a989084 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1317,7 +1317,7 @@ mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif, typeof(*neigh_entry), rif_list_node); } - if (neigh_entry->rif_list_node.next == &rif->neigh_list) + if (list_is_last(&neigh_entry->rif_list_node, &rif->neigh_list)) return NULL; return list_next_entry(neigh_entry, rif_list_node); } From c556cd28930661f337d7989fe74ac31871fd3888 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:25 +0200 Subject: [PATCH 0891/1322] mlxsw: spectrum_router: Add helpers for nexthop access This is done as a preparation before introducing the ability to dump the adjacency table via dpipe, and to count the table size. The current table implementation avoids tunnel entries, thus a helper for checking if the nexthop group contains tunnel entries is also provided. The mlxsw's nexthop representative struct stays private to the router module. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 71 +++++++++++++++++++ .../ethernet/mellanox/mlxsw/spectrum_router.h | 12 ++++ 2 files changed, 83 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 65e59a989084..c062b4f666e3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2068,6 +2068,77 @@ struct mlxsw_sp_nexthop_group { #define nh_rif nexthops[0].rif }; +struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router, + struct mlxsw_sp_nexthop *nh) +{ + if (!nh) { + if (list_empty(&router->nexthop_list)) + return NULL; + else + return list_first_entry(&router->nexthop_list, + typeof(*nh), router_list_node); + } + if (list_is_last(&nh->router_list_node, &router->nexthop_list)) + return NULL; + return list_next_entry(nh, router_list_node); +} + +bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh) +{ + return nh->offloaded; +} + +unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh) +{ + if (!nh->offloaded) + return NULL; + return nh->neigh_entry->ha; +} + +int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index, + u32 *p_adj_hash_index) +{ + struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp; + u32 adj_hash_index = 0; + int i; + + if (!nh->offloaded || !nh_grp->adj_index_valid) + return -EINVAL; + + *p_adj_index = nh_grp->adj_index; + + for (i = 0; i < nh_grp->count; i++) { + struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i]; + + if (nh_iter == nh) + break; + if (nh_iter->offloaded) + adj_hash_index++; + } + + *p_adj_hash_index = adj_hash_index; + return 0; +} + +struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh) +{ + return nh->rif; +} + +bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh) +{ + struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp; + int i; + + for (i = 0; i < nh_grp->count; i++) { + struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i]; + + if (nh_iter->type == MLXSW_SP_NEXTHOP_TYPE_IPIP) + return true; + } + return false; +} + static struct fib_info * mlxsw_sp_nexthop4_group_fi(const struct mlxsw_sp_nexthop_group *nh_grp) { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index ae4c99b3f2fc..d6951d516cf4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -62,6 +62,7 @@ enum mlxsw_sp_rif_counter_dir { }; struct mlxsw_sp_neigh_entry; +struct mlxsw_sp_nexthop; struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp, u16 rif_index); @@ -108,5 +109,16 @@ union mlxsw_sp_l3addr mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto, const struct net_device *ol_dev); __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev); +struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router, + struct mlxsw_sp_nexthop *nh); +bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh); +unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh); +int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index, + u32 *p_adj_hash_index); +struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh); +bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh); +#define mlxsw_sp_nexthop_for_each(nh, router) \ + for (nh = mlxsw_sp_nexthop_next(router, NULL); nh; \ + nh = mlxsw_sp_nexthop_next(router, nh)) #endif /* _MLXSW_ROUTER_H_*/ From c538adb3c6e77e0f6563b71923a81de182b5132c Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:26 +0200 Subject: [PATCH 0892/1322] mlxsw: spectrum_dpipe: Add initial support for the router adjacency table Add initial support for router adjacency table. The table does lookup based on the nexthop-group index and the local nexthop offset. After locating the nexthop entry it sets the destination MAC address and the egress RIF. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_dpipe.c | 100 +++++++++++++++++- .../ethernet/mellanox/mlxsw/spectrum_dpipe.h | 1 + 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index 9253273a5c03..ca16f8924c0a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -841,6 +841,97 @@ static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp) MLXSW_SP_DPIPE_TABLE_NAME_HOST6); } +static int mlxsw_sp_dpipe_table_adj_matches_dump(void *priv, + struct sk_buff *skb) +{ + struct devlink_dpipe_match match = {0}; + int err; + + match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT; + match.header = &mlxsw_sp_dpipe_header_metadata; + match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX; + + err = devlink_dpipe_match_put(skb, &match); + if (err) + return err; + + match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT; + match.header = &mlxsw_sp_dpipe_header_metadata; + match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX; + + return devlink_dpipe_match_put(skb, &match); +} + +static int mlxsw_sp_dpipe_table_adj_actions_dump(void *priv, + struct sk_buff *skb) +{ + struct devlink_dpipe_action action = {0}; + int err; + + action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY; + action.header = &devlink_dpipe_header_ethernet; + action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC; + + err = devlink_dpipe_action_put(skb, &action); + if (err) + return err; + + action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY; + action.header = &mlxsw_sp_dpipe_header_metadata; + action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT; + + return devlink_dpipe_action_put(skb, &action); +} + +static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_nexthop *nh; + u64 size = 0; + + mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) + if (mlxsw_sp_nexthop_offload(nh) && + !mlxsw_sp_nexthop_group_has_ipip(nh)) + size++; + return size; +} + +static u64 +mlxsw_sp_dpipe_table_adj_size_get(void *priv) +{ + struct mlxsw_sp *mlxsw_sp = priv; + u64 size; + + rtnl_lock(); + size = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp); + rtnl_unlock(); + + return size; +} + +static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = { + .matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump, + .actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump, + .size_get = mlxsw_sp_dpipe_table_adj_size_get, +}; + +static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp) +{ + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + + return devlink_dpipe_table_register(devlink, + MLXSW_SP_DPIPE_TABLE_NAME_ADJ, + &mlxsw_sp_dpipe_table_adj_ops, + mlxsw_sp, false); +} + +static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp) +{ + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + + devlink_dpipe_table_unregister(devlink, + MLXSW_SP_DPIPE_TABLE_NAME_ADJ); +} + int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp) { struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); @@ -861,8 +952,14 @@ int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp) err = mlxsw_sp_dpipe_host6_table_init(mlxsw_sp); if (err) goto err_host6_table_init; - return 0; + err = mlxsw_sp_dpipe_adj_table_init(mlxsw_sp); + if (err) + goto err_adj_table_init; + + return 0; +err_adj_table_init: + mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp); err_host6_table_init: mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp); err_host4_table_init: @@ -876,6 +973,7 @@ void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp) { struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + mlxsw_sp_dpipe_adj_table_fini(mlxsw_sp); mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp); mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp); mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h index 283fde4e6783..815d543cf114 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h @@ -56,5 +56,6 @@ static inline void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp) #define MLXSW_SP_DPIPE_TABLE_NAME_ERIF "mlxsw_erif" #define MLXSW_SP_DPIPE_TABLE_NAME_HOST4 "mlxsw_host4" #define MLXSW_SP_DPIPE_TABLE_NAME_HOST6 "mlxsw_host6" +#define MLXSW_SP_DPIPE_TABLE_NAME_ADJ "mlxsw_adj" #endif /* _MLXSW_PIPELINE_H_*/ From f4de25fb530c936af7c3d9a158a7dde86adb2848 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:27 +0200 Subject: [PATCH 0893/1322] mlxsw: reg: Add support for counters on RATR In order to add the ability for setting counters on nexthops the RATR register should be extended. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 44 ++++++++++++++++++----- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 17eba19100de..d44e673a4c4e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -4549,6 +4549,27 @@ MLXSW_ITEM32(reg, ratr, ipip_ipv4_udip, 0x18, 0, 32); */ MLXSW_ITEM32(reg, ratr, ipip_ipv6_ptr, 0x1C, 0, 24); +enum mlxsw_reg_flow_counter_set_type { + /* No count */ + MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00, + /* Count packets and bytes */ + MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03, + /* Count only packets */ + MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05, +}; + +/* reg_ratr_counter_set_type + * Counter set type for flow counters + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, counter_set_type, 0x28, 24, 8); + +/* reg_ratr_counter_index + * Counter index for flow counters + * Access: RW + */ +MLXSW_ITEM32(reg, ratr, counter_index, 0x28, 0, 24); + static inline void mlxsw_reg_ratr_pack(char *payload, enum mlxsw_reg_ratr_op op, bool valid, @@ -4576,6 +4597,20 @@ static inline void mlxsw_reg_ratr_ipip4_entry_pack(char *payload, u32 ipv4_udip) mlxsw_reg_ratr_ipip_ipv4_udip_set(payload, ipv4_udip); } +static inline void mlxsw_reg_ratr_counter_pack(char *payload, u64 counter_index, + bool counter_enable) +{ + enum mlxsw_reg_flow_counter_set_type set_type; + + if (counter_enable) + set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES; + else + set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT; + + mlxsw_reg_ratr_counter_index_set(payload, counter_index); + mlxsw_reg_ratr_counter_set_type_set(payload, set_type); +} + /* RICNT - Router Interface Counter Register * ----------------------------------------- * The RICNT register retrieves per port performance counters @@ -5297,15 +5332,6 @@ enum mlxsw_reg_rauht_trap_id { */ MLXSW_ITEM32(reg, rauht, trap_id, 0x60, 0, 9); -enum mlxsw_reg_flow_counter_set_type { - /* No count */ - MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00, - /* Count packets and bytes */ - MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03, - /* Count only packets */ - MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05, -}; - /* reg_rauht_counter_set_type * Counter set type for flow counters * Access: RW From a5390278a5eb573b76d2d28ce576b6b62c2200be Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:28 +0200 Subject: [PATCH 0894/1322] mlxsw: spectrum: Add support for setting counters on nexthops Add support for setting counters on nexthops based on dpipe's adjacency table counter status. This patch also adds the ability for getting the counter value, which will be used by the dpipe adjacency table dump implementation in the next patches. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 52 +++++++++++++++++-- .../ethernet/mellanox/mlxsw/spectrum_router.h | 2 + 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index c062b4f666e3..a75064a8ba80 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2052,6 +2052,8 @@ struct mlxsw_sp_nexthop { struct mlxsw_sp_neigh_entry *neigh_entry; struct mlxsw_sp_ipip_entry *ipip_entry; }; + unsigned int counter_index; + bool counter_valid; }; struct mlxsw_sp_nexthop_group { @@ -2068,6 +2070,41 @@ struct mlxsw_sp_nexthop_group { #define nh_rif nexthops[0].rif }; +static void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + struct devlink *devlink; + + devlink = priv_to_devlink(mlxsw_sp->core); + if (!devlink_dpipe_table_counter_enabled(devlink, + MLXSW_SP_DPIPE_TABLE_NAME_ADJ)) + return; + + if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &nh->counter_index)) + return; + + nh->counter_valid = true; +} + +static void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) +{ + if (!nh->counter_valid) + return; + mlxsw_sp_flow_counter_free(mlxsw_sp, nh->counter_index); + nh->counter_valid = false; +} + +int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh, u64 *p_counter) +{ + if (!nh->counter_valid) + return -EINVAL; + + return mlxsw_sp_flow_counter_get(mlxsw_sp, nh->counter_index, + p_counter, NULL); +} + struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router, struct mlxsw_sp_nexthop *nh) { @@ -2396,8 +2433,8 @@ static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp, return 0; } -static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, - struct mlxsw_sp_nexthop *nh) +static int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_nexthop *nh) { struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; char ratr_pl[MLXSW_REG_RATR_LEN]; @@ -2406,6 +2443,11 @@ static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, true, MLXSW_REG_RATR_TYPE_ETHERNET, adj_index, neigh_entry->rif); mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha); + if (nh->counter_valid) + mlxsw_reg_ratr_counter_pack(ratr_pl, nh->counter_index, true); + else + mlxsw_reg_ratr_counter_pack(ratr_pl, 0, false); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl); } @@ -2440,7 +2482,7 @@ mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp, if (nh->update || reallocate) { switch (nh->type) { case MLXSW_SP_NEXTHOP_TYPE_ETH: - err = mlxsw_sp_nexthop_mac_update + err = mlxsw_sp_nexthop_update (mlxsw_sp, adj_index, nh); break; case MLXSW_SP_NEXTHOP_TYPE_IPIP: @@ -2857,6 +2899,7 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, if (err) return err; + mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh); list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); if (!dev) @@ -2883,6 +2926,7 @@ static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp, { mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh); list_del(&nh->router_list_node); + mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh); mlxsw_sp_nexthop_remove(mlxsw_sp, nh); } @@ -4120,6 +4164,7 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, nh->nh_grp = nh_grp; memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr)); + mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh); list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list); @@ -4135,6 +4180,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, { mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh); list_del(&nh->router_list_node); + mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh); } static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index d6951d516cf4..a6e86590939f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -120,5 +120,7 @@ bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh); #define mlxsw_sp_nexthop_for_each(nh, router) \ for (nh = mlxsw_sp_nexthop_next(router, NULL); nh; \ nh = mlxsw_sp_nexthop_next(router, nh)) +int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh, u64 *p_counter); #endif /* _MLXSW_ROUTER_H_*/ From 190d38a52a73ef8ac05c1931dda730e0f9b79095 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:29 +0200 Subject: [PATCH 0895/1322] mlxsw: spectrum_dpipe: Add support for adjacency table dump Add support for adjacency table dump. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_dpipe.c | 238 ++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index ca16f8924c0a..e6755a96b269 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -895,6 +895,243 @@ static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp) return size; } +enum mlxsw_sp_dpipe_table_adj_match { + MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX, + MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX, + MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT, +}; + +enum mlxsw_sp_dpipe_table_adj_action { + MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC, + MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT, + MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT, +}; + +static void +mlxsw_sp_dpipe_table_adj_match_action_prepare(struct devlink_dpipe_match *matches, + struct devlink_dpipe_action *actions) +{ + struct devlink_dpipe_action *action; + struct devlink_dpipe_match *match; + + match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX]; + match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT; + match->header = &mlxsw_sp_dpipe_header_metadata; + match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX; + + match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX]; + match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT; + match->header = &mlxsw_sp_dpipe_header_metadata; + match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX; + + action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC]; + action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY; + action->header = &devlink_dpipe_header_ethernet; + action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC; + + action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT]; + action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY; + action->header = &mlxsw_sp_dpipe_header_metadata; + action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT; +} + +static int +mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry, + struct devlink_dpipe_value *match_values, + struct devlink_dpipe_match *matches, + struct devlink_dpipe_value *action_values, + struct devlink_dpipe_action *actions) +{ struct devlink_dpipe_value *action_value; + struct devlink_dpipe_value *match_value; + struct devlink_dpipe_action *action; + struct devlink_dpipe_match *match; + + entry->match_values = match_values; + entry->match_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT; + + entry->action_values = action_values; + entry->action_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT; + + match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX]; + match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX]; + + match_value->match = match; + match_value->value_size = sizeof(u32); + match_value->value = kmalloc(match_value->value_size, GFP_KERNEL); + if (!match_value->value) + return -ENOMEM; + + match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX]; + match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX]; + + match_value->match = match; + match_value->value_size = sizeof(u32); + match_value->value = kmalloc(match_value->value_size, GFP_KERNEL); + if (!match_value->value) + return -ENOMEM; + + action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC]; + action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC]; + + action_value->action = action; + action_value->value_size = sizeof(u64); + action_value->value = kmalloc(action_value->value_size, GFP_KERNEL); + if (!action_value->value) + return -ENOMEM; + + action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT]; + action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT]; + + action_value->action = action; + action_value->value_size = sizeof(u32); + action_value->value = kmalloc(action_value->value_size, GFP_KERNEL); + if (!action_value->value) + return -ENOMEM; + + return 0; +} + +static void +__mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry, + u32 adj_index, u32 adj_hash_index, + unsigned char *ha, + struct mlxsw_sp_rif *rif) +{ + struct devlink_dpipe_value *value; + u32 *p_rif_value; + u32 *p_index; + + value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX]; + p_index = value->value; + *p_index = adj_index; + + value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX]; + p_index = value->value; + *p_index = adj_hash_index; + + value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC]; + ether_addr_copy(value->value, ha); + + value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT]; + p_rif_value = value->value; + *p_rif_value = mlxsw_sp_rif_index(rif); + value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif); + value->mapping_valid = true; +} + +static void mlxsw_sp_dpipe_table_adj_entry_fill(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh, + struct devlink_dpipe_entry *entry) +{ + struct mlxsw_sp_rif *rif = mlxsw_sp_nexthop_rif(nh); + unsigned char *ha = mlxsw_sp_nexthop_ha(nh); + u32 adj_hash_index = 0; + u32 adj_index = 0; + int err; + + mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index); + __mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index, + adj_hash_index, ha, rif); + err = mlxsw_sp_nexthop_counter_get(mlxsw_sp, nh, &entry->counter); + if (!err) + entry->counter_valid = true; +} + +static int +mlxsw_sp_dpipe_table_adj_entries_get(struct mlxsw_sp *mlxsw_sp, + struct devlink_dpipe_entry *entry, + bool counters_enabled, + struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct mlxsw_sp_nexthop *nh; + int entry_index = 0; + int nh_count_max; + int nh_count = 0; + int nh_skip; + int j; + int err; + + rtnl_lock(); + nh_count_max = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp); +start_again: + err = devlink_dpipe_entry_ctx_prepare(dump_ctx); + if (err) + goto err_ctx_prepare; + j = 0; + nh_skip = nh_count; + mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) { + if (!mlxsw_sp_nexthop_offload(nh) || + mlxsw_sp_nexthop_group_has_ipip(nh)) + continue; + + if (nh_count < nh_skip) + goto skip; + + mlxsw_sp_dpipe_table_adj_entry_fill(mlxsw_sp, nh, entry); + entry->index = entry_index; + err = devlink_dpipe_entry_ctx_append(dump_ctx, entry); + if (err) { + if (err == -EMSGSIZE) { + if (!j) + goto err_entry_append; + break; + } + goto err_entry_append; + } + entry_index++; + j++; +skip: + nh_count++; + } + + devlink_dpipe_entry_ctx_close(dump_ctx); + if (nh_count != nh_count_max) + goto start_again; + rtnl_unlock(); + + return 0; + +err_ctx_prepare: +err_entry_append: + rtnl_unlock(); + return err; +} + +static int +mlxsw_sp_dpipe_table_adj_entries_dump(void *priv, bool counters_enabled, + struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct devlink_dpipe_value action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT]; + struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT]; + struct devlink_dpipe_action actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT]; + struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT]; + struct devlink_dpipe_entry entry = {0}; + struct mlxsw_sp *mlxsw_sp = priv; + int err; + + memset(matches, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT * + sizeof(matches[0])); + memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT * + sizeof(match_values[0])); + memset(actions, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT * + sizeof(actions[0])); + memset(action_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT * + sizeof(action_values[0])); + + mlxsw_sp_dpipe_table_adj_match_action_prepare(matches, actions); + err = mlxsw_sp_dpipe_table_adj_entry_prepare(&entry, + match_values, matches, + action_values, actions); + if (err) + goto out; + + err = mlxsw_sp_dpipe_table_adj_entries_get(mlxsw_sp, &entry, + counters_enabled, dump_ctx); +out: + devlink_dpipe_entry_clear(&entry); + return err; +} + static u64 mlxsw_sp_dpipe_table_adj_size_get(void *priv) { @@ -911,6 +1148,7 @@ mlxsw_sp_dpipe_table_adj_size_get(void *priv) static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = { .matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump, .actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump, + .entries_dump = mlxsw_sp_dpipe_table_adj_entries_dump, .size_get = mlxsw_sp_dpipe_table_adj_size_get, }; From 427e652aa34d90960f729c0b902c3c4a8a821b2e Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 25 Sep 2017 10:32:30 +0200 Subject: [PATCH 0896/1322] mlxsw: spectrum_dpipe: Add support for controlling nexthop counters Add support for controlling nexthop counters via dpipe. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_dpipe.c | 24 +++++++++++++++++++ .../ethernet/mellanox/mlxsw/spectrum_router.c | 12 +++++----- .../ethernet/mellanox/mlxsw/spectrum_router.h | 6 +++++ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c index e6755a96b269..a056f23d3a0e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c @@ -1132,6 +1132,29 @@ out: return err; } +static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable) +{ + struct mlxsw_sp *mlxsw_sp = priv; + struct mlxsw_sp_nexthop *nh; + u32 adj_hash_index = 0; + u32 adj_index = 0; + + mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) { + if (!mlxsw_sp_nexthop_offload(nh) || + mlxsw_sp_nexthop_group_has_ipip(nh)) + continue; + + mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_hash_index); + if (enable) + mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh); + else + mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh); + mlxsw_sp_nexthop_update(mlxsw_sp, + adj_index + adj_hash_index, nh); + } + return 0; +} + static u64 mlxsw_sp_dpipe_table_adj_size_get(void *priv) { @@ -1149,6 +1172,7 @@ static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = { .matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump, .actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump, .entries_dump = mlxsw_sp_dpipe_table_adj_entries_dump, + .counters_set_update = mlxsw_sp_dpipe_table_adj_counters_update, .size_get = mlxsw_sp_dpipe_table_adj_size_get, }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index a75064a8ba80..321f7356073c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2070,8 +2070,8 @@ struct mlxsw_sp_nexthop_group { #define nh_rif nexthops[0].rif }; -static void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_nexthop *nh) +void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) { struct devlink *devlink; @@ -2086,8 +2086,8 @@ static void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp, nh->counter_valid = true; } -static void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_nexthop *nh) +void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh) { if (!nh->counter_valid) return; @@ -2433,8 +2433,8 @@ static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp, return 0; } -static int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, - struct mlxsw_sp_nexthop *nh) +int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_nexthop *nh) { struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry; char ratr_pl[MLXSW_REG_RATR_LEN]; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index a6e86590939f..3d449180b035 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -122,5 +122,11 @@ bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh); nh = mlxsw_sp_nexthop_next(router, nh)) int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop *nh, u64 *p_counter); +int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index, + struct mlxsw_sp_nexthop *nh); +void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh); +void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_nexthop *nh); #endif /* _MLXSW_ROUTER_H_*/ From f4344e0a48121d07cf8f69415278e84c39dbe9bf Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:31 -0400 Subject: [PATCH 0897/1322] net: dsa: return -ENODEV is there is no slave PHY Instead of returning -EOPNOTSUPP when a slave device has no PHY, directly return -ENODEV as ethtool and phylib do. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bd51ef56ec5b..79c5a0cd9923 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -266,10 +266,10 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return phy_mii_ioctl(p->phy, ifr, cmd); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return phy_mii_ioctl(p->phy, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -429,7 +429,7 @@ dsa_slave_get_link_ksettings(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); if (!p->phy) - return -EOPNOTSUPP; + return -ENODEV; phy_ethtool_ksettings_get(p->phy, cmd); @@ -442,10 +442,10 @@ dsa_slave_set_link_ksettings(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return phy_ethtool_ksettings_set(p->phy, cmd); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return phy_ethtool_ksettings_set(p->phy, cmd); } static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -481,22 +481,22 @@ static int dsa_slave_nway_reset(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) - return genphy_restart_aneg(p->phy); + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + return genphy_restart_aneg(p->phy); } static u32 dsa_slave_get_link(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); - if (p->phy != NULL) { - genphy_update_link(p->phy); - return p->phy->link; - } + if (!p->phy) + return -ENODEV; - return -EOPNOTSUPP; + genphy_update_link(p->phy); + + return p->phy->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) From 0115dcd1787d2d1c50719fab98d6bcb0c17931a1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:32 -0400 Subject: [PATCH 0898/1322] net: dsa: use slave device phydev There is no need to store a phy_device in dsa_slave_priv since net_device already provides one. Simply s/p->phy/dev->phydev/. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 1 - net/dsa/slave.c | 116 +++++++++++++++++++++------------------------ 2 files changed, 53 insertions(+), 64 deletions(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 0298a0f6a349..eccc62776283 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -79,7 +79,6 @@ struct dsa_slave_priv { * The phylib phy_device pointer for the PHY connected * to this port. */ - struct phy_device *phy; phy_interface_t phy_interface; int old_link; int old_pause; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 79c5a0cd9923..4ea1c6eb0da8 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -96,12 +96,12 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - err = dsa_port_enable(dp, p->phy); + err = dsa_port_enable(dp, dev->phydev); if (err) goto clear_promisc; - if (p->phy) - phy_start(p->phy); + if (dev->phydev) + phy_start(dev->phydev); return 0; @@ -124,10 +124,10 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_master_netdev(p); struct dsa_port *dp = p->dp; - if (p->phy) - phy_stop(p->phy); + if (dev->phydev) + phy_stop(dev->phydev); - dsa_port_disable(dp, p->phy); + dsa_port_disable(dp, dev->phydev); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); @@ -264,12 +264,10 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return phy_mii_ioctl(p->phy, ifr, cmd); + return phy_mii_ioctl(dev->phydev, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -426,12 +424,10 @@ static int dsa_slave_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - phy_ethtool_ksettings_get(p->phy, cmd); + phy_ethtool_ksettings_get(dev->phydev, cmd); return 0; } @@ -440,12 +436,10 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return phy_ethtool_ksettings_set(p->phy, cmd); + return phy_ethtool_ksettings_set(dev->phydev, cmd); } static void dsa_slave_get_drvinfo(struct net_device *dev, @@ -479,24 +473,20 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) static int dsa_slave_nway_reset(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - return genphy_restart_aneg(p->phy); + return genphy_restart_aneg(dev->phydev); } static u32 dsa_slave_get_link(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) + if (!dev->phydev) return -ENODEV; - genphy_update_link(p->phy); + genphy_update_link(dev->phydev); - return p->phy->link; + return dev->phydev->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) @@ -631,7 +621,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -642,12 +632,12 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) return ret; if (e->eee_enabled) { - ret = phy_init_eee(p->phy, 0); + ret = phy_init_eee(dev->phydev, 0); if (ret) return ret; } - return phy_ethtool_set_eee(p->phy, e); + return phy_ethtool_set_eee(dev->phydev, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) @@ -657,7 +647,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) @@ -667,7 +657,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) if (ret) return ret; - return phy_ethtool_get_eee(p->phy, e); + return phy_ethtool_get_eee(dev->phydev, e); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -976,26 +966,26 @@ static void dsa_slave_adjust_link(struct net_device *dev) struct dsa_switch *ds = p->dp->ds; unsigned int status_changed = 0; - if (p->old_link != p->phy->link) { + if (p->old_link != dev->phydev->link) { status_changed = 1; - p->old_link = p->phy->link; + p->old_link = dev->phydev->link; } - if (p->old_duplex != p->phy->duplex) { + if (p->old_duplex != dev->phydev->duplex) { status_changed = 1; - p->old_duplex = p->phy->duplex; + p->old_duplex = dev->phydev->duplex; } - if (p->old_pause != p->phy->pause) { + if (p->old_pause != dev->phydev->pause) { status_changed = 1; - p->old_pause = p->phy->pause; + p->old_pause = dev->phydev->pause; } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->dp->index, p->phy); + ds->ops->adjust_link(ds, p->dp->index, dev->phydev); if (status_changed) - phy_print_status(p->phy); + phy_print_status(dev->phydev); } static int dsa_slave_fixed_link_update(struct net_device *dev, @@ -1020,17 +1010,18 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; - p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); - if (!p->phy) { + slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); + if (!slave_dev->phydev) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) - p->phy_interface = p->phy->interface; - return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); + p->phy_interface = slave_dev->phydev->interface; + + return phy_connect_direct(slave_dev, slave_dev->phydev, + dsa_slave_adjust_link, p->phy_interface); } static int dsa_slave_phy_setup(struct net_device *slave_dev) @@ -1082,22 +1073,23 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } } else { - p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); + slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); } of_node_put(phy_dn); } - if (p->phy && phy_is_fixed) - fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + if (slave_dev->phydev && phy_is_fixed) + fixed_phy_set_link_update(slave_dev->phydev, + dsa_slave_fixed_link_update); /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ - if (!p->phy) { + if (!slave_dev->phydev) { ret = dsa_slave_phy_connect(slave_dev, p->dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", @@ -1108,7 +1100,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) } } - phy_attached_info(p->phy); + phy_attached_info(slave_dev->phydev); return 0; } @@ -1128,12 +1120,12 @@ int dsa_slave_suspend(struct net_device *slave_dev) netif_device_detach(slave_dev); - if (p->phy) { - phy_stop(p->phy); + if (slave_dev->phydev) { + phy_stop(slave_dev->phydev); p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - phy_suspend(p->phy); + phy_suspend(slave_dev->phydev); } return 0; @@ -1141,13 +1133,11 @@ int dsa_slave_suspend(struct net_device *slave_dev) int dsa_slave_resume(struct net_device *slave_dev) { - struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_attach(slave_dev); - if (p->phy) { - phy_resume(p->phy); - phy_start(p->phy); + if (slave_dev->phydev) { + phy_resume(slave_dev->phydev); + phy_start(slave_dev->phydev); } return 0; @@ -1240,8 +1230,8 @@ void dsa_slave_destroy(struct net_device *slave_dev) port_dn = p->dp->dn; netif_carrier_off(slave_dev); - if (p->phy) { - phy_disconnect(p->phy); + if (slave_dev->phydev) { + phy_disconnect(slave_dev->phydev); if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); From 771df31ace8a0264bcc3576d3f02660e3366cd6d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:33 -0400 Subject: [PATCH 0899/1322] net: dsa: use phy_ethtool_get_link_ksettings Use phy_ethtool_get_link_ksettings now that dsa_slave_get_link_ksettings does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4ea1c6eb0da8..bb0f64f47ae7 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -420,17 +420,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) } /* ethtool operations *******************************************************/ -static int -dsa_slave_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *cmd) -{ - if (!dev->phydev) - return -ENODEV; - - phy_ethtool_ksettings_get(dev->phydev, cmd); - - return 0; -} static int dsa_slave_set_link_ksettings(struct net_device *dev, @@ -921,8 +910,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .get_link_ksettings = dsa_slave_get_link_ksettings, .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_link_ksettings = phy_ethtool_get_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; From aa62a8ca85cd05b4af4d7be70ecb735f65b0449a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:34 -0400 Subject: [PATCH 0900/1322] net: dsa: use phy_ethtool_set_link_ksettings Use phy_ethtool_set_link_ksettings now that dsa_slave_set_link_ksettings does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bb0f64f47ae7..d2b632cae468 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -421,16 +421,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) /* ethtool operations *******************************************************/ -static int -dsa_slave_set_link_ksettings(struct net_device *dev, - const struct ethtool_link_ksettings *cmd) -{ - if (!dev->phydev) - return -ENODEV; - - return phy_ethtool_ksettings_set(dev->phydev, cmd); -} - static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { @@ -910,8 +900,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .set_link_ksettings = dsa_slave_set_link_ksettings, .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; From 69b2c1629649b1e765516c7b452797b9697de9ac Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 17:15:35 -0400 Subject: [PATCH 0901/1322] net: dsa: use phy_ethtool_nway_reset Use phy_ethtool_nway_reset now that dsa_slave_nway_reset does exactly the same. Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d2b632cae468..bf8800de13c1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -450,14 +450,6 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) ds->ops->get_regs(ds, p->dp->index, regs, _p); } -static int dsa_slave_nway_reset(struct net_device *dev) -{ - if (!dev->phydev) - return -ENODEV; - - return genphy_restart_aneg(dev->phydev); -} - static u32 dsa_slave_get_link(struct net_device *dev) { if (!dev->phydev) @@ -888,7 +880,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, - .nway_reset = dsa_slave_nway_reset, + .nway_reset = phy_ethtool_nway_reset, .get_link = dsa_slave_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, From 2a52a8c6e594cdc562f503492ba89ac7bc0c4074 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Sep 2017 10:58:20 +0200 Subject: [PATCH 0902/1322] mlxsw: spectrum_acl: Propagate errors from mlxsw_afa_block_jump/continue Propagate error instead of doing WARN_ON right away. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/core_acl_flex_actions.c | 14 ++++++++------ .../mellanox/mlxsw/core_acl_flex_actions.h | 4 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 6 +++--- drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 10 +++++----- .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 6 +++++- .../net/ethernet/mellanox/mlxsw/spectrum_flower.c | 4 +++- 6 files changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c index ab3ffe7a8eda..bc55d0e76705 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c @@ -399,23 +399,25 @@ u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block) } EXPORT_SYMBOL(mlxsw_afa_block_first_set_kvdl_index); -void mlxsw_afa_block_continue(struct mlxsw_afa_block *block) +int mlxsw_afa_block_continue(struct mlxsw_afa_block *block) { - if (WARN_ON(block->finished)) - return; + if (block->finished) + return -EINVAL; mlxsw_afa_set_goto_set(block->cur_set, MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE, 0); block->finished = true; + return 0; } EXPORT_SYMBOL(mlxsw_afa_block_continue); -void mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id) +int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id) { - if (WARN_ON(block->finished)) - return; + if (block->finished) + return -EINVAL; mlxsw_afa_set_goto_set(block->cur_set, MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP, group_id); block->finished = true; + return 0; } EXPORT_SYMBOL(mlxsw_afa_block_jump); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h index 501819c790d6..06b0be432b8f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h @@ -57,8 +57,8 @@ void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block); int mlxsw_afa_block_commit(struct mlxsw_afa_block *block); char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block); u32 mlxsw_afa_block_first_set_kvdl_index(struct mlxsw_afa_block *block); -void mlxsw_afa_block_continue(struct mlxsw_afa_block *block); -void mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id); +int mlxsw_afa_block_continue(struct mlxsw_afa_block *block); +int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id); int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block); int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id); int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index e907ec446a73..9355d914a4c8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -468,9 +468,9 @@ void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei, enum mlxsw_afk_element element, const char *key_value, const char *mask_value, unsigned int len); -void mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei); -void mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei, - u16 group_id); +int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei); +int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei, + u16 group_id); int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei); int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei); int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index eede75fbd585..93dcd315f7d6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -378,15 +378,15 @@ void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei, key_value, mask_value, len); } -void mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei) +int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei) { - mlxsw_afa_block_continue(rulei->act_block); + return mlxsw_afa_block_continue(rulei->act_block); } -void mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei, - u16 group_id) +int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei, + u16 group_id) { - mlxsw_afa_block_jump(rulei->act_block, group_id); + return mlxsw_afa_block_jump(rulei->act_block, group_id); } int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 50b40de1fb91..7e8284b46968 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -608,7 +608,10 @@ mlxsw_sp_acl_tcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp, goto err_rulei_create; } - mlxsw_sp_acl_rulei_act_continue(rulei); + err = mlxsw_sp_acl_rulei_act_continue(rulei); + if (WARN_ON(err)) + goto err_rulei_act_continue; + err = mlxsw_sp_acl_rulei_commit(rulei); if (err) goto err_rulei_commit; @@ -623,6 +626,7 @@ mlxsw_sp_acl_tcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp, err_rule_insert: err_rulei_commit: +err_rulei_act_continue: mlxsw_sp_acl_rulei_destroy(rulei); err_rulei_create: parman_item_remove(region->parman, parman_prio, parman_item); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 8aace9a06a5d..f1cedccb58cc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -84,7 +84,9 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return PTR_ERR(ruleset); group_id = mlxsw_sp_acl_ruleset_group_id(ruleset); - mlxsw_sp_acl_rulei_act_jump(rulei, group_id); + err = mlxsw_sp_acl_rulei_act_jump(rulei, group_id); + if (err) + return err; } else if (is_tcf_mirred_egress_redirect(a)) { int ifindex = tcf_mirred_ifindex(a); struct net_device *out_dev; From 3b8e9238a8d194e82f0202a5fb68a63686ebe420 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Sep 2017 10:58:21 +0200 Subject: [PATCH 0903/1322] net: sched: introduce helper to identify gact pass action Introduce a helper called is_tcf_gact_pass which could be used to tell if the action is gact pass or not. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 41afe1ce7b16..d979a0d48f9e 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -33,6 +33,11 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act, return false; } +static inline bool is_tcf_gact_ok(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_OK, false); +} + static inline bool is_tcf_gact_shot(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_SHOT, false); From b2925957ec1a9349c6ac42fc5ac95bcf0dd4a6a0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Sep 2017 10:58:22 +0200 Subject: [PATCH 0904/1322] mlxsw: spectrum_flower: Offload "ok" termination action If action is "gact_ok", offload it to HW. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index f1cedccb58cc..2f0e57857ea4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -63,7 +63,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, tcf_exts_to_list(exts, &actions); list_for_each_entry(a, &actions, list) { - if (is_tcf_gact_shot(a)) { + if (is_tcf_gact_ok(a)) { + err = mlxsw_sp_acl_rulei_act_continue(rulei); + if (err) + return err; + } else if (is_tcf_gact_shot(a)) { err = mlxsw_sp_acl_rulei_act_drop(rulei); if (err) return err; From c2cc187e53011c1c4931055984657da9085c763b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Sep 2017 13:19:26 +0300 Subject: [PATCH 0905/1322] sctp: Fix a big endian bug in sctp_diag_dump() The sctp_for_each_transport() function takes an pointer to int. The cb->args[] array holds longs so it's only using the high 32 bits. It works on little endian system but will break on big endian 64 bit machines. Fixes: d25adbeb0cdb ("sctp: fix an use-after-free issue in sctp_sock_dump") Signed-off-by: Dan Carpenter Acked-by: Neil Horman Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; From 79ede4ae2d01b0282bfaaf761308a5ac485c8144 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:35 +0200 Subject: [PATCH 0906/1322] nfp: add helper to get flower cmsg length Add a helper function that returns the length of the cmsg data when given the cmsg skb Signed-off-by: John Hurley Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/cmsg.h | 5 +++++ drivers/net/ethernet/netronome/nfp/flower/metadata.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index a2ec60344236..7a5ccf0cc7c2 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -323,6 +323,11 @@ static inline void *nfp_flower_cmsg_get_data(struct sk_buff *skb) return (unsigned char *)skb->data + NFP_FLOWER_CMSG_HLEN; } +static inline int nfp_flower_cmsg_get_data_len(struct sk_buff *skb) +{ + return skb->len - NFP_FLOWER_CMSG_HLEN; +} + struct sk_buff * nfp_flower_cmsg_mac_repr_start(struct nfp_app *app, unsigned int num_ports); void diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c index 3226ddc55f99..193520ef23f0 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c +++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c @@ -140,7 +140,7 @@ exit_rcu_unlock: void nfp_flower_rx_flow_stats(struct nfp_app *app, struct sk_buff *skb) { - unsigned int msg_len = skb->len - NFP_FLOWER_CMSG_HLEN; + unsigned int msg_len = nfp_flower_cmsg_get_data_len(skb); struct nfp_fl_stats_frame *stats_frame; unsigned char *msg; int i; From 611aec101ab7c19755e8ea6d480f679aaffed5ad Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:36 +0200 Subject: [PATCH 0907/1322] nfp: compile flower vxlan tunnel metadata match fields Compile ovs-tc flower vxlan metadata match fields for offloading. Only support offload of tunnel data when the VXLAN port specifically matches well known port 4789. Signed-off-by: John Hurley Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/cmsg.h | 38 ++++++++++ .../net/ethernet/netronome/nfp/flower/main.h | 2 + .../net/ethernet/netronome/nfp/flower/match.c | 60 ++++++++++++++-- .../ethernet/netronome/nfp/flower/offload.c | 70 ++++++++++++++++--- 4 files changed, 158 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index 7a5ccf0cc7c2..af9165b3b652 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -83,6 +83,14 @@ #define NFP_FL_PUSH_VLAN_CFI BIT(12) #define NFP_FL_PUSH_VLAN_VID GENMASK(11, 0) +/* Tunnel ports */ +#define NFP_FL_PORT_TYPE_TUN 0x50000000 + +enum nfp_flower_tun_type { + NFP_FL_TUNNEL_NONE = 0, + NFP_FL_TUNNEL_VXLAN = 2, +}; + struct nfp_fl_output { __be16 a_op; __be16 flags; @@ -230,6 +238,36 @@ struct nfp_flower_ipv6 { struct in6_addr ipv6_dst; }; +/* Flow Frame VXLAN --> Tunnel details (4W/16B) + * ----------------------------------------------------------------- + * 3 2 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ipv4_addr_src | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ipv4_addr_dst | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | tun_flags | tos | ttl | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | gpe_flags | Reserved | Next Protocol | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VNI | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct nfp_flower_vxlan { + __be32 ip_src; + __be32 ip_dst; + __be16 tun_flags; + u8 tos; + u8 ttl; + u8 gpe_flags; + u8 reserved[2]; + u8 nxt_proto; + __be32 tun_id; +}; + +#define NFP_FL_TUN_VNI_OFFSET 8 + /* The base header for a control message packet. * Defines an 8-bit version, and an 8-bit type, padded * to a 32-bit word. Rest of the packet is type-specific. diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index c20dd00a1cae..cd695eabce02 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -58,6 +58,8 @@ struct nfp_app; #define NFP_FL_MASK_REUSE_TIME_NS 40000 #define NFP_FL_MASK_ID_LOCATION 1 +#define NFP_FL_VXLAN_PORT 4789 + struct nfp_fl_mask_id { struct circ_buf mask_id_free_list; struct timespec64 *last_used; diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index d25b5038c3a2..1fd1bab0611f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -77,14 +77,17 @@ nfp_flower_compile_meta(struct nfp_flower_meta_one *frame, u8 key_type) static int nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port, - bool mask_version) + bool mask_version, enum nfp_flower_tun_type tun_type) { if (mask_version) { frame->in_port = cpu_to_be32(~0); return 0; } - frame->in_port = cpu_to_be32(cmsg_port); + if (tun_type) + frame->in_port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type); + else + frame->in_port = cpu_to_be32(cmsg_port); return 0; } @@ -189,15 +192,53 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame, } } +static void +nfp_flower_compile_vxlan(struct nfp_flower_vxlan *frame, + struct tc_cls_flower_offload *flow, + bool mask_version) +{ + struct fl_flow_key *target = mask_version ? flow->mask : flow->key; + struct flow_dissector_key_ipv4_addrs *vxlan_ips; + struct flow_dissector_key_keyid *vni; + + /* Wildcard TOS/TTL/GPE_FLAGS/NXT_PROTO for now. */ + memset(frame, 0, sizeof(struct nfp_flower_vxlan)); + + if (dissector_uses_key(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID)) { + u32 temp_vni; + + vni = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target); + temp_vni = be32_to_cpu(vni->keyid) << NFP_FL_TUN_VNI_OFFSET; + frame->tun_id = cpu_to_be32(temp_vni); + } + + if (dissector_uses_key(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + vxlan_ips = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target); + frame->ip_src = vxlan_ips->src; + frame->ip_dst = vxlan_ips->dst; + } +} + int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, struct nfp_fl_key_ls *key_ls, struct net_device *netdev, struct nfp_fl_payload *nfp_flow) { + enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE; int err; u8 *ext; u8 *msk; + if (key_ls->key_layer & NFP_FLOWER_LAYER_VXLAN) + tun_type = NFP_FL_TUNNEL_VXLAN; + memset(nfp_flow->unmasked_data, 0, key_ls->key_size); memset(nfp_flow->mask_data, 0, key_ls->key_size); @@ -216,14 +257,14 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, /* Populate Exact Port data. */ err = nfp_flower_compile_port((struct nfp_flower_in_port *)ext, nfp_repr_get_port_id(netdev), - false); + false, tun_type); if (err) return err; /* Populate Mask Port Data. */ err = nfp_flower_compile_port((struct nfp_flower_in_port *)msk, nfp_repr_get_port_id(netdev), - true); + true, tun_type); if (err) return err; @@ -291,5 +332,16 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, msk += sizeof(struct nfp_flower_ipv6); } + if (key_ls->key_layer & NFP_FLOWER_LAYER_VXLAN) { + /* Populate Exact VXLAN Data. */ + nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)ext, + flow, false); + /* Populate Mask VXLAN Data. */ + nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)msk, + flow, true); + ext += sizeof(struct nfp_flower_vxlan); + msk += sizeof(struct nfp_flower_vxlan); + } + return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index a18b4d2b1d3e..637372ba8f55 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -52,8 +52,25 @@ BIT(FLOW_DISSECTOR_KEY_PORTS) | \ BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \ BIT(FLOW_DISSECTOR_KEY_VLAN) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ BIT(FLOW_DISSECTOR_KEY_IP)) +#define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \ + (BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_PORTS)) + +#define NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R \ + (BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_ENC_PORTS)) + static int nfp_flower_xmit_flow(struct net_device *netdev, struct nfp_fl_payload *nfp_flow, u8 mtype) @@ -125,15 +142,58 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, if (flow->dissector->used_keys & ~NFP_FLOWER_WHITELIST_DISSECTOR) return -EOPNOTSUPP; + /* If any tun dissector is used then the required set must be used. */ + if (flow->dissector->used_keys & NFP_FLOWER_WHITELIST_TUN_DISSECTOR && + (flow->dissector->used_keys & NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R) + != NFP_FLOWER_WHITELIST_TUN_DISSECTOR_R) + return -EOPNOTSUPP; + + key_layer_two = 0; + key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC; + key_size = sizeof(struct nfp_flower_meta_one) + + sizeof(struct nfp_flower_in_port) + + sizeof(struct nfp_flower_mac_mpls); + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_dissector_key_ipv4_addrs *mask_ipv4 = NULL; + struct flow_dissector_key_ports *mask_enc_ports = NULL; + struct flow_dissector_key_ports *enc_ports = NULL; struct flow_dissector_key_control *mask_enc_ctl = skb_flow_dissector_target(flow->dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, flow->mask); - /* We are expecting a tunnel. For now we ignore offloading. */ - if (mask_enc_ctl->addr_type) + struct flow_dissector_key_control *enc_ctl = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + flow->key); + if (mask_enc_ctl->addr_type != 0xffff || + enc_ctl->addr_type != FLOW_DISSECTOR_KEY_IPV4_ADDRS) return -EOPNOTSUPP; + + /* These fields are already verified as used. */ + mask_ipv4 = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + flow->mask); + if (mask_ipv4->dst != cpu_to_be32(~0)) + return -EOPNOTSUPP; + + mask_enc_ports = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + flow->mask); + enc_ports = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + flow->key); + + if (mask_enc_ports->dst != cpu_to_be16(~0) || + enc_ports->dst != htons(NFP_FL_VXLAN_PORT)) + return -EOPNOTSUPP; + + key_layer |= NFP_FLOWER_LAYER_VXLAN; + key_size += sizeof(struct nfp_flower_vxlan); } if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) { @@ -151,12 +211,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, FLOW_DISSECTOR_KEY_IP, flow->mask); - key_layer_two = 0; - key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC; - key_size = sizeof(struct nfp_flower_meta_one) + - sizeof(struct nfp_flower_in_port) + - sizeof(struct nfp_flower_mac_mpls); - if (mask_basic && mask_basic->n_proto) { /* Ethernet type is present in the key. */ switch (key_basic->n_proto) { From b27d6a95a70de551df828de2b658efd949a9864e Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:37 +0200 Subject: [PATCH 0908/1322] nfp: compile flower vxlan tunnel set actions Compile set tunnel actions for tc flower. Only support VXLAN and ensure a tunnel destination port of 4789 is used. Signed-off-by: John Hurley Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/action.c | 171 ++++++++++++++++-- .../net/ethernet/netronome/nfp/flower/cmsg.h | 31 +++- 2 files changed, 180 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index db9750695dc7..38f3835ae176 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "cmsg.h" #include "main.h" @@ -80,14 +81,27 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan, push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci); } +static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev, + enum nfp_flower_tun_type tun_type) +{ + if (!out_dev->rtnl_link_ops) + return false; + + if (!strcmp(out_dev->rtnl_link_ops->kind, "vxlan")) + return tun_type == NFP_FL_TUNNEL_VXLAN; + + return false; +} + static int nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action, struct nfp_fl_payload *nfp_flow, bool last, - struct net_device *in_dev) + struct net_device *in_dev, enum nfp_flower_tun_type tun_type, + int *tun_out_cnt) { size_t act_size = sizeof(struct nfp_fl_output); + u16 tmp_output_op, tmp_flags; struct net_device *out_dev; - u16 tmp_output_op; int ifindex; /* Set action opcode to output action. */ @@ -97,34 +111,126 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action, output->a_op = cpu_to_be16(tmp_output_op); - /* Set action output parameters. */ - output->flags = cpu_to_be16(last ? NFP_FL_OUT_FLAGS_LAST : 0); - ifindex = tcf_mirred_ifindex(action); out_dev = __dev_get_by_index(dev_net(in_dev), ifindex); if (!out_dev) return -EOPNOTSUPP; - /* Only offload egress ports are on the same device as the ingress - * port. - */ - if (!switchdev_port_same_parent_id(in_dev, out_dev)) - return -EOPNOTSUPP; + tmp_flags = last ? NFP_FL_OUT_FLAGS_LAST : 0; - output->port = cpu_to_be32(nfp_repr_get_port_id(out_dev)); - if (!output->port) - return -EOPNOTSUPP; + if (tun_type) { + /* Verify the egress netdev matches the tunnel type. */ + if (!nfp_fl_netdev_is_tunnel_type(out_dev, tun_type)) + return -EOPNOTSUPP; + if (*tun_out_cnt) + return -EOPNOTSUPP; + (*tun_out_cnt)++; + + output->flags = cpu_to_be16(tmp_flags | + NFP_FL_OUT_FLAGS_USE_TUN); + output->port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type); + } else { + /* Set action output parameters. */ + output->flags = cpu_to_be16(tmp_flags); + + /* Only offload if egress ports are on the same device as the + * ingress port. + */ + if (!switchdev_port_same_parent_id(in_dev, out_dev)) + return -EOPNOTSUPP; + + output->port = cpu_to_be32(nfp_repr_get_port_id(out_dev)); + if (!output->port) + return -EOPNOTSUPP; + } nfp_flow->meta.shortcut = output->port; return 0; } +static bool nfp_fl_supported_tun_port(const struct tc_action *action) +{ + struct ip_tunnel_info *tun = tcf_tunnel_info(action); + + return tun->key.tp_dst == htons(NFP_FL_VXLAN_PORT); +} + +static struct nfp_fl_pre_tunnel *nfp_fl_pre_tunnel(char *act_data, int act_len) +{ + size_t act_size = sizeof(struct nfp_fl_pre_tunnel); + struct nfp_fl_pre_tunnel *pre_tun_act; + u16 tmp_pre_tun_op; + + /* Pre_tunnel action must be first on action list. + * If other actions already exist they need pushed forward. + */ + if (act_len) + memmove(act_data + act_size, act_data, act_len); + + pre_tun_act = (struct nfp_fl_pre_tunnel *)act_data; + + memset(pre_tun_act, 0, act_size); + + tmp_pre_tun_op = + FIELD_PREP(NFP_FL_ACT_LEN_LW, act_size >> NFP_FL_LW_SIZ) | + FIELD_PREP(NFP_FL_ACT_JMP_ID, NFP_FL_ACTION_OPCODE_PRE_TUNNEL); + + pre_tun_act->a_op = cpu_to_be16(tmp_pre_tun_op); + + return pre_tun_act; +} + +static int +nfp_fl_set_vxlan(struct nfp_fl_set_vxlan *set_vxlan, + const struct tc_action *action, + struct nfp_fl_pre_tunnel *pre_tun) +{ + struct ip_tunnel_info *vxlan = tcf_tunnel_info(action); + size_t act_size = sizeof(struct nfp_fl_set_vxlan); + u32 tmp_set_vxlan_type_index = 0; + u16 tmp_set_vxlan_op; + /* Currently support one pre-tunnel so index is always 0. */ + int pretun_idx = 0; + + if (vxlan->options_len) { + /* Do not support options e.g. vxlan gpe. */ + return -EOPNOTSUPP; + } + + tmp_set_vxlan_op = + FIELD_PREP(NFP_FL_ACT_LEN_LW, act_size >> NFP_FL_LW_SIZ) | + FIELD_PREP(NFP_FL_ACT_JMP_ID, + NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL); + + set_vxlan->a_op = cpu_to_be16(tmp_set_vxlan_op); + + /* Set tunnel type and pre-tunnel index. */ + tmp_set_vxlan_type_index |= + FIELD_PREP(NFP_FL_IPV4_TUNNEL_TYPE, NFP_FL_TUNNEL_VXLAN) | + FIELD_PREP(NFP_FL_IPV4_PRE_TUN_INDEX, pretun_idx); + + set_vxlan->tun_type_index = cpu_to_be32(tmp_set_vxlan_type_index); + + set_vxlan->tun_id = vxlan->key.tun_id; + set_vxlan->tun_flags = vxlan->key.tun_flags; + set_vxlan->ipv4_ttl = vxlan->key.ttl; + set_vxlan->ipv4_tos = vxlan->key.tos; + + /* Complete pre_tunnel action. */ + pre_tun->ipv4_dst = vxlan->key.u.ipv4.dst; + + return 0; +} + static int nfp_flower_loop_action(const struct tc_action *a, struct nfp_fl_payload *nfp_fl, int *a_len, - struct net_device *netdev) + struct net_device *netdev, + enum nfp_flower_tun_type *tun_type, int *tun_out_cnt) { + struct nfp_fl_pre_tunnel *pre_tun; + struct nfp_fl_set_vxlan *s_vxl; struct nfp_fl_push_vlan *psh_v; struct nfp_fl_pop_vlan *pop_v; struct nfp_fl_output *output; @@ -137,7 +243,8 @@ nfp_flower_loop_action(const struct tc_action *a, return -EOPNOTSUPP; output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len]; - err = nfp_fl_output(output, a, nfp_fl, true, netdev); + err = nfp_fl_output(output, a, nfp_fl, true, netdev, *tun_type, + tun_out_cnt); if (err) return err; @@ -147,7 +254,8 @@ nfp_flower_loop_action(const struct tc_action *a, return -EOPNOTSUPP; output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len]; - err = nfp_fl_output(output, a, nfp_fl, false, netdev); + err = nfp_fl_output(output, a, nfp_fl, false, netdev, *tun_type, + tun_out_cnt); if (err) return err; @@ -170,6 +278,29 @@ nfp_flower_loop_action(const struct tc_action *a, nfp_fl_push_vlan(psh_v, a); *a_len += sizeof(struct nfp_fl_push_vlan); + } else if (is_tcf_tunnel_set(a) && nfp_fl_supported_tun_port(a)) { + /* Pre-tunnel action is required for tunnel encap. + * This checks for next hop entries on NFP. + * If none, the packet falls back before applying other actions. + */ + if (*a_len + sizeof(struct nfp_fl_pre_tunnel) + + sizeof(struct nfp_fl_set_vxlan) > NFP_FL_MAX_A_SIZ) + return -EOPNOTSUPP; + + *tun_type = NFP_FL_TUNNEL_VXLAN; + pre_tun = nfp_fl_pre_tunnel(nfp_fl->action_data, *a_len); + nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL); + *a_len += sizeof(struct nfp_fl_pre_tunnel); + + s_vxl = (struct nfp_fl_set_vxlan *)&nfp_fl->action_data[*a_len]; + err = nfp_fl_set_vxlan(s_vxl, a, pre_tun); + if (err) + return err; + + *a_len += sizeof(struct nfp_fl_set_vxlan); + } else if (is_tcf_tunnel_release(a)) { + /* Tunnel decap is handled by default so accept action. */ + return 0; } else { /* Currently we do not handle any other actions. */ return -EOPNOTSUPP; @@ -182,18 +313,22 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow, struct net_device *netdev, struct nfp_fl_payload *nfp_flow) { - int act_len, act_cnt, err; + int act_len, act_cnt, err, tun_out_cnt; + enum nfp_flower_tun_type tun_type; const struct tc_action *a; LIST_HEAD(actions); memset(nfp_flow->action_data, 0, NFP_FL_MAX_A_SIZ); nfp_flow->meta.act_len = 0; + tun_type = NFP_FL_TUNNEL_NONE; act_len = 0; act_cnt = 0; + tun_out_cnt = 0; tcf_exts_to_list(flow->exts, &actions); list_for_each_entry(a, &actions, list) { - err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev); + err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev, + &tun_type, &tun_out_cnt); if (err) return err; act_cnt++; diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index af9165b3b652..ff42ce8a1e9c 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -67,10 +67,12 @@ #define NFP_FL_LW_SIZ 2 /* Action opcodes */ -#define NFP_FL_ACTION_OPCODE_OUTPUT 0 -#define NFP_FL_ACTION_OPCODE_PUSH_VLAN 1 -#define NFP_FL_ACTION_OPCODE_POP_VLAN 2 -#define NFP_FL_ACTION_OPCODE_NUM 32 +#define NFP_FL_ACTION_OPCODE_OUTPUT 0 +#define NFP_FL_ACTION_OPCODE_PUSH_VLAN 1 +#define NFP_FL_ACTION_OPCODE_POP_VLAN 2 +#define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL 6 +#define NFP_FL_ACTION_OPCODE_PRE_TUNNEL 17 +#define NFP_FL_ACTION_OPCODE_NUM 32 #define NFP_FL_ACT_JMP_ID GENMASK(15, 8) #define NFP_FL_ACT_LEN_LW GENMASK(7, 0) @@ -85,6 +87,8 @@ /* Tunnel ports */ #define NFP_FL_PORT_TYPE_TUN 0x50000000 +#define NFP_FL_IPV4_TUNNEL_TYPE GENMASK(7, 4) +#define NFP_FL_IPV4_PRE_TUN_INDEX GENMASK(2, 0) enum nfp_flower_tun_type { NFP_FL_TUNNEL_NONE = 0, @@ -123,6 +127,25 @@ struct nfp_flower_meta_one { u16 reserved; }; +struct nfp_fl_pre_tunnel { + __be16 a_op; + __be16 reserved; + __be32 ipv4_dst; + /* reserved for use with IPv6 addresses */ + __be32 extra[3]; +}; + +struct nfp_fl_set_vxlan { + __be16 a_op; + __be16 reserved; + __be64 tun_id; + __be32 tun_type_index; + __be16 tun_flags; + u8 ipv4_ttl; + u8 ipv4_tos; + __be32 extra[2]; +} __packed; + /* Metadata with L2 (1W/4B) * ---------------------------------------------------------------- * 3 2 1 From fd0dd1ab1e107369c950796bb9b0e8eab6134bf1 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:38 +0200 Subject: [PATCH 0909/1322] nfp: offload flower vxlan endpoint MAC addresses Generate a list of MAC addresses of netdevs that could be used as VXLAN tunnel end points. Give offloaded MACs an index for storage on the NFP in the ranges: 0x100-0x1ff physical port representors 0x200-0x2ff VF port representors 0x300-0x3ff other offloads (e.g. vxlan netdevs, ovs bridges) Assign phys and vf indexes based on unique 8 bit values in the port num. Maintain list of other netdevs to ensure same netdev is not offloaded twice and each gets a unique ID without exhausting the entries. Because the IDs are unique but constant for a netdev, any changes are implemented by overwriting the index on NFP. Signed-off-by: John Hurley Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/Makefile | 3 +- .../net/ethernet/netronome/nfp/flower/cmsg.c | 7 - .../net/ethernet/netronome/nfp/flower/cmsg.h | 9 + .../net/ethernet/netronome/nfp/flower/main.c | 13 + .../net/ethernet/netronome/nfp/flower/main.h | 18 + .../net/ethernet/netronome/nfp/flower/match.c | 7 + .../netronome/nfp/flower/tunnel_conf.c | 374 ++++++++++++++++++ 7 files changed, 423 insertions(+), 8 deletions(-) create mode 100644 drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index 96e579a15cbe..becaacf1554d 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -37,7 +37,8 @@ nfp-objs += \ flower/main.o \ flower/match.o \ flower/metadata.o \ - flower/offload.o + flower/offload.o \ + flower/tunnel_conf.o endif ifeq ($(CONFIG_BPF_SYSCALL),y) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index c3ca05d10fe1..b756006dba6f 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -38,17 +38,10 @@ #include #include "main.h" -#include "../nfpcore/nfp_cpp.h" #include "../nfp_net.h" #include "../nfp_net_repr.h" #include "./cmsg.h" -#define nfp_flower_cmsg_warn(app, fmt, args...) \ - do { \ - if (net_ratelimit()) \ - nfp_warn((app)->cpp, fmt, ## args); \ - } while (0) - static struct nfp_flower_cmsg_hdr * nfp_flower_cmsg_get_hdr(struct sk_buff *skb) { diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index ff42ce8a1e9c..dc248193c996 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -39,6 +39,7 @@ #include #include "../nfp_app.h" +#include "../nfpcore/nfp_cpp.h" #define NFP_FLOWER_LAYER_META BIT(0) #define NFP_FLOWER_LAYER_PORT BIT(1) @@ -90,6 +91,12 @@ #define NFP_FL_IPV4_TUNNEL_TYPE GENMASK(7, 4) #define NFP_FL_IPV4_PRE_TUN_INDEX GENMASK(2, 0) +#define nfp_flower_cmsg_warn(app, fmt, args...) \ + do { \ + if (net_ratelimit()) \ + nfp_warn((app)->cpp, fmt, ## args); \ + } while (0) + enum nfp_flower_tun_type { NFP_FL_TUNNEL_NONE = 0, NFP_FL_TUNNEL_VXLAN = 2, @@ -310,6 +317,7 @@ enum nfp_flower_cmsg_type_port { NFP_FLOWER_CMSG_TYPE_FLOW_DEL = 2, NFP_FLOWER_CMSG_TYPE_MAC_REPR = 7, NFP_FLOWER_CMSG_TYPE_PORT_MOD = 8, + NFP_FLOWER_CMSG_TYPE_TUN_MAC = 11, NFP_FLOWER_CMSG_TYPE_FLOW_STATS = 15, NFP_FLOWER_CMSG_TYPE_PORT_ECHO = 16, NFP_FLOWER_CMSG_TYPE_MAX = 32, @@ -343,6 +351,7 @@ enum nfp_flower_cmsg_port_type { NFP_FLOWER_CMSG_PORT_TYPE_UNSPEC = 0x0, NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT = 0x1, NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT = 0x2, + NFP_FLOWER_CMSG_PORT_TYPE_OTHER_PORT = 0x3, }; enum nfp_flower_cmsg_port_vnic_type { diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 91fe03617106..e46e7c60d491 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -436,6 +436,16 @@ static void nfp_flower_clean(struct nfp_app *app) app->priv = NULL; } +static int nfp_flower_start(struct nfp_app *app) +{ + return nfp_tunnel_config_start(app); +} + +static void nfp_flower_stop(struct nfp_app *app) +{ + nfp_tunnel_config_stop(app); +} + const struct nfp_app_type app_flower = { .id = NFP_APP_FLOWER_NIC, .name = "flower", @@ -453,6 +463,9 @@ const struct nfp_app_type app_flower = { .repr_open = nfp_flower_repr_netdev_open, .repr_stop = nfp_flower_repr_netdev_stop, + .start = nfp_flower_start, + .stop = nfp_flower_stop, + .ctrl_msg_rx = nfp_flower_cmsg_rx, .sriov_enable = nfp_flower_sriov_enable, diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index cd695eabce02..9de375acc254 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -84,6 +84,13 @@ struct nfp_fl_stats_id { * @flow_table: Hash table used to store flower rules * @cmsg_work: Workqueue for control messages processing * @cmsg_skbs: List of skbs for control message processing + * @nfp_mac_off_list: List of MAC addresses to offload + * @nfp_mac_index_list: List of unique 8-bit indexes for non NFP netdevs + * @nfp_mac_off_lock: Lock for the MAC address list + * @nfp_mac_index_lock: Lock for the MAC index list + * @nfp_mac_off_ids: IDA to manage id assignment for offloaded macs + * @nfp_mac_off_count: Number of MACs in address list + * @nfp_tun_mac_nb: Notifier to monitor link state */ struct nfp_flower_priv { struct nfp_app *app; @@ -96,6 +103,13 @@ struct nfp_flower_priv { DECLARE_HASHTABLE(flow_table, NFP_FLOWER_HASH_BITS); struct work_struct cmsg_work; struct sk_buff_head cmsg_skbs; + struct list_head nfp_mac_off_list; + struct list_head nfp_mac_index_list; + struct mutex nfp_mac_off_lock; + struct mutex nfp_mac_index_lock; + struct ida nfp_mac_off_ids; + int nfp_mac_off_count; + struct notifier_block nfp_tun_mac_nb; }; struct nfp_fl_key_ls { @@ -165,4 +179,8 @@ nfp_flower_remove_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie); void nfp_flower_rx_flow_stats(struct nfp_app *app, struct sk_buff *skb); +int nfp_tunnel_config_start(struct nfp_app *app); +void nfp_tunnel_config_stop(struct nfp_app *app); +void nfp_tunnel_write_macs(struct nfp_app *app); + #endif diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index 1fd1bab0611f..cb3ff6c126e8 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -232,6 +232,7 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, struct nfp_fl_payload *nfp_flow) { enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE; + struct nfp_repr *netdev_repr; int err; u8 *ext; u8 *msk; @@ -341,6 +342,12 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, flow, true); ext += sizeof(struct nfp_flower_vxlan); msk += sizeof(struct nfp_flower_vxlan); + + /* Configure tunnel end point MAC. */ + if (nfp_netdev_is_nfp_repr(netdev)) { + netdev_repr = netdev_priv(netdev); + nfp_tunnel_write_macs(netdev_repr->app); + } } return 0; diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c new file mode 100644 index 000000000000..34be85803020 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -0,0 +1,374 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "cmsg.h" +#include "main.h" +#include "../nfp_net_repr.h" +#include "../nfp_net.h" + +/** + * struct nfp_tun_mac_addr - configure MAC address of tunnel EP on NFP + * @reserved: reserved for future use + * @count: number of MAC addresses in the message + * @index: index of MAC address in the lookup table + * @addr: interface MAC address + * @addresses: series of MACs to offload + */ +struct nfp_tun_mac_addr { + __be16 reserved; + __be16 count; + struct index_mac_addr { + __be16 index; + u8 addr[ETH_ALEN]; + } addresses[]; +}; + +/** + * struct nfp_tun_mac_offload_entry - list of MACs to offload + * @index: index of MAC address for offloading + * @addr: interface MAC address + * @list: list pointer + */ +struct nfp_tun_mac_offload_entry { + __be16 index; + u8 addr[ETH_ALEN]; + struct list_head list; +}; + +#define NFP_MAX_MAC_INDEX 0xff + +/** + * struct nfp_tun_mac_non_nfp_idx - converts non NFP netdev ifindex to 8-bit id + * @ifindex: netdev ifindex of the device + * @index: index of netdevs mac on NFP + * @list: list pointer + */ +struct nfp_tun_mac_non_nfp_idx { + int ifindex; + u8 index; + struct list_head list; +}; + +static bool nfp_tun_is_netdev_to_offload(struct net_device *netdev) +{ + if (!netdev->rtnl_link_ops) + return false; + if (!strcmp(netdev->rtnl_link_ops->kind, "openvswitch")) + return true; + if (!strcmp(netdev->rtnl_link_ops->kind, "vxlan")) + return true; + + return false; +} + +static int +nfp_flower_xmit_tun_conf(struct nfp_app *app, u8 mtype, u16 plen, void *pdata) +{ + struct sk_buff *skb; + unsigned char *msg; + + skb = nfp_flower_cmsg_alloc(app, plen, mtype); + if (!skb) + return -ENOMEM; + + msg = nfp_flower_cmsg_get_data(skb); + memcpy(msg, pdata, nfp_flower_cmsg_get_data_len(skb)); + + nfp_ctrl_tx(app->ctrl, skb); + return 0; +} + +void nfp_tunnel_write_macs(struct nfp_app *app) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_mac_offload_entry *entry; + struct nfp_tun_mac_addr *payload; + struct list_head *ptr, *storage; + int mac_count, err, pay_size; + + mutex_lock(&priv->nfp_mac_off_lock); + if (!priv->nfp_mac_off_count) { + mutex_unlock(&priv->nfp_mac_off_lock); + return; + } + + pay_size = sizeof(struct nfp_tun_mac_addr) + + sizeof(struct index_mac_addr) * priv->nfp_mac_off_count; + + payload = kzalloc(pay_size, GFP_KERNEL); + if (!payload) { + mutex_unlock(&priv->nfp_mac_off_lock); + return; + } + + payload->count = cpu_to_be16(priv->nfp_mac_off_count); + + mac_count = 0; + list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) { + entry = list_entry(ptr, struct nfp_tun_mac_offload_entry, + list); + payload->addresses[mac_count].index = entry->index; + ether_addr_copy(payload->addresses[mac_count].addr, + entry->addr); + mac_count++; + } + + err = nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_MAC, + pay_size, payload); + + kfree(payload); + + if (err) { + mutex_unlock(&priv->nfp_mac_off_lock); + /* Write failed so retain list for future retry. */ + return; + } + + /* If list was successfully offloaded, flush it. */ + list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) { + entry = list_entry(ptr, struct nfp_tun_mac_offload_entry, + list); + list_del(&entry->list); + kfree(entry); + } + + priv->nfp_mac_off_count = 0; + mutex_unlock(&priv->nfp_mac_off_lock); +} + +static int nfp_tun_get_mac_idx(struct nfp_app *app, int ifindex) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_mac_non_nfp_idx *entry; + struct list_head *ptr, *storage; + int idx; + + mutex_lock(&priv->nfp_mac_index_lock); + list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) { + entry = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx, list); + if (entry->ifindex == ifindex) { + idx = entry->index; + mutex_unlock(&priv->nfp_mac_index_lock); + return idx; + } + } + + idx = ida_simple_get(&priv->nfp_mac_off_ids, 0, + NFP_MAX_MAC_INDEX, GFP_KERNEL); + if (idx < 0) { + mutex_unlock(&priv->nfp_mac_index_lock); + return idx; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + mutex_unlock(&priv->nfp_mac_index_lock); + return -ENOMEM; + } + entry->ifindex = ifindex; + entry->index = idx; + list_add_tail(&entry->list, &priv->nfp_mac_index_list); + mutex_unlock(&priv->nfp_mac_index_lock); + + return idx; +} + +static void nfp_tun_del_mac_idx(struct nfp_app *app, int ifindex) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_mac_non_nfp_idx *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_mac_index_lock); + list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) { + entry = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx, list); + if (entry->ifindex == ifindex) { + ida_simple_remove(&priv->nfp_mac_off_ids, + entry->index); + list_del(&entry->list); + kfree(entry); + break; + } + } + mutex_unlock(&priv->nfp_mac_index_lock); +} + +static void nfp_tun_add_to_mac_offload_list(struct net_device *netdev, + struct nfp_app *app) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_mac_offload_entry *entry; + u16 nfp_mac_idx; + int port = 0; + + /* Check if MAC should be offloaded. */ + if (!is_valid_ether_addr(netdev->dev_addr)) + return; + + if (nfp_netdev_is_nfp_repr(netdev)) + port = nfp_repr_get_port_id(netdev); + else if (!nfp_tun_is_netdev_to_offload(netdev)) + return; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + nfp_flower_cmsg_warn(app, "Mem fail when offloading MAC.\n"); + return; + } + + if (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port) == + NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT) { + nfp_mac_idx = port << 8 | NFP_FLOWER_CMSG_PORT_TYPE_PHYS_PORT; + } else if (FIELD_GET(NFP_FLOWER_CMSG_PORT_TYPE, port) == + NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT) { + port = FIELD_GET(NFP_FLOWER_CMSG_PORT_VNIC, port); + nfp_mac_idx = port << 8 | NFP_FLOWER_CMSG_PORT_TYPE_PCIE_PORT; + } else { + /* Must assign our own unique 8-bit index. */ + int idx = nfp_tun_get_mac_idx(app, netdev->ifindex); + + if (idx < 0) { + nfp_flower_cmsg_warn(app, "Can't assign non-repr MAC index.\n"); + kfree(entry); + return; + } + nfp_mac_idx = idx << 8 | NFP_FLOWER_CMSG_PORT_TYPE_OTHER_PORT; + } + + entry->index = cpu_to_be16(nfp_mac_idx); + ether_addr_copy(entry->addr, netdev->dev_addr); + + mutex_lock(&priv->nfp_mac_off_lock); + priv->nfp_mac_off_count++; + list_add_tail(&entry->list, &priv->nfp_mac_off_list); + mutex_unlock(&priv->nfp_mac_off_lock); +} + +static int nfp_tun_mac_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct nfp_flower_priv *app_priv; + struct net_device *netdev; + struct nfp_app *app; + + if (event == NETDEV_DOWN || event == NETDEV_UNREGISTER) { + app_priv = container_of(nb, struct nfp_flower_priv, + nfp_tun_mac_nb); + app = app_priv->app; + netdev = netdev_notifier_info_to_dev(ptr); + + /* If non-nfp netdev then free its offload index. */ + if (nfp_tun_is_netdev_to_offload(netdev)) + nfp_tun_del_mac_idx(app, netdev->ifindex); + } else if (event == NETDEV_UP || event == NETDEV_CHANGEADDR || + event == NETDEV_REGISTER) { + app_priv = container_of(nb, struct nfp_flower_priv, + nfp_tun_mac_nb); + app = app_priv->app; + netdev = netdev_notifier_info_to_dev(ptr); + + nfp_tun_add_to_mac_offload_list(netdev, app); + + /* Force a list write to keep NFP up to date. */ + nfp_tunnel_write_macs(app); + } + return NOTIFY_OK; +} + +int nfp_tunnel_config_start(struct nfp_app *app) +{ + struct nfp_flower_priv *priv = app->priv; + struct net_device *netdev; + int err; + + /* Initialise priv data for MAC offloading. */ + priv->nfp_mac_off_count = 0; + mutex_init(&priv->nfp_mac_off_lock); + INIT_LIST_HEAD(&priv->nfp_mac_off_list); + priv->nfp_tun_mac_nb.notifier_call = nfp_tun_mac_event_handler; + mutex_init(&priv->nfp_mac_index_lock); + INIT_LIST_HEAD(&priv->nfp_mac_index_list); + ida_init(&priv->nfp_mac_off_ids); + + err = register_netdevice_notifier(&priv->nfp_tun_mac_nb); + if (err) + goto err_free_mac_ida; + + /* Parse netdevs already registered for MACs that need offloaded. */ + rtnl_lock(); + for_each_netdev(&init_net, netdev) + nfp_tun_add_to_mac_offload_list(netdev, app); + rtnl_unlock(); + + return 0; + +err_free_mac_ida: + ida_destroy(&priv->nfp_mac_off_ids); + return err; +} + +void nfp_tunnel_config_stop(struct nfp_app *app) +{ + struct nfp_tun_mac_offload_entry *mac_entry; + struct nfp_flower_priv *priv = app->priv; + struct nfp_tun_mac_non_nfp_idx *mac_idx; + struct list_head *ptr, *storage; + + unregister_netdevice_notifier(&priv->nfp_tun_mac_nb); + + /* Free any memory that may be occupied by MAC list. */ + mutex_lock(&priv->nfp_mac_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_mac_off_list) { + mac_entry = list_entry(ptr, struct nfp_tun_mac_offload_entry, + list); + list_del(&mac_entry->list); + kfree(mac_entry); + } + mutex_unlock(&priv->nfp_mac_off_lock); + + /* Free any memory that may be occupied by MAC index list. */ + mutex_lock(&priv->nfp_mac_index_lock); + list_for_each_safe(ptr, storage, &priv->nfp_mac_index_list) { + mac_idx = list_entry(ptr, struct nfp_tun_mac_non_nfp_idx, + list); + list_del(&mac_idx->list); + kfree(mac_idx); + } + mutex_unlock(&priv->nfp_mac_index_lock); + + ida_destroy(&priv->nfp_mac_off_ids); +} From 2d9ad71a8ce67eea9ee38512a215e1893bd5cf87 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:39 +0200 Subject: [PATCH 0910/1322] nfp: offload vxlan IPv4 endpoints of flower rules Maintain a list of IPv4 addresses used as the tunnel destination IP match fields in currently active flower rules. Offload the entire list of NFP_FL_IPV4_ADDRS_MAX (even if some are unused) when new IPs are added or removed. The NFP should only be aware of tunnel end points that are currently used by rules on the device Signed-off-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/cmsg.h | 1 + .../net/ethernet/netronome/nfp/flower/main.h | 7 + .../net/ethernet/netronome/nfp/flower/match.c | 14 +- .../ethernet/netronome/nfp/flower/offload.c | 4 + .../netronome/nfp/flower/tunnel_conf.c | 120 ++++++++++++++++++ 5 files changed, 143 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index dc248193c996..6540bb1ceefb 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -318,6 +318,7 @@ enum nfp_flower_cmsg_type_port { NFP_FLOWER_CMSG_TYPE_MAC_REPR = 7, NFP_FLOWER_CMSG_TYPE_PORT_MOD = 8, NFP_FLOWER_CMSG_TYPE_TUN_MAC = 11, + NFP_FLOWER_CMSG_TYPE_TUN_IPS = 14, NFP_FLOWER_CMSG_TYPE_FLOW_STATS = 15, NFP_FLOWER_CMSG_TYPE_PORT_ECHO = 16, NFP_FLOWER_CMSG_TYPE_MAX = 32, diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 9de375acc254..53306af6cfe8 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -86,8 +86,10 @@ struct nfp_fl_stats_id { * @cmsg_skbs: List of skbs for control message processing * @nfp_mac_off_list: List of MAC addresses to offload * @nfp_mac_index_list: List of unique 8-bit indexes for non NFP netdevs + * @nfp_ipv4_off_list: List of IPv4 addresses to offload * @nfp_mac_off_lock: Lock for the MAC address list * @nfp_mac_index_lock: Lock for the MAC index list + * @nfp_ipv4_off_lock: Lock for the IPv4 address list * @nfp_mac_off_ids: IDA to manage id assignment for offloaded macs * @nfp_mac_off_count: Number of MACs in address list * @nfp_tun_mac_nb: Notifier to monitor link state @@ -105,8 +107,10 @@ struct nfp_flower_priv { struct sk_buff_head cmsg_skbs; struct list_head nfp_mac_off_list; struct list_head nfp_mac_index_list; + struct list_head nfp_ipv4_off_list; struct mutex nfp_mac_off_lock; struct mutex nfp_mac_index_lock; + struct mutex nfp_ipv4_off_lock; struct ida nfp_mac_off_ids; int nfp_mac_off_count; struct notifier_block nfp_tun_mac_nb; @@ -142,6 +146,7 @@ struct nfp_fl_payload { struct rcu_head rcu; spinlock_t lock; /* lock stats */ struct nfp_fl_stats stats; + __be32 nfp_tun_ipv4_addr; char *unmasked_data; char *mask_data; char *action_data; @@ -182,5 +187,7 @@ void nfp_flower_rx_flow_stats(struct nfp_app *app, struct sk_buff *skb); int nfp_tunnel_config_start(struct nfp_app *app); void nfp_tunnel_config_stop(struct nfp_app *app); void nfp_tunnel_write_macs(struct nfp_app *app); +void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4); +void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4); #endif diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index cb3ff6c126e8..865a815ab92a 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -195,7 +195,7 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame, static void nfp_flower_compile_vxlan(struct nfp_flower_vxlan *frame, struct tc_cls_flower_offload *flow, - bool mask_version) + bool mask_version, __be32 *tun_dst) { struct fl_flow_key *target = mask_version ? flow->mask : flow->key; struct flow_dissector_key_ipv4_addrs *vxlan_ips; @@ -223,6 +223,7 @@ nfp_flower_compile_vxlan(struct nfp_flower_vxlan *frame, target); frame->ip_src = vxlan_ips->src; frame->ip_dst = vxlan_ips->dst; + *tun_dst = vxlan_ips->dst; } } @@ -232,6 +233,7 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, struct nfp_fl_payload *nfp_flow) { enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE; + __be32 tun_dst, tun_dst_mask = 0; struct nfp_repr *netdev_repr; int err; u8 *ext; @@ -336,10 +338,10 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, if (key_ls->key_layer & NFP_FLOWER_LAYER_VXLAN) { /* Populate Exact VXLAN Data. */ nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)ext, - flow, false); + flow, false, &tun_dst); /* Populate Mask VXLAN Data. */ nfp_flower_compile_vxlan((struct nfp_flower_vxlan *)msk, - flow, true); + flow, true, &tun_dst_mask); ext += sizeof(struct nfp_flower_vxlan); msk += sizeof(struct nfp_flower_vxlan); @@ -347,6 +349,12 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, if (nfp_netdev_is_nfp_repr(netdev)) { netdev_repr = netdev_priv(netdev); nfp_tunnel_write_macs(netdev_repr->app); + + /* Store the tunnel destination in the rule data. + * This must be present and be an exact match. + */ + nfp_flow->nfp_tun_ipv4_addr = tun_dst; + nfp_tunnel_add_ipv4_off(netdev_repr->app, tun_dst); } } diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 637372ba8f55..3d9537ebdea4 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -306,6 +306,7 @@ nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer) if (!flow_pay->action_data) goto err_free_mask; + flow_pay->nfp_tun_ipv4_addr = 0; flow_pay->meta.flags = 0; spin_lock_init(&flow_pay->lock); @@ -415,6 +416,9 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev, if (err) goto err_free_flow; + if (nfp_flow->nfp_tun_ipv4_addr) + nfp_tunnel_del_ipv4_off(app, nfp_flow->nfp_tun_ipv4_addr); + err = nfp_flower_xmit_flow(netdev, nfp_flow, NFP_FLOWER_CMSG_TYPE_FLOW_DEL); if (err) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 34be85803020..185505140f5e 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -32,6 +32,7 @@ */ #include +#include #include #include @@ -40,6 +41,30 @@ #include "../nfp_net_repr.h" #include "../nfp_net.h" +#define NFP_FL_IPV4_ADDRS_MAX 32 + +/** + * struct nfp_tun_ipv4_addr - set the IP address list on the NFP + * @count: number of IPs populated in the array + * @ipv4_addr: array of IPV4_ADDRS_MAX 32 bit IPv4 addresses + */ +struct nfp_tun_ipv4_addr { + __be32 count; + __be32 ipv4_addr[NFP_FL_IPV4_ADDRS_MAX]; +}; + +/** + * struct nfp_ipv4_addr_entry - cached IPv4 addresses + * @ipv4_addr: IP address + * @ref_count: number of rules currently using this IP + * @list: list pointer + */ +struct nfp_ipv4_addr_entry { + __be32 ipv4_addr; + int ref_count; + struct list_head list; +}; + /** * struct nfp_tun_mac_addr - configure MAC address of tunnel EP on NFP * @reserved: reserved for future use @@ -112,6 +137,87 @@ nfp_flower_xmit_tun_conf(struct nfp_app *app, u8 mtype, u16 plen, void *pdata) return 0; } +static void nfp_tun_write_ipv4_list(struct nfp_app *app) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_addr_entry *entry; + struct nfp_tun_ipv4_addr payload; + struct list_head *ptr, *storage; + int count; + + memset(&payload, 0, sizeof(struct nfp_tun_ipv4_addr)); + mutex_lock(&priv->nfp_ipv4_off_lock); + count = 0; + list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) { + if (count >= NFP_FL_IPV4_ADDRS_MAX) { + mutex_unlock(&priv->nfp_ipv4_off_lock); + nfp_flower_cmsg_warn(app, "IPv4 offload exceeds limit.\n"); + return; + } + entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list); + payload.ipv4_addr[count++] = entry->ipv4_addr; + } + payload.count = cpu_to_be32(count); + mutex_unlock(&priv->nfp_ipv4_off_lock); + + nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_IPS, + sizeof(struct nfp_tun_ipv4_addr), + &payload); +} + +void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_addr_entry *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_ipv4_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) { + entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list); + if (entry->ipv4_addr == ipv4) { + entry->ref_count++; + mutex_unlock(&priv->nfp_ipv4_off_lock); + return; + } + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + mutex_unlock(&priv->nfp_ipv4_off_lock); + nfp_flower_cmsg_warn(app, "Mem error when offloading IP address.\n"); + return; + } + entry->ipv4_addr = ipv4; + entry->ref_count = 1; + list_add_tail(&entry->list, &priv->nfp_ipv4_off_list); + mutex_unlock(&priv->nfp_ipv4_off_lock); + + nfp_tun_write_ipv4_list(app); +} + +void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_addr_entry *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_ipv4_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) { + entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list); + if (entry->ipv4_addr == ipv4) { + entry->ref_count--; + if (!entry->ref_count) { + list_del(&entry->list); + kfree(entry); + } + break; + } + } + mutex_unlock(&priv->nfp_ipv4_off_lock); + + nfp_tun_write_ipv4_list(app); +} + void nfp_tunnel_write_macs(struct nfp_app *app) { struct nfp_flower_priv *priv = app->priv; @@ -324,6 +430,10 @@ int nfp_tunnel_config_start(struct nfp_app *app) INIT_LIST_HEAD(&priv->nfp_mac_index_list); ida_init(&priv->nfp_mac_off_ids); + /* Initialise priv data for IPv4 offloading. */ + mutex_init(&priv->nfp_ipv4_off_lock); + INIT_LIST_HEAD(&priv->nfp_ipv4_off_list); + err = register_netdevice_notifier(&priv->nfp_tun_mac_nb); if (err) goto err_free_mac_ida; @@ -346,6 +456,7 @@ void nfp_tunnel_config_stop(struct nfp_app *app) struct nfp_tun_mac_offload_entry *mac_entry; struct nfp_flower_priv *priv = app->priv; struct nfp_tun_mac_non_nfp_idx *mac_idx; + struct nfp_ipv4_addr_entry *ip_entry; struct list_head *ptr, *storage; unregister_netdevice_notifier(&priv->nfp_tun_mac_nb); @@ -371,4 +482,13 @@ void nfp_tunnel_config_stop(struct nfp_app *app) mutex_unlock(&priv->nfp_mac_index_lock); ida_destroy(&priv->nfp_mac_off_ids); + + /* Free any memory that may be occupied by ipv4 list. */ + mutex_lock(&priv->nfp_ipv4_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_ipv4_off_list) { + ip_entry = list_entry(ptr, struct nfp_ipv4_addr_entry, list); + list_del(&ip_entry->list); + kfree(ip_entry); + } + mutex_unlock(&priv->nfp_ipv4_off_lock); } From 8e6a9046b66a7dfb11ae8be226afaaf417649411 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:40 +0200 Subject: [PATCH 0911/1322] nfp: flower vxlan neighbour offload Receive a request when the NFP does not know the next hop for a packet that is to be encapsulated in a VXLAN tunnel. Do a route lookup, determine the next hop entry and update neighbour table on NFP. Monitor the kernel neighbour table for link changes and update NFP with relevant information. Overwrite routes with zero values on the NFP when they expire. Signed-off-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/cmsg.c | 6 + .../net/ethernet/netronome/nfp/flower/cmsg.h | 2 + .../net/ethernet/netronome/nfp/flower/main.h | 7 + .../netronome/nfp/flower/tunnel_conf.c | 253 ++++++++++++++++++ 4 files changed, 268 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index b756006dba6f..862787daaa68 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -181,6 +181,12 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) case NFP_FLOWER_CMSG_TYPE_FLOW_STATS: nfp_flower_rx_flow_stats(app, skb); break; + case NFP_FLOWER_CMSG_TYPE_NO_NEIGH: + nfp_tunnel_request_route(app, skb); + break; + case NFP_FLOWER_CMSG_TYPE_TUN_NEIGH: + /* Acks from the NFP that the route is added - ignore. */ + break; default: nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n", type); diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index 6540bb1ceefb..1dc72a1ed577 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -317,7 +317,9 @@ enum nfp_flower_cmsg_type_port { NFP_FLOWER_CMSG_TYPE_FLOW_DEL = 2, NFP_FLOWER_CMSG_TYPE_MAC_REPR = 7, NFP_FLOWER_CMSG_TYPE_PORT_MOD = 8, + NFP_FLOWER_CMSG_TYPE_NO_NEIGH = 10, NFP_FLOWER_CMSG_TYPE_TUN_MAC = 11, + NFP_FLOWER_CMSG_TYPE_TUN_NEIGH = 13, NFP_FLOWER_CMSG_TYPE_TUN_IPS = 14, NFP_FLOWER_CMSG_TYPE_FLOW_STATS = 15, NFP_FLOWER_CMSG_TYPE_PORT_ECHO = 16, diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 53306af6cfe8..93ad969c3653 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -87,12 +87,15 @@ struct nfp_fl_stats_id { * @nfp_mac_off_list: List of MAC addresses to offload * @nfp_mac_index_list: List of unique 8-bit indexes for non NFP netdevs * @nfp_ipv4_off_list: List of IPv4 addresses to offload + * @nfp_neigh_off_list: List of neighbour offloads * @nfp_mac_off_lock: Lock for the MAC address list * @nfp_mac_index_lock: Lock for the MAC index list * @nfp_ipv4_off_lock: Lock for the IPv4 address list + * @nfp_neigh_off_lock: Lock for the neighbour address list * @nfp_mac_off_ids: IDA to manage id assignment for offloaded macs * @nfp_mac_off_count: Number of MACs in address list * @nfp_tun_mac_nb: Notifier to monitor link state + * @nfp_tun_neigh_nb: Notifier to monitor neighbour state */ struct nfp_flower_priv { struct nfp_app *app; @@ -108,12 +111,15 @@ struct nfp_flower_priv { struct list_head nfp_mac_off_list; struct list_head nfp_mac_index_list; struct list_head nfp_ipv4_off_list; + struct list_head nfp_neigh_off_list; struct mutex nfp_mac_off_lock; struct mutex nfp_mac_index_lock; struct mutex nfp_ipv4_off_lock; + struct mutex nfp_neigh_off_lock; struct ida nfp_mac_off_ids; int nfp_mac_off_count; struct notifier_block nfp_tun_mac_nb; + struct notifier_block nfp_tun_neigh_nb; }; struct nfp_fl_key_ls { @@ -189,5 +195,6 @@ void nfp_tunnel_config_stop(struct nfp_app *app); void nfp_tunnel_write_macs(struct nfp_app *app); void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4); void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4); +void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 185505140f5e..8c6b88a1306b 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -41,6 +42,44 @@ #include "../nfp_net_repr.h" #include "../nfp_net.h" +/** + * struct nfp_tun_neigh - neighbour/route entry on the NFP + * @dst_ipv4: destination IPv4 address + * @src_ipv4: source IPv4 address + * @dst_addr: destination MAC address + * @src_addr: source MAC address + * @port_id: NFP port to output packet on - associated with source IPv4 + */ +struct nfp_tun_neigh { + __be32 dst_ipv4; + __be32 src_ipv4; + u8 dst_addr[ETH_ALEN]; + u8 src_addr[ETH_ALEN]; + __be32 port_id; +}; + +/** + * struct nfp_tun_req_route_ipv4 - NFP requests a route/neighbour lookup + * @ingress_port: ingress port of packet that signalled request + * @ipv4_addr: destination ipv4 address for route + * @reserved: reserved for future use + */ +struct nfp_tun_req_route_ipv4 { + __be32 ingress_port; + __be32 ipv4_addr; + __be32 reserved[2]; +}; + +/** + * struct nfp_ipv4_route_entry - routes that are offloaded to the NFP + * @ipv4_addr: destination of route + * @list: list pointer + */ +struct nfp_ipv4_route_entry { + __be32 ipv4_addr; + struct list_head list; +}; + #define NFP_FL_IPV4_ADDRS_MAX 32 /** @@ -137,6 +176,197 @@ nfp_flower_xmit_tun_conf(struct nfp_app *app, u8 mtype, u16 plen, void *pdata) return 0; } +static bool nfp_tun_has_route(struct nfp_app *app, __be32 ipv4_addr) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_route_entry *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_neigh_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) { + entry = list_entry(ptr, struct nfp_ipv4_route_entry, list); + if (entry->ipv4_addr == ipv4_addr) { + mutex_unlock(&priv->nfp_neigh_off_lock); + return true; + } + } + mutex_unlock(&priv->nfp_neigh_off_lock); + return false; +} + +static void nfp_tun_add_route_to_cache(struct nfp_app *app, __be32 ipv4_addr) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_route_entry *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_neigh_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) { + entry = list_entry(ptr, struct nfp_ipv4_route_entry, list); + if (entry->ipv4_addr == ipv4_addr) { + mutex_unlock(&priv->nfp_neigh_off_lock); + return; + } + } + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + mutex_unlock(&priv->nfp_neigh_off_lock); + nfp_flower_cmsg_warn(app, "Mem error when storing new route.\n"); + return; + } + + entry->ipv4_addr = ipv4_addr; + list_add_tail(&entry->list, &priv->nfp_neigh_off_list); + mutex_unlock(&priv->nfp_neigh_off_lock); +} + +static void nfp_tun_del_route_from_cache(struct nfp_app *app, __be32 ipv4_addr) +{ + struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_route_entry *entry; + struct list_head *ptr, *storage; + + mutex_lock(&priv->nfp_neigh_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) { + entry = list_entry(ptr, struct nfp_ipv4_route_entry, list); + if (entry->ipv4_addr == ipv4_addr) { + list_del(&entry->list); + kfree(entry); + break; + } + } + mutex_unlock(&priv->nfp_neigh_off_lock); +} + +static void +nfp_tun_write_neigh(struct net_device *netdev, struct nfp_app *app, + struct flowi4 *flow, struct neighbour *neigh) +{ + struct nfp_tun_neigh payload; + + /* Only offload representor IPv4s for now. */ + if (!nfp_netdev_is_nfp_repr(netdev)) + return; + + memset(&payload, 0, sizeof(struct nfp_tun_neigh)); + payload.dst_ipv4 = flow->daddr; + + /* If entry has expired send dst IP with all other fields 0. */ + if (!(neigh->nud_state & NUD_VALID)) { + nfp_tun_del_route_from_cache(app, payload.dst_ipv4); + /* Trigger ARP to verify invalid neighbour state. */ + neigh_event_send(neigh, NULL); + goto send_msg; + } + + /* Have a valid neighbour so populate rest of entry. */ + payload.src_ipv4 = flow->saddr; + ether_addr_copy(payload.src_addr, netdev->dev_addr); + neigh_ha_snapshot(payload.dst_addr, neigh, netdev); + payload.port_id = cpu_to_be32(nfp_repr_get_port_id(netdev)); + /* Add destination of new route to NFP cache. */ + nfp_tun_add_route_to_cache(app, payload.dst_ipv4); + +send_msg: + nfp_flower_xmit_tun_conf(app, NFP_FLOWER_CMSG_TYPE_TUN_NEIGH, + sizeof(struct nfp_tun_neigh), + (unsigned char *)&payload); +} + +static int +nfp_tun_neigh_event_handler(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct nfp_flower_priv *app_priv; + struct netevent_redirect *redir; + struct flowi4 flow = {}; + struct neighbour *n; + struct nfp_app *app; + struct rtable *rt; + int err; + + switch (event) { + case NETEVENT_REDIRECT: + redir = (struct netevent_redirect *)ptr; + n = redir->neigh; + break; + case NETEVENT_NEIGH_UPDATE: + n = (struct neighbour *)ptr; + break; + default: + return NOTIFY_DONE; + } + + flow.daddr = *(__be32 *)n->primary_key; + + /* Only concerned with route changes for representors. */ + if (!nfp_netdev_is_nfp_repr(n->dev)) + return NOTIFY_DONE; + + app_priv = container_of(nb, struct nfp_flower_priv, nfp_tun_neigh_nb); + app = app_priv->app; + + /* Only concerned with changes to routes already added to NFP. */ + if (!nfp_tun_has_route(app, flow.daddr)) + return NOTIFY_DONE; + +#if IS_ENABLED(CONFIG_INET) + /* Do a route lookup to populate flow data. */ + rt = ip_route_output_key(dev_net(n->dev), &flow); + err = PTR_ERR_OR_ZERO(rt); + if (err) + return NOTIFY_DONE; +#else + return NOTIFY_DONE; +#endif + + flow.flowi4_proto = IPPROTO_UDP; + nfp_tun_write_neigh(n->dev, app, &flow, n); + + return NOTIFY_OK; +} + +void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb) +{ + struct nfp_tun_req_route_ipv4 *payload; + struct net_device *netdev; + struct flowi4 flow = {}; + struct neighbour *n; + struct rtable *rt; + int err; + + payload = nfp_flower_cmsg_get_data(skb); + + netdev = nfp_app_repr_get(app, be32_to_cpu(payload->ingress_port)); + if (!netdev) + goto route_fail_warning; + + flow.daddr = payload->ipv4_addr; + flow.flowi4_proto = IPPROTO_UDP; + +#if IS_ENABLED(CONFIG_INET) + /* Do a route lookup on same namespace as ingress port. */ + rt = ip_route_output_key(dev_net(netdev), &flow); + err = PTR_ERR_OR_ZERO(rt); + if (err) + goto route_fail_warning; +#else + goto route_fail_warning; +#endif + + /* Get the neighbour entry for the lookup */ + n = dst_neigh_lookup(&rt->dst, &flow.daddr); + ip_rt_put(rt); + if (!n) + goto route_fail_warning; + nfp_tun_write_neigh(n->dev, app, &flow, n); + neigh_release(n); + return; + +route_fail_warning: + nfp_flower_cmsg_warn(app, "Requested route not found.\n"); +} + static void nfp_tun_write_ipv4_list(struct nfp_app *app) { struct nfp_flower_priv *priv = app->priv; @@ -434,10 +664,19 @@ int nfp_tunnel_config_start(struct nfp_app *app) mutex_init(&priv->nfp_ipv4_off_lock); INIT_LIST_HEAD(&priv->nfp_ipv4_off_list); + /* Initialise priv data for neighbour offloading. */ + mutex_init(&priv->nfp_neigh_off_lock); + INIT_LIST_HEAD(&priv->nfp_neigh_off_list); + priv->nfp_tun_neigh_nb.notifier_call = nfp_tun_neigh_event_handler; + err = register_netdevice_notifier(&priv->nfp_tun_mac_nb); if (err) goto err_free_mac_ida; + err = register_netevent_notifier(&priv->nfp_tun_neigh_nb); + if (err) + goto err_unreg_mac_nb; + /* Parse netdevs already registered for MACs that need offloaded. */ rtnl_lock(); for_each_netdev(&init_net, netdev) @@ -446,6 +685,8 @@ int nfp_tunnel_config_start(struct nfp_app *app) return 0; +err_unreg_mac_nb: + unregister_netdevice_notifier(&priv->nfp_tun_mac_nb); err_free_mac_ida: ida_destroy(&priv->nfp_mac_off_ids); return err; @@ -455,11 +696,13 @@ void nfp_tunnel_config_stop(struct nfp_app *app) { struct nfp_tun_mac_offload_entry *mac_entry; struct nfp_flower_priv *priv = app->priv; + struct nfp_ipv4_route_entry *route_entry; struct nfp_tun_mac_non_nfp_idx *mac_idx; struct nfp_ipv4_addr_entry *ip_entry; struct list_head *ptr, *storage; unregister_netdevice_notifier(&priv->nfp_tun_mac_nb); + unregister_netevent_notifier(&priv->nfp_tun_neigh_nb); /* Free any memory that may be occupied by MAC list. */ mutex_lock(&priv->nfp_mac_off_lock); @@ -491,4 +734,14 @@ void nfp_tunnel_config_stop(struct nfp_app *app) kfree(ip_entry); } mutex_unlock(&priv->nfp_ipv4_off_lock); + + /* Free any memory that may be occupied by the route list. */ + mutex_lock(&priv->nfp_neigh_off_lock); + list_for_each_safe(ptr, storage, &priv->nfp_neigh_off_list) { + route_entry = list_entry(ptr, struct nfp_ipv4_route_entry, + list); + list_del(&route_entry->list); + kfree(route_entry); + } + mutex_unlock(&priv->nfp_neigh_off_lock); } From 856f5b135758ad80053a49f7ce9d1dc0166e3006 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Sep 2017 12:23:41 +0200 Subject: [PATCH 0912/1322] nfp: flower vxlan neighbour keep-alive Periodically receive messages containing the destination IPs of tunnels that have recently forwarded traffic. Update the neighbour entries 'used' value for these IPs next hop. This prevents the neighbour entry from expiring on timeout but rather signals an ARP to verify the connection. From an NFP perspective, packets will not fall back mid-flow unless the link is verified to be down. Signed-off-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/cmsg.c | 3 + .../net/ethernet/netronome/nfp/flower/cmsg.h | 1 + .../net/ethernet/netronome/nfp/flower/main.h | 1 + .../netronome/nfp/flower/tunnel_conf.c | 64 +++++++++++++++++++ 4 files changed, 69 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index 862787daaa68..6b71c719deba 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -184,6 +184,9 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb) case NFP_FLOWER_CMSG_TYPE_NO_NEIGH: nfp_tunnel_request_route(app, skb); break; + case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS: + nfp_tunnel_keep_alive(app, skb); + break; case NFP_FLOWER_CMSG_TYPE_TUN_NEIGH: /* Acks from the NFP that the route is added - ignore. */ break; diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index 1dc72a1ed577..504ddaa21701 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -319,6 +319,7 @@ enum nfp_flower_cmsg_type_port { NFP_FLOWER_CMSG_TYPE_PORT_MOD = 8, NFP_FLOWER_CMSG_TYPE_NO_NEIGH = 10, NFP_FLOWER_CMSG_TYPE_TUN_MAC = 11, + NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS = 12, NFP_FLOWER_CMSG_TYPE_TUN_NEIGH = 13, NFP_FLOWER_CMSG_TYPE_TUN_IPS = 14, NFP_FLOWER_CMSG_TYPE_FLOW_STATS = 15, diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h index 93ad969c3653..12c319a219d8 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.h +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h @@ -196,5 +196,6 @@ void nfp_tunnel_write_macs(struct nfp_app *app); void nfp_tunnel_del_ipv4_off(struct nfp_app *app, __be32 ipv4); void nfp_tunnel_add_ipv4_off(struct nfp_app *app, __be32 ipv4); void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb); +void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index 8c6b88a1306b..c495f8f38506 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -36,12 +36,36 @@ #include #include #include +#include #include "cmsg.h" #include "main.h" #include "../nfp_net_repr.h" #include "../nfp_net.h" +#define NFP_FL_MAX_ROUTES 32 + +/** + * struct nfp_tun_active_tuns - periodic message of active tunnels + * @seq: sequence number of the message + * @count: number of tunnels report in message + * @flags: options part of the request + * @ipv4: dest IPv4 address of active route + * @egress_port: port the encapsulated packet egressed + * @extra: reserved for future use + * @tun_info: tunnels that have sent traffic in reported period + */ +struct nfp_tun_active_tuns { + __be32 seq; + __be32 count; + __be32 flags; + struct route_ip_info { + __be32 ipv4; + __be32 egress_port; + __be32 extra[2]; + } tun_info[]; +}; + /** * struct nfp_tun_neigh - neighbour/route entry on the NFP * @dst_ipv4: destination IPv4 address @@ -147,6 +171,46 @@ struct nfp_tun_mac_non_nfp_idx { struct list_head list; }; +void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb) +{ + struct nfp_tun_active_tuns *payload; + struct net_device *netdev; + int count, i, pay_len; + struct neighbour *n; + __be32 ipv4_addr; + u32 port; + + payload = nfp_flower_cmsg_get_data(skb); + count = be32_to_cpu(payload->count); + if (count > NFP_FL_MAX_ROUTES) { + nfp_flower_cmsg_warn(app, "Tunnel keep-alive request exceeds max routes.\n"); + return; + } + + pay_len = nfp_flower_cmsg_get_data_len(skb); + if (pay_len != sizeof(struct nfp_tun_active_tuns) + + sizeof(struct route_ip_info) * count) { + nfp_flower_cmsg_warn(app, "Corruption in tunnel keep-alive message.\n"); + return; + } + + for (i = 0; i < count; i++) { + ipv4_addr = payload->tun_info[i].ipv4; + port = be32_to_cpu(payload->tun_info[i].egress_port); + netdev = nfp_app_repr_get(app, port); + if (!netdev) + continue; + + n = neigh_lookup(&arp_tbl, &ipv4_addr, netdev); + if (!n) + continue; + + /* Update the used timestamp of neighbour */ + neigh_event_send(n, NULL); + neigh_release(n); + } +} + static bool nfp_tun_is_netdev_to_offload(struct net_device *netdev) { if (!netdev->rtnl_link_ops) From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 18 Sep 2017 23:00:59 +0300 Subject: [PATCH 0913/1322] platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device, but it does have FUJ02B1. That means we do register the backlight device (and it even seems to work), but the code will oops as soon as we try to set the backlight brightness because it's trying to call call_fext_func() with a NULL device. Let's just skip those function calls when the FUJ02E3 device is not present. Cc: Jonathan Woithe Cc: Andy Shevchenko Signed-off-by: Ville Syrjälä Cc: # 4.13.x Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/fujitsu-laptop.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index 85de30f93a9c..56a8195096a2 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -254,10 +254,12 @@ static int bl_update_status(struct backlight_device *b) { struct acpi_device *device = bl_get_data(b); - if (b->props.power == FB_BLANK_POWERDOWN) - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); - else - call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + if (fext) { + if (b->props.power == FB_BLANK_POWERDOWN) + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); + else + call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + } return set_lcd_level(device, b->props.brightness); } From 4c6bb69663b3a3f2db8f488356e96acb5460f25f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Sep 2017 10:36:05 +0200 Subject: [PATCH 0914/1322] quota: Fix quota corruption with generic/232 test Eric has reported that since commit d2faa415166b "quota: Do not acquire dqio_sem for dquot overwrites in v2 format" test generic/232 occasionally fails due to quota information being incorrect. Indeed that commit was too eager to remove dqio_sem completely from the path that just overwrites quota structure with updated information. Although that is innocent on its own, another process that inserts new quota structure to the same block can perform read-modify-write cycle of that block thus effectively discarding quota information update if they race in a wrong way. Fix the problem by acquiring dqio_sem for reading for overwrites of quota structure. Note that it *is* possible to completely avoid taking dqio_sem in the overwrite path however that will require modifying path inserting / deleting quota structures to avoid RMW cycles of the full block and for now it is not clear whether it is worth the hassle. Fixes: d2faa415166b2883428efa92f451774ef44373ac Reported-and-tested-by: Eric Whitney Signed-off-by: Jan Kara --- fs/quota/quota_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index c0187cda2c1e..a73e5b34db41 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -328,12 +328,16 @@ static int v2_write_dquot(struct dquot *dquot) if (!dquot->dq_off) { alloc = true; down_write(&dqopt->dqio_sem); + } else { + down_read(&dqopt->dqio_sem); } ret = qtree_write_dquot( sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); if (alloc) up_write(&dqopt->dqio_sem); + else + up_read(&dqopt->dqio_sem); return ret; } From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Sep 2017 15:57:16 +0100 Subject: [PATCH 0915/1322] arm64: Make sure SPsel is always set When the kernel is entered at EL2 on an ARMv8.0 system, we construct the EL1 pstate and make sure this uses the the EL1 stack pointer (we perform an exception return to EL1h). But if the kernel is either entered at EL1 or stays at EL2 (because we're on a VHE-capable system), we fail to set SPsel, and use whatever stack selection the higher exception level has choosen for us. Let's not take any chance, and make sure that SPsel is set to one before we decide the mode we're going to run in. Cc: Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7434ec0c7a27..0b243ecaf7ac 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -384,6 +384,7 @@ ENTRY(kimage_vaddr) * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:04 +0200 Subject: [PATCH 0916/1322] KVM: VMX: extract __pi_post_block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simple code movement patch, preparing for the next one. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 71 +++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c83d28b0ab05..0002b14307ab 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11705,6 +11705,43 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + unsigned long flags; + + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* Allow posting non-urgent interrupts */ + new.sn = 0; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + + if(vcpu->pre_pcpu != -1) { + spin_lock_irqsave( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock_irqrestore( + &per_cpu(blocked_vcpu_on_cpu_lock, + vcpu->pre_pcpu), flags); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11798,44 +11835,12 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || !kvm_vcpu_apicv_active(vcpu)) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + __pi_post_block(vcpu); } static void vmx_post_block(struct kvm_vcpu *vcpu) From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:05 +0200 Subject: [PATCH 0917/1322] KVM: VMX: avoid double list add with VT-d posted interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases, for example involving hot-unplug of assigned devices, pi_post_block can forget to remove the vCPU from the blocked_vcpu_list. When this happens, the next call to pi_pre_block corrupts the list. Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block and WARN instead of adding the element twice in the list. Second, always do the list removal in pi_post_block if vcpu->pre_pcpu is set (not -1). The new code keeps interrupts disabled for the whole duration of pi_pre_block/pi_post_block. This is not strictly necessary, but easier to follow. For the same reason, PI.ON is checked only after the cmpxchg, and to handle it we just call the post-block code. This removes duplication of the list removal code. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 62 +++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0002b14307ab..0bfe97e50a40 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11710,10 +11710,11 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; unsigned int dest; - unsigned long flags; do { old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); dest = cpu_physical_id(vcpu->cpu); @@ -11730,14 +11731,10 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -11757,7 +11754,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11767,34 +11763,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11819,7 +11801,12 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11835,12 +11822,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; + WARN_ON(irqs_disabled()); + local_irq_disable(); __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 12:57:06 +0200 Subject: [PATCH 0918/1322] KVM: VMX: simplify and fix vmx_vcpu_pi_load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The simplify part: do not touch pi_desc.nv, we can set it when the VCPU is first created. Likewise, pi_desc.sn is only handled by vmx_vcpu_pi_load, do not touch it in __pi_post_block. The fix part: do not check kvm_arch_has_assigned_device, instead check the SN bit to figure out whether vmx_vcpu_pi_put ran before. This matches what the previous patch did in pi_post_block. Cc: Huangweidong Cc: Gonglei Cc: wangxin Cc: Radim Krčmář Tested-by: Longpeng (Mike) Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 68 ++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0bfe97e50a40..b9d2140eb212 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2202,43 +2202,41 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + return; + } + + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); + dest = cpu_physical_id(cpu); - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } - - /* Allow posting non-urgent interrupts */ new.sn = 0; } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); @@ -9592,6 +9590,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -11723,9 +11728,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) else new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ - new.sn = 0; - /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; } while (cmpxchg(&pi_desc->control, old.control, From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 25 Sep 2017 10:19:57 +0200 Subject: [PATCH 0919/1322] mtd: Fix partition alignment check on multi-erasesize devices Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") introduced a regression on heterogeneous erase region devices. Alignment of the partition was tested against the master eraseblock size which can be bigger than the slave one, thus leading to some partitions being marked as read-only. Update wr_alignment to match this slave erasesize after this erasesize has been determined by picking the biggest erasesize of all the regions embedded in the MTD partition. Reported-by: Mathias Thore Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") Cc: Signed-off-by: Boris Brezillon Tested-by: Mathias Thore Reviewed-by: Mathias Thore --- drivers/mtd/mtdpart.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 5736b0c90b33..a308e707392d 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -581,6 +581,14 @@ static struct mtd_part *allocate_partition(struct mtd_info *parent, slave->mtd.erasesize = parent->erasesize; } + /* + * Slave erasesize might differ from the master one if the master + * exposes several regions with different erasesize. Adjust + * wr_alignment accordingly. + */ + if (!(slave->mtd.flags & MTD_NO_ERASE)) + wr_alignment = slave->mtd.erasesize; + tmp = slave->offset; remainder = do_div(tmp, wr_alignment); if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) { From 5c62c1c67903621cfa715d6f690548ee53301620 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 17:28:47 +0800 Subject: [PATCH 0920/1322] iommu/io-pgtable-arm-v7s: Need dma-sync while there is no QUIRK_NO_DMA Fix the commit 81b3c2521844 ("iommu/io-pgtable: Introduce explicit coherency"). If there is no IO_PGTABLE_QUIRK_NO_DMA, we should call dma_sync_single_for_device for cache synchronization. Signed-off-by: Yong Wu Fixes: 81b3c2521844 ('iommu/io-pgtable: Introduce explicit coherency') Reviewed-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index d665d0dc16e8..6961fc393f0b 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -245,7 +245,7 @@ static void __arm_v7s_free_table(void *table, int lvl, static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries, struct io_pgtable_cfg *cfg) { - if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) + if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) return; dma_sync_single_for_device(cfg->iommu_dev, __arm_v7s_dma_addr(ptep), From 1ff9b17cedb39bc78f9e3f82485765f9b467177d Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Mon, 25 Sep 2017 18:15:26 +0800 Subject: [PATCH 0921/1322] iommu/mediatek: Limit the physical address in 32bit for v7s The ARM short descriptor has already limited the physical address to 32bit after the commit <76557391433c> ("iommu/io-pgtable: Sanitise map/unmap addresses"). But in MediaTek 4GB mode, the physical address is from 0x1_0000_0000 to 0x1_ffff_ffff. this will cause: WARNING: CPU: 4 PID: 3900 at xxx/drivers/iommu/io-pgtable-arm-v7s.c:482 arm_v7s_map+0x40/0xf8 Modules linked in: CPU: 4 PID: 3900 Comm: weston Tainted: G S W 4.9.44 #1 Hardware name: MediaTek MT2712m1v1 board (DT) task: ffffffc0eaa5b280 task.stack: ffffffc0e9858000 PC is at arm_v7s_map+0x40/0xf8 LR is at mtk_iommu_map+0x64/0x90 pc : [] lr : [] pstate: 000001c5 sp : ffffffc0e985b920 x29: ffffffc0e985b920 x28: 0000000127d00000 x27: 0000000000100000 x26: ffffff8008f9e000 x25: 0000000000000003 x24: 0000000000100000 x23: 0000000127d00000 x22: 00000000ff800000 x21: ffffffc0f7ec8ce0 x20: 0000000000000003 x19: 0000000000000003 x18: 0000000000000002 x17: 0000007f7e5d72c0 x16: ffffff80082b0f08 x15: 0000000000000001 x14: 000000000000003f x13: 0000000000000000 x12: 0000000000000028 x11: 0088000000000000 x10: 0000000000000000 x9 : ffffff80092fa000 x8 : ffffffc0e9858000 x7 : ffffff80085b29d8 x6 : 0000000000000000 x5 : ffffff80085b09a8 x4 : 0000000000000003 x3 : 0000000000100000 x2 : 0000000127d00000 x1 : 00000000ff800000 x0 : 0000000000000001 ... Call trace: [] arm_v7s_map+0x40/0xf8 [] mtk_iommu_map+0x64/0x90 [] iommu_map+0x100/0x3a0 [] default_iommu_map_sg+0x104/0x168 [] iommu_dma_alloc+0x238/0x3f8 [] __iommu_alloc_attrs+0xa8/0x260 [] mtk_drm_gem_create+0xac/0x180 [] mtk_drm_gem_dumb_create+0x54/0xc8 [] drm_mode_create_dumb_ioctl+0xa4/0xd8 [] drm_ioctl+0x1c0/0x490 In order to satify this, Limit the physical address to 32bit. Signed-off-by: Yong Wu Acked-by: Will Deacon Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index bd515be5b380..16d33ac19db0 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -371,7 +371,8 @@ static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova, int ret; spin_lock_irqsave(&dom->pgtlock, flags); - ret = dom->iop->map(dom->iop, iova, paddr, size, prot); + ret = dom->iop->map(dom->iop, iova, paddr & DMA_BIT_MASK(32), + size, prot); spin_unlock_irqrestore(&dom->pgtlock, flags); return ret; From 3c6bae62136ba5b24f0b113e68121b783457ca4b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 26 Sep 2017 13:07:46 +0530 Subject: [PATCH 0922/1322] iommu/amd: pr_err() strings should end with newlines pr_err() messages should end with a new-line to avoid other messages being concatenated. So replace '/n' with '\n'. Signed-off-by: Arvind Yadav Fixes: 45a01c42933b ('iommu/amd: Add function copy_dev_tables()') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 382de42b8359..6fe2d0346073 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -874,7 +874,7 @@ static bool copy_device_table(void) hi = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET + 4); entry = (((u64) hi) << 32) + lo; if (last_entry && last_entry != entry) { - pr_err("IOMMU:%d should use the same dev table as others!/n", + pr_err("IOMMU:%d should use the same dev table as others!\n", iommu->index); return false; } @@ -882,7 +882,7 @@ static bool copy_device_table(void) old_devtb_size = ((entry & ~PAGE_MASK) + 1) << 12; if (old_devtb_size != dev_table_size) { - pr_err("The device table size of IOMMU:%d is not expected!/n", + pr_err("The device table size of IOMMU:%d is not expected!\n", iommu->index); return false; } @@ -890,7 +890,7 @@ static bool copy_device_table(void) old_devtb_phys = entry & PAGE_MASK; if (old_devtb_phys >= 0x100000000ULL) { - pr_err("The address of old device table is above 4G, not trustworthy!/n"); + pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); @@ -901,7 +901,7 @@ static bool copy_device_table(void) old_dev_tbl_cpy = (void *)__get_free_pages(gfp_flag, get_order(dev_table_size)); if (old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!/n"); + pr_err("Failed to allocate memory for copying old device table!\n"); return false; } From 50ce6312f293e129eedf2affc7bd791c71d8287e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 26 Sep 2017 19:32:52 +0100 Subject: [PATCH 0923/1322] iommu: Fix comment for iommu_ops.map_sg The definition of map_sg was split during a recent addition to iommu_ops. Put it back together. Fixes: add02cfdc9bc ("iommu: Introduce Interface for IOMMU TLB Flushing") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a7f2ac689d29..41b8c5757859 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -167,11 +167,11 @@ struct iommu_resv_region { * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain * @map_sg: map a scatter-gather list of physically contiguous memory chunks + * to an iommu domain * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain * @tlb_range_add: Add a given iova range to the flush queue for this domain * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue - * to an iommu domain * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping From df5efdd97029f2cff7e5c91ea1c9f2b94d009b0f Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:05:57 -0700 Subject: [PATCH 0924/1322] IB/hfi1: Turn off AOC TX after offline substates Offline.quietDuration was added in the 8051 firmware, and the driver only turns off the AOC transmitters when offline.quiet is reached. However, the AOC transmitters need to be turned off at the new state. Therefore, turn off the AOC transmitters at any offline substates including offline.quiet and offline.quietDuration, then recheck we reached offline.quiet to support backwards compatibility. Reviewed-by: Jakub Byczkowski Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 85 +++++++++++++++++++++++-------- drivers/infiniband/hw/hfi1/chip.h | 1 + 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b2ed4b9cda6e..1c810d65721a 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1066,6 +1066,8 @@ static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); static int thermal_init(struct hfi1_devdata *dd); static void update_statusp(struct hfi1_pportdata *ppd, u32 state); +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void log_state_transition(struct hfi1_pportdata *ppd, u32 state); @@ -10305,6 +10307,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) { struct hfi1_devdata *dd = ppd->dd; u32 previous_state; + int offline_state_ret; int ret; update_lcb_cache(dd); @@ -10326,28 +10329,11 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); - /* - * Wait for offline transition. It can take a while for - * the link to go down. - */ - ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000); - if (ret < 0) - return ret; - - /* - * Now in charge of LCB - must be after the physical state is - * offline.quiet and before host_link_state is changed. - */ - set_host_lcb_access(dd); - write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ - - /* make sure the logical state is also down */ - ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); - if (ret) - force_logical_link_state_down(ppd); - - ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + offline_state_ret = wait_phys_link_offline_substates(ppd, 10000); + if (offline_state_ret < 0) + return offline_state_ret; + /* Disabling AOC transmitters */ if (ppd->port_type == PORT_TYPE_QSFP && ppd->qsfp_info.limiting_active && qsfp_mod_present(ppd)) { @@ -10364,6 +10350,30 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) } } + /* + * Wait for the offline.Quiet transition if it hasn't happened yet. It + * can take a while for the link to go down. + */ + if (offline_state_ret != PLS_OFFLINE_QUIET) { + ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 30000); + if (ret < 0) + return ret; + } + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + /* make sure the logical state is also down */ + ret = wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + if (ret) + force_logical_link_state_down(ppd); + + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + /* * The LNI has a mandatory wait time after the physical state * moves to Offline.Quiet. The wait time may be different @@ -12804,6 +12814,39 @@ static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state, return 0; } +/* + * wait_phys_link_offline_quiet_substates - wait for any offline substate + * @ppd: port device + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for any offline physical link + * state change to occur. + * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT. + */ +static int wait_phys_link_offline_substates(struct hfi1_pportdata *ppd, + int msecs) +{ + u32 read_state; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + read_state = read_physical_state(ppd->dd); + if ((read_state & 0xF0) == PLS_OFFLINE) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(ppd->dd, + "timeout waiting for phy link offline.quiet substates. Read state 0x%x, %dms\n", + read_state, msecs); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + log_state_transition(ppd, read_state); + return read_state; +} + #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index b8345a60a0fb..461f937fe110 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -204,6 +204,7 @@ #define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 #define PLS_OFFLINE_REPORT_FAILURE 0x93 #define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_OFFLINE_QUIET_DURATION 0x95 #define PLS_POLLING 0x20 #define PLS_POLLING_QUIET 0x20 #define PLS_POLLING_ACTIVE 0x21 From 30e10527bcce376114e627abb7fabfbe9bfee91e Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Tue, 26 Sep 2017 06:06:03 -0700 Subject: [PATCH 0925/1322] IB/hfi1: Only reset QSFP after link up and turn off AOC TX QSFP reset enables AOC transmitters by default. They should be off before moving to high power mode to complete the setup. There is no need to reset the QSFP during LNI failure as it was reset at link down. Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 12 +++++++++++- drivers/infiniband/hw/hfi1/chip.h | 2 +- drivers/infiniband/hw/hfi1/platform.c | 4 +++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 1c810d65721a..27b75a8f5097 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -9415,7 +9415,7 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); } -void reset_qsfp(struct hfi1_pportdata *ppd) +int reset_qsfp(struct hfi1_pportdata *ppd) { struct hfi1_devdata *dd = ppd->dd; u64 mask, qsfp_mask; @@ -9445,6 +9445,13 @@ void reset_qsfp(struct hfi1_pportdata *ppd) * for alarms and warnings */ set_qsfp_int_n(ppd, 1); + + /* + * After the reset, AOC transmitters are enabled by default. They need + * to be turned off to complete the QSFP setup before they can be + * enabled again. + */ + return set_qsfp_tx(ppd, 0); } static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, @@ -10406,6 +10413,9 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { /* went down while attempting link up */ check_lni_states(ppd); + + /* The QSFP doesn't need to be reset on LNI failure */ + ppd->qsfp_info.reset_needed = 0; } /* the active link width (downgrade) is 0 on link down */ diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 461f937fe110..50b8645d0b87 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -723,7 +723,7 @@ void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_start_link(struct work_struct *work); void handle_sma_message(struct work_struct *work); -void reset_qsfp(struct hfi1_pportdata *ppd); +int reset_qsfp(struct hfi1_pportdata *ppd); void qsfp_event(struct work_struct *work); void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); int send_idle_sma(struct hfi1_devdata *dd, u64 message); diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c index a8af96d2b1b0..d486355880cb 100644 --- a/drivers/infiniband/hw/hfi1/platform.c +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -790,7 +790,9 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, * reuse of stale settings established in our previous pass through. */ if (ppd->qsfp_info.reset_needed) { - reset_qsfp(ppd); + ret = reset_qsfp(ppd); + if (ret) + return ret; refresh_qsfp_cache(ppd, &ppd->qsfp_info); } else { ppd->qsfp_info.reset_needed = 1; From 753b19afb19dd97d0767df5e8afb13faff605315 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Tue, 26 Sep 2017 06:06:09 -0700 Subject: [PATCH 0926/1322] IB/hfi1: Check eeprom config partition validity Relying on a trailing magic value is incorrect. There are instances where this is not present as trailing magic value has a specific purpose which is not partition validation. Instead use the header magic value which is present in all variants of the platform configuration and is intended for validation. This is also used in other locations in the driver. Fixes: bc5214ee2922 (IB/hfi1: Handle missing magic values in config file) Reviewed-by: Jakub Byczkowski Signed-off-by: Jan Sokolowski Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/eprom.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c index d46b17107901..1613af1c58d9 100644 --- a/drivers/infiniband/hw/hfi1/eprom.c +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -204,7 +204,10 @@ done_asic: return ret; } -/* magic character sequence that trails an image */ +/* magic character sequence that begins an image */ +#define IMAGE_START_MAGIC "APO=" + +/* magic character sequence that might trail an image */ #define IMAGE_TRAIL_MAGIC "egamiAPO" /* EPROM file types */ @@ -250,6 +253,7 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, { void *buffer; void *p; + u32 length; int ret; buffer = kmalloc(P1_SIZE, GFP_KERNEL); @@ -262,15 +266,21 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data, return ret; } - /* scan for image magic that may trail the actual data */ - p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); - if (!p) { + /* config partition is valid only if it starts with IMAGE_START_MAGIC */ + if (memcmp(buffer, IMAGE_START_MAGIC, strlen(IMAGE_START_MAGIC))) { kfree(buffer); return -ENOENT; } + /* scan for image magic that may trail the actual data */ + p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE); + if (p) + length = p - buffer; + else + length = P1_SIZE; + *data = buffer; - *size = p - buffer; + *size = length; return 0; } From 09592af5fdd722615ebe435fb34308de26c74bcf Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Tue, 26 Sep 2017 06:06:15 -0700 Subject: [PATCH 0927/1322] IB/hfi1: Return correct value in general interrupt handler The general interrupt handler returns IRQ_HANDLED whether an IRQ was handled or not. Determine if an IRQ was handled and return the correct value. Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 27b75a8f5097..0be42787759f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8240,6 +8240,7 @@ static irqreturn_t general_interrupt(int irq, void *data) u64 regs[CCE_NUM_INT_CSRS]; u32 bit; int i; + irqreturn_t handled = IRQ_NONE; this_cpu_inc(*dd->int_counter); @@ -8260,9 +8261,10 @@ static irqreturn_t general_interrupt(int irq, void *data) for_each_set_bit(bit, (unsigned long *)®s[0], CCE_NUM_INT_CSRS * 64) { is_interrupt(dd, bit); + handled = IRQ_HANDLED; } - return IRQ_HANDLED; + return handled; } static irqreturn_t sdma_interrupt(int irq, void *data) From 612601d0013f03de9dc134809f242ba6da9ca252 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Tue, 26 Sep 2017 06:06:22 -0700 Subject: [PATCH 0928/1322] Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0" commit 9a9b8112699d will cause core to fail UD QP from being destroyed on ipoib unload, therefore cause resources leakage. On pkey change event above patch modifies mgid before calling underlying driver to detach it from QP. Drivers' detach_mcast() will fail to find modified mgid it was never given to attach in a first place. Core qp->usecnt will never go down, so ib_destroy_qp() will fail. IPoIB driver actually does take care of new broadcast mgid based on new pkey by destroying an old mcast object in ipoib_mcast_dev_flush()) .... if (priv->broadcast) { rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); list_add_tail(&priv->broadcast->list, &remove_list); priv->broadcast = NULL; } ... then in restarted ipoib_macst_join_task() creating a new broadcast mcast object, sending join request and on completion tells the driver to attach to reinitialized QP: ... if (!priv->broadcast) { ... broadcast = ipoib_mcast_alloc(dev, 0); ... memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; ... Fixes: 9a9b8112699d ("IB/ipoib: Update broadcast object if PKey value was changed in index 0") Cc: stable@vger.kernel.org Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Feras Daoud Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 2e075377242e..6cd61638b441 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1000,19 +1000,6 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) */ priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; - - /* - * Update the broadcast address in the priv->broadcast object, - * in case it already exists, otherwise no one will do that. - */ - if (priv->broadcast) { - spin_lock_irq(&priv->lock); - memcpy(priv->broadcast->mcmember.mgid.raw, - priv->dev->broadcast + 4, - sizeof(union ib_gid)); - spin_unlock_irq(&priv->lock); - } - return 0; } From b8f42738acaddf67731c34935c0994e09a588ca7 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 26 Sep 2017 06:06:28 -0700 Subject: [PATCH 0929/1322] IB/hfi1: On error, fix use after free during user context setup During base context setup, if setup_base_ctxt() fails, the context is deallocated. This is incorrect because the context is referenced on return, to notify any waiting subcontext. If there are no subcontexts the pointer will be invalid. Reorganize the error path so that deallocate_ctxt() is called after all the possible subcontexts have been notified. Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 41 ++++++++++++++------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2bc89260235a..d9a1e9893136 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -930,15 +930,8 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo) switch (ret) { case 0: ret = setup_base_ctxt(fd, uctxt); - if (uctxt->subctxt_cnt) { - /* - * Base context is done (successfully or not), notify - * anybody using a sub-context that is waiting for - * this completion. - */ - clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); - wake_up(&uctxt->wait); - } + if (ret) + deallocate_ctxt(uctxt); break; case 1: ret = complete_subctxt(fd); @@ -1305,25 +1298,25 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, /* Now allocate the RcvHdr queue and eager buffers. */ ret = hfi1_create_rcvhdrq(dd, uctxt); if (ret) - return ret; + goto done; ret = hfi1_setup_eagerbufs(uctxt); if (ret) - goto setup_failed; + goto done; /* If sub-contexts are enabled, do the appropriate setup */ if (uctxt->subctxt_cnt) ret = setup_subctxt(uctxt); if (ret) - goto setup_failed; + goto done; ret = hfi1_alloc_ctxt_rcv_groups(uctxt); if (ret) - goto setup_failed; + goto done; ret = init_user_ctxt(fd, uctxt); if (ret) - goto setup_failed; + goto done; user_init(uctxt); @@ -1331,12 +1324,22 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); - return 0; +done: + if (uctxt->subctxt_cnt) { + /* + * On error, set the failed bit so sub-contexts will clean up + * correctly. + */ + if (ret) + set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); -setup_failed: - /* Set the failed bit so sub-context init can do the right thing */ - set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags); - deallocate_ctxt(uctxt); + /* + * Base context is done (successfully or not), notify anybody + * using a sub-context that is waiting for this completion. + */ + clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } return ret; } From 828bcbdc975fbcfb27946c33d4b1d1bfab70789b Mon Sep 17 00:00:00 2001 From: Harish Chegondi Date: Tue, 26 Sep 2017 06:06:34 -0700 Subject: [PATCH 0930/1322] IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load Failure to tune PCIe capabilities should not fail driver load. This can cause the driver load to fail on systems with any of the following: 1. HFI's parent is not root. Example: HFI card is behind a PCIe bridge. 2. HFI's parent is not PCI Express capable. In these situations, failure to tune PCIe capabilities should be logged in the system message logs but not cause the driver load to fail. This patch also ensures pcie capability word DevCtl is written only after a successful read and the capability tuning process continues even if read/write of the pcie capability word DevCtl fails. Fixes: c53df62c7a9a ("IB/hfi1: Check return values from PCI config API calls") Fixes: bf70a7757736 ("staging/rdma/hfi1: Enable WFR PCIe extended tags from the driver") Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Jakub Byczkowski Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 50 +++++++++++++------------------ 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 82447b7cdda1..09e50fd2a08f 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -68,7 +68,7 @@ /* * Code to adjust PCIe capabilities. */ -static int tune_pcie_caps(struct hfi1_devdata *); +static void tune_pcie_caps(struct hfi1_devdata *); /* * Do all the common PCIe setup and initialization. @@ -351,7 +351,7 @@ int pcie_speeds(struct hfi1_devdata *dd) */ int request_msix(struct hfi1_devdata *dd, u32 msireq) { - int nvec, ret; + int nvec; nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, PCI_IRQ_MSIX | PCI_IRQ_LEGACY); @@ -360,12 +360,7 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) return nvec; } - ret = tune_pcie_caps(dd); - if (ret) { - dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret); - pci_free_irq_vectors(dd->pcidev); - return ret; - } + tune_pcie_caps(dd); /* check for legacy IRQ */ if (nvec == 1 && !dd->pcidev->msix_enabled) @@ -502,7 +497,7 @@ uint aspm_mode = ASPM_MODE_DISABLED; module_param_named(aspm, aspm_mode, uint, S_IRUGO); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static int tune_pcie_caps(struct hfi1_devdata *dd) +static void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -513,22 +508,14 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * Turn on extended tags in DevCtl in case the BIOS has turned it off * to improve WFR SDMA bandwidth */ - ret = pcie_capability_read_word(dd->pcidev, - PCI_EXP_DEVCTL, &ectl); - if (ret) { - dd_dev_err(dd, "Unable to read from PCI config\n"); - return ret; - } - - if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if ((!ret) && !(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { dd_dev_info(dd, "Enabling PCIe extended tags\n"); ectl |= PCI_EXP_DEVCTL_EXT_TAG; ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); - if (ret) { - dd_dev_err(dd, "Unable to write to PCI config\n"); - return ret; - } + if (ret) + dd_dev_info(dd, "Unable to write to PCI config\n"); } /* Find out supported and configured values for parent (root) */ parent = dd->pcidev->bus->self; @@ -536,15 +523,22 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) * The driver cannot perform the tuning if it does not have * access to the upstream component. */ - if (!parent) - return -EINVAL; + if (!parent) { + dd_dev_info(dd, "Parent not found\n"); + return; + } if (!pci_is_root_bus(parent->bus)) { dd_dev_info(dd, "Parent not root\n"); - return -EINVAL; + return; + } + if (!pci_is_pcie(parent)) { + dd_dev_info(dd, "Parent is not PCI Express capable\n"); + return; + } + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_info(dd, "PCI device is not PCI Express capable\n"); + return; } - - if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) - return -EINVAL; rc_mpss = parent->pcie_mpss; rc_mps = ffs(pcie_get_mps(parent)) - 8; /* Find out supported and configured values for endpoint (us) */ @@ -590,8 +584,6 @@ static int tune_pcie_caps(struct hfi1_devdata *dd) ep_mrrs = max_mrrs; pcie_set_readrq(dd->pcidev, ep_mrrs); } - - return 0; } /* End of PCIe capability tuning */ From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Wed, 27 Sep 2017 14:49:17 +0200 Subject: [PATCH 0931/1322] mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user When calculating the size needed by struct atmel_pmecc_user *user, the dmu and delta buffer sizes were forgotten. This lead to a memory corruption (especially with a large ecc_strength). Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver") Cc: stable@vger.kernel.org Reported-by: Richard Genoud Pointed-at-by: Boris Brezillon Signed-off-by: Richard Genoud Reviewed-by: Nicolas Ferre Signed-off-by: Boris Brezillon --- drivers/mtd/nand/atmel/pmecc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/atmel/pmecc.c b/drivers/mtd/nand/atmel/pmecc.c index 146af8218314..8268636675ef 100644 --- a/drivers/mtd/nand/atmel/pmecc.c +++ b/drivers/mtd/nand/atmel/pmecc.c @@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pmecc *pmecc, size += (req->ecc.strength + 1) * sizeof(u16); /* Reserve space for mu, dmu and delta. */ size = ALIGN(size, sizeof(s32)); - size += (req->ecc.strength + 1) * sizeof(s32); + size += (req->ecc.strength + 1) * sizeof(s32) * 3; user = kzalloc(size, GFP_KERNEL); if (!user) From 85e482285bbbd508483cbe08de69c8fe00cdbbfe Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:11 +0200 Subject: [PATCH 0932/1322] fib: notifier: Add VIF add and delete event types In order for an interface to forward packets according to the kernel multicast routing table, it must be configured with a VIF index according to the mroute user API. The VIF index is then used to refer to that interface in the mroute user API, for example, to set the iif and oifs of an MFC entry. In order to allow drivers to be aware and offload multicast routes, they have to be aware of the VIF add and delete notifications. Due to the fact that a specific VIF can be deleted and re-added pointing to another netdevice, and the MFC routes that point to it will forward the matching packets to the new netdevice, a driver willing to offload MFC cache entries must be aware of the VIF add and delete events in addition to MFC routes notifications. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 669b9716dc7a..54cd6b839d2f 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -20,6 +20,8 @@ enum fib_event_type { FIB_EVENT_RULE_DEL, FIB_EVENT_NH_ADD, FIB_EVENT_NH_DEL, + FIB_EVENT_VIF_ADD, + FIB_EVENT_VIF_DEL, }; struct fib_notifier_ops { From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:12 +0200 Subject: [PATCH 0933/1322] ipmr: Add reference count to MFC entries Next commits will introduce MFC notifications through the atomic fib_notification chain, thus allowing modules to be aware of MFC entries. Due to the fact that modules may need to hold a reference to an MFC entry, add reference count to MFC entries to prevent them from being freed while these modules use them. The reference counting is done only on resolved MFC entries currently. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 +++++++++++++++++++++ net/ipv4/ipmr.c | 8 +++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/linux/mroute.h b/include/linux/mroute.h index d7f63339ef0b..10028f208efb 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg { * @wrong_if: number of wrong source interface hits * @lastuse: time of last use of the group (traffic or update) * @ttls: OIF TTL threshold array + * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction */ @@ -138,6 +139,7 @@ struct mfc_cache { unsigned long wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; + refcount_t refcount; } res; } mfc_un; struct list_head list; @@ -148,4 +150,23 @@ struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); + +#ifdef CONFIG_IP_MROUTE +void ipmr_cache_free(struct mfc_cache *mfc_cache); +#else +static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) +{ +} +#endif + +static inline void ipmr_cache_put(struct mfc_cache *c) +{ + if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + ipmr_cache_free(c); +} +static inline void ipmr_cache_hold(struct mfc_cache *c) +{ + refcount_inc(&c->mfc_un.res.refcount); +} + #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..86dc5f98c5dd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -652,10 +652,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -949,6 +950,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1162,7 +1164,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1264,7 +1266,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:13 +0200 Subject: [PATCH 0934/1322] ipmr: Add FIB notification access functions Make the ipmr module register as a FIB notifier. To do that, implement both the ipmr_seq_read and ipmr_dump ops. The ipmr_seq_read op returns a sequence counter that is incremented on every notification related operation done by the ipmr. To implement that, add a sequence counter in the netns_ipv4 struct and increment it whenever a new MFC route or VIF are added or deleted. The sequence operations are protected by the RTNL lock. The ipmr_dump iterates the list of MFC routes and the list of VIF entries and sends notifications about them. The entries dump is done under RCU where the VIF dump uses the mrt_lock too, as the vif->dev field can change under RCU. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 15 +++++ include/net/netns/ipv4.h | 3 + net/ipv4/ipmr.c | 137 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 153 insertions(+), 2 deletions(-) diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 10028f208efb..54c5cb82ddcb 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #ifdef CONFIG_IP_MROUTE @@ -58,6 +59,14 @@ struct vif_device { int link; /* Physical interface index */ }; +struct vif_entry_notifier_info { + struct fib_notifier_info info; + struct net_device *dev; + vifi_t vif_index; + unsigned short vif_flags; + u32 tb_id; +}; + #define VIFF_STATIC 0x8000 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) @@ -146,6 +155,12 @@ struct mfc_cache { struct rcu_head rcu; }; +struct mfc_entry_notifier_info { + struct fib_notifier_info info; + struct mfc_cache *mfc; + u32 tb_id; +}; + struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8387f099115e..abc84d986da4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -163,6 +163,9 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* protected by rtnl_mutex */ + struct fib_notifier_ops *ipmr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t rt_genid; }; #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 86dc5f98c5dd..49879c338357 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -264,6 +264,16 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +308,16 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +607,43 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -3050,14 +3107,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3074,7 +3204,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3084,6 +3216,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } From b362053a7cc0fcc09b92642ba5dd1ca7fddc9004 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:14 +0200 Subject: [PATCH 0935/1322] ipmr: Send FIB notifications on MFC and VIF entries Use the newly introduced notification chain to send events upon VIF and MFC addition and deletion. The MFC notifications are sent only on resolved MFC entries, as unresolved cannot be offloaded. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 49879c338357..ba71bc402336 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -627,6 +627,27 @@ static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, return call_fib_notifier(nb, net, event_type, &info.info); } +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, @@ -644,6 +665,24 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, return call_fib_notifier(nb, net, event_type, &info.info); } +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -651,6 +690,7 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -660,6 +700,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -909,6 +953,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -1209,6 +1254,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1220,6 +1266,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_put(c); @@ -1248,6 +1295,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1297,6 +1346,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1304,6 +1354,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1322,6 +1373,8 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_put(c); } From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:15 +0200 Subject: [PATCH 0936/1322] net: ipmr: Add MFC offload indication Allow drivers, registered to the fib notification chain indicate whether a multicast MFC route is offloaded or not, similarly to unicast routes. The indication of whether a route is offloaded is done using the mfc_flags field on an mfc_cache struct, and the information is sent to the userspace via the RTNetlink interface only. Currently, MFC routes are either offloaded or not, thus there is no need to add per-VIF offload indication. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 2 ++ net/ipv4/ipmr.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 54c5cb82ddcb..5566580811ce 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -90,9 +90,11 @@ struct mr_table { /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) + * MFC_OFFLOAD - the entry was offloaded to the hardware */ enum { MFC_STATIC = BIT(0), + MFC_OFFLOAD = BIT(1), }; struct mfc_cache_cmp_arg { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ba71bc402336..2a795d2c0502 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2268,6 +2268,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:16 +0200 Subject: [PATCH 0937/1322] net: mroute: Check if rule is a default rule When the ipmr starts, it adds one default FIB rule that matches all packets and sends them to the DEFAULT (multicast) FIB table. A more complex rule can be added by user to specify that for a specific interface, a packet should be look up at either an arbitrary table or according to the l3mdev of the interface. For drivers willing to offload the ipmr logic into a hardware but don't want to offload all the FIB rules functionality, provide a function that can indicate whether the FIB rule is the default multicast rule, thus only one routing table is needed. This way, a driver can register to the FIB notification chain, get notifications about FIB rules added and trigger some kind of an internal abort mechanism when a non default rule is added by the user. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 7 +++++++ net/ipv4/ipmr.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 5566580811ce..b072a84fbe1c 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); +bool ipmr_rule_default(const struct fib_rule *rule); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, char __user *optval, unsigned int optlen) @@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt) { return 0; } + +static inline bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} #endif struct vif_device { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2a795d2c0502..292a8e80bdfa 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -274,6 +274,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -318,6 +324,12 @@ static unsigned int ipmr_rules_seq_read(struct net *net) { return 0; } + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, From c011ec1bbfd69e091ca8d77e13fc251a07be57dc Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:17 +0200 Subject: [PATCH 0938/1322] mlxsw: spectrum: Add the multicast routing offloading logic Add the multicast router offloading logic, which is in charge of handling the VIF and MFC notifications and translating it to the hardware logic API. The offloading logic has to overcome several obstacles in order to safely comply with the kernel multicast router user API: - It must keep track of the mapping between VIFs to netdevices. The user can add an MFC cache entry pointing to a VIF, delete the VIF and add re-add it with a different netdevice. The offloading logic has to handle this in order to be compatible with the kernel logic. - It must keep track of the mapping between netdevices to spectrum RIFs, as the current hardware implementation assume having a RIF for every port in a multicast router. - It must handle routes pointing to pimreg device to be trapped to the kernel, as the packet should be delivered to userspace. - It must handle routes pointing tunnel VIFs. The current implementation does not support multicast forwarding to tunnels, thus routes that point to a tunnel should be trapped to the kernel. - It must be aware of proxy multicast routes, which include both (*,*) routes and duplicate routes. Currently proxy routes are not offloaded and trigger the abort mechanism: removal of all routes from hardware and triggering the traffic to go through the kernel. The multicast routing offloading logic also updates the counters of the offloaded MFC routes in a periodic work. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 3 +- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../net/ethernet/mellanox/mlxsw/spectrum_mr.c | 1014 +++++++++++++++++ .../net/ethernet/mellanox/mlxsw/spectrum_mr.h | 133 +++ 4 files changed, 1150 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 4b88158173f3..9b29764905f3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -17,7 +17,8 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_kvdl.o spectrum_acl_tcam.o \ spectrum_acl.o spectrum_flower.o \ spectrum_cnt.o spectrum_fid.o \ - spectrum_ipip.o spectrum_acl_flex_actions.o + spectrum_ipip.o spectrum_acl_flex_actions.o \ + spectrum_mr.o mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB) += spectrum_dcb.o mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o obj-$(CONFIG_MLXSW_MINIMAL) += mlxsw_minimal.o diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 9355d914a4c8..44c5259e5548 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -153,6 +153,7 @@ struct mlxsw_sp { struct mlxsw_sp_sb *sb; struct mlxsw_sp_bridge *bridge; struct mlxsw_sp_router *router; + struct mlxsw_sp_mr *mr; struct mlxsw_afa *afa; struct mlxsw_sp_acl *acl; struct mlxsw_sp_fid_core *fid_core; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c new file mode 100644 index 000000000000..09120259a45d --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -0,0 +1,1014 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "spectrum_mr.h" +#include "spectrum_router.h" + +struct mlxsw_sp_mr { + const struct mlxsw_sp_mr_ops *mr_ops; + void *catchall_route_priv; + struct delayed_work stats_update_dw; + struct list_head table_list; +#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */ + unsigned long priv[0]; + /* priv has to be always the last item */ +}; + +struct mlxsw_sp_mr_vif { + struct net_device *dev; + const struct mlxsw_sp_rif *rif; + unsigned long vif_flags; + + /* A list of route_vif_entry structs that point to routes that the VIF + * instance is used as one of the egress VIFs + */ + struct list_head route_evif_list; + + /* A list of route_vif_entry structs that point to routes that the VIF + * instance is used as an ingress VIF + */ + struct list_head route_ivif_list; +}; + +struct mlxsw_sp_mr_route_vif_entry { + struct list_head vif_node; + struct list_head route_node; + struct mlxsw_sp_mr_vif *mr_vif; + struct mlxsw_sp_mr_route *mr_route; +}; + +struct mlxsw_sp_mr_table { + struct list_head node; + enum mlxsw_sp_l3proto proto; + struct mlxsw_sp *mlxsw_sp; + u32 vr_id; + struct mlxsw_sp_mr_vif vifs[MAXVIFS]; + struct list_head route_list; + struct rhashtable route_ht; + char catchall_route_priv[0]; + /* catchall_route_priv has to be always the last item */ +}; + +struct mlxsw_sp_mr_route { + struct list_head node; + struct rhash_head ht_node; + struct mlxsw_sp_mr_route_key key; + enum mlxsw_sp_mr_route_action route_action; + u16 min_mtu; + struct mfc_cache *mfc4; + void *route_priv; + const struct mlxsw_sp_mr_table *mr_table; + /* A list of route_vif_entry structs that point to the egress VIFs */ + struct list_head evif_list; + /* A route_vif_entry struct that point to the ingress VIF */ + struct mlxsw_sp_mr_route_vif_entry ivif; +}; + +static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = { + .key_len = sizeof(struct mlxsw_sp_mr_route_key), + .key_offset = offsetof(struct mlxsw_sp_mr_route, key), + .head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node), + .automatic_shrinking = true, +}; + +static bool mlxsw_sp_mr_vif_regular(const struct mlxsw_sp_mr_vif *vif) +{ + return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER)); +} + +static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif) +{ + return mlxsw_sp_mr_vif_regular(vif) && vif->dev && vif->rif; +} + +static bool mlxsw_sp_mr_vif_rif_invalid(const struct mlxsw_sp_mr_vif *vif) +{ + return mlxsw_sp_mr_vif_regular(vif) && vif->dev && !vif->rif; +} + +static bool +mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route) +{ + vifi_t ivif; + + switch (mr_route->mr_table->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + ivif = mr_route->mfc4->mfc_parent; + return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255; + case MLXSW_SP_L3_PROTO_IPV6: + /* fall through */ + default: + WARN_ON_ONCE(1); + } + return false; +} + +static int +mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route) +{ + struct mlxsw_sp_mr_route_vif_entry *rve; + int valid_evifs; + + valid_evifs = 0; + list_for_each_entry(rve, &mr_route->evif_list, route_node) + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) + valid_evifs++; + return valid_evifs; +} + +static bool mlxsw_sp_mr_route_starg(const struct mlxsw_sp_mr_route *mr_route) +{ + switch (mr_route->mr_table->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + return mr_route->key.source_mask.addr4 == INADDR_ANY; + case MLXSW_SP_L3_PROTO_IPV6: + /* fall through */ + default: + WARN_ON_ONCE(1); + } + return false; +} + +static enum mlxsw_sp_mr_route_action +mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route) +{ + struct mlxsw_sp_mr_route_vif_entry *rve; + + /* If the ingress port is not regular and resolved, trap the route */ + if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif)) + return MLXSW_SP_MR_ROUTE_ACTION_TRAP; + + /* The kernel does not match a (*,G) route that the ingress interface is + * not one of the egress interfaces, so trap these kind of routes. + */ + if (mlxsw_sp_mr_route_starg(mr_route) && + !mlxsw_sp_mr_route_ivif_in_evifs(mr_route)) + return MLXSW_SP_MR_ROUTE_ACTION_TRAP; + + /* If the route has no valid eVIFs, trap it. */ + if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route)) + return MLXSW_SP_MR_ROUTE_ACTION_TRAP; + + /* If either one of the eVIFs is not regular (VIF of type pimreg or + * tunnel) or one of the VIFs has no matching RIF, trap the packet. + */ + list_for_each_entry(rve, &mr_route->evif_list, route_node) { + if (!mlxsw_sp_mr_vif_regular(rve->mr_vif) || + mlxsw_sp_mr_vif_rif_invalid(rve->mr_vif)) + return MLXSW_SP_MR_ROUTE_ACTION_TRAP; + } + return MLXSW_SP_MR_ROUTE_ACTION_FORWARD; +} + +static enum mlxsw_sp_mr_route_prio +mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route) +{ + return mlxsw_sp_mr_route_starg(mr_route) ? + MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG; +} + +static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route_key *key, + const struct mfc_cache *mfc) +{ + bool starg = (mfc->mfc_origin == INADDR_ANY); + + memset(key, 0, sizeof(*key)); + key->vrid = mr_table->vr_id; + key->proto = mr_table->proto; + key->group.addr4 = mfc->mfc_mcastgrp; + key->group_mask.addr4 = 0xffffffff; + key->source.addr4 = mfc->mfc_origin; + key->source_mask.addr4 = starg ? 0 : 0xffffffff; +} + +static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route, + struct mlxsw_sp_mr_vif *mr_vif) +{ + struct mlxsw_sp_mr_route_vif_entry *rve; + + rve = kzalloc(sizeof(*rve), GFP_KERNEL); + if (!rve) + return -ENOMEM; + rve->mr_route = mr_route; + rve->mr_vif = mr_vif; + list_add_tail(&rve->route_node, &mr_route->evif_list); + list_add_tail(&rve->vif_node, &mr_vif->route_evif_list); + return 0; +} + +static void +mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve) +{ + list_del(&rve->route_node); + list_del(&rve->vif_node); + kfree(rve); +} + +static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route, + struct mlxsw_sp_mr_vif *mr_vif) +{ + mr_route->ivif.mr_route = mr_route; + mr_route->ivif.mr_vif = mr_vif; + list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list); +} + +static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route) +{ + list_del(&mr_route->ivif.vif_node); +} + +static int +mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route, + struct mlxsw_sp_mr_route_info *route_info) +{ + struct mlxsw_sp_mr_route_vif_entry *rve; + u16 *erif_indices; + u16 irif_index; + u16 erif = 0; + + erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices), + GFP_KERNEL); + if (!erif_indices) + return -ENOMEM; + + list_for_each_entry(rve, &mr_route->evif_list, route_node) { + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) { + u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif); + + erif_indices[erif++] = rifi; + } + } + + if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif)) + irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif); + else + irif_index = 0; + + route_info->irif_index = irif_index; + route_info->erif_indices = erif_indices; + route_info->min_mtu = mr_route->min_mtu; + route_info->route_action = mr_route->route_action; + route_info->erif_num = erif; + return 0; +} + +static void +mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info) +{ + kfree(route_info->erif_indices); +} + +static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route, + bool replace) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + struct mlxsw_sp_mr_route_info route_info; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + int err; + + err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info); + if (err) + return err; + + if (!replace) { + struct mlxsw_sp_mr_route_params route_params; + + mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size, + GFP_KERNEL); + if (!mr_route->route_priv) { + err = -ENOMEM; + goto out; + } + + route_params.key = mr_route->key; + route_params.value = route_info; + route_params.prio = mlxsw_sp_mr_route_prio(mr_route); + err = mr->mr_ops->route_create(mlxsw_sp, mr->priv, + mr_route->route_priv, + &route_params); + if (err) + kfree(mr_route->route_priv); + } else { + err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv, + &route_info); + } +out: + mlxsw_sp_mr_route_info_destroy(&route_info); + return err; +} + +static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + + mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv); + kfree(mr_route->route_priv); +} + +static struct mlxsw_sp_mr_route * +mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, + struct mfc_cache *mfc) +{ + struct mlxsw_sp_mr_route_vif_entry *rve, *tmp; + struct mlxsw_sp_mr_route *mr_route; + int err; + int i; + + /* Allocate and init a new route and fill it with parameters */ + mr_route = kzalloc(sizeof(*mr_route), GFP_KERNEL); + if (!mr_route) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&mr_route->evif_list); + mlxsw_sp_mr_route4_key(mr_table, &mr_route->key, mfc); + + /* Find min_mtu and link iVIF and eVIFs */ + mr_route->min_mtu = ETH_MAX_MTU; + ipmr_cache_hold(mfc); + mr_route->mfc4 = mfc; + mr_route->mr_table = mr_table; + for (i = 0; i < MAXVIFS; i++) { + if (mfc->mfc_un.res.ttls[i] != 255) { + err = mlxsw_sp_mr_route_evif_link(mr_route, + &mr_table->vifs[i]); + if (err) + goto err; + if (mr_table->vifs[i].dev && + mr_table->vifs[i].dev->mtu < mr_route->min_mtu) + mr_route->min_mtu = mr_table->vifs[i].dev->mtu; + } + } + mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]); + if (err) + goto err; + + mr_route->route_action = mlxsw_sp_mr_route_action(mr_route); + return mr_route; +err: + ipmr_cache_put(mfc); + list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node) + mlxsw_sp_mr_route_evif_unlink(rve); + kfree(mr_route); + return ERR_PTR(err); +} + +static void mlxsw_sp_mr_route4_destroy(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route) +{ + struct mlxsw_sp_mr_route_vif_entry *rve, *tmp; + + mlxsw_sp_mr_route_ivif_unlink(mr_route); + ipmr_cache_put(mr_route->mfc4); + list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node) + mlxsw_sp_mr_route_evif_unlink(rve); + kfree(mr_route); +} + +static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route) +{ + switch (mr_table->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + mlxsw_sp_mr_route4_destroy(mr_table, mr_route); + break; + case MLXSW_SP_L3_PROTO_IPV6: + /* fall through */ + default: + WARN_ON_ONCE(1); + } +} + +static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route, + bool offload) +{ + switch (mr_route->mr_table->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + if (offload) + mr_route->mfc4->mfc_flags |= MFC_OFFLOAD; + else + mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD; + break; + case MLXSW_SP_L3_PROTO_IPV6: + /* fall through */ + default: + WARN_ON_ONCE(1); + } +} + +static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route) +{ + bool offload; + + offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP; + mlxsw_sp_mr_mfc_offload_set(mr_route, offload); +} + +static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route *mr_route) +{ + mlxsw_sp_mr_mfc_offload_set(mr_route, false); + mlxsw_sp_mr_route_erase(mr_table, mr_route); + rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node, + mlxsw_sp_mr_route_ht_params); + list_del(&mr_route->node); + mlxsw_sp_mr_route_destroy(mr_table, mr_route); +} + +int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table, + struct mfc_cache *mfc, bool replace) +{ + struct mlxsw_sp_mr_route *mr_orig_route = NULL; + struct mlxsw_sp_mr_route *mr_route; + int err; + + /* If the route is a (*,*) route, abort, as these kind of routes are + * used for proxy routes. + */ + if (mfc->mfc_origin == INADDR_ANY && mfc->mfc_mcastgrp == INADDR_ANY) { + dev_warn(mr_table->mlxsw_sp->bus_info->dev, + "Offloading proxy routes is not supported.\n"); + return -EINVAL; + } + + /* Create a new route */ + mr_route = mlxsw_sp_mr_route4_create(mr_table, mfc); + if (IS_ERR(mr_route)) + return PTR_ERR(mr_route); + + /* Find any route with a matching key */ + mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht, + &mr_route->key, + mlxsw_sp_mr_route_ht_params); + if (replace) { + /* On replace case, make the route point to the new route_priv. + */ + if (WARN_ON(!mr_orig_route)) { + err = -ENOENT; + goto err_no_orig_route; + } + mr_route->route_priv = mr_orig_route->route_priv; + } else if (mr_orig_route) { + /* On non replace case, if another route with the same key was + * found, abort, as duplicate routes are used for proxy routes. + */ + dev_warn(mr_table->mlxsw_sp->bus_info->dev, + "Offloading proxy routes is not supported.\n"); + err = -EINVAL; + goto err_duplicate_route; + } + + /* Put it in the table data-structures */ + list_add_tail(&mr_route->node, &mr_table->route_list); + err = rhashtable_insert_fast(&mr_table->route_ht, + &mr_route->ht_node, + mlxsw_sp_mr_route_ht_params); + if (err) + goto err_rhashtable_insert; + + /* Write the route to the hardware */ + err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace); + if (err) + goto err_mr_route_write; + + /* Destroy the original route */ + if (replace) { + rhashtable_remove_fast(&mr_table->route_ht, + &mr_orig_route->ht_node, + mlxsw_sp_mr_route_ht_params); + list_del(&mr_orig_route->node); + mlxsw_sp_mr_route4_destroy(mr_table, mr_orig_route); + } + + mlxsw_sp_mr_mfc_offload_update(mr_route); + return 0; + +err_mr_route_write: + rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node, + mlxsw_sp_mr_route_ht_params); +err_rhashtable_insert: + list_del(&mr_route->node); +err_no_orig_route: +err_duplicate_route: + mlxsw_sp_mr_route4_destroy(mr_table, mr_route); + return err; +} + +void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table, + struct mfc_cache *mfc) +{ + struct mlxsw_sp_mr_route *mr_route; + struct mlxsw_sp_mr_route_key key; + + mlxsw_sp_mr_route4_key(mr_table, &key, mfc); + mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key, + mlxsw_sp_mr_route_ht_params); + if (mr_route) + __mlxsw_sp_mr_route_del(mr_table, mr_route); +} + +/* Should be called after the VIF struct is updated */ +static int +mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route_vif_entry *rve) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + enum mlxsw_sp_mr_route_action route_action; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + u16 irif_index; + int err; + + route_action = mlxsw_sp_mr_route_action(rve->mr_route); + if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP) + return 0; + + /* rve->mr_vif->rif is guaranteed to be valid at this stage */ + irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif); + err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv, + irif_index); + if (err) + return err; + + err = mr->mr_ops->route_action_update(mlxsw_sp, + rve->mr_route->route_priv, + route_action); + if (err) + /* No need to rollback here because the iRIF change only takes + * place after the action has been updated. + */ + return err; + + rve->mr_route->route_action = route_action; + mlxsw_sp_mr_mfc_offload_update(rve->mr_route); + return 0; +} + +static void +mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route_vif_entry *rve) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + + mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv, + MLXSW_SP_MR_ROUTE_ACTION_TRAP); + rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP; + mlxsw_sp_mr_mfc_offload_update(rve->mr_route); +} + +/* Should be called after the RIF struct is updated */ +static int +mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route_vif_entry *rve) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + enum mlxsw_sp_mr_route_action route_action; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + u16 erif_index = 0; + int err; + + /* Update the route action, as the new eVIF can be a tunnel or a pimreg + * device which will require updating the action. + */ + route_action = mlxsw_sp_mr_route_action(rve->mr_route); + if (route_action != rve->mr_route->route_action) { + err = mr->mr_ops->route_action_update(mlxsw_sp, + rve->mr_route->route_priv, + route_action); + if (err) + return err; + } + + /* Add the eRIF */ + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) { + erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif); + err = mr->mr_ops->route_erif_add(mlxsw_sp, + rve->mr_route->route_priv, + erif_index); + if (err) + goto err_route_erif_add; + } + + /* Update the minimum MTU */ + if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) { + rve->mr_route->min_mtu = rve->mr_vif->dev->mtu; + err = mr->mr_ops->route_min_mtu_update(mlxsw_sp, + rve->mr_route->route_priv, + rve->mr_route->min_mtu); + if (err) + goto err_route_min_mtu_update; + } + + rve->mr_route->route_action = route_action; + mlxsw_sp_mr_mfc_offload_update(rve->mr_route); + return 0; + +err_route_min_mtu_update: + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) + mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, + erif_index); +err_route_erif_add: + if (route_action != rve->mr_route->route_action) + mr->mr_ops->route_action_update(mlxsw_sp, + rve->mr_route->route_priv, + rve->mr_route->route_action); + return err; +} + +/* Should be called before the RIF struct is updated */ +static void +mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table, + struct mlxsw_sp_mr_route_vif_entry *rve) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + enum mlxsw_sp_mr_route_action route_action; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + u16 rifi; + + /* If the unresolved RIF was not valid, no need to delete it */ + if (!mlxsw_sp_mr_vif_valid(rve->mr_vif)) + return; + + /* Update the route action: if there is only one valid eVIF in the + * route, set the action to trap as the VIF deletion will lead to zero + * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to + * determine the route action. + */ + if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1) + route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP; + else + route_action = mlxsw_sp_mr_route_action(rve->mr_route); + if (route_action != rve->mr_route->route_action) + mr->mr_ops->route_action_update(mlxsw_sp, + rve->mr_route->route_priv, + route_action); + + /* Delete the erif from the route */ + rifi = mlxsw_sp_rif_index(rve->mr_vif->rif); + mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi); + rve->mr_route->route_action = route_action; + mlxsw_sp_mr_mfc_offload_update(rve->mr_route); +} + +static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table, + struct net_device *dev, + struct mlxsw_sp_mr_vif *mr_vif, + unsigned long vif_flags, + const struct mlxsw_sp_rif *rif) +{ + struct mlxsw_sp_mr_route_vif_entry *irve, *erve; + int err; + + /* Update the VIF */ + mr_vif->dev = dev; + mr_vif->rif = rif; + mr_vif->vif_flags = vif_flags; + + /* Update all routes where this VIF is used as an unresolved iRIF */ + list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) { + err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve); + if (err) + goto err_irif_unresolve; + } + + /* Update all routes where this VIF is used as an unresolved eRIF */ + list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) { + err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve); + if (err) + goto err_erif_unresolve; + } + return 0; + +err_erif_unresolve: + list_for_each_entry_from_reverse(erve, &mr_vif->route_evif_list, + vif_node) + mlxsw_sp_mr_route_evif_unresolve(mr_table, erve); +err_irif_unresolve: + list_for_each_entry_from_reverse(irve, &mr_vif->route_ivif_list, + vif_node) + mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve); + mr_vif->rif = NULL; + return err; +} + +static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table, + struct net_device *dev, + struct mlxsw_sp_mr_vif *mr_vif) +{ + struct mlxsw_sp_mr_route_vif_entry *rve; + + /* Update all routes where this VIF is used as an unresolved eRIF */ + list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) + mlxsw_sp_mr_route_evif_unresolve(mr_table, rve); + + /* Update all routes where this VIF is used as an unresolved iRIF */ + list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node) + mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve); + + /* Update the VIF */ + mr_vif->dev = dev; + mr_vif->rif = NULL; +} + +int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table, + struct net_device *dev, vifi_t vif_index, + unsigned long vif_flags, const struct mlxsw_sp_rif *rif) +{ + struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index]; + + if (WARN_ON(vif_index >= MAXVIFS)) + return -EINVAL; + if (mr_vif->dev) + return -EEXIST; + return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif); +} + +void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index) +{ + struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index]; + + if (WARN_ON(vif_index >= MAXVIFS)) + return; + if (WARN_ON(!mr_vif->dev)) + return; + mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif); +} + +struct mlxsw_sp_mr_vif * +mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table, + const struct net_device *dev) +{ + vifi_t vif_index; + + for (vif_index = 0; vif_index < MAXVIFS; vif_index++) + if (mr_table->vifs[vif_index].dev == dev) + return &mr_table->vifs[vif_index]; + return NULL; +} + +int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif) +{ + const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif); + struct mlxsw_sp_mr_vif *mr_vif; + + if (!rif_dev) + return 0; + + mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev); + if (!mr_vif) + return 0; + return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif, + mr_vif->vif_flags, rif); +} + +void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif) +{ + const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif); + struct mlxsw_sp_mr_vif *mr_vif; + + if (!rif_dev) + return; + + mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev); + if (!mr_vif) + return; + mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif); +} + +void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif, int mtu) +{ + const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif); + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + struct mlxsw_sp_mr_route_vif_entry *rve; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + struct mlxsw_sp_mr_vif *mr_vif; + + if (!rif_dev) + return; + + /* Search for a VIF that use that RIF */ + mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev); + if (!mr_vif) + return; + + /* Update all the routes that uses that VIF as eVIF */ + list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) { + if (mtu < rve->mr_route->min_mtu) { + rve->mr_route->min_mtu = mtu; + mr->mr_ops->route_min_mtu_update(mlxsw_sp, + rve->mr_route->route_priv, + mtu); + } + } +} + +struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp, + u32 vr_id, + enum mlxsw_sp_l3proto proto) +{ + struct mlxsw_sp_mr_route_params catchall_route_params = { + .prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL, + .key = { + .vrid = vr_id, + }, + .value = { + .route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP, + } + }; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + struct mlxsw_sp_mr_table *mr_table; + int err; + int i; + + mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size, + GFP_KERNEL); + if (!mr_table) + return ERR_PTR(-ENOMEM); + + mr_table->vr_id = vr_id; + mr_table->mlxsw_sp = mlxsw_sp; + mr_table->proto = proto; + INIT_LIST_HEAD(&mr_table->route_list); + + err = rhashtable_init(&mr_table->route_ht, + &mlxsw_sp_mr_route_ht_params); + if (err) + goto err_route_rhashtable_init; + + for (i = 0; i < MAXVIFS; i++) { + INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list); + INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list); + } + + err = mr->mr_ops->route_create(mlxsw_sp, mr->priv, + mr_table->catchall_route_priv, + &catchall_route_params); + if (err) + goto err_ops_route_create; + list_add_tail(&mr_table->node, &mr->table_list); + return mr_table; + +err_ops_route_create: + rhashtable_destroy(&mr_table->route_ht); +err_route_rhashtable_init: + kfree(mr_table); + return ERR_PTR(err); +} + +void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table) +{ + struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp; + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + + WARN_ON(!mlxsw_sp_mr_table_empty(mr_table)); + list_del(&mr_table->node); + mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, + &mr_table->catchall_route_priv); + rhashtable_destroy(&mr_table->route_ht); + kfree(mr_table); +} + +void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table) +{ + struct mlxsw_sp_mr_route *mr_route, *tmp; + int i; + + list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node) + __mlxsw_sp_mr_route_del(mr_table, mr_route); + + for (i = 0; i < MAXVIFS; i++) { + mr_table->vifs[i].dev = NULL; + mr_table->vifs[i].rif = NULL; + } +} + +bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table) +{ + int i; + + for (i = 0; i < MAXVIFS; i++) + if (mr_table->vifs[i].dev) + return false; + return list_empty(&mr_table->route_list); +} + +static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_route *mr_route) +{ + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + u64 packets, bytes; + + if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP) + return; + + mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets, + &bytes); + + switch (mr_route->mr_table->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + if (mr_route->mfc4->mfc_un.res.pkt != packets) + mr_route->mfc4->mfc_un.res.lastuse = jiffies; + mr_route->mfc4->mfc_un.res.pkt = packets; + mr_route->mfc4->mfc_un.res.bytes = bytes; + break; + case MLXSW_SP_L3_PROTO_IPV6: + /* fall through */ + default: + WARN_ON_ONCE(1); + } +} + +static void mlxsw_sp_mr_stats_update(struct work_struct *work) +{ + struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr, + stats_update_dw.work); + struct mlxsw_sp_mr_table *mr_table; + struct mlxsw_sp_mr_route *mr_route; + unsigned long interval; + + rtnl_lock(); + list_for_each_entry(mr_table, &mr->table_list, node) + list_for_each_entry(mr_route, &mr_table->route_list, node) + mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp, + mr_route); + rtnl_unlock(); + + interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL); + mlxsw_core_schedule_dw(&mr->stats_update_dw, interval); +} + +int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp, + const struct mlxsw_sp_mr_ops *mr_ops) +{ + struct mlxsw_sp_mr *mr; + unsigned long interval; + int err; + + mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL); + if (!mr) + return -ENOMEM; + mr->mr_ops = mr_ops; + mlxsw_sp->mr = mr; + INIT_LIST_HEAD(&mr->table_list); + + err = mr_ops->init(mlxsw_sp, mr->priv); + if (err) + goto err; + + /* Create the delayed work for counter updates */ + INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update); + interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL); + mlxsw_core_schedule_dw(&mr->stats_update_dw, interval); + return 0; +err: + kfree(mr); + return err; +} + +void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_mr *mr = mlxsw_sp->mr; + + cancel_delayed_work_sync(&mr->stats_update_dw); + mr->mr_ops->fini(mr->priv); + kfree(mr); +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h new file mode 100644 index 000000000000..c851b237d253 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h @@ -0,0 +1,133 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MLXSW_SPECTRUM_MCROUTER_H +#define _MLXSW_SPECTRUM_MCROUTER_H + +#include +#include "spectrum_router.h" +#include "spectrum.h" + +enum mlxsw_sp_mr_route_action { + MLXSW_SP_MR_ROUTE_ACTION_FORWARD, + MLXSW_SP_MR_ROUTE_ACTION_TRAP, +}; + +enum mlxsw_sp_mr_route_prio { + MLXSW_SP_MR_ROUTE_PRIO_SG, + MLXSW_SP_MR_ROUTE_PRIO_STARG, + MLXSW_SP_MR_ROUTE_PRIO_CATCHALL, + __MLXSW_SP_MR_ROUTE_PRIO_MAX +}; + +#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1) + +struct mlxsw_sp_mr_route_key { + int vrid; + enum mlxsw_sp_l3proto proto; + union mlxsw_sp_l3addr group; + union mlxsw_sp_l3addr group_mask; + union mlxsw_sp_l3addr source; + union mlxsw_sp_l3addr source_mask; +}; + +struct mlxsw_sp_mr_route_info { + enum mlxsw_sp_mr_route_action route_action; + u16 irif_index; + u16 *erif_indices; + size_t erif_num; + u16 min_mtu; +}; + +struct mlxsw_sp_mr_route_params { + struct mlxsw_sp_mr_route_key key; + struct mlxsw_sp_mr_route_info value; + enum mlxsw_sp_mr_route_prio prio; +}; + +struct mlxsw_sp_mr_ops { + int priv_size; + int route_priv_size; + int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv); + int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv, + void *route_priv, + struct mlxsw_sp_mr_route_params *route_params); + int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + struct mlxsw_sp_mr_route_info *route_info); + int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + u64 *packets, u64 *bytes); + int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + enum mlxsw_sp_mr_route_action route_action); + int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + u16 min_mtu); + int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + u16 irif_index); + int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + u16 erif_index); + int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv, + u16 erif_index); + void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv, + void *route_priv); + void (*fini)(void *priv); +}; + +struct mlxsw_sp_mr; +struct mlxsw_sp_mr_table; + +int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp, + const struct mlxsw_sp_mr_ops *mr_ops); +void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp); +int mlxsw_sp_mr_route4_add(struct mlxsw_sp_mr_table *mr_table, + struct mfc_cache *mfc, bool replace); +void mlxsw_sp_mr_route4_del(struct mlxsw_sp_mr_table *mr_table, + struct mfc_cache *mfc); +int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table, + struct net_device *dev, vifi_t vif_index, + unsigned long vif_flags, + const struct mlxsw_sp_rif *rif); +void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index); +int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif); +void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif); +void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table, + const struct mlxsw_sp_rif *rif, int mtu); +struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp, + u32 tb_id, + enum mlxsw_sp_l3proto proto); +void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table); +void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table); +bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table); + +#endif From 0e14c7777acb6d58250cb746685dde0a74d60fe8 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:18 +0200 Subject: [PATCH 0939/1322] mlxsw: spectrum: Add the multicast routing hardware logic Implement the multicast routing hardware API introduced in previous patch for the specific spectrum hardware. The spectrum hardware multicast routes are written using the RMFT2 register and point to an ACL flexible action set. The actions used for multicast routes are: - Counter action, which allows counting bytes and packets on multicast routes. - Multicast route action, which provide RPF check and do the actual packet duplication to a list of RIFs. - Trap action, in the case the route action specified by the called is trap. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/Makefile | 2 +- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 + .../mellanox/mlxsw/spectrum_mr_tcam.c | 828 ++++++++++++++++++ .../mellanox/mlxsw/spectrum_mr_tcam.h | 43 + 4 files changed, 873 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 9b29764905f3..4816504419fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -18,7 +18,7 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_acl.o spectrum_flower.o \ spectrum_cnt.o spectrum_fid.o \ spectrum_ipip.o spectrum_acl_flex_actions.o \ - spectrum_mr.o + spectrum_mr.o spectrum_mr_tcam.o mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB) += spectrum_dcb.o mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o obj-$(CONFIG_MLXSW_MINIMAL) += mlxsw_minimal.o diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 44c5259e5548..ae67e6046098 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -139,6 +139,7 @@ struct mlxsw_sp_port_mall_tc_entry { struct mlxsw_sp_sb; struct mlxsw_sp_bridge; struct mlxsw_sp_router; +struct mlxsw_sp_mr; struct mlxsw_sp_acl; struct mlxsw_sp_counter_pool; struct mlxsw_sp_fid_core; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c new file mode 100644 index 000000000000..cda9e9ad10e3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c @@ -0,0 +1,828 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "reg.h" +#include "spectrum.h" +#include "core_acl_flex_actions.h" +#include "spectrum_mr.h" + +struct mlxsw_sp_mr_tcam_region { + struct mlxsw_sp *mlxsw_sp; + enum mlxsw_reg_rtar_key_type rtar_key_type; + struct parman *parman; + struct parman_prio *parman_prios; +}; + +struct mlxsw_sp_mr_tcam { + struct mlxsw_sp_mr_tcam_region ipv4_tcam_region; +}; + +/* This struct maps to one RIGR2 register entry */ +struct mlxsw_sp_mr_erif_sublist { + struct list_head list; + u32 rigr2_kvdl_index; + int num_erifs; + u16 erif_indices[MLXSW_REG_RIGR2_MAX_ERIFS]; + bool synced; +}; + +struct mlxsw_sp_mr_tcam_erif_list { + struct list_head erif_sublists; + u32 kvdl_index; +}; + +static bool +mlxsw_sp_mr_erif_sublist_full(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_erif_sublist *erif_sublist) +{ + int erif_list_entries = MLXSW_CORE_RES_GET(mlxsw_sp->core, + MC_ERIF_LIST_ENTRIES); + + return erif_sublist->num_erifs == erif_list_entries; +} + +static void +mlxsw_sp_mr_erif_list_init(struct mlxsw_sp_mr_tcam_erif_list *erif_list) +{ + INIT_LIST_HEAD(&erif_list->erif_sublists); +} + +#define MLXSW_SP_KVDL_RIGR2_SIZE 1 + +static struct mlxsw_sp_mr_erif_sublist * +mlxsw_sp_mr_erif_sublist_create(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_erif_list *erif_list) +{ + struct mlxsw_sp_mr_erif_sublist *erif_sublist; + int err; + + erif_sublist = kzalloc(sizeof(*erif_sublist), GFP_KERNEL); + if (!erif_sublist) + return ERR_PTR(-ENOMEM); + err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_RIGR2_SIZE, + &erif_sublist->rigr2_kvdl_index); + if (err) { + kfree(erif_sublist); + return ERR_PTR(err); + } + + list_add_tail(&erif_sublist->list, &erif_list->erif_sublists); + return erif_sublist; +} + +static void +mlxsw_sp_mr_erif_sublist_destroy(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_erif_sublist *erif_sublist) +{ + list_del(&erif_sublist->list); + mlxsw_sp_kvdl_free(mlxsw_sp, erif_sublist->rigr2_kvdl_index); + kfree(erif_sublist); +} + +static int +mlxsw_sp_mr_erif_list_add(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_erif_list *erif_list, + u16 erif_index) +{ + struct mlxsw_sp_mr_erif_sublist *sublist; + + /* If either there is no erif_entry or the last one is full, allocate a + * new one. + */ + if (list_empty(&erif_list->erif_sublists)) { + sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, erif_list); + if (IS_ERR(sublist)) + return PTR_ERR(sublist); + erif_list->kvdl_index = sublist->rigr2_kvdl_index; + } else { + sublist = list_last_entry(&erif_list->erif_sublists, + struct mlxsw_sp_mr_erif_sublist, + list); + sublist->synced = false; + if (mlxsw_sp_mr_erif_sublist_full(mlxsw_sp, sublist)) { + sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, + erif_list); + if (IS_ERR(sublist)) + return PTR_ERR(sublist); + } + } + + /* Add the eRIF to the last entry's last index */ + sublist->erif_indices[sublist->num_erifs++] = erif_index; + return 0; +} + +static void +mlxsw_sp_mr_erif_list_flush(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_erif_list *erif_list) +{ + struct mlxsw_sp_mr_erif_sublist *erif_sublist, *tmp; + + list_for_each_entry_safe(erif_sublist, tmp, &erif_list->erif_sublists, + list) + mlxsw_sp_mr_erif_sublist_destroy(mlxsw_sp, erif_sublist); +} + +static int +mlxsw_sp_mr_erif_list_commit(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_erif_list *erif_list) +{ + struct mlxsw_sp_mr_erif_sublist *curr_sublist; + char rigr2_pl[MLXSW_REG_RIGR2_LEN]; + int err; + int i; + + list_for_each_entry(curr_sublist, &erif_list->erif_sublists, list) { + if (curr_sublist->synced) + continue; + + /* If the sublist is not the last one, pack the next index */ + if (list_is_last(&curr_sublist->list, + &erif_list->erif_sublists)) { + mlxsw_reg_rigr2_pack(rigr2_pl, + curr_sublist->rigr2_kvdl_index, + false, 0); + } else { + struct mlxsw_sp_mr_erif_sublist *next_sublist; + + next_sublist = list_next_entry(curr_sublist, list); + mlxsw_reg_rigr2_pack(rigr2_pl, + curr_sublist->rigr2_kvdl_index, + true, + next_sublist->rigr2_kvdl_index); + } + + /* Pack all the erifs */ + for (i = 0; i < curr_sublist->num_erifs; i++) { + u16 erif_index = curr_sublist->erif_indices[i]; + + mlxsw_reg_rigr2_erif_entry_pack(rigr2_pl, i, true, + erif_index); + } + + /* Write the entry */ + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rigr2), + rigr2_pl); + if (err) + /* No need of a rollback here because this + * hardware entry should not be pointed yet. + */ + return err; + curr_sublist->synced = true; + } + return 0; +} + +static void mlxsw_sp_mr_erif_list_move(struct mlxsw_sp_mr_tcam_erif_list *to, + struct mlxsw_sp_mr_tcam_erif_list *from) +{ + list_splice(&from->erif_sublists, &to->erif_sublists); + to->kvdl_index = from->kvdl_index; +} + +struct mlxsw_sp_mr_tcam_route { + struct mlxsw_sp_mr_tcam_erif_list erif_list; + struct mlxsw_afa_block *afa_block; + u32 counter_index; + struct parman_item parman_item; + struct parman_prio *parman_prio; + enum mlxsw_sp_mr_route_action action; + struct mlxsw_sp_mr_route_key key; + u16 irif_index; + u16 min_mtu; +}; + +static struct mlxsw_afa_block * +mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp, + enum mlxsw_sp_mr_route_action route_action, + u16 irif_index, u32 counter_index, + u16 min_mtu, + struct mlxsw_sp_mr_tcam_erif_list *erif_list) +{ + struct mlxsw_afa_block *afa_block; + int err; + + afa_block = mlxsw_afa_block_create(mlxsw_sp->afa); + if (IS_ERR(afa_block)) + return afa_block; + + err = mlxsw_afa_block_append_counter(afa_block, counter_index); + if (err) + goto err; + + switch (route_action) { + case MLXSW_SP_MR_ROUTE_ACTION_TRAP: + err = mlxsw_afa_block_append_trap(afa_block, + MLXSW_TRAP_ID_ACL1); + if (err) + goto err; + break; + case MLXSW_SP_MR_ROUTE_ACTION_FORWARD: + /* If we are about to append a multicast router action, commit + * the erif_list. + */ + err = mlxsw_sp_mr_erif_list_commit(mlxsw_sp, erif_list); + if (err) + goto err; + + err = mlxsw_afa_block_append_mcrouter(afa_block, irif_index, + min_mtu, false, + erif_list->kvdl_index); + if (err) + goto err; + break; + default: + err = -EINVAL; + goto err; + } + + err = mlxsw_afa_block_commit(afa_block); + if (err) + goto err; + return afa_block; +err: + mlxsw_afa_block_destroy(afa_block); + return ERR_PTR(err); +} + +static void +mlxsw_sp_mr_tcam_afa_block_destroy(struct mlxsw_afa_block *afa_block) +{ + mlxsw_afa_block_destroy(afa_block); +} + +static int mlxsw_sp_mr_tcam_route_replace(struct mlxsw_sp *mlxsw_sp, + struct parman_item *parman_item, + struct mlxsw_sp_mr_route_key *key, + struct mlxsw_afa_block *afa_block) +{ + char rmft2_pl[MLXSW_REG_RMFT2_LEN]; + + switch (key->proto) { + case MLXSW_SP_L3_PROTO_IPV4: + mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, true, parman_item->index, + key->vrid, + MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0, + ntohl(key->group.addr4), + ntohl(key->group_mask.addr4), + ntohl(key->source.addr4), + ntohl(key->source_mask.addr4), + mlxsw_afa_block_first_set(afa_block)); + break; + case MLXSW_SP_L3_PROTO_IPV6: + default: + WARN_ON_ONCE(1); + } + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl); +} + +static int mlxsw_sp_mr_tcam_route_remove(struct mlxsw_sp *mlxsw_sp, int vrid, + struct parman_item *parman_item) +{ + char rmft2_pl[MLXSW_REG_RMFT2_LEN]; + + mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, false, parman_item->index, vrid, + 0, 0, 0, 0, 0, 0, NULL); + + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl); +} + +static int +mlxsw_sp_mr_tcam_erif_populate(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_erif_list *erif_list, + struct mlxsw_sp_mr_route_info *route_info) +{ + int err; + int i; + + for (i = 0; i < route_info->erif_num; i++) { + u16 erif_index = route_info->erif_indices[i]; + + err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, erif_list, + erif_index); + if (err) + return err; + } + return 0; +} + +static int +mlxsw_sp_mr_tcam_route_parman_item_add(struct mlxsw_sp_mr_tcam *mr_tcam, + struct mlxsw_sp_mr_tcam_route *route, + enum mlxsw_sp_mr_route_prio prio) +{ + struct parman_prio *parman_prio = NULL; + int err; + + switch (route->key.proto) { + case MLXSW_SP_L3_PROTO_IPV4: + parman_prio = &mr_tcam->ipv4_tcam_region.parman_prios[prio]; + err = parman_item_add(mr_tcam->ipv4_tcam_region.parman, + parman_prio, &route->parman_item); + if (err) + return err; + break; + case MLXSW_SP_L3_PROTO_IPV6: + default: + WARN_ON_ONCE(1); + } + route->parman_prio = parman_prio; + return 0; +} + +static void +mlxsw_sp_mr_tcam_route_parman_item_remove(struct mlxsw_sp_mr_tcam *mr_tcam, + struct mlxsw_sp_mr_tcam_route *route) +{ + switch (route->key.proto) { + case MLXSW_SP_L3_PROTO_IPV4: + parman_item_remove(mr_tcam->ipv4_tcam_region.parman, + route->parman_prio, &route->parman_item); + break; + case MLXSW_SP_L3_PROTO_IPV6: + default: + WARN_ON_ONCE(1); + } +} + +static int +mlxsw_sp_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv, + void *route_priv, + struct mlxsw_sp_mr_route_params *route_params) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_sp_mr_tcam *mr_tcam = priv; + int err; + + route->key = route_params->key; + route->irif_index = route_params->value.irif_index; + route->min_mtu = route_params->value.min_mtu; + route->action = route_params->value.route_action; + + /* Create the egress RIFs list */ + mlxsw_sp_mr_erif_list_init(&route->erif_list); + err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &route->erif_list, + &route_params->value); + if (err) + goto err_erif_populate; + + /* Create the flow counter */ + err = mlxsw_sp_flow_counter_alloc(mlxsw_sp, &route->counter_index); + if (err) + goto err_counter_alloc; + + /* Create the flexible action block */ + route->afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, + route->action, + route->irif_index, + route->counter_index, + route->min_mtu, + &route->erif_list); + if (IS_ERR(route->afa_block)) { + err = PTR_ERR(route->afa_block); + goto err_afa_block_create; + } + + /* Allocate place in the TCAM */ + err = mlxsw_sp_mr_tcam_route_parman_item_add(mr_tcam, route, + route_params->prio); + if (err) + goto err_parman_item_add; + + /* Write the route to the TCAM */ + err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item, + &route->key, route->afa_block); + if (err) + goto err_route_replace; + return 0; + +err_route_replace: + mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route); +err_parman_item_add: + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); +err_afa_block_create: + mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index); +err_erif_populate: +err_counter_alloc: + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list); + return err; +} + +static void mlxsw_sp_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp, + void *priv, void *route_priv) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_sp_mr_tcam *mr_tcam = priv; + + mlxsw_sp_mr_tcam_route_remove(mlxsw_sp, route->key.vrid, + &route->parman_item); + mlxsw_sp_mr_tcam_route_parman_item_remove(mr_tcam, route); + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); + mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index); + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list); +} + +static int mlxsw_sp_mr_tcam_route_stats(struct mlxsw_sp *mlxsw_sp, + void *route_priv, u64 *packets, + u64 *bytes) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + + return mlxsw_sp_flow_counter_get(mlxsw_sp, route->counter_index, + packets, bytes); +} + +static int +mlxsw_sp_mr_tcam_route_action_update(struct mlxsw_sp *mlxsw_sp, + void *route_priv, + enum mlxsw_sp_mr_route_action route_action) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_afa_block *afa_block; + int err; + + /* Create a new flexible action block */ + afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route_action, + route->irif_index, + route->counter_index, + route->min_mtu, + &route->erif_list); + if (IS_ERR(afa_block)) + return PTR_ERR(afa_block); + + /* Update the TCAM route entry */ + err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item, + &route->key, afa_block); + if (err) + goto err; + + /* Delete the old one */ + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); + route->afa_block = afa_block; + route->action = route_action; + return 0; +err: + mlxsw_sp_mr_tcam_afa_block_destroy(afa_block); + return err; +} + +static int mlxsw_sp_mr_tcam_route_min_mtu_update(struct mlxsw_sp *mlxsw_sp, + void *route_priv, u16 min_mtu) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_afa_block *afa_block; + int err; + + /* Create a new flexible action block */ + afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, + route->action, + route->irif_index, + route->counter_index, + min_mtu, + &route->erif_list); + if (IS_ERR(afa_block)) + return PTR_ERR(afa_block); + + /* Update the TCAM route entry */ + err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item, + &route->key, afa_block); + if (err) + goto err; + + /* Delete the old one */ + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); + route->afa_block = afa_block; + route->min_mtu = min_mtu; + return 0; +err: + mlxsw_sp_mr_tcam_afa_block_destroy(afa_block); + return err; +} + +static int mlxsw_sp_mr_tcam_route_irif_update(struct mlxsw_sp *mlxsw_sp, + void *route_priv, u16 irif_index) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + + if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP) + return -EINVAL; + route->irif_index = irif_index; + return 0; +} + +static int mlxsw_sp_mr_tcam_route_erif_add(struct mlxsw_sp *mlxsw_sp, + void *route_priv, u16 erif_index) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + int err; + + err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &route->erif_list, + erif_index); + if (err) + return err; + + /* Commit the action only if the route action is not TRAP */ + if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP) + return mlxsw_sp_mr_erif_list_commit(mlxsw_sp, + &route->erif_list); + return 0; +} + +static int mlxsw_sp_mr_tcam_route_erif_del(struct mlxsw_sp *mlxsw_sp, + void *route_priv, u16 erif_index) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_sp_mr_erif_sublist *erif_sublist; + struct mlxsw_sp_mr_tcam_erif_list erif_list; + struct mlxsw_afa_block *afa_block; + int err; + int i; + + /* Create a copy of the original erif_list without the deleted entry */ + mlxsw_sp_mr_erif_list_init(&erif_list); + list_for_each_entry(erif_sublist, &route->erif_list.erif_sublists, list) { + for (i = 0; i < erif_sublist->num_erifs; i++) { + u16 curr_erif = erif_sublist->erif_indices[i]; + + if (curr_erif == erif_index) + continue; + err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &erif_list, + curr_erif); + if (err) + goto err_erif_list_add; + } + } + + /* Create the flexible action block pointing to the new erif_list */ + afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route->action, + route->irif_index, + route->counter_index, + route->min_mtu, + &erif_list); + if (IS_ERR(afa_block)) { + err = PTR_ERR(afa_block); + goto err_afa_block_create; + } + + /* Update the TCAM route entry */ + err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item, + &route->key, afa_block); + if (err) + goto err_route_write; + + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list); + route->afa_block = afa_block; + mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list); + return 0; + +err_route_write: + mlxsw_sp_mr_tcam_afa_block_destroy(afa_block); +err_afa_block_create: +err_erif_list_add: + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list); + return err; +} + +static int +mlxsw_sp_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp, void *route_priv, + struct mlxsw_sp_mr_route_info *route_info) +{ + struct mlxsw_sp_mr_tcam_route *route = route_priv; + struct mlxsw_sp_mr_tcam_erif_list erif_list; + struct mlxsw_afa_block *afa_block; + int err; + + /* Create a new erif_list */ + mlxsw_sp_mr_erif_list_init(&erif_list); + err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &erif_list, route_info); + if (err) + goto err_erif_populate; + + /* Create the flexible action block pointing to the new erif_list */ + afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, + route_info->route_action, + route_info->irif_index, + route->counter_index, + route_info->min_mtu, + &erif_list); + if (IS_ERR(afa_block)) { + err = PTR_ERR(afa_block); + goto err_afa_block_create; + } + + /* Update the TCAM route entry */ + err = mlxsw_sp_mr_tcam_route_replace(mlxsw_sp, &route->parman_item, + &route->key, afa_block); + if (err) + goto err_route_write; + + mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block); + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list); + route->afa_block = afa_block; + mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list); + route->action = route_info->route_action; + route->irif_index = route_info->irif_index; + route->min_mtu = route_info->min_mtu; + return 0; + +err_route_write: + mlxsw_sp_mr_tcam_afa_block_destroy(afa_block); +err_afa_block_create: +err_erif_populate: + mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list); + return err; +} + +#define MLXSW_SP_MR_TCAM_REGION_BASE_COUNT 16 +#define MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP 16 + +static int +mlxsw_sp_mr_tcam_region_alloc(struct mlxsw_sp_mr_tcam_region *mr_tcam_region) +{ + struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp; + char rtar_pl[MLXSW_REG_RTAR_LEN]; + + mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_ALLOCATE, + mr_tcam_region->rtar_key_type, + MLXSW_SP_MR_TCAM_REGION_BASE_COUNT); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl); +} + +static void +mlxsw_sp_mr_tcam_region_free(struct mlxsw_sp_mr_tcam_region *mr_tcam_region) +{ + struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp; + char rtar_pl[MLXSW_REG_RTAR_LEN]; + + mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_DEALLOCATE, + mr_tcam_region->rtar_key_type, 0); + mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl); +} + +static int mlxsw_sp_mr_tcam_region_parman_resize(void *priv, + unsigned long new_count) +{ + struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv; + struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp; + char rtar_pl[MLXSW_REG_RTAR_LEN]; + u64 max_tcam_rules; + + max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES); + if (new_count > max_tcam_rules) + return -EINVAL; + mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_RESIZE, + mr_tcam_region->rtar_key_type, new_count); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl); +} + +static void mlxsw_sp_mr_tcam_region_parman_move(void *priv, + unsigned long from_index, + unsigned long to_index, + unsigned long count) +{ + struct mlxsw_sp_mr_tcam_region *mr_tcam_region = priv; + struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp; + char rrcr_pl[MLXSW_REG_RRCR_LEN]; + + mlxsw_reg_rrcr_pack(rrcr_pl, MLXSW_REG_RRCR_OP_MOVE, + from_index, count, + mr_tcam_region->rtar_key_type, to_index); + mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rrcr), rrcr_pl); +} + +static const struct parman_ops mlxsw_sp_mr_tcam_region_parman_ops = { + .base_count = MLXSW_SP_MR_TCAM_REGION_BASE_COUNT, + .resize_step = MLXSW_SP_MR_TCAM_REGION_RESIZE_STEP, + .resize = mlxsw_sp_mr_tcam_region_parman_resize, + .move = mlxsw_sp_mr_tcam_region_parman_move, + .algo = PARMAN_ALGO_TYPE_LSORT, +}; + +static int +mlxsw_sp_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_mr_tcam_region *mr_tcam_region, + enum mlxsw_reg_rtar_key_type rtar_key_type) +{ + struct parman_prio *parman_prios; + struct parman *parman; + int err; + int i; + + mr_tcam_region->rtar_key_type = rtar_key_type; + mr_tcam_region->mlxsw_sp = mlxsw_sp; + + err = mlxsw_sp_mr_tcam_region_alloc(mr_tcam_region); + if (err) + return err; + + parman = parman_create(&mlxsw_sp_mr_tcam_region_parman_ops, + mr_tcam_region); + if (!parman) { + err = -ENOMEM; + goto err_parman_create; + } + mr_tcam_region->parman = parman; + + parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1, + sizeof(*parman_prios), GFP_KERNEL); + if (!parman_prios) + goto err_parman_prios_alloc; + mr_tcam_region->parman_prios = parman_prios; + + for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++) + parman_prio_init(mr_tcam_region->parman, + &mr_tcam_region->parman_prios[i], i); + return 0; + +err_parman_prios_alloc: + parman_destroy(parman); +err_parman_create: + mlxsw_sp_mr_tcam_region_free(mr_tcam_region); + return err; +} + +static void +mlxsw_sp_mr_tcam_region_fini(struct mlxsw_sp_mr_tcam_region *mr_tcam_region) +{ + int i; + + for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++) + parman_prio_fini(&mr_tcam_region->parman_prios[i]); + kfree(mr_tcam_region->parman_prios); + parman_destroy(mr_tcam_region->parman); + mlxsw_sp_mr_tcam_region_free(mr_tcam_region); +} + +static int mlxsw_sp_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv) +{ + struct mlxsw_sp_mr_tcam *mr_tcam = priv; + + if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MC_ERIF_LIST_ENTRIES) || + !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_TCAM_RULES)) + return -EIO; + + return mlxsw_sp_mr_tcam_region_init(mlxsw_sp, + &mr_tcam->ipv4_tcam_region, + MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST); +} + +static void mlxsw_sp_mr_tcam_fini(void *priv) +{ + struct mlxsw_sp_mr_tcam *mr_tcam = priv; + + mlxsw_sp_mr_tcam_region_fini(&mr_tcam->ipv4_tcam_region); +} + +const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops = { + .priv_size = sizeof(struct mlxsw_sp_mr_tcam), + .route_priv_size = sizeof(struct mlxsw_sp_mr_tcam_route), + .init = mlxsw_sp_mr_tcam_init, + .route_create = mlxsw_sp_mr_tcam_route_create, + .route_update = mlxsw_sp_mr_tcam_route_update, + .route_stats = mlxsw_sp_mr_tcam_route_stats, + .route_action_update = mlxsw_sp_mr_tcam_route_action_update, + .route_min_mtu_update = mlxsw_sp_mr_tcam_route_min_mtu_update, + .route_irif_update = mlxsw_sp_mr_tcam_route_irif_update, + .route_erif_add = mlxsw_sp_mr_tcam_route_erif_add, + .route_erif_del = mlxsw_sp_mr_tcam_route_erif_del, + .route_destroy = mlxsw_sp_mr_tcam_route_destroy, + .fini = mlxsw_sp_mr_tcam_fini, +}; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h new file mode 100644 index 000000000000..f9b59ee25406 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h @@ -0,0 +1,43 @@ +/* + * drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Yotam Gigi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _MLXSW_SPECTRUM_MCROUTER_TCAM_H +#define _MLXSW_SPECTRUM_MCROUTER_TCAM_H + +#include "spectrum.h" +#include "spectrum_mr.h" + +extern const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops; + +#endif From 7e50d435759accec4e17764a8d5a1ef63b79ffd6 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:19 +0200 Subject: [PATCH 0940/1322] mlxsw: spectrum: router: Squash the default route table to main Currently, the mlxsw Spectrum driver offloads only either the RT_TABLE_MAIN FIB table or the VRF tables, so the RT_TABLE_LOCAL table is squashed to the RT_TABLE_MAIN table to allow local routes to be offloaded too. By default, multicast MFC routes which are not assigned to any user requested table are put in the RT_TABLE_DEFAULT table. Due to the fact that offloading multicast MFC routes support in Spectrum router logic is going to be introduced soon, squash the default table to MAIN too. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 321f7356073c..28c0c84bc966 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -693,8 +693,8 @@ static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp, static u32 mlxsw_sp_fix_tb_id(u32 tb_id) { - /* For our purpose, squash main and local table into one */ - if (tb_id == RT_TABLE_LOCAL) + /* For our purpose, squash main, default and local tables into one */ + if (tb_id == RT_TABLE_LOCAL || tb_id == RT_TABLE_DEFAULT) tb_id = RT_TABLE_MAIN; return tb_id; } From d42b0965b1d4fe0808a2103a3f7c015515b1112e Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:20 +0200 Subject: [PATCH 0941/1322] mlxsw: spectrum_router: Add multicast routes notification handling functionality Add functionality for calling the multicast routing offloading logic upon MFC and VIF add and delete notifications. In addition, call the multicast routing upon RIF addition and deletion events. As the multicast routing offload logic may sleep, the actual calls are done in a deferred work. To ensure the MFC object is not freed in that interval, a reference is held to it. In case of a failure, the abort mechanism is used, which ejects all the routes from the hardware and triggers the traffic to flow through the kernel. Note: At that stage, the FIB notifications are still ignored, and will be enabled in a further patch. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 187 +++++++++++++++++- 1 file changed, 185 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 28c0c84bc966..77584422ed08 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -65,6 +65,8 @@ #include "spectrum_cnt.h" #include "spectrum_dpipe.h" #include "spectrum_ipip.h" +#include "spectrum_mr.h" +#include "spectrum_mr_tcam.h" #include "spectrum_router.h" struct mlxsw_sp_vr; @@ -459,6 +461,7 @@ struct mlxsw_sp_vr { unsigned int rif_count; struct mlxsw_sp_fib *fib4; struct mlxsw_sp_fib *fib6; + struct mlxsw_sp_mr_table *mr4_table; }; static const struct rhashtable_params mlxsw_sp_fib_ht_params; @@ -653,7 +656,7 @@ static void mlxsw_sp_lpm_fini(struct mlxsw_sp *mlxsw_sp) static bool mlxsw_sp_vr_is_used(const struct mlxsw_sp_vr *vr) { - return !!vr->fib4 || !!vr->fib6; + return !!vr->fib4 || !!vr->fib6 || !!vr->mr4_table; } static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp) @@ -744,9 +747,18 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp, err = PTR_ERR(vr->fib6); goto err_fib6_create; } + vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id, + MLXSW_SP_L3_PROTO_IPV4); + if (IS_ERR(vr->mr4_table)) { + err = PTR_ERR(vr->mr4_table); + goto err_mr_table_create; + } vr->tb_id = tb_id; return vr; +err_mr_table_create: + mlxsw_sp_fib_destroy(vr->fib6); + vr->fib6 = NULL; err_fib6_create: mlxsw_sp_fib_destroy(vr->fib4); vr->fib4 = NULL; @@ -755,6 +767,8 @@ err_fib6_create: static void mlxsw_sp_vr_destroy(struct mlxsw_sp_vr *vr) { + mlxsw_sp_mr_table_destroy(vr->mr4_table); + vr->mr4_table = NULL; mlxsw_sp_fib_destroy(vr->fib6); vr->fib6 = NULL; mlxsw_sp_fib_destroy(vr->fib4); @@ -775,7 +789,8 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id) static void mlxsw_sp_vr_put(struct mlxsw_sp_vr *vr) { if (!vr->rif_count && list_empty(&vr->fib4->node_list) && - list_empty(&vr->fib6->node_list)) + list_empty(&vr->fib6->node_list) && + mlxsw_sp_mr_table_empty(vr->mr4_table)) mlxsw_sp_vr_destroy(vr); } @@ -4731,6 +4746,75 @@ static int __mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp, return 0; } +static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp, + struct mfc_entry_notifier_info *men_info, + bool replace) +{ + struct mlxsw_sp_vr *vr; + + if (mlxsw_sp->router->aborted) + return 0; + + vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id); + if (IS_ERR(vr)) + return PTR_ERR(vr); + + return mlxsw_sp_mr_route4_add(vr->mr4_table, men_info->mfc, replace); +} + +static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp, + struct mfc_entry_notifier_info *men_info) +{ + struct mlxsw_sp_vr *vr; + + if (mlxsw_sp->router->aborted) + return; + + vr = mlxsw_sp_vr_find(mlxsw_sp, men_info->tb_id); + if (WARN_ON(!vr)) + return; + + mlxsw_sp_mr_route4_del(vr->mr4_table, men_info->mfc); + mlxsw_sp_vr_put(vr); +} + +static int +mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp, + struct vif_entry_notifier_info *ven_info) +{ + struct mlxsw_sp_rif *rif; + struct mlxsw_sp_vr *vr; + + if (mlxsw_sp->router->aborted) + return 0; + + vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id); + if (IS_ERR(vr)) + return PTR_ERR(vr); + + rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, ven_info->dev); + return mlxsw_sp_mr_vif_add(vr->mr4_table, ven_info->dev, + ven_info->vif_index, + ven_info->vif_flags, rif); +} + +static void +mlxsw_sp_router_fibmr_vif_del(struct mlxsw_sp *mlxsw_sp, + struct vif_entry_notifier_info *ven_info) +{ + struct mlxsw_sp_vr *vr; + + if (mlxsw_sp->router->aborted) + return; + + vr = mlxsw_sp_vr_find(mlxsw_sp, ven_info->tb_id); + if (WARN_ON(!vr)) + return; + + mlxsw_sp_mr_vif_del(vr->mr4_table, ven_info->vif_index); + mlxsw_sp_vr_put(vr); +} + static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp) { enum mlxsw_reg_ralxx_protocol proto = MLXSW_REG_RALXX_PROTOCOL_IPV4; @@ -4741,6 +4825,10 @@ static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp) if (err) return err; + /* The multicast router code does not need an abort trap as by default, + * packets that don't match any routes are trapped to the CPU. + */ + proto = MLXSW_REG_RALXX_PROTOCOL_IPV6; return __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto, MLXSW_SP_LPM_TREE_MIN + 1); @@ -4822,6 +4910,8 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp) if (!mlxsw_sp_vr_is_used(vr)) continue; + + mlxsw_sp_mr_table_flush(vr->mr4_table); mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4); /* If virtual router was only used for IPv4, then it's no @@ -4854,6 +4944,8 @@ struct mlxsw_sp_fib_event_work { struct fib_entry_notifier_info fen_info; struct fib_rule_notifier_info fr_info; struct fib_nh_notifier_info fnh_info; + struct mfc_entry_notifier_info men_info; + struct vif_entry_notifier_info ven_info; }; struct mlxsw_sp *mlxsw_sp; unsigned long event; @@ -4940,6 +5032,55 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) kfree(fib_work); } +static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work) +{ + struct mlxsw_sp_fib_event_work *fib_work = + container_of(work, struct mlxsw_sp_fib_event_work, work); + struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; + struct fib_rule *rule; + bool replace; + int err; + + rtnl_lock(); + switch (fib_work->event) { + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_ADD: + replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE; + + err = mlxsw_sp_router_fibmr_add(mlxsw_sp, &fib_work->men_info, + replace); + if (err) + mlxsw_sp_router_fib_abort(mlxsw_sp); + ipmr_cache_put(fib_work->men_info.mfc); + break; + case FIB_EVENT_ENTRY_DEL: + mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info); + ipmr_cache_put(fib_work->men_info.mfc); + break; + case FIB_EVENT_VIF_ADD: + err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp, + &fib_work->ven_info); + if (err) + mlxsw_sp_router_fib_abort(mlxsw_sp); + dev_put(fib_work->ven_info.dev); + break; + case FIB_EVENT_VIF_DEL: + mlxsw_sp_router_fibmr_vif_del(mlxsw_sp, + &fib_work->ven_info); + dev_put(fib_work->ven_info.dev); + break; + case FIB_EVENT_RULE_ADD: /* fall through */ + case FIB_EVENT_RULE_DEL: + rule = fib_work->fr_info.rule; + if (!ipmr_rule_default(rule) && !rule->l3mdev) + mlxsw_sp_router_fib_abort(mlxsw_sp); + fib_rule_put(rule); + break; + } + rtnl_unlock(); + kfree(fib_work); +} + static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work, struct fib_notifier_info *info) { @@ -4985,6 +5126,30 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work, } } +static void +mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work, + struct fib_notifier_info *info) +{ + switch (fib_work->event) { + case FIB_EVENT_ENTRY_REPLACE: /* fall through */ + case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info)); + ipmr_cache_hold(fib_work->men_info.mfc); + break; + case FIB_EVENT_VIF_ADD: /* fall through */ + case FIB_EVENT_VIF_DEL: + memcpy(&fib_work->ven_info, info, sizeof(fib_work->ven_info)); + dev_hold(fib_work->ven_info.dev); + break; + case FIB_EVENT_RULE_ADD: /* fall through */ + case FIB_EVENT_RULE_DEL: + memcpy(&fib_work->fr_info, info, sizeof(fib_work->fr_info)); + fib_rule_get(fib_work->fr_info.rule); + break; + } +} + /* Called with rcu_read_lock() */ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, unsigned long event, void *ptr) @@ -5014,6 +5179,10 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, INIT_WORK(&fib_work->work, mlxsw_sp_router_fib6_event_work); mlxsw_sp_router_fib6_event(fib_work, info); break; + case RTNL_FAMILY_IPMR: + INIT_WORK(&fib_work->work, mlxsw_sp_router_fibmr_event_work); + mlxsw_sp_router_fibmr_event(fib_work, info); + break; } mlxsw_core_schedule_work(&fib_work->work); @@ -5227,12 +5396,18 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, if (err) goto err_configure; + err = mlxsw_sp_mr_rif_add(vr->mr4_table, rif); + if (err) + goto err_mr_rif_add; + mlxsw_sp_rif_counters_alloc(rif); mlxsw_sp->router->rifs[rif_index] = rif; vr->rif_count++; return rif; +err_mr_rif_add: + ops->deconfigure(rif); err_configure: if (fid) mlxsw_sp_fid_put(fid); @@ -5257,6 +5432,7 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) vr->rif_count--; mlxsw_sp->router->rifs[rif->rif_index] = NULL; mlxsw_sp_rif_counters_free(rif); + mlxsw_sp_mr_rif_del(vr->mr4_table, rif); ops->deconfigure(rif); if (fid) /* Loopback RIFs are not associated with a FID. */ @@ -6120,6 +6296,10 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) if (err) goto err_lpm_init; + err = mlxsw_sp_mr_init(mlxsw_sp, &mlxsw_sp_mr_tcam_ops); + if (err) + goto err_mr_init; + err = mlxsw_sp_vrs_init(mlxsw_sp); if (err) goto err_vrs_init; @@ -6141,6 +6321,8 @@ err_register_fib_notifier: err_neigh_init: mlxsw_sp_vrs_fini(mlxsw_sp); err_vrs_init: + mlxsw_sp_mr_fini(mlxsw_sp); +err_mr_init: mlxsw_sp_lpm_fini(mlxsw_sp); err_lpm_init: rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht); @@ -6162,6 +6344,7 @@ void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) unregister_fib_notifier(&mlxsw_sp->router->fib_nb); mlxsw_sp_neigh_fini(mlxsw_sp); mlxsw_sp_vrs_fini(mlxsw_sp); + mlxsw_sp_mr_fini(mlxsw_sp); mlxsw_sp_lpm_fini(mlxsw_sp); rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht); rhashtable_destroy(&mlxsw_sp->router->nexthop_ht); From fd890fe98f8b026642d39011d03d22fb4aa66b0f Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:21 +0200 Subject: [PATCH 0942/1322] mlxsw: spectrum: Notify multicast router on RIF MTU changes Due to the fact that multicast routes hold the minimum MTU of all the egress RIFs and trap packets that don't meet it, notify the mulitcast router code on RIF MTU changes. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 77584422ed08..dbd9c196fcfb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5773,6 +5773,17 @@ int mlxsw_sp_netdevice_router_port_event(struct net_device *dev) if (err) goto err_rif_fdb_op; + if (rif->mtu != dev->mtu) { + struct mlxsw_sp_vr *vr; + + /* The RIF is relevant only to its mr_table instance, as unlike + * unicast routing, in multicast routing a RIF cannot be shared + * between several multicast routing tables. + */ + vr = &mlxsw_sp->router->vrs[rif->vr_id]; + mlxsw_sp_mr_rif_mtu_update(vr->mr4_table, rif, dev->mtu); + } + ether_addr_copy(rif->addr, dev->dev_addr); rif->mtu = dev->mtu; From 664375e9567b5eeece8d9ebf85eaf5107cab382d Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:22 +0200 Subject: [PATCH 0943/1322] mlxsw: spectrum: router: Don't ignore IPMR notifications Make the Spectrum router logic not ignore the RTNL_FAMILY_IPMR FIB notifications. Past commits added the IPMR VIF and MFC add/del notifications via the fib_notifier chain. In addition, a code for handling these notifications in the Spectrum router logic was added. Make the Spectrum router logic not ignore these notifications and forward the requests to the Spectrum multicast router offloading logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index dbd9c196fcfb..ef4b86b3aa9b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5159,7 +5159,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, struct mlxsw_sp_router *router; if (!net_eq(info->net, &init_net) || - (info->family != AF_INET && info->family != AF_INET6)) + (info->family != AF_INET && info->family != AF_INET6 && + info->family != RTNL_FAMILY_IPMR)) return NOTIFY_DONE; fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); From aaf2c2fb0f51f91c699039440862b6ae9c25c10e Mon Sep 17 00:00:00 2001 From: Tyler Baicar Date: Mon, 28 Aug 2017 10:53:41 -0600 Subject: [PATCH 0944/1322] ACPI / APEI: clear error status before acknowledging the error Currently we acknowledge errors before clearing the error status. This could cause a new error to be populated by firmware in-between the error acknowledgment and the error status clearing which would cause the second error's status to be cleared without being handled. So, clear the error status before acknowledging the errors. Also, make sure to acknowledge the error if the error status read fails. Signed-off-by: Tyler Baicar Reviewed-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 077f9bad6f44..3c3a37b8503b 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -743,17 +743,19 @@ static int ghes_proc(struct ghes *ghes) } ghes_do_proc(ghes, ghes->estatus); +out: + ghes_clear_estatus(ghes); + + if (rc == -ENOENT) + return rc; + /* * GHESv2 type HEST entries introduce support for error acknowledgment, * so only acknowledge the error if this support is present. */ - if (is_hest_type_generic_v2(ghes)) { - rc = ghes_ack_error(ghes->generic_v2); - if (rc) - return rc; - } -out: - ghes_clear_estatus(ghes); + if (is_hest_type_generic_v2(ghes)) + return ghes_ack_error(ghes->generic_v2); + return rc; } From 2e08d20d777e997bf37806b22b471f98fbe6b693 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 27 Sep 2017 16:34:59 -0500 Subject: [PATCH 0945/1322] percpu: fix starting offset for chunk statistics traversal This patch fixes the starting offset used when scanning chunks to compute the chunk statistics. The value start_offset (and end_offset) are managed in bytes while the traversal occurs over bits. Thus for the reserved and dynamic chunk, it may incorrectly skip over the initial allocations. Signed-off-by: Dennis Zhou Signed-off-by: Tejun Heo --- mm/percpu-stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 6142484e88f7..7a58460bfd27 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -73,7 +73,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, last_alloc + 1 : 0; as_len = 0; - start = chunk->start_offset; + start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; /* * If a bit is set in the allocation map, the bound_map identifies From 8aba2333904f9b1c1ea038df261bf7ae8fefb98e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 02:08:43 +0200 Subject: [PATCH 0946/1322] cpufreq: docs: Drop intel-pstate.txt from index.txt Commit 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) dropped the intel-pstate.txt file from Documentation/cpu-freq/, but it did not update the index.txt file in there accordingly, so do that now. Fixes: 33fc30b47098 (cpufreq: intel_pstate: Document the current behavior and user interface) Signed-off-by: Rafael J. Wysocki --- Documentation/cpu-freq/index.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index 03a7cee6ac73..c15e75386a05 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -32,8 +32,6 @@ cpufreq-stats.txt - General description of sysfs cpufreq stats. index.txt - File index, Mailing list and Links (this document) -intel-pstate.txt - Intel pstate cpufreq driver specific file. - pcc-cpufreq.txt - PCC cpufreq driver specific file. From d1b490939d8c117a06dfc562c41d933f71d30289 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Tue, 19 Sep 2017 12:11:55 -0300 Subject: [PATCH 0947/1322] scsi: aacraid: Add a small delay after IOP reset Commit 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") changed the way driver checks if a reset succeeded. Now, after an IOP reset, aacraid immediately start polling a register to verify the reset is complete. This behavior cause regressions on the reset path in PowerPC (at least). Since the delay after the IOP reset was removed by the aforementioned patch, the fact driver just starts to read a register instantly after the reset was issued (by writing in another register) "corrupts" the reset procedure, which ends up failing all the time. The issue highly impacted kdump on PowerPC, since on kdump path we proactively issue a reset in adapter (through the reset_devices kernel parameter). This patch (re-)adds a delay right after IOP reset is issued. Empirically we measured that 3 seconds is enough, but for safety reasons we delay for 5s (and since it was 30s before, 5s is still a small amount). For reference, without this patch we observe the following messages on kdump kernel boot process: [ 76.294] aacraid 0003:01:00.0: IOP reset failed [ 76.294] aacraid 0003:01:00.0: ARC Reset attempt failed [ 86.524] aacraid 0003:01:00.0: adapter kernel panic'd ff. [ 86.524] aacraid 0003:01:00.0: Controller reset type is 3 [ 86.524] aacraid 0003:01:00.0: Issuing IOP reset [146.534] aacraid 0003:01:00.0: IOP reset failed [146.534] aacraid 0003:01:00.0: ARC Reset attempt failed Fixes: 0e9973ed3382 ("scsi: aacraid: Add periodic checks to see IOP reset status") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Guilherme G. Piccoli Acked-by: Dave Carroll Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/src.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/aacraid/src.c b/drivers/scsi/aacraid/src.c index 48c2b2b34b72..0c9361c87ec8 100644 --- a/drivers/scsi/aacraid/src.c +++ b/drivers/scsi/aacraid/src.c @@ -740,6 +740,8 @@ static void aac_send_iop_reset(struct aac_dev *dev) aac_set_intx_mode(dev); src_writel(dev, MUnit.IDR, IOP_SRC_RESET_MASK); + + msleep(5000); } static void aac_send_hardware_soft_reset(struct aac_dev *dev) From d0b7a9095c0730b92a0a2eecaba2e6b77ed87339 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 27 Sep 2017 14:44:19 +0200 Subject: [PATCH 0948/1322] scsi: ILLEGAL REQUEST + ASC==27 => target failure ASC 0x27 is "WRITE PROTECTED". This error code is returned e.g. by Fujitsu ETERNUS systems under certain conditions for WRITE SAME 16 commands with UNMAP bit set. It should not be treated as a path error. In general, it makes sense to assume that being write protected is a target rather than a path property. Signed-off-by: Martin Wilck Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 38942050b265..dab876c65473 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -580,7 +580,8 @@ int scsi_check_sense(struct scsi_cmnd *scmd) if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x24 || /* Invalid field in cdb */ - sshdr.asc == 0x26) { /* Parameter value invalid */ + sshdr.asc == 0x26 || /* Parameter value invalid */ + sshdr.asc == 0x27) { /* Write protected */ set_host_byte(scmd, DID_TARGET_FAILURE); } return SUCCESS; From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:23:35 -0700 Subject: [PATCH 0949/1322] md: separate request handling With commit cc27b0c78c79, pers->make_request could bail out without handling the bio. If that happens, we should retry. The commit fixes md_make_request but not other call sites. Separate the request handling part, so other call sites can use it. Reported-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 58 ++++++++++++++++++++++++++++--------------------- drivers/md/md.h | 1 + 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 08fcaebc61bd..1db1a22ed835 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +void md_handle_request(struct mddev *mddev, struct bio *bio) +{ +check_suspended: + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} +EXPORT_SYMBOL(md_handle_request); + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); @@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } -check_suspended: - rcu_read_lock(); - if (mddev->suspended) { - DEFINE_WAIT(__wait); - for (;;) { - prepare_to_wait(&mddev->sb_wait, &__wait, - TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) - break; - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - } - finish_wait(&mddev->sb_wait, &__wait); - } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); /* * save the sectors now since our bio can @@ -310,20 +324,14 @@ check_suspended: sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); - goto check_suspended; - } + + md_handle_request(mddev, bio); cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); part_stat_unlock(); - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); - return BLK_QC_T_NONE; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 561d22b9a9a8..d8287d3cd1bf 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -692,6 +692,7 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); +extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 09:55:28 -0700 Subject: [PATCH 0950/1322] md: fix a race condition for flush request handling md_submit_flush_data calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Reported-by: Nate Dailey Tested-by: Nate Dailey Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1db1a22ed835..0ff1bbf6c90e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -447,16 +447,22 @@ static void md_submit_flush_data(struct work_struct *ws) struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct bio *bio = mddev->flush_bio; + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); } - - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); } void md_flush_request(struct mddev *mddev, struct bio *bio) From c4d6a1b8e8ea79c439a4871cba540443c9eb13b9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 10:29:22 -0700 Subject: [PATCH 0951/1322] dm-raid: fix a race condition in request handling raid_map calls pers->make_request, which missed the suspend check. Fix it with the new md_handle_request API. Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) Cc: Heinz Mauelshagen Cc: Mike Snitzer Cc: stable@vger.kernel.org Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..1ac58c5651b7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3238,7 +3238,7 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - mddev->pers->make_request(mddev, bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; } From 7d5d7b5058fbd638914e42504677141a69f43011 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Sep 2017 11:03:52 -0700 Subject: [PATCH 0952/1322] md/raid5: cap worker count static checker reports a potential integer overflow. Cap the worker count to avoid the overflow. Reported:-by: Dan Carpenter Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 076409455b60..928e24a07133 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6575,14 +6575,17 @@ static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf; - unsigned long new; + unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; - if (kstrtoul(page, 10, &new)) + if (kstrtouint(page, 10, &new)) + return -EINVAL; + /* 8192 should be big enough */ + if (new > 8192) return -EINVAL; err = mddev_lock(mddev); From 38e8a5c040d3ec99a8351c688dcdf0f549611565 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 21 Aug 2017 12:04:50 +0300 Subject: [PATCH 0953/1322] net/mlx5e: IPoIB, Fix access to invalid memory address When cleaning rdma netdevice we need to save the mdev pointer because priv is released when we release netdev. This bug was found using the kernel address sanitizer (KASAN). use-after-free in mlx5_rdma_netdev_free+0xe3/0x100 [mlx5_core] Fixes: 48935bbb7ae8 ("net/mlx5e: IPoIB, Add netdevice profile skeleton") Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 85298051a3e4..145e392ab849 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -572,12 +572,13 @@ void mlx5_rdma_netdev_free(struct net_device *netdev) { struct mlx5e_priv *priv = mlx5i_epriv(netdev); const struct mlx5e_profile *profile = priv->profile; + struct mlx5_core_dev *mdev = priv->mdev; mlx5e_detach_netdev(priv); profile->cleanup(priv); destroy_workqueue(priv->wq); free_netdev(netdev); - mlx5e_destroy_mdev_resources(priv->mdev); + mlx5e_destroy_mdev_resources(mdev); } EXPORT_SYMBOL(mlx5_rdma_netdev_free); From 99d3cd27f755d63fd6cf85169eaa873d90769aa5 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Thu, 24 Aug 2017 17:21:44 +0300 Subject: [PATCH 0954/1322] net/mlx5: Fix FPGA capability location Currently, FPGA capability is located in (mdev)->caps.hca_cur, change the location to be (mdev)->caps.fpga, since hca_cur is reserved for HCA device capabilities. Fixes: e29341fb3a5b ("net/mlx5: FPGA, Add basic support for Innova") Signed-off-by: Inbar Karmy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 3 +-- include/linux/mlx5/device.h | 5 ++--- include/linux/mlx5/driver.h | 1 + 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c index e37453d838db..c0fd2212e890 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c @@ -71,11 +71,11 @@ int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, return 0; } -int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps) +int mlx5_fpga_caps(struct mlx5_core_dev *dev) { u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0}; - return mlx5_core_access_reg(dev, in, sizeof(in), caps, + return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga, MLX5_ST_SZ_BYTES(fpga_cap), MLX5_REG_FPGA_CAP, 0, 0); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h index 94bdfd47c3f0..d05233c9b4f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h @@ -65,7 +65,7 @@ struct mlx5_fpga_qp_counters { u64 rx_total_drop; }; -int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps); +int mlx5_fpga_caps(struct mlx5_core_dev *dev); int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query); int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op); int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c index 9034e9960a76..dc8970346521 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c @@ -139,8 +139,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) if (err) goto out; - err = mlx5_fpga_caps(fdev->mdev, - fdev->mdev->caps.hca_cur[MLX5_CAP_FPGA]); + err = mlx5_fpga_caps(fdev->mdev); if (err) goto out; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index eaf4ad209c8f..e32dbc4934db 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -980,7 +980,6 @@ enum mlx5_cap_type { MLX5_CAP_RESERVED, MLX5_CAP_VECTOR_CALC, MLX5_CAP_QOS, - MLX5_CAP_FPGA, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1110,10 +1109,10 @@ enum mlx5_mcam_feature_groups { MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld) #define MLX5_CAP_FPGA(mdev, cap) \ - MLX5_GET(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap) + MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap) #define MLX5_CAP64_FPGA(mdev, cap) \ - MLX5_GET64(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap) + MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap) enum { MLX5_CMD_STAT_OK = 0x0, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 02ff700e4f30..401c8972cc3a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -774,6 +774,7 @@ struct mlx5_core_dev { u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; u32 pcam[MLX5_ST_SZ_DW(pcam_reg)]; u32 mcam[MLX5_ST_SZ_DW(mcam_reg)]; + u32 fpga[MLX5_ST_SZ_DW(fpga_cap)]; } caps; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; From 16f1c5bb3ed75b3cf3ced537db40f7e1a244debe Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Sun, 30 Jul 2017 11:02:51 +0300 Subject: [PATCH 0955/1322] net/mlx5: Check device capability for maximum flow counters Added check for the maximal number of flow counters attached to rule (FTE). Fixes: bd5251dbf156b ('net/mlx5_core: Introduce flow steering destination of type counter') Signed-off-by: Raed Salem Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 8 ++++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 11 +++++++++++ include/linux/mlx5/mlx5_ifc.h | 3 ++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index e0d0efd903bc..36ecc2b2e187 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -293,6 +293,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, } if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev, + log_max_flow_counter, + ft->type)); int list_size = 0; list_for_each_entry(dst, &fte->node.children, node.list) { @@ -305,12 +308,17 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, in_dests += MLX5_ST_SZ_BYTES(dest_format_struct); list_size++; } + if (list_size > max_list_size) { + err = -EINVAL; + goto err_out; + } MLX5_SET(flow_context, in_flow_context, flow_counter_list_size, list_size); } err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +err_out: kvfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 5509a752f98e..48dd78975062 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -52,6 +52,7 @@ enum fs_flow_table_type { FS_FT_FDB = 0X4, FS_FT_SNIFFER_RX = 0X5, FS_FT_SNIFFER_TX = 0X6, + FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX, }; enum fs_flow_table_op_mod { @@ -260,4 +261,14 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev); #define fs_for_each_dst(pos, fte) \ fs_list_for_each_entry(pos, &(fte)->node.children) +#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) ( \ + (type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) : \ + (type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) : \ + (type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) : \ + (type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) : \ + (type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) : \ + (type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) : \ + (BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\ + ) + #endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a528b35a022e..69772347f866 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -327,7 +327,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_80[0x18]; u8 log_max_destination[0x8]; - u8 reserved_at_a0[0x18]; + u8 log_max_flow_counter[0x8]; + u8 reserved_at_a8[0x10]; u8 log_max_flow[0x8]; u8 reserved_at_c0[0x40]; From ace743214ea205c7d433562c5fa24e33bdfda7ab Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 5 Sep 2017 15:05:51 +0300 Subject: [PATCH 0956/1322] net/mlx5e: Fix erroneous freeing of encap header buffer In case the neighbour for the tunnel destination isn't valid, we send a neighbour update request but we free the encap header buffer. This is wrong, because we still need it for allocating a HW encap entry once the neighbour is available. Fix that by skipping freeing it if we wait for neighbour. Fixes: 232c001398ae ('net/mlx5e: Add support to neighbour update flow') Signed-off-by: Paul Blakey Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index da503e6411da..4e2fc016bdd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1564,7 +1564,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, break; default: err = -EOPNOTSUPP; - goto out; + goto free_encap; } fl4.flowi4_tos = tun_key->tos; fl4.daddr = tun_key->u.ipv4.dst; @@ -1573,7 +1573,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev, &fl4, &n, &ttl); if (err) - goto out; + goto free_encap; /* used by mlx5e_detach_encap to lookup a neigh hash table * entry in the neigh hash table when a user deletes a rule @@ -1590,7 +1590,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, */ err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e); if (err) - goto out; + goto free_encap; read_lock_bh(&n->lock); nud_state = n->nud_state; @@ -1630,8 +1630,9 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv, destroy_neigh_entry: mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); -out: +free_encap: kfree(encap_header); +out: if (n) neigh_release(n); return err; @@ -1668,7 +1669,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, break; default: err = -EOPNOTSUPP; - goto out; + goto free_encap; } fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label); @@ -1678,7 +1679,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev, &fl6, &n, &ttl); if (err) - goto out; + goto free_encap; /* used by mlx5e_detach_encap to lookup a neigh hash table * entry in the neigh hash table when a user deletes a rule @@ -1695,7 +1696,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, */ err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e); if (err) - goto out; + goto free_encap; read_lock_bh(&n->lock); nud_state = n->nud_state; @@ -1736,8 +1737,9 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv, destroy_neigh_entry: mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); -out: +free_encap: kfree(encap_header); +out: if (n) neigh_release(n); return err; From bdd66ac0aeed971d1cb42b3aa0d11b0ea3842e09 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 11 Jun 2017 21:13:25 +0300 Subject: [PATCH 0957/1322] net/mlx5e: Disallow TC offloading of unsupported match/action combinations When offloading header re-write, the HW may need to adjust checksums along the packet. For IP traffic, and a case where we are asked to modify fields in the IP header, current HW supports that only for TCP and UDP. Enforce it, in this case fail the offloading attempt for non TCP/UDP packets. Fixes: d7e75a325cb2 ('net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions') Fixes: 2f4fe4cab073 ('net/mlx5e: Add offloading of NIC TC pedit (header re-write) actions') Signed-off-by: Or Gerlitz Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 4e2fc016bdd6..d3786005fba7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1317,6 +1317,69 @@ static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 upda return true; } +static bool modify_header_match_supported(struct mlx5_flow_spec *spec, + struct tcf_exts *exts) +{ + const struct tc_action *a; + bool modify_ip_header; + LIST_HEAD(actions); + u8 htype, ip_proto; + void *headers_v; + u16 ethertype; + int nkeys, i; + + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); + + /* for non-IP we only re-write MACs, so we're okay */ + if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) + goto out_ok; + + modify_ip_header = false; + tcf_exts_to_list(exts, &actions); + list_for_each_entry(a, &actions, list) { + if (!is_tcf_pedit(a)) + continue; + + nkeys = tcf_pedit_nkeys(a); + for (i = 0; i < nkeys; i++) { + htype = tcf_pedit_htype(a, i); + if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 || + htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6) { + modify_ip_header = true; + break; + } + } + } + + ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol); + if (modify_ip_header && ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP) { + pr_info("can't offload re-write of ip proto %d\n", ip_proto); + return false; + } + +out_ok: + return true; +} + +static bool actions_match_supported(struct mlx5e_priv *priv, + struct tcf_exts *exts, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow) +{ + u32 actions; + + if (flow->flags & MLX5E_TC_FLOW_ESWITCH) + actions = flow->esw_attr->action; + else + actions = flow->nic_attr->action; + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return modify_header_match_supported(&parse_attr->spec, exts); + + return true; +} + static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, struct mlx5e_tc_flow_parse_attr *parse_attr, struct mlx5e_tc_flow *flow) @@ -1378,6 +1441,9 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, return -EINVAL; } + if (!actions_match_supported(priv, exts, parse_attr, flow)) + return -EOPNOTSUPP; + return 0; } @@ -1936,6 +2002,10 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts, return -EINVAL; } + + if (!actions_match_supported(priv, exts, parse_attr, flow)) + return -EOPNOTSUPP; + return err; } From b281208911a549e391d92ee6cb680dcd3d71783b Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 8 Aug 2017 11:45:28 +0300 Subject: [PATCH 0958/1322] net/mlx5e: Check encap entry state when offloading tunneled flows Encap entries cached by the driver could be invalidated due to tunnel destination neighbour state changes. When attempting to offload a flow that uses a cached encap entry, we must check the entry validity and defer the offloading if the entry exists but not valid. When EAGAIN is returned, the flow offloading to hardware takes place by the neigh update code when the tunnel destination neighbour becomes connected. Fixes: 232c001398ae ("net/mlx5e: Add support to neighbour update flow") Signed-off-by: Vlad Buslov Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index d3786005fba7..1aa2028ed995 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1859,6 +1859,7 @@ vxlan_encap_offload_err: } } + /* must verify if encap is valid or not */ if (found) goto attach_flow; @@ -1885,6 +1886,8 @@ attach_flow: *encap_dev = e->out_dev; if (e->flags & MLX5_ENCAP_ENTRY_VALID) attr->encap_id = e->encap_id; + else + err = -EAGAIN; return err; From b20eab15a1d5091e45022401e75b49948e8be33f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 12 Sep 2017 17:51:12 +0300 Subject: [PATCH 0959/1322] net/mlx5e: Print netdev features correctly in error message Use the correct formatting for netdev features. Fixes: 0e405443e803 ("net/mlx5e: Improve set features ndo resiliency") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dfc29720ab77..84b013dc62e9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3333,8 +3333,8 @@ static int mlx5e_handle_feature(struct net_device *netdev, err = feature_handler(netdev, enable); if (err) { - netdev_err(netdev, "%s feature 0x%llx failed err %d\n", - enable ? "Enable" : "Disable", feature, err); + netdev_err(netdev, "%s feature %pNF failed, err %d\n", + enable ? "Enable" : "Disable", &feature, err); return err; } From 1456f69ff5fbba48ed5bc86e858e945e693ba0b7 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 10 Sep 2017 10:36:06 +0300 Subject: [PATCH 0960/1322] net/mlx5e: Don't add/remove 802.1ad rules when changing 802.1Q VLAN filter Toggling of C-tag VLAN filter should not affect the "any S-tag" steering rule. Fixes: 8a271746a264 ("net/mlx5e: Receive s-tagged packets in promiscuous mode") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index f11fd07ac4dd..850cdc980ab5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -291,7 +291,7 @@ void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv) priv->fs.vlan.filter_disabled = false; if (priv->netdev->flags & IFF_PROMISC) return; - mlx5e_del_any_vid_rules(priv); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); } void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) @@ -302,7 +302,7 @@ void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv) priv->fs.vlan.filter_disabled = true; if (priv->netdev->flags & IFF_PROMISC) return; - mlx5e_add_any_vid_rules(priv); + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); } int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, From 603e1f5bd3ca76f16688e10040545594d2e91ba4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 13 Sep 2017 15:37:50 +0300 Subject: [PATCH 0961/1322] net/mlx5e: Fix calculated checksum offloads counters Instead of calculating the offloads counters, count them explicitly. The calculations done for these counters would result in bugs in some cases, for example: When running TCP traffic over a VXLAN tunnel with TSO enabled the following counters would increase: tx_csum_partial: 1,333,284 tx_csum_partial_inner: 29,286 tx4_csum_partial_inner: 384 tx7_csum_partial_inner: 8 tx9_csum_partial_inner: 34 tx10_csum_partial_inner: 26,807 tx11_csum_partial_inner: 287 tx12_csum_partial_inner: 27 tx16_csum_partial_inner: 6 tx25_csum_partial_inner: 1,733 Seems like tx_csum_partial increased out of nowhere. The issue is in the following calculation in mlx5e_update_sw_counters: s->tx_csum_partial = s->tx_packets - tx_offload_none - s->tx_csum_partial_inner; While tx_packets increases by the number of GSO segments for each SKB, tx_csum_partial_inner will only increase by one, resulting in wrong tx_csum_partial counter. Fixes: bfe6d8d1d433 ("net/mlx5e: Reorganize ethtool statistics") Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++------ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 1 + 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 84b013dc62e9..cc11bbbd0309 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -184,7 +184,6 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) struct mlx5e_sw_stats temp, *s = &temp; struct mlx5e_rq_stats *rq_stats; struct mlx5e_sq_stats *sq_stats; - u64 tx_offload_none = 0; int i, j; memset(s, 0, sizeof(*s)); @@ -199,6 +198,7 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) s->rx_lro_bytes += rq_stats->lro_bytes; s->rx_csum_none += rq_stats->csum_none; s->rx_csum_complete += rq_stats->csum_complete; + s->rx_csum_unnecessary += rq_stats->csum_unnecessary; s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner; s->rx_xdp_drop += rq_stats->xdp_drop; s->rx_xdp_tx += rq_stats->xdp_tx; @@ -229,14 +229,11 @@ static void mlx5e_update_sw_counters(struct mlx5e_priv *priv) s->tx_queue_dropped += sq_stats->dropped; s->tx_xmit_more += sq_stats->xmit_more; s->tx_csum_partial_inner += sq_stats->csum_partial_inner; - tx_offload_none += sq_stats->csum_none; + s->tx_csum_none += sq_stats->csum_none; + s->tx_csum_partial += sq_stats->csum_partial; } } - /* Update calculated offload counters */ - s->tx_csum_partial = s->tx_packets - tx_offload_none - s->tx_csum_partial_inner; - s->rx_csum_unnecessary = s->rx_packets - s->rx_csum_none - s->rx_csum_complete; - s->link_down_events_phy = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters, counter_set.phys_layer_cntrs.link_down_events); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f1dd638384d3..15a1687483cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -627,6 +627,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, if (lro) { skb->ip_summed = CHECKSUM_UNNECESSARY; + rq->stats.csum_unnecessary++; return; } @@ -644,7 +645,9 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, skb->csum_level = 1; skb->encapsulation = 1; rq->stats.csum_unnecessary_inner++; + return; } + rq->stats.csum_unnecessary++; return; } csum_none: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 6d199ffb1c0b..f8637213afc0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -68,6 +68,7 @@ struct mlx5e_sw_stats { u64 rx_xdp_drop; u64 rx_xdp_tx; u64 rx_xdp_tx_full; + u64 tx_csum_none; u64 tx_csum_partial; u64 tx_csum_partial_inner; u64 tx_queue_stopped; @@ -108,6 +109,7 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) }, @@ -339,6 +341,7 @@ struct mlx5e_rq_stats { u64 packets; u64 bytes; u64 csum_complete; + u64 csum_unnecessary; u64 csum_unnecessary_inner; u64 csum_none; u64 lro_packets; @@ -363,6 +366,7 @@ static const struct counter_desc rq_stats_desc[] = { { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) }, @@ -392,6 +396,7 @@ struct mlx5e_sq_stats { u64 tso_bytes; u64 tso_inner_packets; u64 tso_inner_bytes; + u64 csum_partial; u64 csum_partial_inner; u64 nop; /* less likely accessed in data path */ @@ -408,6 +413,7 @@ static const struct counter_desc sq_stats_desc[] = { { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_bytes) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) }, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index fee43e40fa16..1d6925d4369a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -193,6 +193,7 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct sq->stats.csum_partial_inner++; } else { eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; + sq->stats.csum_partial++; } } else sq->stats.csum_none++; From 480df991b869eff02a004e8fe7707900437cfcd4 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 Aug 2017 18:52:14 +0300 Subject: [PATCH 0962/1322] net/mlx5: Fix static checker warning on steering tracepoints code Fix this sparse complaint: drivers/net/ethernet/mellanox/mlx5/core/./diag/fs_tracepoint.h:172:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1) Fixes: d9fea79171ee ('net/mlx5: Add tracepoints') Signed-off-by: Matan Barak Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h index 1e3a6c3e4132..80eef4163f52 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h @@ -139,7 +139,7 @@ TRACE_EVENT(mlx5_fs_del_fg, {MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"} TRACE_EVENT(mlx5_fs_set_fte, - TP_PROTO(const struct fs_fte *fte, bool new_fte), + TP_PROTO(const struct fs_fte *fte, int new_fte), TP_ARGS(fte, new_fte), TP_STRUCT__entry( __field(const struct fs_fte *, fte) @@ -149,7 +149,7 @@ TRACE_EVENT(mlx5_fs_set_fte, __field(u32, action) __field(u32, flow_tag) __field(u8, mask_enable) - __field(bool, new_fte) + __field(int, new_fte) __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) From 353f59f4d41e9c5798a15c5c52958f25b579a3d5 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 24 Sep 2017 09:54:00 +0200 Subject: [PATCH 0963/1322] net/mlx5: Fix wrong indentation in enable SRIOV code Smatch is screaming: drivers/net/ethernet/mellanox/mlx5/core/sriov.c:112 mlx5_device_enable_sriov() warn: inconsistent indenting fix that. Fixes: 7ecf6d8ff154 ('IB/mlx5: Restore IB guid/policy for virtual functions') Signed-off-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 6c48e9959b65..2a8b529ce6dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -109,7 +109,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) mlx5_core_warn(dev, "failed to restore VF %d settings, err %d\n", vf, err); - continue; + continue; } } mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); From da541b20021c781f8b65492eeaee824e729599eb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 17:34:23 -0500 Subject: [PATCH 0964/1322] objtool: Skip unreachable warnings for GCC 4.4 and older The kbuild bot occasionally reports warnings like: drivers/scsi/pcmcia/aha152x_core.o: warning: objtool: seldo_run()+0x130: unreachable instruction These warnings are always with GCC 4.4. That version of GCC sometimes places unreachable instructions after calls to noreturn functions. The unreachable warnings aren't very important anyway. Just ignore them for old versions of GCC. Reported-by: kbuild test robot Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bc89b807d965b98ec18a0bb94f96a594bd58f2f2.1506551639.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- scripts/Makefile.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 2e3a10e79ca9..061d0c3a420a 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -265,6 +265,8 @@ objtool_args += --no-fp endif ifdef CONFIG_GCOV_KERNEL objtool_args += --no-unreachable +else +objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable) endif # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory From 607a4029d439cdfa258aff5da32bb9cd6ed1a66d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Sep 2017 10:36:38 -0500 Subject: [PATCH 0965/1322] objtool: Support unoptimized frame pointer setup Arnd Bergmann reported a bunch of warnings like: crypto/jitterentropy.o: warning: objtool: jent_fold_time()+0x3b: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_stuck()+0x1d: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_unbiased_bit()+0x15: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_read_entropy()+0x32: call without frame pointer save/setup crypto/jitterentropy.o: warning: objtool: jent_entropy_collector_free()+0x19: call without frame pointer save/setup and arch/x86/events/core.o: warning: objtool: collect_events uses BP as a scratch register arch/x86/events/core.o: warning: objtool: events_ht_sysfs_show()+0x22: call without frame pointer save/setup With certain rare configurations, GCC sometimes sets up the frame pointer with: lea (%rsp),%rbp instead of: mov %rsp,%rbp The instructions are equivalent, so treat the former like the latter. Reported-by: Arnd Bergmann Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a468af8b28a69b83fffc6d7668be9b6fcc873699.1506526584.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- tools/objtool/arch/x86/decode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0f22768c0d4d..34a579f806e3 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -284,11 +284,16 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, case 0x8d: if (sib == 0x24 && rex_w && !rex_b && !rex_x) { - /* lea disp(%rsp), reg */ *type = INSN_STACK; - op->src.type = OP_SRC_ADD; + if (!insn.displacement.value) { + /* lea (%rsp), reg */ + op->src.type = OP_SRC_REG; + } else { + /* lea disp(%rsp), reg */ + op->src.type = OP_SRC_ADD; + op->src.offset = insn.displacement.value; + } op->src.reg = CFI_SP; - op->src.offset = insn.displacement.value; op->dest.type = OP_DEST_REG; op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r]; From 66a733ea6b611aecf0119514d2dddab5f9d6c01e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: [PATCH 0966/1322] seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Cc: stable@vger.kernel.org # needs s/refcount_/atomic_/ for v4.12 and earlier Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a1..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: From fe659bcc9b173bcfdd958ce2aec75e47651e74e1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:22 -0400 Subject: [PATCH 0967/1322] USB: dummy-hcd: fix connection failures (wrong speed) The dummy-hcd UDC driver is not careful about the way it handles connection speeds. It ignores the module parameter that is supposed to govern the maximum connection speed and it doesn't set the HCD flags properly for the case where it ends up running at full speed. The result is that in many cases, gadget enumeration over dummy-hcd fails because the bMaxPacketSize byte in the device descriptor is set incorrectly. For example, the default settings call for a high-speed connection, but the maxpacket value for ep0 ends up being set for a Super-Speed connection. This patch fixes the problem by initializing the gadget's max_speed and the HCD flags correctly. Signed-off-by: Alan Stern CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index b1e21b3be6e1..d515ec31afe4 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -1036,7 +1036,12 @@ static int dummy_udc_probe(struct platform_device *pdev) memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; - dum->gadget.max_speed = USB_SPEED_SUPER; + if (mod_data.is_super_speed) + dum->gadget.max_speed = USB_SPEED_SUPER; + else if (mod_data.is_high_speed) + dum->gadget.max_speed = USB_SPEED_HIGH; + else + dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); @@ -2560,8 +2565,6 @@ static struct hc_driver dummy_hcd = { .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), - .flags = HCD_USB3 | HCD_SHARED, - .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, @@ -2590,8 +2593,12 @@ static int dummy_hcd_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); - if (!mod_data.is_super_speed) + if (mod_data.is_super_speed) + dummy_hcd.flags = HCD_USB3 | HCD_SHARED; + else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; + else + dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; From 0173a68bfb0ad1c72a6ee39cc485aa2c97540b98 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:40 -0400 Subject: [PATCH 0968/1322] USB: dummy-hcd: fix infinite-loop resubmission bug The dummy-hcd HCD/UDC emulator tries not to do too much work during each timer interrupt. But it doesn't try very hard; currently all it does is limit the total amount of bulk data transferred. Other transfer types aren't limited, and URBs that transfer no data (because of an error, perhaps) don't count toward the limit, even though on a real USB bus they would consume at least a minimum overhead. This means it's possible to get the driver stuck in an infinite loop, for example, if the host class driver resubmits an URB every time it completes (which is common for interrupt URBs). Each time the URB is resubmitted it gets added to the end of the pending-URBs list, and dummy-hcd doesn't stop until that list is empty. Andrey Konovalov was able to trigger this failure mode using the syzkaller fuzzer. This patch fixes the infinite-loop problem by restricting the URBs handled during each timer interrupt to those that were already on the pending list when the interrupt routine started. Newly added URBs won't be processed until the next timer interrupt. The problem of properly accounting for non-bulk bandwidth (as well as packet and transaction overhead) is not addressed here. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index d515ec31afe4..b2ab9cc33fec 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -237,6 +237,8 @@ struct dummy_hcd { struct usb_device *udev; struct list_head urbp_list; + struct urbp *next_frame_urbp; + u32 stream_en_ep; u8 num_stream[30 / 2]; @@ -1250,6 +1252,8 @@ static int dummy_urb_enqueue( list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; + if (!dum_hcd->next_frame_urbp) + dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ @@ -1766,6 +1770,7 @@ static void dummy_timer(unsigned long _dum_hcd) spin_unlock_irqrestore(&dum->lock, flags); return; } + dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) @@ -1782,6 +1787,10 @@ restart: int type; int status = -EINPROGRESS; + /* stop when we reach URBs queued after the timer interrupt */ + if (urbp == dum_hcd->next_frame_urbp) + break; + urb = urbp->urb; if (urb->unlinked) goto return_urb; From 7dbd8f4cabd96db5a50513de9d83a8105a5ffc81 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:49 -0400 Subject: [PATCH 0969/1322] USB: dummy-hcd: Fix erroneous synchronization change A recent change to the synchronization in dummy-hcd was incorrect. The issue was that dummy_udc_stop() contained no locking and therefore could race with various gadget driver callbacks, and the fix was to add locking and issue the callbacks with the private spinlock held. UDC drivers aren't supposed to do this. Gadget driver callback routines are allowed to invoke functions in the UDC driver, and these functions will generally try to acquire the private spinlock. This would deadlock the driver. The correct solution is to drop the spinlock before issuing callbacks, and avoid races by emulating the synchronize_irq() call that all real UDC drivers must perform in their ->udc_stop() routines after disabling interrupts. This involves adding a flag to dummy-hcd's private structure to keep track of whether interrupts are supposed to be enabled, and adding a counter to keep track of ongoing callbacks so that dummy_udc_stop() can wait for them all to finish. A real UDC driver won't receive disconnect, reset, suspend, resume, or setup events once it has disabled interrupts. dummy-hcd will receive them but won't try to issue any gadget driver callbacks, which should be just as good. Signed-off-by: Alan Stern Fixes: f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") CC: Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/dummy_hcd.c | 32 ++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index b2ab9cc33fec..b17618a55f1b 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -255,11 +255,13 @@ struct dummy { */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; + int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; + unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; @@ -441,18 +443,27 @@ static void set_link_state(struct dummy_hcd *dum_hcd) (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ - if (dum->driver && (disconnect || reset)) { + if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); + ++dum->callback_usage; + spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } - } else if (dum_hcd->active != dum_hcd->old_active) { + } else if (dum_hcd->active != dum_hcd->old_active && + dum->ints_enabled) { + ++dum->callback_usage; + spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; @@ -973,8 +984,11 @@ static int dummy_udc_start(struct usb_gadget *g, * can't enumerate without help from the driver we're binding. */ + spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; + dum->ints_enabled = 1; + spin_unlock_irq(&dum->lock); return 0; } @@ -985,6 +999,16 @@ static int dummy_udc_stop(struct usb_gadget *g) struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); + dum->ints_enabled = 0; + stop_activity(dum); + + /* emulate synchronize_irq(): wait for callbacks to finish */ + while (dum->callback_usage > 0) { + spin_unlock_irq(&dum->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dum->lock); + } + dum->driver = NULL; spin_unlock_irq(&dum->lock); @@ -1529,6 +1553,8 @@ static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; + if (!dum->ints_enabled) + return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { @@ -1870,10 +1896,12 @@ restart: * until setup() returns; no reentrancy issues etc. */ if (value > 0) { + ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); + --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ From 4dcf4bab4a409e81284b8202137e4a85b96b34de Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:23 +0900 Subject: [PATCH 0970/1322] usb: gadget: udc: renesas_usb3: fix for no-data control transfer When bRequestType & USB_DIR_IN is false and req.length is 0 in control transfer, since it means non-data, this driver should not set the mode as control write. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index df37c1e6e9d5..555c105e82df 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1150,7 +1150,8 @@ static void usb3_start_pipe0(struct renesas_usb3_ep *usb3_ep, usb3_set_p0_con_for_ctrl_read_data(usb3); } else { usb3_clear_bit(usb3, P0_MOD_DIR, USB3_P0_MOD); - usb3_set_p0_con_for_ctrl_write_data(usb3); + if (usb3_req->req.length) + usb3_set_p0_con_for_ctrl_write_data(usb3); } usb3_p0_xfer(usb3_ep, usb3_req); From 73f2f5745f18b4ccfe9484deac4e84a1378d19fd Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:24 +0900 Subject: [PATCH 0971/1322] usb: gadget: udc: renesas_usb3: fix Pn_RAMMAP.Pn_MPKT value According to the datasheet of R-Car Gen3, the Pn_RAMMAP.Pn_MPKT should be set to one of 8, 16, 32, 64, 512 and 1024. Otherwise, when a gadget driver uses an interrupt endpoint, unexpected behavior happens. So, this patch fixes it. Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 555c105e82df..7e0c53492356 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -2054,7 +2054,16 @@ static u32 usb3_calc_ramarea(int ram_size) static u32 usb3_calc_rammap_val(struct renesas_usb3_ep *usb3_ep, const struct usb_endpoint_descriptor *desc) { - return usb3_ep->rammap_val | PN_RAMMAP_MPKT(usb_endpoint_maxp(desc)); + int i; + const u32 max_packet_array[] = {8, 16, 32, 64, 512}; + u32 mpkt = PN_RAMMAP_MPKT(1024); + + for (i = 0; i < ARRAY_SIZE(max_packet_array); i++) { + if (usb_endpoint_maxp(desc) <= max_packet_array[i]) + mpkt = PN_RAMMAP_MPKT(max_packet_array[i]); + } + + return usb3_ep->rammap_val | mpkt; } static int usb3_enable_pipe_n(struct renesas_usb3_ep *usb3_ep, From 447b8a01b84f048d93d43bfe1fcaa4fcc56595cc Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 25 Sep 2017 17:01:25 +0900 Subject: [PATCH 0972/1322] usb: gadget: udc: renesas_usb3: Fix return value of usb3_write_pipe() This patch fixes an issue that this driver cannot go status stage in control read when the req.zero is set to 1 and the len in usb3_write_pipe() is set to 0. Otherwise, if we use g_ncm driver, usb enumeration takes long time (5 seconds or more). Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: # v4.5+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/renesas_usb3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 7e0c53492356..63a206122058 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -1038,7 +1038,7 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_ep->ep.maxpacket); u8 *buf = usb3_req->req.buf + usb3_req->req.actual; u32 tmp = 0; - bool is_last; + bool is_last = !len ? true : false; if (usb3_wait_pipe_status(usb3_ep, PX_STA_BUFSTS) < 0) return -EBUSY; @@ -1059,7 +1059,8 @@ static int usb3_write_pipe(struct renesas_usb3_ep *usb3_ep, usb3_write(usb3, tmp, fifo_reg); } - is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); + if (!is_last) + is_last = usb3_is_transfer_complete(usb3_ep, usb3_req); /* Send the data */ usb3_set_px_con_send(usb3_ep, len, is_last); From 6124607acc88fffeaadf3aacfeb3cc1304c87387 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:12 +0900 Subject: [PATCH 0973/1322] usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe This patch fixes an issue that the driver sets the BCLR bit of {C,Dn}FIFOCTR register to 1 even when it's non-DCP pipe and the FRDY bit of {C,Dn}FIFOCTR register is set to 1. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Cc: # v3.1+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/fifo.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index d1af831f43eb..03cac07f57b5 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -282,11 +282,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_fifo *fifo) { struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); + int ret = 0; if (!usbhs_pipe_is_dcp(pipe)) - usbhsf_fifo_barrier(priv, fifo); + ret = usbhsf_fifo_barrier(priv, fifo); - usbhs_write(priv, fifo->ctr, BCLR); + /* + * if non-DCP pipe, this driver should set BCLR when + * usbhsf_fifo_barrier() returns 0. + */ + if (!ret) + usbhs_write(priv, fifo->ctr, BCLR); } static int usbhsf_fifo_rcv_len(struct usbhs_priv *priv, From 0a2ce62b61f2c76d0213edf4e37aaf54a8ddf295 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:13 +0900 Subject: [PATCH 0974/1322] usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction This patch fixes an issue that the usbhsf_fifo_clear() is possible to cause 10 msec delay if the pipe is RX direction and empty because the FRDY bit will never be set to 1 in such case. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Cc: # v3.1+ Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/fifo.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 03cac07f57b5..68f26904c316 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -284,8 +284,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); int ret = 0; - if (!usbhs_pipe_is_dcp(pipe)) - ret = usbhsf_fifo_barrier(priv, fifo); + if (!usbhs_pipe_is_dcp(pipe)) { + /* + * This driver checks the pipe condition first to avoid -EBUSY + * from usbhsf_fifo_barrier() with about 10 msec delay in + * the interrupt handler if the pipe is RX direction and empty. + */ + if (usbhs_pipe_is_dir_in(pipe)) + ret = usbhs_pipe_is_accessible(pipe); + if (!ret) + ret = usbhsf_fifo_barrier(priv, fifo); + } /* * if non-DCP pipe, this driver should set BCLR when From addfc5823dbf3e6ed400e98e49c7e64b10e191d6 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 12 Sep 2017 10:24:40 +0100 Subject: [PATCH 0975/1322] usb: gadget: ffs: handle I/O completion in-order By submitting completed transfers to the system workqueue there is no guarantee that completion events will be queued up in the correct order, as in multi-processor systems there is a thread running for each processor and the work items are not bound to a particular core. This means that several completions are in the queue at the same time, they may be processed in parallel and complete out of order, resulting in data appearing corrupt when read by userspace. Create a single-threaded workqueue for FunctionFS so that data completed requests is passed to userspace in the order in which they complete. Acked-by: Michal Nazarewicz Signed-off-by: John Keeping Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 17 +++++++++++++---- drivers/usb/gadget/function/u_fs.h | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 9990944a7245..8b342587f8ad 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -46,7 +46,8 @@ static void ffs_data_get(struct ffs_data *ffs); static void ffs_data_put(struct ffs_data *ffs); /* Creates new ffs_data object. */ -static struct ffs_data *__must_check ffs_data_new(void) __attribute__((malloc)); +static struct ffs_data *__must_check ffs_data_new(const char *dev_name) + __attribute__((malloc)); /* Opened counter handling. */ static void ffs_data_opened(struct ffs_data *ffs); @@ -780,11 +781,12 @@ static void ffs_epfile_async_io_complete(struct usb_ep *_ep, struct usb_request *req) { struct ffs_io_data *io_data = req->context; + struct ffs_data *ffs = io_data->ffs; ENTER(); INIT_WORK(&io_data->work, ffs_user_copy_worker); - schedule_work(&io_data->work); + queue_work(ffs->io_completion_wq, &io_data->work); } static void __ffs_epfile_read_buffer_free(struct ffs_epfile *epfile) @@ -1500,7 +1502,7 @@ ffs_fs_mount(struct file_system_type *t, int flags, if (unlikely(ret < 0)) return ERR_PTR(ret); - ffs = ffs_data_new(); + ffs = ffs_data_new(dev_name); if (unlikely(!ffs)) return ERR_PTR(-ENOMEM); ffs->file_perms = data.perms; @@ -1610,6 +1612,7 @@ static void ffs_data_put(struct ffs_data *ffs) BUG_ON(waitqueue_active(&ffs->ev.waitq) || waitqueue_active(&ffs->ep0req_completion.wait) || waitqueue_active(&ffs->wait)); + destroy_workqueue(ffs->io_completion_wq); kfree(ffs->dev_name); kfree(ffs); } @@ -1642,7 +1645,7 @@ static void ffs_data_closed(struct ffs_data *ffs) ffs_data_put(ffs); } -static struct ffs_data *ffs_data_new(void) +static struct ffs_data *ffs_data_new(const char *dev_name) { struct ffs_data *ffs = kzalloc(sizeof *ffs, GFP_KERNEL); if (unlikely(!ffs)) @@ -1650,6 +1653,12 @@ static struct ffs_data *ffs_data_new(void) ENTER(); + ffs->io_completion_wq = alloc_ordered_workqueue("%s", 0, dev_name); + if (!ffs->io_completion_wq) { + kfree(ffs); + return NULL; + } + refcount_set(&ffs->ref, 1); atomic_set(&ffs->opened, 0); ffs->state = FFS_READ_DESCRIPTORS; diff --git a/drivers/usb/gadget/function/u_fs.h b/drivers/usb/gadget/function/u_fs.h index 540f1c48c1a8..79f70ebf85dc 100644 --- a/drivers/usb/gadget/function/u_fs.h +++ b/drivers/usb/gadget/function/u_fs.h @@ -279,6 +279,7 @@ struct ffs_data { } file_perms; struct eventfd_ctx *ffs_eventfd; + struct workqueue_struct *io_completion_wq; bool no_disconnect; struct work_struct reset_work; From 6baeda120d90aa637b08f7604de104ab00ce9126 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 31 Aug 2017 14:51:40 +0200 Subject: [PATCH 0976/1322] usb: gadget: udc: atmel: set vbus irqflags explicitly The driver triggers actions on both edges of the vbus signal. The former PIO controller was triggering IRQs on both falling and rising edges by default. Newer PIO controller don't, so it's better to set it explicitly to IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING. Without this patch we may trigger the connection with host but only on some bouncing signal conditions and thus lose connecting events. Acked-by: Ludovic Desroches Signed-off-by: Nicolas Ferre Cc: stable # v4.4+ Signed-off-by: Felipe Balbi --- drivers/usb/gadget/udc/atmel_usba_udc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index 98d71400f8a1..a884c022df7a 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -29,6 +29,8 @@ #include #include "atmel_usba_udc.h" +#define USBA_VBUS_IRQFLAGS (IRQF_ONESHOT \ + | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING) #ifdef CONFIG_USB_GADGET_DEBUG_FS #include @@ -2361,7 +2363,7 @@ static int usba_udc_probe(struct platform_device *pdev) IRQ_NOAUTOEN); ret = devm_request_threaded_irq(&pdev->dev, gpio_to_irq(udc->vbus_pin), NULL, - usba_vbus_irq_thread, IRQF_ONESHOT, + usba_vbus_irq_thread, USBA_VBUS_IRQFLAGS, "atmel_usba_udc", udc); if (ret) { udc->vbus_pin = -ENODEV; From c3cdce45f8d3dfa5c3467894aa89798314920328 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 30 Aug 2017 19:03:52 +0800 Subject: [PATCH 0977/1322] usb: dwc3: of-simple: Add compatible for Spreadtrum SC9860 platform Add compatible string to use this generic glue layer to support Spreadtrum SC9860 platform's dwc3 controller. Signed-off-by: Baolin Wang Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/dwc3-of-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/dwc3-of-simple.c b/drivers/usb/dwc3/dwc3-of-simple.c index 4cef7d4f9cd0..a26d1fde0f5e 100644 --- a/drivers/usb/dwc3/dwc3-of-simple.c +++ b/drivers/usb/dwc3/dwc3-of-simple.c @@ -177,6 +177,7 @@ static const struct of_device_id of_dwc3_simple_match[] = { { .compatible = "rockchip,rk3399-dwc3" }, { .compatible = "xlnx,zynqmp-dwc3" }, { .compatible = "cavium,octeon-7130-usb-uctl" }, + { .compatible = "sprd,sc9860-dwc3" }, { /* Sentinel */ } }; MODULE_DEVICE_TABLE(of, of_dwc3_simple_match); From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Thu, 28 Sep 2017 12:37:31 +0800 Subject: [PATCH 0978/1322] irq/generic-chip: Don't replace domain's name When generic irq chips are allocated for an irq domain the domain name is set to the irq chip name. That was done to have named domains before the recent changes which enforce domain naming were done. Since then the overwrite causes a memory leak when the domain name is dynamically allocated and even worse it would cause the domain free code to free the wrong name pointer, which might point to a constant. Remove the name assignment to prevent this. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com --- kernel/irq/generic-chip.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); From 8c28ef3f1c1c57b6f468343d5959e5125b30334d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 25 Sep 2017 02:01:01 -0600 Subject: [PATCH 0979/1322] xen-pciback: relax BAR sizing write value check Just like done in d2bd05d88d ("xen-pciback: return proper values during BAR sizing") for the ROM BAR, ordinary ones also shouldn't compare the written value directly against ~0, but consider the r/o bits at the bottom (if any). Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xen-pciback/conf_space_header.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index 5fbfd9cfb6d6..5b3d57fc82d3 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -169,6 +169,9 @@ static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) { struct pci_bar_info *bar = data; + unsigned int pos = (offset - PCI_BASE_ADDRESS_0) / 4; + const struct resource *res = dev->resource; + u32 mask; if (unlikely(!bar)) { pr_warn(DRV_NAME ": driver data not found for %s\n", @@ -179,7 +182,13 @@ static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) /* A write to obtain the length must happen as a 32-bit write. * This does not (yet) support writing individual bytes */ - if (value == ~0) + if (res[pos].flags & IORESOURCE_IO) + mask = ~PCI_BASE_ADDRESS_IO_MASK; + else if (pos && (res[pos - 1].flags & IORESOURCE_MEM_64)) + mask = 0; + else + mask = ~PCI_BASE_ADDRESS_MEM_MASK; + if ((value | mask) == ~0U) bar->which = 1; else { u32 tmpval; From 0d805ee70a69eabd38160dc199e183ac2f13fe4b Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 27 Sep 2017 02:41:25 -0700 Subject: [PATCH 0980/1322] xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables mapping When bootup a PVM guest with large memory(Ex.240GB), XEN provided initial mapping overlaps with kernel module virtual space. When mapping in this space is cleared by xen_cleanhighmap(), in certain case there could be an 2MB mapping left. This is due to XEN initialize 4MB aligned mapping but xen_cleanhighmap() finish at 2MB boundary. When module loading is just on top of the 2MB space, got below warning: WARNING: at mm/vmalloc.c:106 vmap_pte_range+0x14e/0x190() Call Trace: [] warn_alloc_failed+0xf3/0x160 [] __vmalloc_area_node+0x182/0x1c0 [] ? module_alloc_update_bounds+0x1e/0x80 [] __vmalloc_node_range+0xa7/0x110 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc+0x64/0x70 [] ? module_alloc_update_bounds+0x1e/0x80 [] module_alloc_update_bounds+0x1e/0x80 [] move_module+0x27/0x150 [] layout_and_allocate+0x120/0x1b0 [] load_module+0x78/0x640 [] ? security_file_permission+0x8b/0x90 [] sys_init_module+0x62/0x1e0 [] system_call_fastpath+0x16/0x1b Then the mapping of 2MB is cleared, finally oops when the page in that space is accessed. BUG: unable to handle kernel paging request at ffff880022600000 IP: [] clear_page_c_e+0x7/0x10 PGD 1788067 PUD 178c067 PMD 22434067 PTE 0 Oops: 0002 [#1] SMP Call Trace: [] ? prep_new_page+0x127/0x1c0 [] get_page_from_freelist+0x1e2/0x550 [] ? ii_iovec_copy_to_user+0x90/0x140 [] __alloc_pages_nodemask+0x12d/0x230 [] alloc_pages_vma+0xc6/0x1a0 [] ? pte_mfn_to_pfn+0x7d/0x100 [] do_anonymous_page+0x16b/0x350 [] handle_pte_fault+0x1e4/0x200 [] ? xen_pmd_val+0xe/0x10 [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e [] handle_mm_fault+0x15b/0x270 [] do_page_fault+0x140/0x470 [] page_fault+0x25/0x30 Call xen_cleanhighmap() with 4MB aligned for page tables mapping to fix it. The unnecessory call of xen_cleanhighmap() in DEBUG mode is also removed. -v2: add comment about XEN alignment from Juergen. References: https://lists.xen.org/archives/html/xen-devel/2012-07/msg01562.html Signed-off-by: Zhenzhong Duan Reviewed-by: Juergen Gross [boris: added 'xen/mmu' tag to commit subject] Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 509f560bd0c6..58b09fcadbaa 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1238,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void) * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if + * As Xen is aligning the memory end to a 4MB boundary, for good + * measure we also round up to PMD_SIZE * 2 - which means that if * anybody is using __ka address to the initial boot-stack - and try * to use it - they are going to crash. The xen_start_info has been * taken care of already in xen_setup_kernel_pagetable. */ addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + size = xen_start_info->nr_pt_frames * PAGE_SIZE; - xen_cleanhighmap(addr, addr + size); + xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2)); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif } #endif From 686fef928bba6be13cabe639f154af7d72b63120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2017 06:38:17 -0700 Subject: [PATCH 0981/1322] timer: Prepare to change timer callback argument type Modern kernel callback systems pass the structure associated with a given callback to the callback function. The timer callback remains one of the legacy cases where an arbitrary unsigned long argument continues to be passed as the callback argument. This has several problems: - This bloats the timer_list structure with a normally redundant .data field. - No type checking is being performed, forcing callbacks to do explicit type casts of the unsigned long argument into the object that was passed, rather than using container_of(), as done in most of the other callback infrastructure. - Neighboring buffer overflows can overwrite both the .function and the .data field, providing attackers with a way to elevate from a buffer overflow into a simplistic ROP-like mechanism that allows calling arbitrary functions with a controlled first argument. - For future Control Flow Integrity work, this creates a unique function prototype for timer callbacks, instead of allowing them to continue to be clustered with other void functions that take a single unsigned long argument. This adds a new timer initialization API, which will ultimately replace the existing setup_timer(), setup_{deferrable,pinned,etc}_timer() family, named timer_setup() (to mirror hrtimer_setup(), making instances of its use much easier to grep for). In order to support the migration of existing timers into the new callback arguments, timer_setup() casts its arguments to the existing legacy types, and explicitly passes the timer pointer as the legacy data argument. Once all setup_*timer() callers have been replaced with timer_setup(), the casts can be removed, and the data argument can be dropped with the timer expiration code changed to just pass the timer to the callback directly. Since the regular pattern of using container_of() during local variable declaration repeats the need for the variable type declaration to be included, this adds a helper modeled after other from_*() helpers that wrap container_of(), named from_timer(). This helper uses typeof(*variable), removing the type redundancy and minimizing the need for line wraps in forthcoming conversions from "unsigned data long" to "struct timer_list *" in the timer callbacks: -void callback(unsigned long data) +void callback(struct timer_list *t) { - struct some_data_structure *local = (struct some_data_structure *)data; + struct some_data_structure *local = from_timer(local, t, timer); Finally, in order to support the handful of timer users that perform open-coded assignments of the .function (and .data) fields, provide cast macros (TIMER_FUNC_TYPE and TIMER_DATA_TYPE) that can be used temporarily. Once conversion has been completed, these can be globally trivially removed. Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20170928133817.GA113410@beast --- include/linux/timer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/timer.h b/include/linux/timer.h index e6789b8757d5..6383c528b148 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,6 +168,20 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) +#define TIMER_DATA_TYPE unsigned long +#define TIMER_FUNC_TYPE void (*)(TIMER_DATA_TYPE) + +static inline void timer_setup(struct timer_list *timer, + void (*callback)(struct timer_list *), + unsigned int flags) +{ + __setup_timer(timer, (TIMER_FUNC_TYPE)callback, + (TIMER_DATA_TYPE)timer, flags); +} + +#define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) + /** * timer_pending - is a timer pending? * @timer: the timer in question From 1fa4df3e688902d033dfda796eb83ae6ad8d0488 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 27 Sep 2017 16:35:00 -0500 Subject: [PATCH 0982/1322] percpu: fix iteration to prevent skipping over block The iterator functions pcpu_next_md_free_region and pcpu_next_fit_region use the block offset to determine if they have checked the area in the prior iteration. However, this causes an issue when the block offset is greater than subsequent block contig hints. If within the iterator it moves to check subsequent blocks, it may fail in the second predicate due to the block offset not being cleared. Thus, this causes the allocator to skip over blocks leading to false failures when allocating from the reserved chunk. While this happens in the general case as well, it will only fail if it cannot allocate a new chunk. This patch resets the block offset to 0 to pass the second predicate when checking subseqent blocks within the iterator function. Signed-off-by: Dennis Zhou Reported-and-tested-by: Luis Henriques Signed-off-by: Tejun Heo --- mm/percpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/percpu.c b/mm/percpu.c index 59d44d61f5f1..aa121cef76de 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -353,6 +353,8 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, block->contig_hint_start); return; } + /* reset to satisfy the second predicate above */ + block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; @@ -407,6 +409,8 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, *bit_off = pcpu_block_off_to_off(i, block->first_free); return; } + /* reset to satisfy the second predicate above */ + block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); From 2580c4c17aee3ad58e9751012bad278dd074ccae Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 28 Sep 2017 11:32:37 +0200 Subject: [PATCH 0983/1322] tun: bail out from tun_get_user() if the skb is empty KMSAN (https://github.com/google/kmsan) reported accessing uninitialized skb->data[0] in the case the skb is empty (i.e. skb->len is 0): ================================================ BUG: KMSAN: use of uninitialized memory in tun_get_user+0x19ba/0x3770 CPU: 0 PID: 3051 Comm: probe Not tainted 4.13.0+ #3140 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: ... __msan_warning_32+0x66/0xb0 mm/kmsan/kmsan_instr.c:477 tun_get_user+0x19ba/0x3770 drivers/net/tun.c:1301 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:245 ... origin: ... kmsan_poison_shadow+0x6e/0xc0 mm/kmsan/kmsan.c:211 slab_alloc_node mm/slub.c:2732 __kmalloc_node_track_caller+0x351/0x370 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x26a/0x810 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:903 alloc_skb_with_frags+0x1d7/0xc80 net/core/skbuff.c:4756 sock_alloc_send_pskb+0xabf/0xfe0 net/core/sock.c:2037 tun_alloc_skb drivers/net/tun.c:1144 tun_get_user+0x9a8/0x3770 drivers/net/tun.c:1274 tun_chr_write_iter+0x19f/0x300 drivers/net/tun.c:1365 call_write_iter ./include/linux/fs.h:1743 new_sync_write fs/read_write.c:457 __vfs_write+0x6c3/0x7f0 fs/read_write.c:470 vfs_write+0x3e4/0x770 fs/read_write.c:518 SYSC_write+0x12f/0x2b0 fs/read_write.c:565 SyS_write+0x55/0x80 fs/read_write.c:557 do_syscall_64+0x242/0x330 arch/x86/entry/common.c:284 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:245 ================================================ Make sure tun_get_user() doesn't touch skb->data[0] unless there is actual data. C reproducer below: ========================== // autogenerated by syzkaller (http://github.com/google/syzkaller) #define _GNU_SOURCE #include #include #include #include #include #include int main() { int sock = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); int tun_fd = open("/dev/net/tun", O_RDWR); struct ifreq req; memset(&req, 0, sizeof(struct ifreq)); strcpy((char*)&req.ifr_name, "gre0"); req.ifr_flags = IFF_UP | IFF_MULTICAST; ioctl(tun_fd, TUNSETIFF, &req); ioctl(sock, SIOCSIFFLAGS, "gre0"); write(tun_fd, "hi", 0); return 0; } ========================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller --- drivers/net/tun.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3c9985f29950..5ce580f413b9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1496,11 +1496,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { - switch (skb->data[0] & 0xf0) { - case 0x40: + u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; + + switch (ip_version) { + case 4: pi.proto = htons(ETH_P_IP); break; - case 0x60: + case 6: pi.proto = htons(ETH_P_IPV6); break; default: From c0a1666bcb2a33e84187a15eabdcd54056be9a97 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Sep 2017 17:58:41 +0200 Subject: [PATCH 0984/1322] KVM: VMX: use cmpxchg64 This fixes a compilation failure on 32-bit systems. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b9d2140eb212..7f62c94196d1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2238,8 +2238,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.ndst = (dest << 8) & 0xFF00; new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -11730,8 +11730,8 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); @@ -11800,8 +11800,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); /* We should not block the vCPU if an interrupt is posted for it. */ if (pi_test_on(pi_desc) == 1) From b28503a3fe6400757817e4460090f96bc1b9d6e7 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:03 +0200 Subject: [PATCH 0985/1322] perf test: Fix vmlinux failure on s390x On s390x perf test 1 failed. It turned out that commit 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit 4a084ecfc821. Signed-off-by: Thomas-Mich Richter Cc: Hendrik Brueckner Cc: Zvonko Kosic Fixes: 4a084ecfc821 ("perf report: Fix module symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-1-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-5ani7ly57zji7s0hmzkx416l@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/sym-handling.c | 8 -------- tools/perf/util/symbol-elf.c | 8 +------- tools/perf/util/symbol.h | 3 --- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c index e103f6e46afe..581d4c5a896b 100644 --- a/tools/perf/arch/s390/util/sym-handling.c +++ b/tools/perf/arch/s390/util/sym-handling.c @@ -18,12 +18,4 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) return false; return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; } - -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map) -{ - if (map->type == MAP__FUNCTION) - sym->st_value += map->start; -} #endif diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 5c39f420111e..9cf781f0d8a2 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -810,12 +810,6 @@ static u64 ref_reloc(struct kmap *kmap) void __weak arch__sym_update(struct symbol *s __maybe_unused, GElf_Sym *sym __maybe_unused) { } -void __weak arch__adjust_sym_map_offset(GElf_Sym *sym, GElf_Shdr *shdr, - struct map *map __maybe_unused) -{ - sym->st_value -= shdr->sh_addr - shdr->sh_offset; -} - int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, int kmodule) { @@ -996,7 +990,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, /* Adjust symbol to map to file offset */ if (adjust_kernel_syms) - arch__adjust_sym_map_offset(&sym, &shdr, map); + sym.st_value -= shdr.sh_addr - shdr.sh_offset; if (strcmp(section_name, (curr_dso->short_name + diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 2bd6a1f01a1c..aad99e7e179b 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -344,9 +344,6 @@ int setup_intlist(struct intlist **list, const char *list_str, #ifdef HAVE_LIBELF_SUPPORT bool elf__needs_adjust_symbols(GElf_Ehdr ehdr); void arch__sym_update(struct symbol *s, GElf_Sym *sym); -void arch__adjust_sym_map_offset(GElf_Sym *sym, - GElf_Shdr *shdr __maybe_unused, - struct map *map __maybe_unused); #endif #define SYMBOL_A 0 From 5357413f5c067f60933e4b8d79d483fbe62b2bb5 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 15 Sep 2017 09:14:04 +0200 Subject: [PATCH 0986/1322] perf test: Fix vmlinux failure on s390x part 2 On s390x perf test 1 failed. It turned out that commit cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") was incorrect. The previous implementation in dso__load_sym() is also suitable for s390x. Therefore this patch undoes commit cf6383f73cf2 Signed-off-by: Thomas-Mich Richter Cc: Zvonko Kosic Cc: Hendrik Brueckner Fixes: cf6383f73cf2 ("perf report: Fix kernel symbol adjustment for s390x") LPU-Reference: 20170915071404.58398-2-tmricht@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-v101o8k25vuja2ogosgf15yy@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/util/Build | 1 - tools/perf/arch/s390/util/sym-handling.c | 21 --------------------- 2 files changed, 22 deletions(-) delete mode 100644 tools/perf/arch/s390/util/sym-handling.c diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index bd518b623d7a..5bd7b9260cc0 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -1,5 +1,4 @@ libperf-y += header.o -libperf-y += sym-handling.o libperf-y += kvm-stat.o libperf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c deleted file mode 100644 index 581d4c5a896b..000000000000 --- a/tools/perf/arch/s390/util/sym-handling.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Architecture specific ELF symbol handling and relocation mapping. - * - * Copyright 2017 IBM Corp. - * Author(s): Thomas Richter - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - */ - -#include "symbol.h" - -#ifdef HAVE_LIBELF_SUPPORT -bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) -{ - if (ehdr.e_type == ET_EXEC) - return false; - return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN; -} -#endif From bd86e32059526e2d0d13ca1e4447dfbbddb6e5cc Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Wed, 27 Sep 2017 20:28:57 +0800 Subject: [PATCH 0987/1322] dm crypt: fix memory leak in crypt_ctr_cipher_old() Fix memory leak of cipher_api. Fixes: 33d2f09fcb35 (dm crypt: introduce new format of cipher with "capi:" prefix) Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Jeffy Chen Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a55ffd4f5933..75341fdca4b6 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2466,6 +2466,7 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key kfree(cipher_api); return ret; } + kfree(cipher_api); return 0; bad_mem: From aff3da39211105a42b2108b8af79bf8e16f670fd Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Mon, 25 Sep 2017 14:59:46 +0200 Subject: [PATCH 0988/1322] net: mvpp2: fix parsing fragmentation detection Parsing fragmentation detection failed due to wrong configured parser TCAM entry's. Some traffic was marked as fragmented in RX descriptor, even it wasn't IP fragmented. The hardware also failed to calculate checksums which lead to use software checksum and caused performance degradation. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Antoine Tenart Signed-off-by: Stefan Chulski Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index dd0ee2691c86..da04939a2748 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -676,6 +676,7 @@ enum mvpp2_tag_type { #define MVPP2_PRS_RI_L3_MCAST BIT(15) #define MVPP2_PRS_RI_L3_BCAST (BIT(15) | BIT(16)) #define MVPP2_PRS_RI_IP_FRAG_MASK 0x20000 +#define MVPP2_PRS_RI_IP_FRAG_TRUE BIT(17) #define MVPP2_PRS_RI_UDF3_MASK 0x300000 #define MVPP2_PRS_RI_UDF3_RX_SPECIAL BIT(21) #define MVPP2_PRS_RI_L4_PROTO_MASK 0x1c00000 @@ -2315,7 +2316,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, (proto != IPPROTO_IGMP)) return -EINVAL; - /* Fragmented packet */ + /* Not fragmented packet */ tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, MVPP2_PE_LAST_FREE_TID); if (tid < 0) @@ -2334,8 +2335,12 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, MVPP2_PRS_SRAM_OP_SEL_UDF_ADD); mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT, MVPP2_PRS_IPV4_DIP_AI_BIT); - mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_MASK, - ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + + mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, + MVPP2_PRS_TCAM_PROTO_MASK_L); + mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, + MVPP2_PRS_TCAM_PROTO_MASK); mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK); mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT); @@ -2346,7 +2351,7 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4); mvpp2_prs_hw_write(priv, &pe); - /* Not fragmented packet */ + /* Fragmented packet */ tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, MVPP2_PE_LAST_FREE_TID); if (tid < 0) @@ -2358,8 +2363,11 @@ static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto, pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0; mvpp2_prs_sram_ri_update(&pe, ri, ri_mask); - mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, MVPP2_PRS_TCAM_PROTO_MASK_L); - mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, MVPP2_PRS_TCAM_PROTO_MASK); + mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE, + ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK); + + mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0); + mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0); /* Update shadow table and hw entry */ mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4); From 6bf69a1d6334bed776875c5ca852594ab4e5b209 Mon Sep 17 00:00:00 2001 From: Yan Markman Date: Mon, 25 Sep 2017 14:59:47 +0200 Subject: [PATCH 0989/1322] net: mvpp2: fix port list indexing The private port_list array has a list of pointers to mvpp2_port instances. This list is allocated given the number of ports enabled in the device tree, but the pointers are set using the port-id property. If on a single port is enabled, the port_list array will be of size 1, but when registering the port, if its id is not 0 the driver will crash. Other crashes were encountered in various situations. This fixes the issue by using an index not equal to the value of the port-id property. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Antoine Tenart Signed-off-by: Yan Markman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index da04939a2748..b2f99df81e9c 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -7504,7 +7504,7 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv, /* Ports initialization */ static int mvpp2_port_probe(struct platform_device *pdev, struct device_node *port_node, - struct mvpp2 *priv) + struct mvpp2 *priv, int index) { struct device_node *phy_node; struct phy *comphy; @@ -7678,7 +7678,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, } netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr); - priv->port_list[id] = port; + priv->port_list[index] = port; return 0; err_free_port_pcpu: @@ -8013,10 +8013,12 @@ static int mvpp2_probe(struct platform_device *pdev) } /* Initialize ports */ + i = 0; for_each_available_child_of_node(dn, port_node) { - err = mvpp2_port_probe(pdev, port_node, priv); + err = mvpp2_port_probe(pdev, port_node, priv, i); if (err < 0) goto err_mg_clk; + i++; } platform_set_drvdata(pdev, priv); From c7dfc8c848a48f176096f66a14879fb3333a460f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 25 Sep 2017 14:59:48 +0200 Subject: [PATCH 0990/1322] net: mvpp2: do not select the internal source clock This patch stops the internal MAC Tx clock from being enabled as the internal clock isn't used. The definition used for the bit controlling this behaviour is renamed as well as it was wrongly named (bit 4 of GMAC_CTRL_2_REG). Fixes: 3919357fb0bb ("net: mvpp2: initialize the GMAC when using a port") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index b2f99df81e9c..161055564720 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -333,7 +333,7 @@ #define MVPP2_GMAC_INBAND_AN_MASK BIT(0) #define MVPP2_GMAC_FLOW_CTRL_MASK GENMASK(2, 1) #define MVPP2_GMAC_PCS_ENABLE_MASK BIT(3) -#define MVPP2_GMAC_PORT_RGMII_MASK BIT(4) +#define MVPP2_GMAC_INTERNAL_CLK_MASK BIT(4) #define MVPP2_GMAC_DISABLE_PADDING BIT(5) #define MVPP2_GMAC_PORT_RESET_MASK BIT(6) #define MVPP2_GMAC_AUTONEG_CONFIG 0xc @@ -4599,7 +4599,6 @@ static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port) val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK; } else if (phy_interface_mode_is_rgmii(port->phy_interface)) { val &= ~MVPP2_GMAC_PCS_ENABLE_MASK; - val |= MVPP2_GMAC_PORT_RGMII_MASK; } writel(val, port->base + MVPP2_GMAC_CTRL_2_REG); From 35f493b87ec072c5a2497ffbee243095ef725827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 08:40:02 -0700 Subject: [PATCH 0991/1322] inetpeer: fix RCU lookup() again My prior fix was not complete, as we were dereferencing a pointer three times per node, not twice as I initially thought. Fixes: 4cc5b44b29a9 ("inetpeer: fix RCU lookup()") Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..b20c8ac64081 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; From 8f1975e31d8ed0c021a1993a4d9123dd39c740ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Sep 2017 09:14:14 -0700 Subject: [PATCH 0992/1322] inetpeer: speed up inetpeer_invalidate_tree() As measured in my prior patch ("sch_netem: faster rb tree removal"), rbtree_postorder_for_each_entry_safe() is nice looking but much slower than using rb_next() directly, except when tree is small enough to fit in CPU caches (then the cost is the same) From: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..6e5626cc366c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); From 76cf546c2802f6e25113ba481d7e85d0298768c6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:49 -0700 Subject: [PATCH 0993/1322] net_sched: use idr to allocate bpf filter handles Instead of calling cls_bpf_get() in a loop to find a unused handle, just switch to idr API to allocate new handles. Cc: Daniel Borkmann Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 57 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 36671b0fb125..6c6b21f6ba62 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; - u32 hgen; + struct idr handle_idr; struct rcu_head rcu; }; @@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; @@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + + idr_remove_ext(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, - struct cls_bpf_head *head) -{ - unsigned int i = 0x80000000; - u32 handle; - - do { - if (++head->hgen == 0x7FFFFFFF) - head->hgen = 1; - } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; + unsigned long idr_index; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, } } - if (handle == 0) - prog->handle = cls_bpf_grab_new_handle(tp, head); - else + if (handle == 0) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (ret) + goto errout; + prog->handle = idr_index; + } else { + if (!oldprog) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (ret) + goto errout; + } prog->handle = handle; - if (prog->handle == 0) { - ret = -EINVAL; - goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) - goto errout; + goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); __cls_bpf_delete_prog(prog); return ret; } @@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { + idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); @@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_idr: + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); From 1d8134fea2eb460698a281f91c03c400f7e546ce Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:50 -0700 Subject: [PATCH 0994/1322] net_sched: use idr to allocate basic filter handles Instead of calling basic_get() in a loop to find a unused handle, just switch to idr API to allocate new handles. Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d89ebafd2239..cfeb6f158566 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,13 +17,14 @@ #include #include #include +#include #include #include #include struct basic_head { - u32 hgenerator; struct list_head flist; + struct idr handle_idr; struct rcu_head rcu; }; @@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } @@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); } + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); return 0; @@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; + unsigned long idr_index; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -179,30 +185,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, err = -EINVAL; if (handle) { fnew->handle = handle; - } else if (fold) { - fnew->handle = fold->handle; - } else { - unsigned int i = 0x80000000; - do { - if (++head->hgenerator == 0x7FFFFFFF) - head->hgenerator = 1; - } while (--i > 0 && basic_get(tp, head->hgenerator)); - - if (i <= 0) { - pr_err("Insufficient number of handles\n"); - goto errout; + if (!fold) { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; } - - fnew->handle = head->hgenerator; + } else { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (err) + goto errout; + fnew->handle = idr_index; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); - if (err < 0) + if (err < 0) { + if (!fold) + idr_remove_ext(&head->handle_idr, fnew->handle); goto errout; + } *arg = fnew; if (fold) { + idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, basic_delete_filter); From e7614370d6f04711c4e4b48f7055e5008fa4ed42 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 25 Sep 2017 10:13:51 -0700 Subject: [PATCH 0995/1322] net_sched: use idr to allocate u32 filter handles Instead of calling u32_lookup_ht() in a loop to find a unused handle, just switch to idr API to allocate new handles. u32 filters are special as the handle could contain a hash table id and a key id, so we need two IDR to allocate each of them. Cc: Chris Mi Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 106 +++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 10b8d851fc6b..094d224411a9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -46,6 +46,7 @@ #include #include #include +#include struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -82,6 +83,7 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; + struct idr handle_idr; struct rcu_head rcu; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. @@ -93,7 +95,7 @@ struct tc_u_common { struct tc_u_hnode __rcu *hlist; struct Qdisc *q; int refcnt; - u32 hgenerator; + struct idr handle_idr; struct hlist_node hnode; struct rcu_head rcu; }; @@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle) return u32_lookup_key(ht, handle); } -static u32 gen_new_htid(struct tc_u_common *tp_c) +static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { - int i = 0x800; + unsigned long idr_index; + int err; - /* hgenerator only used inside rtnl lock it is safe to increment + /* This is only used inside rtnl lock it is safe to increment * without read _copy_ update semantics */ - do { - if (++tp_c->hgenerator == 0x7FF) - tp_c->hgenerator = 1; - } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); - - return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; + err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index, + 1, 0x7FF, GFP_KERNEL); + if (err) + return 0; + return (u32)(idr_index | 0x800) << 20; } static struct hlist_head *tc_u_common_hash; @@ -366,8 +368,9 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; root_ht->refcnt++; - root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); @@ -377,6 +380,7 @@ static int u32_init(struct tcf_proto *tp) } tp_c->q = tp->q; INIT_HLIST_NODE(&tp_c->hnode); + idr_init(&tp_c->handle_idr); h = tc_u_hash(tp); hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); @@ -565,6 +569,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); + idr_remove_ext(&ht->handle_idr, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -586,6 +591,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht); + idr_destroy(&ht->handle_idr); + idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -633,6 +640,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree_rcu(ht, rcu); } + idr_destroy(&tp_c->handle_idr); kfree(tp_c); } @@ -701,27 +709,21 @@ ret: return ret; } -#define NR_U32_NODE (1<<12) -static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { - struct tc_u_knode *n; - unsigned long i; - unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), - GFP_KERNEL); - if (!bitmap) - return handle | 0xFFF; + unsigned long idr_index; + u32 start = htid | 0x800; + u32 max = htid | 0xFFF; + u32 min = htid; - for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); - n; - n = rtnl_dereference(n->next)) - set_bit(TC_U32_NODE(n->handle), bitmap); + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + start, max + 1, GFP_KERNEL)) { + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + min + 1, max + 1, GFP_KERNEL)) + return max; + } - i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); - if (i >= NR_U32_NODE) - i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); - - kfree(bitmap); - return handle | (i >= NR_U32_NODE ? 0xFFF : i); + return (u32)idr_index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -806,6 +808,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, if (pins->handle == n->handle) break; + idr_replace_ext(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } @@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; if (TC_U32_KEY(handle)) return -EINVAL; - if (handle == 0) { - handle = gen_new_htid(tp->data); - if (handle == 0) - return -ENOMEM; - } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; + if (handle == 0) { + handle = gen_new_htid(tp->data, ht); + if (handle == 0) { + kfree(ht); + return -ENOMEM; + } + } else { + err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL, + handle, handle + 1, GFP_KERNEL); + if (err) { + kfree(ht); + return err; + } + } ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + idr_init(&ht->handle_idr); err = u32_replace_hw_hnode(tp, ht, flags); if (err) { + idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); return err; } @@ -986,24 +1000,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) return -EINVAL; handle = htid | TC_U32_NODE(handle); + err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, + handle, handle + 1, + GFP_KERNEL); + if (err) + return err; } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL] == NULL) - return -EINVAL; + if (tb[TCA_U32_SEL] == NULL) { + err = -EINVAL; + goto erridr; + } s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); - if (n == NULL) - return -ENOBUFS; + if (n == NULL) { + err = -ENOBUFS; + goto erridr; + } #ifdef CONFIG_CLS_U32_PERF size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); if (!n->pf) { - kfree(n); - return -ENOBUFS; + err = -ENOBUFS; + goto errfree; } #endif @@ -1066,9 +1089,12 @@ errhw: errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF +errfree: free_percpu(n->pf); #endif kfree(n); +erridr: + idr_remove_ext(&ht->handle_idr, handle); return err; } From db06ae41945b14feb7f696dcafe8048cc37e8a20 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 25 Sep 2017 23:32:20 +0200 Subject: [PATCH 0996/1322] net: dsa: mv88e6xxx: Allow dsa and cpu ports in multiple vlans Ports with the same VLAN must all be in the same bridge. However the CPU and DSA ports need to be in multiple VLANs spread over multiple bridges. So exclude them when performing this test. Fixes: b2f81d304cee ("net: dsa: add CPU and DSA ports as VLAN members") Signed-off-by: Andrew Lunn Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index c6678aa9b4ef..674dab71d71c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1100,6 +1100,10 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, }; int i, err; + /* DSA and CPU ports have to be members of multiple vlans */ + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + return 0; + if (!vid_begin) return -EOPNOTSUPP; From e804441cfe0b60f6c430901946a69c01eac09df1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 25 Sep 2017 15:55:53 -0700 Subject: [PATCH 0997/1322] net: dsa: Fix network device registration order We cannot be registering the network device first, then setting its carrier off and finally connecting it to a PHY, doing that leaves a window during which the carrier is at best inconsistent, and at worse the device is not usable without a down/up sequence since the network device is visible to user space with possibly no PHY device attached. Re-order steps so that they make logical sense. This fixes some devices where the port was not usable after e.g: an unbind then bind of the driver. Fixes: 0071f56e46da ("dsa: Register netdev before phy") Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..865e29e62bad 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1301,28 +1301,33 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) From 61f26d92510ea1e30f9b69097b34b6a522daa1d2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 07:40:42 +0200 Subject: [PATCH 0998/1322] selftests: rtnetlink.sh: add rudimentary vrf test Acked-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 42 ++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 4b48de565cae..a048f7a5f94c 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -291,6 +291,47 @@ kci_test_ifalias() echo "PASS: set ifalias $namewant for $devdummy" } +kci_test_vrf() +{ + vrfname="test-vrf" + ret=0 + + ip link show type vrf 2>/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: vrf: iproute2 too old" + return 0 + fi + + ip link add "$vrfname" type vrf table 10 + check_err $? + if [ $ret -ne 0 ];then + echo "FAIL: can't add vrf interface, skipping test" + return 0 + fi + + ip -br link show type vrf | grep -q "$vrfname" + check_err $? + if [ $ret -ne 0 ];then + echo "FAIL: created vrf device not found" + return 1 + fi + + ip link set dev "$vrfname" up + check_err $? + + ip link set dev "$devdummy" master "$vrfname" + check_err $? + ip link del dev "$vrfname" + check_err $? + + if [ $ret -ne 0 ];then + echo "FAIL: vrf" + return 1 + fi + + echo "PASS: vrf" +} + kci_test_rtnl() { kci_add_dummy @@ -306,6 +347,7 @@ kci_test_rtnl() kci_test_bridge kci_test_addrlabel kci_test_ifalias + kci_test_vrf kci_del_dummy } From 06d7a1b932c26afe2c0a1f4520ddd417d8eeda79 Mon Sep 17 00:00:00 2001 From: Ed Blake Date: Tue, 26 Sep 2017 11:43:46 +0100 Subject: [PATCH 0999/1322] net: stmmac: dwc-qos: Add suspend / resume support Add hook to stmmac_pltfr_pm_ops for suspend / resume handling. Signed-off-by: Ed Blake Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index dd6a2f9791cc..5efef8001edf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -511,6 +511,7 @@ static struct platform_driver dwc_eth_dwmac_driver = { .remove = dwc_eth_dwmac_remove, .driver = { .name = "dwc-eth-dwmac", + .pm = &stmmac_pltfr_pm_ops, .of_match_table = dwc_eth_dwmac_match, }, }; From 1579f678fb4397f9e439d2e373d4ade036c673b4 Mon Sep 17 00:00:00 2001 From: Ed Blake Date: Tue, 26 Sep 2017 11:44:53 +0100 Subject: [PATCH 1000/1322] net: stmmac: dwmac4: Re-enable MAC Rx before powering down Re-enable the MAC receiver by setting CONFIG_RE before powering down, as instructed in section 6.3.5.1 of [1]. Without this the MAC fails to receive WoL packets and never wakes up. [1] DWC Ethernet QoS Databook 4.10a October 2014 Signed-off-by: Ed Blake Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index c4407e8e39a3..2f7d7ec59962 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -296,6 +296,7 @@ static void dwmac4_pmt(struct mac_device_info *hw, unsigned long mode) { void __iomem *ioaddr = hw->pcsr; unsigned int pmt = 0; + u32 config; if (mode & WAKE_MAGIC) { pr_debug("GMAC: WOL Magic frame\n"); @@ -306,6 +307,12 @@ static void dwmac4_pmt(struct mac_device_info *hw, unsigned long mode) pmt |= power_down | global_unicast | wake_up_frame_en; } + if (pmt) { + /* The receiver must be enabled for WOL before powering down */ + config = readl(ioaddr + GMAC_CONFIG); + config |= GMAC_CONFIG_RE; + writel(config, ioaddr + GMAC_CONFIG); + } writel(pmt, ioaddr + GMAC_PMT); } From 79110a0426d8179a51bf3cb698a84f6ec98ca60c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:40 +0200 Subject: [PATCH 1001/1322] rtnetlink: add helper to put master and link ifindexes rtnl_fill_ifinfo currently requires caller to hold the rtnl mutex. Unfortunately the function is quite large which makes it harder to see which spots require the lock, which spots assume it and which ones could do without. Add helpers to factor out the ifindex dumping, one can use rcu to avoid rtnl dependency. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..c801212ee40e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1307,6 +1307,31 @@ static u32 rtnl_get_event(unsigned long event) return rtnl_event_type; } +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) +{ + int ifindex = dev_get_iflink(dev); + + if (dev->ifindex == ifindex) + return 0; + + return nla_put_u32(skb, IFLA_LINK, ifindex); +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, @@ -1316,7 +1341,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *af_spec; struct rtnl_af_ops *af_ops; - struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1345,10 +1369,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || - (upper_dev && - nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_iflink(skb, dev) || + put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || From 250fc3dfdbd3e8b5c751ce7b5a26176ec62ca0f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:41 +0200 Subject: [PATCH 1002/1322] rtnetlink: add helpers to dump vf information similar to earlier patches, split out more parts of this function to better see what is happening and where we assume rtnl is locked. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 50 +++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c801212ee40e..d504e78cd01f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1211,6 +1211,36 @@ nla_put_vfinfo_failure: return -EMSGSIZE; } +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + return -EMSGSIZE; + } + + nla_nest_end(skb, vfinfo); + return 0; +} + static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; @@ -1407,27 +1437,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && - nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && - ext_filter_mask & RTEXT_FILTER_VF) { - int i; - struct nlattr *vfinfo; - int num_vfs = dev_num_vf(dev->dev.parent); - - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); - if (!vfinfo) - goto nla_put_failure; - for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) - goto nla_put_failure; - } - - nla_nest_end(skb, vfinfo); - } - if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; From b1e66b9a67d67d0e73091b04b51e524581c8c887 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:42 +0200 Subject: [PATCH 1003/1322] rtnetlink: add helpers to dump netnsid information Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d504e78cd01f..d524609c587c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1362,6 +1362,23 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) return nla_put_u32(skb, IFLA_LINK, ifindex); } +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev) +{ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(dev_net(dev), link_net); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + } + } + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, @@ -1451,17 +1468,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - if (dev->rtnl_link_ops && - dev->rtnl_link_ops->get_link_net) { - struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); - - if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(dev_net(dev), link_net); - - if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) - goto nla_put_failure; - } - } + if (rtnl_fill_link_netnsid(skb, dev)) + goto nla_put_failure; if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) goto nla_put_failure; From 4c82a95e523721637d44e0e8d879fa11fa825eec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Sep 2017 13:58:43 +0200 Subject: [PATCH 1004/1322] rtnetlink: rtnl_have_link_slave_info doesn't need rtnl it can be switched to rcu. Reviewed-by: David Ahern Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d524609c587c..e6955da0d58d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -522,11 +522,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; + bool ret = false; - master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) - return true; - return false; + ret = true; + rcu_read_unlock(); + return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, From 4d8806fd14e1492cd4fb2021f709b163ea3364ad Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 26 Sep 2017 16:14:09 +0100 Subject: [PATCH 1005/1322] cxgb4: make function ch_flower_stats_cb, fixes warning The function ch_flower_stats_cb is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warnings: symbol 'ch_flower_stats_cb' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index a36bd66d2834..92a311767381 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -366,7 +366,7 @@ err: return ret; } -void ch_flower_stats_cb(unsigned long data) +static void ch_flower_stats_cb(unsigned long data) { struct adapter *adap = (struct adapter *)data; struct ch_tc_flower_entry *flower_entry; From 2b7c6ba945fd3c10ca3e030be402098aff2f89d3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:13 +0100 Subject: [PATCH 1006/1322] bpf/verifier: improve disassembly of BPF_END instructions print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a different structure: it has a size in insn->imm (even if it's BPF_X) and uses the BPF_SRC (X or K) to indicate which endianness to use. So it needs different code to print it. Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f849eca36052..e8d7bb8e6b98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,26 +332,40 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; +static void print_bpf_end_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + static void print_bpf_insn(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose("BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(env, insn); + } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); - else + } else { verbose("(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->imm); + } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", From 73c864b38383f4abc9b559025cd663f4a81afa89 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:29 +0100 Subject: [PATCH 1007/1322] bpf/verifier: improve disassembly of BPF_NEG instructions BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are compound-assignments. So give it its own format in print_bpf_insn(). Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8d7bb8e6b98..4cf9b72c59a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -351,6 +351,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, verbose("BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose("(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", From 4971613c1639d8e5f102c4e797c3bf8f83a5a69e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:19:37 -0400 Subject: [PATCH 1008/1322] packet: in packet_do_bind, test fanout with bind_lock held Once a socket has po->fanout set, it remains a member of the group until it is destroyed. The prot_hook must be constant and identical across sockets in the group. If fanout_add races with packet_do_bind between the test of po->fanout and taking the lock, the bind call may make type or dev inconsistent with that of the fanout group. Hold po->bind_lock when testing po->fanout to avoid this race. I had to introduce artificial delay (local_bh_enable) to actually observe the race. Fixes: dc99f600698d ("packet: Add fanout support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d288f52c53f7..a10c2836465c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3069,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { From da7c9561015e93d10fe6aab73e9288e0d09d65a6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 26 Sep 2017 12:20:17 -0400 Subject: [PATCH 1009/1322] packet: only test po->has_vnet_hdr once in packet_snd Packet socket option po->has_vnet_hdr can be updated concurrently with other operations if no ring is attached. Do not test the option twice in packet_snd, as the value may change in between calls. A race on setsockopt disable may cause a packet > mtu to be sent without having GSO options set. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a10c2836465c..bec01a3daf5b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2840,6 +2840,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2883,6 +2884,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2941,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; From 352f58b0d9f26d283b10f4c9f21e8717141c1334 Mon Sep 17 00:00:00 2001 From: Aviad Krawczyk Date: Wed, 27 Sep 2017 01:57:50 +0800 Subject: [PATCH 1010/1322] net-next/hinic: Set Rxq irq to specific cpu for NUMA Set Rxq irq to specific cpu for allocating and receiving the skb from the same node. Signed-off-by: Aviad Krawczyk Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_rx.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c index 1d4f712b15a8..e2e5cdc7119c 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "hinic_common.h" @@ -171,11 +172,10 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq) struct hinic_sge sge; dma_addr_t dma_addr; struct sk_buff *skb; - int i, alloc_more; u16 prod_idx; + int i; free_wqebbs = hinic_get_rq_free_wqebbs(rxq->rq); - alloc_more = 0; /* Limit the allocation chunks */ if (free_wqebbs > nic_dev->rx_weight) @@ -185,7 +185,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq) skb = rx_alloc_skb(rxq, &dma_addr); if (!skb) { netdev_err(rxq->netdev, "Failed to alloc Rx skb\n"); - alloc_more = 1; goto skb_out; } @@ -195,7 +194,6 @@ static int rx_alloc_pkts(struct hinic_rxq *rxq) &prod_idx); if (!rq_wqe) { rx_free_skb(rxq, skb, dma_addr); - alloc_more = 1; goto skb_out; } @@ -211,9 +209,7 @@ skb_out: hinic_rq_update(rxq->rq, prod_idx); } - if (alloc_more) - tasklet_schedule(&rxq->rx_task); - + tasklet_schedule(&rxq->rx_task); return i; } @@ -357,7 +353,7 @@ static int rxq_recv(struct hinic_rxq *rxq, int budget) } if (pkts) - tasklet_schedule(&rxq->rx_task); /* hinic_rx_alloc_pkts */ + tasklet_schedule(&rxq->rx_task); /* rx_alloc_pkts */ u64_stats_update_begin(&rxq->rxq_stats.syncp); rxq->rxq_stats.pkts += pkts; @@ -417,6 +413,8 @@ static int rx_request_irq(struct hinic_rxq *rxq) struct hinic_dev *nic_dev = netdev_priv(rxq->netdev); struct hinic_hwdev *hwdev = nic_dev->hwdev; struct hinic_rq *rq = rxq->rq; + struct hinic_qp *qp; + struct cpumask mask; int err; rx_add_napi(rxq); @@ -432,7 +430,9 @@ static int rx_request_irq(struct hinic_rxq *rxq) return err; } - return 0; + qp = container_of(rq, struct hinic_qp, rq); + cpumask_set_cpu(qp->q_id % num_online_cpus(), &mask); + return irq_set_affinity_hint(rq->irq, &mask); } static void rx_free_irq(struct hinic_rxq *rxq) From bbdc9e687fb3c2920961d7716f1c5519ff7bc595 Mon Sep 17 00:00:00 2001 From: Aviad Krawczyk Date: Wed, 27 Sep 2017 02:11:33 +0800 Subject: [PATCH 1011/1322] net-next/hinic: Fix a case of Tx Queue is Stopped forever Fix the following scenario: 1. tx_free_poll is running on cpu X 2. xmit function is running on cpu Y and fails to get sq wqe 3. tx_free_poll frees wqes on cpu X and checks the queue is not stopped 4. xmit function stops the queue after failed to get sq wqe 5. The queue is stopped forever Signed-off-by: Aviad Krawczyk Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_tx.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index abe3e38cd342..9128858479c4 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -212,10 +212,19 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx); if (!sq_wqe) { - tx_unmap_skb(nic_dev, skb, txq->sges); - netif_stop_subqueue(netdev, qp->q_id); + /* Check for the case free_tx_poll is called in another cpu + * and we stopped the subqueue after free_tx_poll check. + */ + sq_wqe = hinic_sq_get_wqe(txq->sq, wqe_size, &prod_idx); + if (sq_wqe) { + netif_wake_subqueue(nic_dev->netdev, qp->q_id); + goto process_sq_wqe; + } + + tx_unmap_skb(nic_dev, skb, txq->sges); + u64_stats_update_begin(&txq->txq_stats.syncp); txq->txq_stats.tx_busy++; u64_stats_update_end(&txq->txq_stats.syncp); @@ -223,6 +232,7 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) goto flush_skbs; } +process_sq_wqe: hinic_sq_prepare_wqe(txq->sq, prod_idx, sq_wqe, txq->sges, nr_sges); hinic_sq_write_wqe(txq->sq, prod_idx, sq_wqe, skb, wqe_size); From b32ca44a88def4bf92626d8777494c6f14638c42 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 26 Sep 2017 14:57:21 -0400 Subject: [PATCH 1012/1322] net: dsa: mv88e6xxx: lock mutex when freeing IRQs mv88e6xxx_g2_irq_free locks the registers mutex, but not mv88e6xxx_g1_irq_free, which results in a stack trace from assert_reg_lock when unloading the mv88e6xxx module. Fix this. Fixes: 3460a5770ce9 ("net: dsa: mv88e6xxx: Mask g1 interrupts and free interrupt") Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 674dab71d71c..d74c7335c512 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3951,7 +3951,9 @@ static void mv88e6xxx_remove(struct mdio_device *mdiodev) if (chip->irq > 0) { if (chip->info->g2_irqs > 0) mv88e6xxx_g2_irq_free(chip); + mutex_lock(&chip->reg_lock); mv88e6xxx_g1_irq_free(chip); + mutex_unlock(&chip->reg_lock); } } From 6ade97da601f8af793f6c7a861af754d0f0b6767 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 26 Sep 2017 23:12:28 +0300 Subject: [PATCH 1013/1322] arp: make arp_hdr_len() return unsigned int Negative ARP header length are not a thing. Constify arguments while I'm at it. Space savings: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3) function old new delta arpt_do_table 1163 1160 -3 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 3 ++- include/linux/if_arp.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c99dc59d729b..d2e94b8559f0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2491,7 +2491,8 @@ int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *curr_active_slave, *curr_arp_slave; unsigned char *arp_ptr; __be32 sip, tip; - int alen, is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); + int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); + unsigned int alen; if (!slave_do_arp_validate(bond, slave)) { if ((slave_do_arp_validate_only(bond) && is_arp) || diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index 3355efc89781..6756fea18b69 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb) return (struct arphdr *)skb_network_header(skb); } -static inline int arp_hdr_len(struct net_device *dev) +static inline unsigned int arp_hdr_len(const struct net_device *dev) { switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) From 9d538fa60bad4f7b23193c89e843797a1cf71ef3 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 26 Sep 2017 17:38:50 -0700 Subject: [PATCH 1014/1322] net: Set sk_prot_creator when cloning sockets to the right proto sk->sk_prot and sk->sk_prot_creator can differ when the app uses IPV6_ADDRFORM (transforming an IPv6-socket to an IPv4-one). Which is why sk_prot_creator is there to make sure that sk_prot_free() does the kmem_cache_free() on the right kmem_cache slab. Now, if such a socket gets transformed back to a listening socket (using connect() with AF_UNSPEC) we will allocate an IPv4 tcp_sock through sk_clone_lock() when a new connection comes in. But sk_prot_creator will still point to the IPv6 kmem_cache (as everything got copied in sk_clone_lock()). When freeing, we will thus put this memory back into the IPv6 kmem_cache although it was allocated in the IPv4 cache. I have seen memory corruption happening because of this. With slub-debugging and MEMCG_KMEM enabled this gives the warning "cache_from_obj: Wrong slab cache. TCPv6 but object is from TCP" A C-program to trigger this: void main(void) { int fd = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); int new_fd, newest_fd, client_fd; struct sockaddr_in6 bind_addr; struct sockaddr_in bind_addr4, client_addr1, client_addr2; struct sockaddr unsp; int val; memset(&bind_addr, 0, sizeof(bind_addr)); bind_addr.sin6_family = AF_INET6; bind_addr.sin6_port = ntohs(42424); memset(&client_addr1, 0, sizeof(client_addr1)); client_addr1.sin_family = AF_INET; client_addr1.sin_port = ntohs(42424); client_addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&client_addr2, 0, sizeof(client_addr2)); client_addr2.sin_family = AF_INET; client_addr2.sin_port = ntohs(42421); client_addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); memset(&unsp, 0, sizeof(unsp)); unsp.sa_family = AF_UNSPEC; bind(fd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)); listen(fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr1, sizeof(client_addr1)); new_fd = accept(fd, NULL, NULL); close(fd); val = AF_INET; setsockopt(new_fd, SOL_IPV6, IPV6_ADDRFORM, &val, sizeof(val)); connect(new_fd, &unsp, sizeof(unsp)); memset(&bind_addr4, 0, sizeof(bind_addr4)); bind_addr4.sin_family = AF_INET; bind_addr4.sin_port = ntohs(42421); bind(new_fd, (struct sockaddr *)&bind_addr4, sizeof(bind_addr4)); listen(new_fd, 5); client_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); connect(client_fd, (struct sockaddr *)&client_addr2, sizeof(client_addr2)); newest_fd = accept(new_fd, NULL, NULL); close(new_fd); close(client_fd); close(new_fd); } As far as I can see, this bug has been there since the beginning of the git-days. Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..7d55c05f449d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); From 9ffe79a9c2eec0f30687c2fd8b452bda5c8287b0 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:23 +0800 Subject: [PATCH 1015/1322] net: hns3: Support for dynamically assigning tx buffer to TC This patch add support of dynamically assigning tx buffer to TC when the TC is enabled. It will save buffer for rx direction to avoid packet loss. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 1 + .../hisilicon/hns3/hns3pf/hclge_main.c | 64 ++++++++++++++++--- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 758cf3948131..a81c6cb93ed5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -311,6 +311,7 @@ struct hclge_tc_thrd { struct hclge_priv_buf { struct hclge_waterline wl; /* Waterline for low and high*/ u32 buf_size; /* TC private buffer size */ + u32 tx_buf_size; u32 enable; /* Enable TC private buffer or not */ }; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e0685e630afe..eaa3fc355568 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1324,7 +1324,7 @@ static int hclge_alloc_vport(struct hclge_dev *hdev) return 0; } -static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size) +static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev) { /* TX buffer size is unit by 128 byte */ #define HCLGE_BUF_SIZE_UNIT_SHIFT 7 @@ -1337,10 +1337,13 @@ static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size) req = (struct hclge_tx_buff_alloc *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0); - for (i = 0; i < HCLGE_TC_NUM; i++) + for (i = 0; i < HCLGE_TC_NUM; i++) { + u32 buf_size = hdev->priv_buf[i].tx_buf_size; + req->tx_pkt_buff[i] = cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) | HCLGE_BUF_SIZE_UPDATE_EN_MSK); + } ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { @@ -1352,9 +1355,9 @@ static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size) return 0; } -static int hclge_tx_buffer_alloc(struct hclge_dev *hdev, u32 buf_size) +static int hclge_tx_buffer_alloc(struct hclge_dev *hdev) { - int ret = hclge_cmd_alloc_tx_buff(hdev, buf_size); + int ret = hclge_cmd_alloc_tx_buff(hdev); if (ret) { dev_err(&hdev->pdev->dev, @@ -1433,6 +1436,16 @@ static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev) return rx_priv; } +static u32 hclge_get_tx_buff_alloced(struct hclge_dev *hdev) +{ + u32 i, total_tx_size = 0; + + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) + total_tx_size += hdev->priv_buf[i].tx_buf_size; + + return total_tx_size; +} + static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) { u32 shared_buf_min, shared_buf_tc, shared_std; @@ -1477,18 +1490,43 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) return true; } +static int hclge_tx_buffer_calc(struct hclge_dev *hdev) +{ + u32 i, total_size; + + total_size = hdev->pkt_buf_size; + + /* alloc tx buffer for all enabled tc */ + for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { + struct hclge_priv_buf *priv = &hdev->priv_buf[i]; + + if (total_size < HCLGE_DEFAULT_TX_BUF) + return -ENOMEM; + + if (hdev->hw_tc_map & BIT(i)) + priv->tx_buf_size = HCLGE_DEFAULT_TX_BUF; + else + priv->tx_buf_size = 0; + + total_size -= priv->tx_buf_size; + } + + return 0; +} + /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs * @hdev: pointer to struct hclge_dev - * @tx_size: the allocated tx buffer for all TCs * @return: 0: calculate sucessful, negative: fail */ -int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size) +int hclge_rx_buffer_calc(struct hclge_dev *hdev) { - u32 rx_all = hdev->pkt_buf_size - tx_size; + u32 rx_all = hdev->pkt_buf_size; int no_pfc_priv_num, pfc_priv_num; struct hclge_priv_buf *priv; int i; + rx_all -= hclge_get_tx_buff_alloced(hdev); + /* When DCB is not supported, rx private * buffer is not allocated. */ @@ -1771,7 +1809,6 @@ static int hclge_common_wl_config(struct hclge_dev *hdev) int hclge_buffer_alloc(struct hclge_dev *hdev) { - u32 tx_buf_size = HCLGE_DEFAULT_TX_BUF; int ret; hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM, @@ -1780,14 +1817,21 @@ int hclge_buffer_alloc(struct hclge_dev *hdev) if (!hdev->priv_buf) return -ENOMEM; - ret = hclge_tx_buffer_alloc(hdev, tx_buf_size); + ret = hclge_tx_buffer_calc(hdev); + if (ret) { + dev_err(&hdev->pdev->dev, + "could not calc tx buffer size for all TCs %d\n", ret); + return ret; + } + + ret = hclge_tx_buffer_alloc(hdev); if (ret) { dev_err(&hdev->pdev->dev, "could not alloc tx buffers %d\n", ret); return ret; } - ret = hclge_rx_buffer_calc(hdev, tx_buf_size); + ret = hclge_rx_buffer_calc(hdev); if (ret) { dev_err(&hdev->pdev->dev, "could not calc rx priv buffer size for all TCs %d\n", From acf61ecd44feae2a78c13d0d7cb8e386741c5cf0 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:24 +0800 Subject: [PATCH 1016/1322] net: hns3: Add support for dynamically buffer reallocation Current buffer allocation can only happen at init, when doing buffer reallocation after init, care must be taken care of memory which priv_buf points to. This patch fixes it by using a dynamic allocated temporary memory. Because we only do buffer reallocation at init or when setting up the DCB parameter, and priv_buf is only used at buffer allocation process, so it is ok to use a dynamic allocated temporary memory. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_cmd.h | 5 + .../hisilicon/hns3/hns3pf/hclge_main.c | 150 ++++++++++-------- .../hisilicon/hns3/hns3pf/hclge_main.h | 2 - 3 files changed, 87 insertions(+), 70 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index a81c6cb93ed5..6b6d28eff664 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -322,6 +322,11 @@ struct hclge_shared_buf { u32 buf_size; }; +struct hclge_pkt_buf_alloc { + struct hclge_priv_buf priv_buf[HCLGE_MAX_TC_NUM]; + struct hclge_shared_buf s_buf; +}; + #define HCLGE_RX_COM_WL_EN_B 15 struct hclge_rx_com_wl_buf { __le16 high_wl; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index eaa3fc355568..61632feb8c4e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1324,7 +1324,8 @@ static int hclge_alloc_vport(struct hclge_dev *hdev) return 0; } -static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev) +static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { /* TX buffer size is unit by 128 byte */ #define HCLGE_BUF_SIZE_UNIT_SHIFT 7 @@ -1338,7 +1339,7 @@ static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev) hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0); for (i = 0; i < HCLGE_TC_NUM; i++) { - u32 buf_size = hdev->priv_buf[i].tx_buf_size; + u32 buf_size = buf_alloc->priv_buf[i].tx_buf_size; req->tx_pkt_buff[i] = cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) | @@ -1355,9 +1356,10 @@ static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev) return 0; } -static int hclge_tx_buffer_alloc(struct hclge_dev *hdev) +static int hclge_tx_buffer_alloc(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { - int ret = hclge_cmd_alloc_tx_buff(hdev); + int ret = hclge_cmd_alloc_tx_buff(hdev, buf_alloc); if (ret) { dev_err(&hdev->pdev->dev, @@ -1390,13 +1392,14 @@ static int hclge_get_pfc_enalbe_num(struct hclge_dev *hdev) } /* Get the number of pfc enabled TCs, which have private buffer */ -static int hclge_get_pfc_priv_num(struct hclge_dev *hdev) +static int hclge_get_pfc_priv_num(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { struct hclge_priv_buf *priv; int i, cnt = 0; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if ((hdev->tm_info.hw_pfc_map & BIT(i)) && priv->enable) cnt++; @@ -1406,13 +1409,14 @@ static int hclge_get_pfc_priv_num(struct hclge_dev *hdev) } /* Get the number of pfc disabled TCs, which have private buffer */ -static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev) +static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { struct hclge_priv_buf *priv; int i, cnt = 0; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if (hdev->hw_tc_map & BIT(i) && !(hdev->tm_info.hw_pfc_map & BIT(i)) && priv->enable) @@ -1422,31 +1426,33 @@ static int hclge_get_no_pfc_priv_num(struct hclge_dev *hdev) return cnt; } -static u32 hclge_get_rx_priv_buff_alloced(struct hclge_dev *hdev) +static u32 hclge_get_rx_priv_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc) { struct hclge_priv_buf *priv; u32 rx_priv = 0; int i; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if (priv->enable) rx_priv += priv->buf_size; } return rx_priv; } -static u32 hclge_get_tx_buff_alloced(struct hclge_dev *hdev) +static u32 hclge_get_tx_buff_alloced(struct hclge_pkt_buf_alloc *buf_alloc) { u32 i, total_tx_size = 0; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) - total_tx_size += hdev->priv_buf[i].tx_buf_size; + total_tx_size += buf_alloc->priv_buf[i].tx_buf_size; return total_tx_size; } -static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) +static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc, + u32 rx_all) { u32 shared_buf_min, shared_buf_tc, shared_std; int tc_num, pfc_enable_num; @@ -1467,30 +1473,31 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all) hdev->mps; shared_std = max_t(u32, shared_buf_min, shared_buf_tc); - rx_priv = hclge_get_rx_priv_buff_alloced(hdev); + rx_priv = hclge_get_rx_priv_buff_alloced(buf_alloc); if (rx_all <= rx_priv + shared_std) return false; shared_buf = rx_all - rx_priv; - hdev->s_buf.buf_size = shared_buf; - hdev->s_buf.self.high = shared_buf; - hdev->s_buf.self.low = 2 * hdev->mps; + buf_alloc->s_buf.buf_size = shared_buf; + buf_alloc->s_buf.self.high = shared_buf; + buf_alloc->s_buf.self.low = 2 * hdev->mps; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { if ((hdev->hw_tc_map & BIT(i)) && (hdev->tm_info.hw_pfc_map & BIT(i))) { - hdev->s_buf.tc_thrd[i].low = hdev->mps; - hdev->s_buf.tc_thrd[i].high = 2 * hdev->mps; + buf_alloc->s_buf.tc_thrd[i].low = hdev->mps; + buf_alloc->s_buf.tc_thrd[i].high = 2 * hdev->mps; } else { - hdev->s_buf.tc_thrd[i].low = 0; - hdev->s_buf.tc_thrd[i].high = hdev->mps; + buf_alloc->s_buf.tc_thrd[i].low = 0; + buf_alloc->s_buf.tc_thrd[i].high = hdev->mps; } } return true; } -static int hclge_tx_buffer_calc(struct hclge_dev *hdev) +static int hclge_tx_buffer_calc(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { u32 i, total_size; @@ -1498,7 +1505,7 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev) /* alloc tx buffer for all enabled tc */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - struct hclge_priv_buf *priv = &hdev->priv_buf[i]; + struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i]; if (total_size < HCLGE_DEFAULT_TX_BUF) return -ENOMEM; @@ -1516,22 +1523,24 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev) /* hclge_rx_buffer_calc: calculate the rx private buffer size for all TCs * @hdev: pointer to struct hclge_dev + * @buf_alloc: pointer to buffer calculation data * @return: 0: calculate sucessful, negative: fail */ -int hclge_rx_buffer_calc(struct hclge_dev *hdev) +int hclge_rx_buffer_calc(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { u32 rx_all = hdev->pkt_buf_size; int no_pfc_priv_num, pfc_priv_num; struct hclge_priv_buf *priv; int i; - rx_all -= hclge_get_tx_buff_alloced(hdev); + rx_all -= hclge_get_tx_buff_alloced(buf_alloc); /* When DCB is not supported, rx private * buffer is not allocated. */ if (!hnae3_dev_dcb_supported(hdev)) { - if (!hclge_is_rx_buf_ok(hdev, rx_all)) + if (!hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all)) return -ENOMEM; return 0; @@ -1539,7 +1548,7 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev) /* step 1, try to alloc private buffer for all enabled tc */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if (hdev->hw_tc_map & BIT(i)) { priv->enable = 1; if (hdev->tm_info.hw_pfc_map & BIT(i)) { @@ -1560,14 +1569,14 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev) } } - if (hclge_is_rx_buf_ok(hdev, rx_all)) + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all)) return 0; /* step 2, try to decrease the buffer size of * no pfc TC's private buffer */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; priv->enable = 0; priv->wl.low = 0; @@ -1590,18 +1599,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev) } } - if (hclge_is_rx_buf_ok(hdev, rx_all)) + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all)) return 0; /* step 3, try to reduce the number of pfc disabled TCs, * which have private buffer */ /* get the total no pfc enable TC number, which have private buffer */ - no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev); + no_pfc_priv_num = hclge_get_no_pfc_priv_num(hdev, buf_alloc); /* let the last to be cleared first */ for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if (hdev->hw_tc_map & BIT(i) && !(hdev->tm_info.hw_pfc_map & BIT(i))) { @@ -1613,22 +1622,22 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev) no_pfc_priv_num--; } - if (hclge_is_rx_buf_ok(hdev, rx_all) || + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) || no_pfc_priv_num == 0) break; } - if (hclge_is_rx_buf_ok(hdev, rx_all)) + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all)) return 0; /* step 4, try to reduce the number of pfc enabled TCs * which have private buffer. */ - pfc_priv_num = hclge_get_pfc_priv_num(hdev); + pfc_priv_num = hclge_get_pfc_priv_num(hdev, buf_alloc); /* let the last to be cleared first */ for (i = HCLGE_MAX_TC_NUM - 1; i >= 0; i--) { - priv = &hdev->priv_buf[i]; + priv = &buf_alloc->priv_buf[i]; if (hdev->hw_tc_map & BIT(i) && hdev->tm_info.hw_pfc_map & BIT(i)) { @@ -1640,17 +1649,18 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev) pfc_priv_num--; } - if (hclge_is_rx_buf_ok(hdev, rx_all) || + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all) || pfc_priv_num == 0) break; } - if (hclge_is_rx_buf_ok(hdev, rx_all)) + if (hclge_is_rx_buf_ok(hdev, buf_alloc, rx_all)) return 0; return -ENOMEM; } -static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) +static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { struct hclge_rx_priv_buff *req; struct hclge_desc desc; @@ -1662,7 +1672,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) /* Alloc private buffer TCs */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - struct hclge_priv_buf *priv = &hdev->priv_buf[i]; + struct hclge_priv_buf *priv = &buf_alloc->priv_buf[i]; req->buf_num[i] = cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S); @@ -1671,7 +1681,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) } req->shared_buf = - cpu_to_le16((hdev->s_buf.buf_size >> HCLGE_BUF_UNIT_S) | + cpu_to_le16((buf_alloc->s_buf.buf_size >> HCLGE_BUF_UNIT_S) | (1 << HCLGE_TC0_PRI_BUF_EN_B)); ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -1686,7 +1696,8 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev) #define HCLGE_PRIV_ENABLE(a) ((a) > 0 ? 1 : 0) -static int hclge_rx_priv_wl_config(struct hclge_dev *hdev) +static int hclge_rx_priv_wl_config(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { struct hclge_rx_priv_wl_buf *req; struct hclge_priv_buf *priv; @@ -1706,7 +1717,9 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev) desc[i].flag &= ~cpu_to_le16(HCLGE_CMD_FLAG_NEXT); for (j = 0; j < HCLGE_TC_NUM_ONE_DESC; j++) { - priv = &hdev->priv_buf[i * HCLGE_TC_NUM_ONE_DESC + j]; + u32 idx = i * HCLGE_TC_NUM_ONE_DESC + j; + + priv = &buf_alloc->priv_buf[idx]; req->tc_wl[j].high = cpu_to_le16(priv->wl.high >> HCLGE_BUF_UNIT_S); req->tc_wl[j].high |= @@ -1731,9 +1744,10 @@ static int hclge_rx_priv_wl_config(struct hclge_dev *hdev) return 0; } -static int hclge_common_thrd_config(struct hclge_dev *hdev) +static int hclge_common_thrd_config(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { - struct hclge_shared_buf *s_buf = &hdev->s_buf; + struct hclge_shared_buf *s_buf = &buf_alloc->s_buf; struct hclge_rx_com_thrd *req; struct hclge_desc desc[2]; struct hclge_tc_thrd *tc; @@ -1777,9 +1791,10 @@ static int hclge_common_thrd_config(struct hclge_dev *hdev) return 0; } -static int hclge_common_wl_config(struct hclge_dev *hdev) +static int hclge_common_wl_config(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { - struct hclge_shared_buf *buf = &hdev->s_buf; + struct hclge_shared_buf *buf = &buf_alloc->s_buf; struct hclge_rx_com_wl *req; struct hclge_desc desc; int ret; @@ -1809,69 +1824,68 @@ static int hclge_common_wl_config(struct hclge_dev *hdev) int hclge_buffer_alloc(struct hclge_dev *hdev) { + struct hclge_pkt_buf_alloc *pkt_buf; int ret; - hdev->priv_buf = devm_kmalloc_array(&hdev->pdev->dev, HCLGE_MAX_TC_NUM, - sizeof(struct hclge_priv_buf), - GFP_KERNEL | __GFP_ZERO); - if (!hdev->priv_buf) + pkt_buf = kzalloc(sizeof(*pkt_buf), GFP_KERNEL); + if (!pkt_buf) return -ENOMEM; - ret = hclge_tx_buffer_calc(hdev); + ret = hclge_tx_buffer_calc(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not calc tx buffer size for all TCs %d\n", ret); - return ret; + goto out; } - ret = hclge_tx_buffer_alloc(hdev); + ret = hclge_tx_buffer_alloc(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not alloc tx buffers %d\n", ret); - return ret; + goto out; } - ret = hclge_rx_buffer_calc(hdev); + ret = hclge_rx_buffer_calc(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not calc rx priv buffer size for all TCs %d\n", ret); - return ret; + goto out; } - ret = hclge_rx_priv_buf_alloc(hdev); + ret = hclge_rx_priv_buf_alloc(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not alloc rx priv buffer %d\n", ret); - return ret; + goto out; } if (hnae3_dev_dcb_supported(hdev)) { - ret = hclge_rx_priv_wl_config(hdev); + ret = hclge_rx_priv_wl_config(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not configure rx private waterline %d\n", ret); - return ret; + goto out; } - ret = hclge_common_thrd_config(hdev); + ret = hclge_common_thrd_config(hdev, pkt_buf); if (ret) { dev_err(&hdev->pdev->dev, "could not configure common threshold %d\n", ret); - return ret; + goto out; } } - ret = hclge_common_wl_config(hdev); - if (ret) { + ret = hclge_common_wl_config(hdev, pkt_buf); + if (ret) dev_err(&hdev->pdev->dev, "could not configure common waterline %d\n", ret); - return ret; - } - return 0; +out: + kfree(pkt_buf); + return ret; } static int hclge_init_roce_base_info(struct hclge_vport *vport) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 9fcfd9395424..4fc36f04c971 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -463,8 +463,6 @@ struct hclge_dev { u32 pkt_buf_size; /* Total pf buf size for tx/rx */ u32 mps; /* Max packet size */ - struct hclge_priv_buf *priv_buf; - struct hclge_shared_buf s_buf; enum hclge_mta_dmac_sel_type mta_mac_sel_type; bool enable_mta; /* Mutilcast filter enable */ From 9dc2145d910e94d1987fd165ac7643777fcf17c4 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:25 +0800 Subject: [PATCH 1017/1322] net: hns3: Add support for PFC setting in TM module This patch add a pfc_pause_en cmd, and use it to configure PFC option according to fc_mode in hdev->tm_info. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 68 +++++++++++++++++-- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 5 ++ 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 73a75d7cc551..0b4b5d9b0798 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -124,6 +124,20 @@ static int hclge_mac_pause_en_cfg(struct hclge_dev *hdev, bool tx, bool rx) return hclge_cmd_send(&hdev->hw, &desc, 1); } +static int hclge_pfc_pause_en_cfg(struct hclge_dev *hdev, u8 tx_rx_bitmap, + u8 pfc_bitmap) +{ + struct hclge_desc desc; + struct hclge_pfc_en_cmd *pfc = (struct hclge_pfc_en_cmd *)&desc.data; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PFC_PAUSE_EN, false); + + pfc->tx_rx_en_bitmap = tx_rx_bitmap; + pfc->pri_en_bitmap = pfc_bitmap; + + return hclge_cmd_send(&hdev->hw, &desc, 1); +} + static int hclge_fill_pri_array(struct hclge_dev *hdev, u8 *pri, u8 pri_id) { u8 tc; @@ -969,20 +983,64 @@ static int hclge_tm_schd_setup_hw(struct hclge_dev *hdev) return hclge_tm_schd_mode_hw(hdev); } +static int hclge_pfc_setup_hw(struct hclge_dev *hdev) +{ + u8 enable_bitmap = 0; + + if (hdev->tm_info.fc_mode == HCLGE_FC_PFC) + enable_bitmap = HCLGE_TX_MAC_PAUSE_EN_MSK | + HCLGE_RX_MAC_PAUSE_EN_MSK; + + return hclge_pfc_pause_en_cfg(hdev, enable_bitmap, + hdev->tm_info.hw_pfc_map); +} + +static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev) +{ + bool tx_en, rx_en; + + switch (hdev->tm_info.fc_mode) { + case HCLGE_FC_NONE: + tx_en = false; + rx_en = false; + break; + case HCLGE_FC_RX_PAUSE: + tx_en = false; + rx_en = true; + break; + case HCLGE_FC_TX_PAUSE: + tx_en = true; + rx_en = false; + break; + case HCLGE_FC_FULL: + tx_en = true; + rx_en = true; + break; + default: + tx_en = true; + rx_en = true; + } + + return hclge_mac_pause_en_cfg(hdev, tx_en, rx_en); +} + int hclge_pause_setup_hw(struct hclge_dev *hdev) { - bool en = hdev->tm_info.fc_mode != HCLGE_FC_PFC; int ret; u8 i; - ret = hclge_mac_pause_en_cfg(hdev, en, en); - if (ret) - return ret; + if (hdev->tm_info.fc_mode != HCLGE_FC_PFC) + return hclge_mac_pause_setup_hw(hdev); - /* Only DCB-supported dev supports qset back pressure setting */ + /* Only DCB-supported dev supports qset back pressure and pfc cmd */ if (!hnae3_dev_dcb_supported(hdev)) return 0; + /* When MAC is GE Mode, hdev does not support pfc setting */ + ret = hclge_pfc_setup_hw(hdev); + if (ret) + dev_warn(&hdev->pdev->dev, "set pfc pause failed:%d\n", ret); + for (i = 0; i < hdev->tm_info.num_tc; i++) { ret = hclge_tm_qs_bp_cfg(hdev, i); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 85158b0d73fe..8ecd83c50f47 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -94,6 +94,11 @@ struct hclge_bp_to_qs_map_cmd { u32 rsvd1; }; +struct hclge_pfc_en_cmd { + u8 tx_rx_en_bitmap; + u8 pri_en_bitmap; +}; + #define hclge_tm_set_field(dest, string, val) \ hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH), val) From 0a5677d39ef12739c9c10ef6e8e5f4b0805bfe71 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:26 +0800 Subject: [PATCH 1018/1322] net: hns3: Add support for port shaper setting in TM module This patch add a tm_port_shaper cmd and set port shaper to HCLGE_ETHER_MAX_RATE on TM initialization process. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 32 +++++++++++++++++++ .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 +++ 2 files changed, 36 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 0b4b5d9b0798..f79cebd7c95b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -301,6 +301,34 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, return hclge_cmd_send(&hdev->hw, &desc, 1); } +static int hclge_tm_port_shaper_cfg(struct hclge_dev *hdev) +{ + struct hclge_port_shapping_cmd *shap_cfg_cmd; + struct hclge_desc desc; + u32 shapping_para = 0; + u8 ir_u, ir_b, ir_s; + int ret; + + ret = hclge_shaper_para_calc(HCLGE_ETHER_MAX_RATE, + HCLGE_SHAPER_LVL_PORT, + &ir_b, &ir_u, &ir_s); + if (ret) + return ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_PORT_SHAPPING, false); + shap_cfg_cmd = (struct hclge_port_shapping_cmd *)desc.data; + + hclge_tm_set_field(shapping_para, IR_B, ir_b); + hclge_tm_set_field(shapping_para, IR_U, ir_u); + hclge_tm_set_field(shapping_para, IR_S, ir_s); + hclge_tm_set_field(shapping_para, BS_B, HCLGE_SHAPER_BS_U_DEF); + hclge_tm_set_field(shapping_para, BS_S, HCLGE_SHAPER_BS_S_DEF); + + shap_cfg_cmd->port_shapping_para = cpu_to_le32(shapping_para); + + return hclge_cmd_send(&hdev->hw, &desc, 1); +} + static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, enum hclge_shap_bucket bucket, u8 pri_id, u8 ir_b, u8 ir_u, u8 ir_s, @@ -864,6 +892,10 @@ static int hclge_tm_shaper_cfg(struct hclge_dev *hdev) { int ret; + ret = hclge_tm_port_shaper_cfg(hdev); + if (ret) + return ret; + ret = hclge_tm_pg_shaper_cfg(hdev); if (ret) return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 8ecd83c50f47..19a01e41c8b0 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -99,6 +99,10 @@ struct hclge_pfc_en_cmd { u8 pri_en_bitmap; }; +struct hclge_port_shapping_cmd { + __le32 port_shapping_para; +}; + #define hclge_tm_set_field(dest, string, val) \ hnae_set_field((dest), (HCLGE_TM_SHAP_##string##_MSK), \ (HCLGE_TM_SHAP_##string##_LSH), val) From cc9bb43ab394f14096a55ee6101af0e804c05f0f Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:27 +0800 Subject: [PATCH 1019/1322] net: hns3: Add tc-based TM support for sriov enabled port When sriov is enabled and TM is in tc-based mode, vf's TM parameters is not set in TM initialization process. This patch add the tc_based TM support for sriov enabled using the information in vport struct. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 49 ++++++++++++------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index f79cebd7c95b..ea94d23a79f7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -388,13 +388,13 @@ static int hclge_tm_pri_schd_mode_cfg(struct hclge_dev *hdev, u8 pri_id) return hclge_cmd_send(&hdev->hw, &desc, 1); } -static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id) +static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id, u8 mode) { struct hclge_desc desc; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TM_QS_SCH_MODE_CFG, false); - if (hdev->tm_info.tc_info[qs_id].tc_sch_mode == HCLGE_SCH_MODE_DWRR) + if (mode == HCLGE_SCH_MODE_DWRR) desc.data[1] = cpu_to_le32(HCLGE_TM_TX_SCHD_DWRR_MSK); else desc.data[1] = 0; @@ -638,17 +638,18 @@ static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev) { struct hclge_vport *vport = hdev->vport; int ret; - u32 i; + u32 i, k; if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) { /* Cfg qs -> pri mapping, one by one mapping */ - for (i = 0; i < hdev->tm_info.num_tc; i++) { - ret = hclge_tm_qs_to_pri_map_cfg(hdev, i, i); - if (ret) - return ret; - } + for (k = 0; k < hdev->num_alloc_vport; k++) + for (i = 0; i < hdev->tm_info.num_tc; i++) { + ret = hclge_tm_qs_to_pri_map_cfg( + hdev, vport[k].qs_offset + i, i); + if (ret) + return ret; + } } else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) { - int k; /* Cfg qs -> pri mapping, qs = tc, pri = vf, 8 qs -> 1 pri */ for (k = 0; k < hdev->num_alloc_vport; k++) for (i = 0; i < HNAE3_MAX_TC; i++) { @@ -797,10 +798,11 @@ static int hclge_tm_pri_shaper_cfg(struct hclge_dev *hdev) static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev) { + struct hclge_vport *vport = hdev->vport; struct hclge_pg_info *pg_info; u8 dwrr; int ret; - u32 i; + u32 i, k; for (i = 0; i < hdev->tm_info.num_tc; i++) { pg_info = @@ -811,9 +813,13 @@ static int hclge_tm_pri_tc_base_dwrr_cfg(struct hclge_dev *hdev) if (ret) return ret; - ret = hclge_tm_qs_weight_cfg(hdev, i, dwrr); - if (ret) - return ret; + for (k = 0; k < hdev->num_alloc_vport; k++) { + ret = hclge_tm_qs_weight_cfg( + hdev, vport[k].qs_offset + i, + vport[k].dwrr); + if (ret) + return ret; + } } return 0; @@ -944,7 +950,10 @@ static int hclge_tm_schd_mode_vnet_base_cfg(struct hclge_vport *vport) return ret; for (i = 0; i < kinfo->num_tc; i++) { - ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i); + u8 sch_mode = hdev->tm_info.tc_info[i].tc_sch_mode; + + ret = hclge_tm_qs_schd_mode_cfg(hdev, vport->qs_offset + i, + sch_mode); if (ret) return ret; } @@ -956,7 +965,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev) { struct hclge_vport *vport = hdev->vport; int ret; - u8 i; + u8 i, k; if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) { for (i = 0; i < hdev->tm_info.num_tc; i++) { @@ -964,9 +973,13 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev) if (ret) return ret; - ret = hclge_tm_qs_schd_mode_cfg(hdev, i); - if (ret) - return ret; + for (k = 0; k < hdev->num_alloc_vport; k++) { + ret = hclge_tm_qs_schd_mode_cfg( + hdev, vport[k].qs_offset + i, + HCLGE_SCH_MODE_DWRR); + if (ret) + return ret; + } } } else { for (i = 0; i < hdev->num_alloc_vport; i++) { From 77f255c1c695c72acb1d1c47d30323a273774ae6 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:28 +0800 Subject: [PATCH 1020/1322] net: hns3: Add some interface for the support of DCB feature This patch add some interface and export some interface from hclge_tm and hclgc_main to support the upcoming DCB feature. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_main.c | 3 +- .../hisilicon/hns3/hns3pf/hclge_main.h | 3 ++ .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 48 +++++++++++++++++-- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 6 +++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 61632feb8c4e..644f7ff54081 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -30,7 +30,6 @@ #define HCLGE_64BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_64_bit_stats, f)) #define HCLGE_32BIT_STATS_FIELD_OFF(f) (offsetof(struct hclge_32_bit_stats, f)) -static int hclge_rss_init_hw(struct hclge_dev *hdev); static int hclge_set_mta_filter_mode(struct hclge_dev *hdev, enum hclge_mta_dmac_sel_type mta_mac_sel, bool enable); @@ -2655,7 +2654,7 @@ static int hclge_get_tc_size(struct hnae3_handle *handle) return hdev->rss_size_max; } -static int hclge_rss_init_hw(struct hclge_dev *hdev) +int hclge_rss_init_hw(struct hclge_dev *hdev) { const u8 hfunc = HCLGE_RSS_HASH_ALGO_TOEPLITZ; struct hclge_vport *vport = hdev->vport; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 4fc36f04c971..394b58788065 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -515,4 +515,7 @@ static inline int hclge_get_queue_id(struct hnae3_queue *queue) int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex); int hclge_set_vf_vlan_common(struct hclge_dev *vport, int vfid, bool is_kill, u16 vlan, u8 qos, __be16 proto); + +int hclge_buffer_alloc(struct hclge_dev *hdev); +int hclge_rss_init_hw(struct hclge_dev *hdev); #endif diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index ea94d23a79f7..8295684da5ba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -883,10 +883,14 @@ static int hclge_tm_pri_dwrr_cfg(struct hclge_dev *hdev) return 0; } -static int hclge_tm_map_cfg(struct hclge_dev *hdev) +int hclge_tm_map_cfg(struct hclge_dev *hdev) { int ret; + ret = hclge_up_to_tc_map(hdev); + if (ret) + return ret; + ret = hclge_tm_pg_to_pri_map(hdev); if (ret) return ret; @@ -994,7 +998,7 @@ static int hclge_tm_lvl34_schd_mode_cfg(struct hclge_dev *hdev) return 0; } -static int hclge_tm_schd_mode_hw(struct hclge_dev *hdev) +int hclge_tm_schd_mode_hw(struct hclge_dev *hdev) { int ret; @@ -1092,7 +1096,45 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev) return ret; } - return hclge_up_to_tc_map(hdev); + return 0; +} + +int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc) +{ + struct hclge_vport *vport = hdev->vport; + struct hnae3_knic_private_info *kinfo; + u32 i, k; + + for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) { + if (prio_tc[i] >= hdev->tm_info.num_tc) + return -EINVAL; + hdev->tm_info.prio_tc[i] = prio_tc[i]; + + for (k = 0; k < hdev->num_alloc_vport; k++) { + kinfo = &vport[k].nic.kinfo; + kinfo->prio_tc[i] = prio_tc[i]; + } + } + return 0; +} + +void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc) +{ + u8 i, bit_map = 0; + + hdev->tm_info.num_tc = num_tc; + + for (i = 0; i < hdev->tm_info.num_tc; i++) + bit_map |= BIT(i); + + if (!bit_map) { + bit_map = 1; + hdev->tm_info.num_tc = 1; + } + + hdev->hw_tc_map = bit_map; + + hclge_tm_schd_info_init(hdev); } int hclge_tm_init_hw(struct hclge_dev *hdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h index 19a01e41c8b0..bf59961918ab 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h @@ -112,4 +112,10 @@ struct hclge_port_shapping_cmd { int hclge_tm_schd_init(struct hclge_dev *hdev); int hclge_pause_setup_hw(struct hclge_dev *hdev); +int hclge_tm_schd_mode_hw(struct hclge_dev *hdev); +int hclge_tm_prio_tc_info_update(struct hclge_dev *hdev, u8 *prio_tc); +void hclge_tm_schd_info_update(struct hclge_dev *hdev, u8 num_tc); +int hclge_tm_dwrr_cfg(struct hclge_dev *hdev); +int hclge_tm_map_cfg(struct hclge_dev *hdev); +int hclge_tm_init_hw(struct hclge_dev *hdev); #endif From cacde272dd00496c2c1c36606a56b340cd967603 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:29 +0800 Subject: [PATCH 1021/1322] net: hns3: Add hclge_dcb module for the support of DCB feature The hclge_dcb module calls the interface from hclge_main/tm and provide interface for the dcb netlink interface. This patch also update Makefiles required to build the DCB supported code in HNS3 Ethernet driver and update the existing Kconfig file in the hisilicon folder. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/Kconfig | 9 + drivers/net/ethernet/hisilicon/hns3/hnae3.h | 17 + .../ethernet/hisilicon/hns3/hns3pf/Makefile | 2 + .../hisilicon/hns3/hns3pf/hclge_dcb.c | 304 ++++++++++++++++++ .../hisilicon/hns3/hns3pf/hclge_dcb.h | 21 ++ .../hisilicon/hns3/hns3pf/hclge_main.c | 25 +- .../hisilicon/hns3/hns3pf/hclge_main.h | 3 + 7 files changed, 375 insertions(+), 6 deletions(-) create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig index 91c7bdb9b43c..9d7cb0387bf7 100644 --- a/drivers/net/ethernet/hisilicon/Kconfig +++ b/drivers/net/ethernet/hisilicon/Kconfig @@ -103,4 +103,13 @@ config HNS3_ENET family of SoCs. This module depends upon HNAE3 driver to access the HNAE3 devices and their associated operations. +config HNS3_DCB + bool "Hisilicon HNS3 Data Center Bridge Support" + default n + depends on HNS3 && HNS3_HCLGE && DCB + ---help--- + Say Y here if you want to use Data Center Bridging (DCB) in the HNS3 driver. + + If unsure, say N. + endif # NET_VENDOR_HISILICON diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 1a01cadfe5f3..c677530841cf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -28,6 +28,7 @@ */ #include +#include #include #include #include @@ -131,6 +132,7 @@ struct hnae3_client_ops { int (*init_instance)(struct hnae3_handle *handle); void (*uninit_instance)(struct hnae3_handle *handle, bool reset); void (*link_status_change)(struct hnae3_handle *handle, bool state); + int (*setup_tc)(struct hnae3_handle *handle, u8 tc); }; #define HNAE3_CLIENT_NAME_LENGTH 16 @@ -363,6 +365,20 @@ struct hnae3_ae_ops { u16 vlan, u8 qos, __be16 proto); }; +struct hnae3_dcb_ops { + /* IEEE 802.1Qaz std */ + int (*ieee_getets)(struct hnae3_handle *, struct ieee_ets *); + int (*ieee_setets)(struct hnae3_handle *, struct ieee_ets *); + int (*ieee_getpfc)(struct hnae3_handle *, struct ieee_pfc *); + int (*ieee_setpfc)(struct hnae3_handle *, struct ieee_pfc *); + + /* DCBX configuration */ + u8 (*getdcbx)(struct hnae3_handle *); + u8 (*setdcbx)(struct hnae3_handle *, u8); + + int (*map_update)(struct hnae3_handle *); +}; + struct hnae3_ae_algo { const struct hnae3_ae_ops *ops; struct list_head node; @@ -394,6 +410,7 @@ struct hnae3_knic_private_info { u16 num_tqps; /* total number of TQPs in this handle */ struct hnae3_queue **tqp; /* array base of all TQPs in this instance */ + const struct hnae3_dcb_ops *dcb_ops; }; struct hnae3_roce_private_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile index 162e8a42acd0..7023dc878086 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile @@ -7,5 +7,7 @@ ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3 obj-$(CONFIG_HNS3_HCLGE) += hclge.o hclge-objs = hclge_main.o hclge_cmd.o hclge_mdio.o hclge_tm.o +hclge-$(CONFIG_HNS3_DCB) += hclge_dcb.o + obj-$(CONFIG_HNS3_ENET) += hns3.o hns3-objs = hns3_enet.o hns3_ethtool.o diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c new file mode 100644 index 000000000000..1b30a6f966d8 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include "hclge_main.h" +#include "hclge_tm.h" +#include "hnae3.h" + +#define BW_PERCENT 100 + +static int hclge_ieee_ets_to_tm_info(struct hclge_dev *hdev, + struct ieee_ets *ets) +{ + u8 i; + + for (i = 0; i < HNAE3_MAX_TC; i++) { + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_STRICT: + hdev->tm_info.tc_info[i].tc_sch_mode = + HCLGE_SCH_MODE_SP; + hdev->tm_info.pg_info[0].tc_dwrr[i] = 0; + break; + case IEEE_8021QAZ_TSA_ETS: + hdev->tm_info.tc_info[i].tc_sch_mode = + HCLGE_SCH_MODE_DWRR; + hdev->tm_info.pg_info[0].tc_dwrr[i] = + ets->tc_tx_bw[i]; + break; + default: + /* Hardware only supports SP (strict priority) + * or ETS (enhanced transmission selection) + * algorithms, if we receive some other value + * from dcbnl, then throw an error. + */ + return -EINVAL; + } + } + + return hclge_tm_prio_tc_info_update(hdev, ets->prio_tc); +} + +static void hclge_tm_info_to_ieee_ets(struct hclge_dev *hdev, + struct ieee_ets *ets) +{ + u32 i; + + memset(ets, 0, sizeof(*ets)); + ets->willing = 1; + ets->ets_cap = hdev->tc_max; + + for (i = 0; i < HNAE3_MAX_TC; i++) { + ets->prio_tc[i] = hdev->tm_info.prio_tc[i]; + ets->tc_tx_bw[i] = hdev->tm_info.pg_info[0].tc_dwrr[i]; + + if (hdev->tm_info.tc_info[i].tc_sch_mode == + HCLGE_SCH_MODE_SP) + ets->tc_tsa[i] = IEEE_8021QAZ_TSA_STRICT; + else + ets->tc_tsa[i] = IEEE_8021QAZ_TSA_ETS; + } +} + +/* IEEE std */ +static int hclge_ieee_getets(struct hnae3_handle *h, struct ieee_ets *ets) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + + hclge_tm_info_to_ieee_ets(hdev, ets); + + return 0; +} + +static int hclge_ets_validate(struct hclge_dev *hdev, struct ieee_ets *ets, + u8 *tc, bool *changed) +{ + u32 total_ets_bw = 0; + u8 max_tc = 0; + u8 i; + + for (i = 0; i < HNAE3_MAX_TC; i++) { + if (ets->prio_tc[i] >= hdev->tc_max || + i >= hdev->tc_max) + return -EINVAL; + + if (ets->prio_tc[i] != hdev->tm_info.prio_tc[i]) + *changed = true; + + if (ets->prio_tc[i] > max_tc) + max_tc = ets->prio_tc[i]; + + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_STRICT: + if (hdev->tm_info.tc_info[i].tc_sch_mode != + HCLGE_SCH_MODE_SP) + *changed = true; + break; + case IEEE_8021QAZ_TSA_ETS: + if (hdev->tm_info.tc_info[i].tc_sch_mode != + HCLGE_SCH_MODE_DWRR) + *changed = true; + + total_ets_bw += ets->tc_tx_bw[i]; + break; + default: + return -EINVAL; + } + } + + if (total_ets_bw != BW_PERCENT) + return -EINVAL; + + *tc = max_tc + 1; + if (*tc != hdev->tm_info.num_tc) + *changed = true; + + return 0; +} + +static int hclge_map_update(struct hnae3_handle *h) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + int ret; + + ret = hclge_tm_map_cfg(hdev); + if (ret) + return ret; + + ret = hclge_tm_schd_mode_hw(hdev); + if (ret) + return ret; + + ret = hclge_pause_setup_hw(hdev); + if (ret) + return ret; + + ret = hclge_buffer_alloc(hdev); + if (ret) + return ret; + + return hclge_rss_init_hw(hdev); +} + +static int hclge_client_setup_tc(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = hdev->vport; + struct hnae3_client *client; + struct hnae3_handle *handle; + int ret; + u32 i; + + for (i = 0; i < hdev->num_vmdq_vport + 1; i++) { + handle = &vport[i].nic; + client = handle->client; + + if (!client || !client->ops || !client->ops->setup_tc) + continue; + + ret = client->ops->setup_tc(handle, hdev->tm_info.num_tc); + if (ret) + return ret; + } + + return 0; +} + +static int hclge_ieee_setets(struct hnae3_handle *h, struct ieee_ets *ets) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + bool map_changed = false; + u8 num_tc = 0; + int ret; + + if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) + return -EINVAL; + + ret = hclge_ets_validate(hdev, ets, &num_tc, &map_changed); + if (ret) + return ret; + + hclge_tm_schd_info_update(hdev, num_tc); + + ret = hclge_ieee_ets_to_tm_info(hdev, ets); + if (ret) + return ret; + + if (map_changed) { + ret = hclge_client_setup_tc(hdev); + if (ret) + return ret; + } + + return hclge_tm_dwrr_cfg(hdev); +} + +static int hclge_ieee_getpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + u8 i, j, pfc_map, *prio_tc; + + memset(pfc, 0, sizeof(*pfc)); + pfc->pfc_cap = hdev->pfc_max; + prio_tc = hdev->tm_info.prio_tc; + pfc_map = hdev->tm_info.hw_pfc_map; + + /* Pfc setting is based on TC */ + for (i = 0; i < hdev->tm_info.num_tc; i++) { + for (j = 0; j < HNAE3_MAX_USER_PRIO; j++) { + if ((prio_tc[j] == i) && (pfc_map & BIT(i))) + pfc->pfc_en |= BIT(j); + } + } + + return 0; +} + +static int hclge_ieee_setpfc(struct hnae3_handle *h, struct ieee_pfc *pfc) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + u8 i, j, pfc_map, *prio_tc; + + if (!(hdev->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) + return -EINVAL; + + prio_tc = hdev->tm_info.prio_tc; + pfc_map = 0; + + for (i = 0; i < hdev->tm_info.num_tc; i++) { + for (j = 0; j < HNAE3_MAX_USER_PRIO; j++) { + if ((prio_tc[j] == i) && (pfc->pfc_en & BIT(j))) { + pfc_map |= BIT(i); + break; + } + } + } + + if (pfc_map == hdev->tm_info.hw_pfc_map) + return 0; + + hdev->tm_info.hw_pfc_map = pfc_map; + + return hclge_pause_setup_hw(hdev); +} + +/* DCBX configuration */ +static u8 hclge_getdcbx(struct hnae3_handle *h) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + + return hdev->dcbx_cap; +} + +static u8 hclge_setdcbx(struct hnae3_handle *h, u8 mode) +{ + struct hclge_vport *vport = hclge_get_vport(h); + struct hclge_dev *hdev = vport->back; + + /* No support for LLD_MANAGED modes or CEE */ + if ((mode & DCB_CAP_DCBX_LLD_MANAGED) || + (mode & DCB_CAP_DCBX_VER_CEE) || + !(mode & DCB_CAP_DCBX_HOST)) + return 1; + + hdev->dcbx_cap = mode; + + return 0; +} + +static const struct hnae3_dcb_ops hns3_dcb_ops = { + .ieee_getets = hclge_ieee_getets, + .ieee_setets = hclge_ieee_setets, + .ieee_getpfc = hclge_ieee_getpfc, + .ieee_setpfc = hclge_ieee_setpfc, + .getdcbx = hclge_getdcbx, + .setdcbx = hclge_setdcbx, + .map_update = hclge_map_update, +}; + +void hclge_dcb_ops_set(struct hclge_dev *hdev) +{ + struct hclge_vport *vport = hdev->vport; + struct hnae3_knic_private_info *kinfo; + + /* Hdev does not support DCB or vport is + * not a pf, then dcb_ops is not set. + */ + if (!hnae3_dev_dcb_supported(hdev) || + vport->vport_id != 0) + return; + + kinfo = &vport->nic.kinfo; + kinfo->dcb_ops = &hns3_dcb_ops; + hdev->dcbx_cap = DCB_CAP_DCBX_VER_IEEE | DCB_CAP_DCBX_HOST; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h new file mode 100644 index 000000000000..7d808ee96694 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2016~2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __HCLGE_DCB_H__ +#define __HCLGE_DCB_H__ + +#include "hclge_main.h" + +#ifdef CONFIG_HNS3_DCB +void hclge_dcb_ops_set(struct hclge_dev *hdev); +#else +static inline void hclge_dcb_ops_set(struct hclge_dev *hdev) {} +#endif + +#endif /* __HCLGE_DCB_H__ */ diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 644f7ff54081..dd220eab7f53 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -19,6 +19,7 @@ #include #include "hclge_cmd.h" +#include "hclge_dcb.h" #include "hclge_main.h" #include "hclge_mdio.h" #include "hclge_tm.h" @@ -1057,7 +1058,7 @@ static int hclge_configure(struct hclge_dev *hdev) hdev->hw.mac.phy_addr = cfg.phy_addr; hdev->num_desc = cfg.tqp_desc_num; hdev->tm_info.num_pg = 1; - hdev->tm_info.num_tc = cfg.tc_num; + hdev->tc_max = cfg.tc_num; hdev->tm_info.hw_pfc_map = 0; ret = hclge_parse_speed(cfg.default_speed, &hdev->hw.mac.speed); @@ -1066,15 +1067,25 @@ static int hclge_configure(struct hclge_dev *hdev) return ret; } - if ((hdev->tm_info.num_tc > HNAE3_MAX_TC) || - (hdev->tm_info.num_tc < 1)) { + if ((hdev->tc_max > HNAE3_MAX_TC) || + (hdev->tc_max < 1)) { dev_warn(&hdev->pdev->dev, "TC num = %d.\n", - hdev->tm_info.num_tc); - hdev->tm_info.num_tc = 1; + hdev->tc_max); + hdev->tc_max = 1; } + /* Dev does not support DCB */ + if (!hnae3_dev_dcb_supported(hdev)) { + hdev->tc_max = 1; + hdev->pfc_max = 0; + } else { + hdev->pfc_max = hdev->tc_max; + } + + hdev->tm_info.num_tc = hdev->tc_max; + /* Currently not support uncontiuous tc */ - for (i = 0; i < cfg.tc_num; i++) + for (i = 0; i < hdev->tm_info.num_tc; i++) hnae_set_bit(hdev->hw_tc_map, i, 1); if (!hdev->num_vmdq_vport && !hdev->num_req_vfs) @@ -4238,6 +4249,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } + hclge_dcb_ops_set(hdev); + setup_timer(&hdev->service_timer, hclge_service_timer, (unsigned long)hdev); INIT_WORK(&hdev->service_task, hclge_service_task); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 394b58788065..7c66c00e8a3e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -421,8 +421,11 @@ struct hclge_dev { #define HCLGE_FLAG_TC_BASE_SCH_MODE 1 #define HCLGE_FLAG_VNET_BASE_SCH_MODE 2 u8 tx_sch_mode; + u8 tc_max; + u8 pfc_max; u8 default_up; + u8 dcbx_cap; struct hclge_tm_info tm_info; u16 num_msi; From 986743dbf0a70211bba594b5abee33b6661feaa9 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:30 +0800 Subject: [PATCH 1022/1322] net: hns3: Add dcb netlink interface for the support of DCB feature This patch add dcb netlink interface by calling the interface from hclge_dcb module. This patch also update Makefile in order to build hns3_dcbnl module. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/Makefile | 2 + .../hisilicon/hns3/hns3pf/hns3_dcbnl.c | 106 ++++++++++++++++++ .../hisilicon/hns3/hns3pf/hns3_enet.c | 2 + .../hisilicon/hns3/hns3pf/hns3_enet.h | 7 ++ 4 files changed, 117 insertions(+) create mode 100644 drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile index 7023dc878086..d2b20d01a58c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/Makefile @@ -11,3 +11,5 @@ hclge-$(CONFIG_HNS3_DCB) += hclge_dcb.o obj-$(CONFIG_HNS3_ENET) += hns3.o hns3-objs = hns3_enet.o hns3_ethtool.o + +hns3-$(CONFIG_HNS3_DCB) += hns3_dcbnl.o diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c new file mode 100644 index 000000000000..9832172bfb08 --- /dev/null +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016-2017 Hisilicon Limited. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include "hnae3.h" +#include "hns3_enet.h" + +static +int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->ieee_getets) + return h->kinfo.dcb_ops->ieee_getets(h, ets); + + return -EOPNOTSUPP; +} + +static +int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->ieee_setets) + return h->kinfo.dcb_ops->ieee_setets(h, ets); + + return -EOPNOTSUPP; +} + +static +int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->ieee_getpfc) + return h->kinfo.dcb_ops->ieee_getpfc(h, pfc); + + return -EOPNOTSUPP; +} + +static +int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->ieee_setpfc) + return h->kinfo.dcb_ops->ieee_setpfc(h, pfc); + + return -EOPNOTSUPP; +} + +/* DCBX configuration */ +static u8 hns3_dcbnl_getdcbx(struct net_device *ndev) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->getdcbx) + return h->kinfo.dcb_ops->getdcbx(h); + + return 0; +} + +/* return 0 if successful, otherwise fail */ +static u8 hns3_dcbnl_setdcbx(struct net_device *ndev, u8 mode) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + + if (h->kinfo.dcb_ops->setdcbx) + return h->kinfo.dcb_ops->setdcbx(h, mode); + + return 1; +} + +static const struct dcbnl_rtnl_ops hns3_dcbnl_ops = { + .ieee_getets = hns3_dcbnl_ieee_getets, + .ieee_setets = hns3_dcbnl_ieee_setets, + .ieee_getpfc = hns3_dcbnl_ieee_getpfc, + .ieee_setpfc = hns3_dcbnl_ieee_setpfc, + .getdcbx = hns3_dcbnl_getdcbx, + .setdcbx = hns3_dcbnl_setdcbx, +}; + +/* hclge_dcbnl_setup - DCBNL setup + * @handle: the corresponding vport handle + * Set up DCBNL + */ +void hns3_dcbnl_setup(struct hnae3_handle *handle) +{ + struct net_device *dev = handle->kinfo.netdev; + + if (!handle->kinfo.dcb_ops) + return; + + dev->dcbnl_ops = &hns3_dcbnl_ops; +} diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 35369e1c8036..11dab26f3543 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2790,6 +2790,8 @@ static int hns3_client_init(struct hnae3_handle *handle) goto out_reg_netdev_fail; } + hns3_dcbnl_setup(handle); + /* MTU range: (ETH_MIN_MTU(kernel default) - 9706) */ netdev->max_mtu = HNS3_MAX_MTU - (ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h index 7e8746189747..481eada73e2d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h @@ -590,4 +590,11 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value) void hns3_ethtool_set_ops(struct net_device *netdev); int hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget); + +#ifdef CONFIG_HNS3_DCB +void hns3_dcbnl_setup(struct hnae3_handle *handle); +#else +static inline void hns3_dcbnl_setup(struct hnae3_handle *handle) {} +#endif + #endif From 7979a223305016625d211dd051569933c433f81e Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:31 +0800 Subject: [PATCH 1023/1322] net: hns3: Setting for fc_mode and dcb enable flag in TM module After the DCB feature is supported, fc_mode and dcb enable flag must be set according to the DCB parameter. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 8295684da5ba..359ee670d1e1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -486,7 +486,11 @@ static void hclge_tm_tc_info_init(struct hclge_dev *hdev) hdev->tm_info.prio_tc[i] = (i >= hdev->tm_info.num_tc) ? 0 : i; - hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; + /* DCB is enabled if we have more than 1 TC */ + if (hdev->tm_info.num_tc > 1) + hdev->flag |= HCLGE_FLAG_DCB_ENABLE; + else + hdev->flag &= ~HCLGE_FLAG_DCB_ENABLE; } static void hclge_tm_pg_info_init(struct hclge_dev *hdev) @@ -512,6 +516,24 @@ static void hclge_tm_pg_info_init(struct hclge_dev *hdev) } } +static void hclge_pfc_info_init(struct hclge_dev *hdev) +{ + if (!(hdev->flag & HCLGE_FLAG_DCB_ENABLE)) { + if (hdev->fc_mode_last_time == HCLGE_FC_PFC) + dev_warn(&hdev->pdev->dev, + "DCB is disable, but last mode is FC_PFC\n"); + + hdev->tm_info.fc_mode = hdev->fc_mode_last_time; + } else if (hdev->tm_info.fc_mode != HCLGE_FC_PFC) { + /* fc_mode_last_time record the last fc_mode when + * DCB is enabled, so that fc_mode can be set to + * the correct value when DCB is disabled. + */ + hdev->fc_mode_last_time = hdev->tm_info.fc_mode; + hdev->tm_info.fc_mode = HCLGE_FC_PFC; + } +} + static int hclge_tm_schd_info_init(struct hclge_dev *hdev) { if ((hdev->tx_sch_mode != HCLGE_FLAG_TC_BASE_SCH_MODE) && @@ -524,8 +546,7 @@ static int hclge_tm_schd_info_init(struct hclge_dev *hdev) hclge_tm_vport_info_update(hdev); - hdev->tm_info.fc_mode = HCLGE_FC_NONE; - hdev->fc_mode_last_time = hdev->tm_info.fc_mode; + hclge_pfc_info_init(hdev); return 0; } @@ -1158,8 +1179,13 @@ int hclge_tm_init_hw(struct hclge_dev *hdev) int hclge_tm_schd_init(struct hclge_dev *hdev) { - int ret = hclge_tm_schd_info_init(hdev); + int ret; + /* fc_mode is HCLGE_FC_FULL on reset */ + hdev->tm_info.fc_mode = HCLGE_FC_FULL; + hdev->fc_mode_last_time = hdev->tm_info.fc_mode; + + ret = hclge_tm_schd_info_init(hdev); if (ret) return ret; From 9df8f79a4d2957fa3083e8fda0843d8c010351a7 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 27 Sep 2017 09:45:32 +0800 Subject: [PATCH 1024/1322] net: hns3: Add DCB support when interacting with network stack When using lldptool to configure DCB parameter, hclge_dcb module call the client_ops->setup_tc to tell network stack which queue and priority is using for specific tc. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hns3_enet.c | 102 +++++++++++++++--- 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 11dab26f3543..4a0890f98b70 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -196,6 +196,32 @@ static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector) tqp_vector->tx_group.flow_level = HNS3_FLOW_LOW; } +static int hns3_nic_set_real_num_queue(struct net_device *netdev) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; + struct hnae3_knic_private_info *kinfo = &h->kinfo; + unsigned int queue_size = kinfo->rss_size * kinfo->num_tc; + int ret; + + ret = netif_set_real_num_tx_queues(netdev, queue_size); + if (ret) { + netdev_err(netdev, + "netif_set_real_num_tx_queues fail, ret=%d!\n", + ret); + return ret; + } + + ret = netif_set_real_num_rx_queues(netdev, queue_size); + if (ret) { + netdev_err(netdev, + "netif_set_real_num_rx_queues fail, ret=%d!\n", ret); + return ret; + } + + return 0; +} + static int hns3_nic_net_up(struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); @@ -232,26 +258,13 @@ out_start_err: static int hns3_nic_net_open(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; int ret; netif_carrier_off(netdev); - ret = netif_set_real_num_tx_queues(netdev, h->kinfo.num_tqps); - if (ret) { - netdev_err(netdev, - "netif_set_real_num_tx_queues fail, ret=%d!\n", - ret); + ret = hns3_nic_set_real_num_queue(netdev); + if (ret) return ret; - } - - ret = netif_set_real_num_rx_queues(netdev, h->kinfo.num_tqps); - if (ret) { - netdev_err(netdev, - "netif_set_real_num_rx_queues fail, ret=%d!\n", ret); - return ret; - } ret = hns3_nic_net_up(netdev); if (ret) { @@ -2848,10 +2861,69 @@ static void hns3_link_status_change(struct hnae3_handle *handle, bool linkup) } } +static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc) +{ + struct hnae3_knic_private_info *kinfo = &handle->kinfo; + struct net_device *ndev = kinfo->netdev; + bool if_running = netif_running(ndev); + int ret; + u8 i; + + if (tc > HNAE3_MAX_TC) + return -EINVAL; + + if (!ndev) + return -ENODEV; + + ret = netdev_set_num_tc(ndev, tc); + if (ret) + return ret; + + if (if_running) { + (void)hns3_nic_net_stop(ndev); + msleep(100); + } + + ret = (kinfo->dcb_ops && kinfo->dcb_ops->map_update) ? + kinfo->dcb_ops->map_update(handle) : -EOPNOTSUPP; + if (ret) + goto err_out; + + if (tc <= 1) { + netdev_reset_tc(ndev); + goto out; + } + + for (i = 0; i < HNAE3_MAX_TC; i++) { + struct hnae3_tc_info *tc_info = &kinfo->tc_info[i]; + + if (tc_info->enable) + netdev_set_tc_queue(ndev, + tc_info->tc, + tc_info->tqp_count, + tc_info->tqp_offset); + } + + for (i = 0; i < HNAE3_MAX_USER_PRIO; i++) { + netdev_set_prio_tc_map(ndev, i, + kinfo->prio_tc[i]); + } + +out: + ret = hns3_nic_set_real_num_queue(ndev); + +err_out: + if (if_running) + (void)hns3_nic_net_open(ndev); + + return ret; +} + const struct hnae3_client_ops client_ops = { .init_instance = hns3_client_init, .uninit_instance = hns3_client_uninit, .link_status_change = hns3_link_status_change, + .setup_tc = hns3_client_setup_tc, }; /* hns3_init_module - Driver registration routine From e49aa15ef6c179f69e5578a271801f31a09e9a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 28 Sep 2017 13:20:32 -0700 Subject: [PATCH 1025/1322] Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" This reverts commit dbbccdc4ced015cdd4051299bd87fbe0254ad351. It turns out that the "legacy" users aren't so legacy at all, and that turning off the legacy ioctl will break the current Qt bluetooth stack for bluetooth LE devices that were released just a couple of months ago. So it's simply not true that this was a legacy interface that hasn't been needed and is only limited to old legacy BT devices. Because I actually read Kconfig help messages, and actively try to turn off features that I don't need, I turned the option off. Then I spent _way_ too much time debugging BLE issues until I realized that it wasn't the Qt and subsurface development that had broken one of my dive computer BLE downloads, but simply my broken kernel config. Maybe in a decade it will be true that this is a legacy interface. And maybe with a better help-text and correct dependencies, this kind of legacy removal might be acceptable. But as things are right now both the commit message and the Kconfig help text were misleading, and the Kconfig option had the wrong dependenencies. There's no reason to keep that broken Kconfig option in the tree. Cc: Marcel Holtmann Cc: Johan Hedberg Signed-off-by: Linus Torvalds --- net/bluetooth/Kconfig | 10 ---------- net/bluetooth/hci_sock.c | 6 ------ 2 files changed, 16 deletions(-) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, From 87cbde8d9081b91df86a21d0d743cd700e04890a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Sep 2017 01:45:10 +0200 Subject: [PATCH 1026/1322] PM / s2idle: Invoke the ->wake() platform callback earlier The role of the ->wake() platform callback for suspend-to-idle is to deal with possible spurious wakeups, among other things. The ACPI implementation of it, acpi_s2idle_wake(), additionally checks the conditions for entering the Low Power S0 Idle state by the platform and reports the ones that have not been met. However, the ->wake() platform callback is invoked after calling dpm_noirq_resume_devices(), which means that the power states of some devices may have changed since s2idle_enter() returned, so some unmet Low Power S0 Idle conditions may be reported incorrectly as a result of that. To avoid these false positives, reorder the invocations of the dpm_noirq_resume_devices() routine and the ->wake() platform callback in s2idle_loop(). Fixes: 726fb6b4f2a8 (ACPI / PM: Check low power idle constraints for debug only) Tested-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void) * frozen processes + suspended devices + idle processors. * Thus s2idle_enter() should be called right after * all devices have been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, + * so prevent them from terminating the loop right away. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) s2idle_enter(); + else if (error == -EBUSY && pm_wakeup_pending()) + error = 0; - dpm_noirq_resume_devices(PMSG_RESUME); - if (error && (error != -EBUSY || !pm_wakeup_pending())) { - dpm_noirq_end(); - break; - } - - if (s2idle_ops && s2idle_ops->wake) + if (!error && s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); + if (error) + break; + if (s2idle_ops && s2idle_ops->sync) s2idle_ops->sync(); From 5af48b59f35cf712793badabe1a574a0d0ce3bd3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 27 Sep 2017 16:12:44 +0300 Subject: [PATCH 1027/1322] net: bridge: add per-port group_fwd_mask with less restrictions We need to be able to transparently forward most link-local frames via tunnels (e.g. vxlan, qinq). Currently the bridge's group_fwd_mask has a mask which restricts the forwarding of STP and LACP, but we need to be able to forward these over tunnels and control that forwarding on a per-port basis thus add a new per-port group_fwd_mask option which only disallows mac pause frames to be forwarded (they're always dropped anyway). The patch does not change the current default situation - all of the others are still restricted unless configured for forwarding. We have successfully tested this patch with LACP and STP forwarding over VxLAN and qinq tunnels. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_input.c | 1 + net/bridge/br_netlink.c | 14 +++++++++++++- net/bridge/br_private.h | 10 +++++++++- net/bridge/br_sysfs_if.c | 18 ++++++++++++++++++ 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8d062c58d5cb..ea87bd708ee9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -325,6 +325,7 @@ enum { IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7637f58c1226..7cb613776b31 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) * * Others reserved for future standardization */ + fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3bc890716c89..dea88a255d26 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + 0; } @@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & - BR_VLAN_TUNNEL))) + BR_VLAN_TUNNEL)) || + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask)) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, }; /* Change the state of the port and notify spanning tree */ @@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } #endif + + if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = fwd_mask; + } + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e870cfc85b14..020c709a017f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -36,7 +36,14 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ -#define BR_GROUPFWD_RESTRICTED 0x0007u +enum { + BR_GROUPFWD_STP = BIT(0), + BR_GROUPFWD_MACPAUSE = BIT(1), + BR_GROUPFWD_LACP = BIT(2), +}; + +#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ + BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u @@ -268,6 +275,7 @@ struct net_bridge_port { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + u16 group_fwd_mask; }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 5d5d413a6cf8..9110d5e56085 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v) } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%#x\n", p->group_fwd_mask); +} + +static int store_group_fwd_mask(struct net_bridge_port *p, + unsigned long v) +{ + if (v & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = v; + + return 0; +} +static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, + store_group_fwd_mask); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, + &brport_attr_group_fwd_mask, NULL }; From cf5d74b85ef40c202c76d90959db4d850f301b95 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Wed, 27 Sep 2017 18:30:58 +0200 Subject: [PATCH 1028/1322] tcp: fix under-evaluated ssthresh in TCP Vegas With the commit 76174004a0f19785 (tcp: do not slow start when cwnd equals ssthresh), the comparison to the reduced cwnd in tcp_vegas_ssthresh() would under-evaluate the ssthresh. Signed-off-by: Hoang Tran Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: [PATCH 1029/1322] bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 8 +++++++ kernel/bpf/syscall.c | 51 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b672c50f160..33ccc474fb04 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -187,6 +187,8 @@ struct bpf_prog_aux { struct bpf_map **used_maps; struct bpf_prog *prog; struct user_struct *user; + u64 load_time; /* ns since boottime */ + u8 name[BPF_OBJ_NAME_LEN]; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e43491ac4823..bd6348269bf5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -175,6 +175,8 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -210,6 +212,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; + __u8 prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -812,6 +815,11 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..45970df3f820 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + /* '\0' terminates dst */ + *dst = 0; + + return 0; +} + #define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_name static int bpf_prog_load(union bpf_attr *attr) { @@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 *user_map_ids = (u32 *)info.map_ids; + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: [PATCH 1030/1322] bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 7 ++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33ccc474fb04..252f4bc9eb25 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,6 +56,7 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; + u8 name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd6348269bf5..6d2137b4cf38 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -190,6 +190,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ + __u8 map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -829,6 +830,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 45970df3f820..11a7f82a55d1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD numa_node +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) From 88cda1c9da02c8aa31e1d5dcf22e8a35cc8c19f2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:54 -0700 Subject: [PATCH 1031/1322] bpf: libbpf: Provide basic API support to specify BPF obj name This patch extends the libbpf to provide API support to allow specifying BPF object name. In tools/lib/bpf/libbpf, the C symbol of the function and the map is used. Regarding section name, all maps are under the same section named "maps". Hence, section name is not a good choice for map's name. To be consistent with map, bpf_prog also follows and uses its function symbol as the prog's name. This patch adds logic to collect function's symbols in libbpf. There is existing codes to collect the map's symbols and no change is needed. The bpf_load_program_name() and bpf_map_create_name() are added to take the name argument. For the other bpf_map_create_xxx() variants, a name argument is directly added to them. In samples/bpf, bpf_load.c in particular, the symbol is also used as the map's name and the map symbols has already been collected in the existing code. For bpf_prog, bpf_load.c does not collect the function symbol name. We can consider to collect them later if there is a need to continue supporting the bpf_load.c. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/bpf_load.c | 2 + samples/bpf/map_perf_test_user.c | 1 + tools/include/uapi/linux/bpf.h | 10 ++ tools/lib/bpf/bpf.c | 57 +++++++--- tools/lib/bpf/bpf.h | 23 +++-- tools/lib/bpf/libbpf.c | 109 +++++++++++++++----- tools/testing/selftests/bpf/test_verifier.c | 2 +- 7 files changed, 157 insertions(+), 47 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 6aa50098dfb8..18b1c8dd0391 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -221,6 +221,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, @@ -228,6 +229,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, numa_node); } else { map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, maps[i].def.value_size, maps[i].def.max_entries, diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index a0310fc70057..519d9af4b04a 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu) inner_lru_map_fds[cpu] = bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], sizeof(uint32_t), sizeof(long), inner_lru_hash_size, 0, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e43491ac4823..6d2137b4cf38 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -175,6 +175,8 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +#define BPF_OBJ_NAME_LEN 16U + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -188,6 +190,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ + __u8 map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -210,6 +213,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; + __u8 prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -812,6 +816,11 @@ struct bpf_prog_info { __u32 xlated_prog_len; __aligned_u64 jited_prog_insns; __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -821,6 +830,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; + __u8 name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 1d6907d379c9..daf624e4c720 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -46,6 +46,8 @@ # endif #endif +#define min(x, y) ((x) < (y) ? (x) : (y)) + static inline __u64 ptr_to_u64(const void *ptr) { return (__u64) (unsigned long) ptr; @@ -57,10 +59,11 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, return syscall(__NR_bpf, cmd, attr, size); } -int bpf_create_map_node(enum bpf_map_type map_type, int key_size, - int value_size, int max_entries, __u32 map_flags, - int node) +int bpf_create_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + __u32 map_flags, int node) { + __u32 name_len = name ? strlen(name) : 0; union bpf_attr attr; memset(&attr, '\0', sizeof(attr)); @@ -70,6 +73,8 @@ int bpf_create_map_node(enum bpf_map_type map_type, int key_size, attr.value_size = value_size; attr.max_entries = max_entries; attr.map_flags = map_flags; + memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + if (node >= 0) { attr.map_flags |= BPF_F_NUMA_NODE; attr.numa_node = node; @@ -81,14 +86,23 @@ int bpf_create_map_node(enum bpf_map_type map_type, int key_size, int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags) { - return bpf_create_map_node(map_type, key_size, value_size, + return bpf_create_map_node(map_type, NULL, key_size, value_size, max_entries, map_flags, -1); } -int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, - int inner_map_fd, int max_entries, +int bpf_create_map_name(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + __u32 map_flags) +{ + return bpf_create_map_node(map_type, name, key_size, value_size, + max_entries, map_flags, -1); +} + +int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int inner_map_fd, int max_entries, __u32 map_flags, int node) { + __u32 name_len = name ? strlen(name) : 0; union bpf_attr attr; memset(&attr, '\0', sizeof(attr)); @@ -99,6 +113,8 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, attr.inner_map_fd = inner_map_fd; attr.max_entries = max_entries; attr.map_flags = map_flags; + memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + if (node >= 0) { attr.map_flags |= BPF_F_NUMA_NODE; attr.numa_node = node; @@ -107,19 +123,24 @@ int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } -int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, - int inner_map_fd, int max_entries, __u32 map_flags) +int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, + int key_size, int inner_map_fd, int max_entries, + __u32 map_flags) { - return bpf_create_map_in_map_node(map_type, key_size, inner_map_fd, - max_entries, map_flags, -1); + return bpf_create_map_in_map_node(map_type, name, key_size, + inner_map_fd, max_entries, map_flags, + -1); } -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, size_t log_buf_sz) +int bpf_load_program_name(enum bpf_prog_type type, const char *name, + const struct bpf_insn *insns, + size_t insns_cnt, const char *license, + __u32 kern_version, char *log_buf, + size_t log_buf_sz) { int fd; union bpf_attr attr; + __u32 name_len = name ? strlen(name) : 0; bzero(&attr, sizeof(attr)); attr.prog_type = type; @@ -130,6 +151,7 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, attr.log_size = 0; attr.log_level = 0; attr.kern_version = kern_version; + memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); if (fd >= 0 || !log_buf || !log_buf_sz) @@ -143,6 +165,15 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } +int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, + size_t insns_cnt, const char *license, + __u32 kern_version, char *log_buf, + size_t log_buf_sz) +{ + return bpf_load_program_name(type, NULL, insns, insns_cnt, license, + kern_version, log_buf, log_buf_sz); +} + int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, int strict_alignment, const char *license, __u32 kern_version, diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index b8ea5843c39e..118d00535a0d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -24,19 +24,28 @@ #include #include -int bpf_create_map_node(enum bpf_map_type map_type, int key_size, - int value_size, int max_entries, __u32 map_flags, - int node); +int bpf_create_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + __u32 map_flags, int node); +int bpf_create_map_name(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, int max_entries, + __u32 map_flags); int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); -int bpf_create_map_in_map_node(enum bpf_map_type map_type, int key_size, - int inner_map_fd, int max_entries, +int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, + int key_size, int inner_map_fd, int max_entries, __u32 map_flags, int node); -int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size, - int inner_map_fd, int max_entries, __u32 map_flags); +int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, + int key_size, int inner_map_fd, int max_entries, + __u32 map_flags); /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE 65536 +int bpf_load_program_name(enum bpf_prog_type type, const char *name, + const struct bpf_insn *insns, + size_t insns_cnt, const char *license, + __u32 kern_version, char *log_buf, + size_t log_buf_sz); int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 35f6dfcdc565..4f402dcdf372 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -171,6 +171,7 @@ int libbpf_strerror(int err, char *buf, size_t size) struct bpf_program { /* Index in elf obj file, for relocation use. */ int idx; + char *name; char *section_name; struct bpf_insn *insns; size_t insns_cnt; @@ -283,6 +284,7 @@ static void bpf_program__exit(struct bpf_program *prog) prog->clear_priv = NULL; bpf_program__unload(prog); + zfree(&prog->name); zfree(&prog->section_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -293,26 +295,27 @@ static void bpf_program__exit(struct bpf_program *prog) } static int -bpf_program__init(void *data, size_t size, char *name, int idx, - struct bpf_program *prog) +bpf_program__init(void *data, size_t size, char *section_name, int idx, + struct bpf_program *prog) { if (size < sizeof(struct bpf_insn)) { - pr_warning("corrupted section '%s'\n", name); + pr_warning("corrupted section '%s'\n", section_name); return -EINVAL; } bzero(prog, sizeof(*prog)); - prog->section_name = strdup(name); + prog->section_name = strdup(section_name); if (!prog->section_name) { - pr_warning("failed to alloc name for prog %s\n", - name); + pr_warning("failed to alloc name for prog under section %s\n", + section_name); goto errout; } prog->insns = malloc(size); if (!prog->insns) { - pr_warning("failed to alloc insns for %s\n", name); + pr_warning("failed to alloc insns for prog under section %s\n", + section_name); goto errout; } prog->insns_cnt = size / sizeof(struct bpf_insn); @@ -331,12 +334,12 @@ errout: static int bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, - char *name, int idx) + char *section_name, int idx) { struct bpf_program prog, *progs; int nr_progs, err; - err = bpf_program__init(data, size, name, idx, &prog); + err = bpf_program__init(data, size, section_name, idx, &prog); if (err) return err; @@ -350,8 +353,8 @@ bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, * is still valid, so don't need special treat for * bpf_close_object(). */ - pr_warning("failed to alloc a new program '%s'\n", - name); + pr_warning("failed to alloc a new program under section '%s'\n", + section_name); bpf_program__exit(&prog); return -ENOMEM; } @@ -364,6 +367,54 @@ bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, return 0; } +static int +bpf_object__init_prog_names(struct bpf_object *obj) +{ + Elf_Data *symbols = obj->efile.symbols; + struct bpf_program *prog; + size_t pi, si; + + for (pi = 0; pi < obj->nr_programs; pi++) { + char *name = NULL; + + prog = &obj->programs[pi]; + + for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name; + si++) { + GElf_Sym sym; + + if (!gelf_getsym(symbols, si, &sym)) + continue; + if (sym.st_shndx != prog->idx) + continue; + + name = elf_strptr(obj->efile.elf, + obj->efile.strtabidx, + sym.st_name); + if (!name) { + pr_warning("failed to get sym name string for prog %s\n", + prog->section_name); + return -LIBBPF_ERRNO__LIBELF; + } + } + + if (!name) { + pr_warning("failed to find sym for prog %s\n", + prog->section_name); + return -EINVAL; + } + + prog->name = strdup(name); + if (!prog->name) { + pr_warning("failed to allocate memory for prog sym %s\n", + name); + return -ENOMEM; + } + } + + return 0; +} + static struct bpf_object *bpf_object__new(const char *path, void *obj_buf, size_t obj_buf_sz) @@ -766,8 +817,12 @@ static int bpf_object__elf_collect(struct bpf_object *obj) pr_warning("Corrupted ELF file: index of strtab invalid\n"); return LIBBPF_ERRNO__FORMAT; } - if (obj->efile.maps_shndx >= 0) + if (obj->efile.maps_shndx >= 0) { err = bpf_object__init_maps(obj); + if (err) + goto out; + } + err = bpf_object__init_prog_names(obj); out: return err; } @@ -870,11 +925,12 @@ bpf_object__create_maps(struct bpf_object *obj) struct bpf_map_def *def = &obj->maps[i].def; int *pfd = &obj->maps[i].fd; - *pfd = bpf_create_map(def->type, - def->key_size, - def->value_size, - def->max_entries, - 0); + *pfd = bpf_create_map_name(def->type, + obj->maps[i].name, + def->key_size, + def->value_size, + def->max_entries, + 0); if (*pfd < 0) { size_t j; int err = *pfd; @@ -982,7 +1038,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) } static int -load_program(enum bpf_prog_type type, struct bpf_insn *insns, +load_program(enum bpf_prog_type type, const char *name, struct bpf_insn *insns, int insns_cnt, char *license, u32 kern_version, int *pfd) { int ret; @@ -995,8 +1051,8 @@ load_program(enum bpf_prog_type type, struct bpf_insn *insns, if (!log_buf) pr_warning("Alloc log buffer for bpf loader error, continue without log\n"); - ret = bpf_load_program(type, insns, insns_cnt, license, - kern_version, log_buf, BPF_LOG_BUF_SIZE); + ret = bpf_load_program_name(type, name, insns, insns_cnt, license, + kern_version, log_buf, BPF_LOG_BUF_SIZE); if (ret >= 0) { *pfd = ret; @@ -1021,9 +1077,9 @@ load_program(enum bpf_prog_type type, struct bpf_insn *insns, if (type != BPF_PROG_TYPE_KPROBE) { int fd; - fd = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns, - insns_cnt, license, kern_version, - NULL, 0); + fd = bpf_load_program_name(BPF_PROG_TYPE_KPROBE, name, + insns, insns_cnt, license, + kern_version, NULL, 0); if (fd >= 0) { close(fd); ret = -LIBBPF_ERRNO__PROGTYPE; @@ -1067,8 +1123,8 @@ bpf_program__load(struct bpf_program *prog, pr_warning("Program '%s' is inconsistent: nr(%d) != 1\n", prog->section_name, prog->instances.nr); } - err = load_program(prog->type, prog->insns, prog->insns_cnt, - license, kern_version, &fd); + err = load_program(prog->type, prog->name, prog->insns, + prog->insns_cnt, license, kern_version, &fd); if (!err) prog->instances.fds[0] = fd; goto out; @@ -1096,7 +1152,8 @@ bpf_program__load(struct bpf_program *prog, continue; } - err = load_program(prog->type, result.new_insn_ptr, + err = load_program(prog->type, prog->name, + result.new_insn_ptr, result.new_insn_cnt, license, kern_version, &fd); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index a0426147523d..290d5056c165 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6939,7 +6939,7 @@ static int create_map_in_map(void) return inner_map_fd; } - outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, + outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL, sizeof(int), inner_map_fd, 1, 0); if (outer_map_fd < 0) printf("Failed to create array of maps '%s'!\n", From 6e525d06674a21468b4e728ef880770a0ca5c60d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:55 -0700 Subject: [PATCH 1032/1322] bpf: Swap the order of checking prog_info and map_info This patch swaps the checking order. It now checks the map_info first and then prog_info. It is a prep work for adding test to the newly added fields (the map_ids of prog_info field in particular). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_progs.c | 58 ++++++++++++------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 11ee25cea227..31ae27dc8d04 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -316,6 +316,36 @@ static void test_bpf_obj_id(void) error_cnt++; assert(!err); + /* Insert a magic value to the map */ + map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id"); + assert(map_fds[i] >= 0); + err = bpf_map_update_elem(map_fds[i], &array_key, + &array_magic_value, 0); + assert(!err); + + /* Check getting map info */ + info_len = sizeof(struct bpf_map_info) * 2; + bzero(&map_infos[i], info_len); + err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i], + &info_len); + if (CHECK(err || + map_infos[i].type != BPF_MAP_TYPE_ARRAY || + map_infos[i].key_size != sizeof(__u32) || + map_infos[i].value_size != sizeof(__u64) || + map_infos[i].max_entries != 1 || + map_infos[i].map_flags != 0 || + info_len != sizeof(struct bpf_map_info), + "get-map-info(fd)", + "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n", + err, errno, + map_infos[i].type, BPF_MAP_TYPE_ARRAY, + info_len, sizeof(struct bpf_map_info), + map_infos[i].key_size, + map_infos[i].value_size, + map_infos[i].max_entries, + map_infos[i].map_flags)) + goto done; + /* Check getting prog info */ info_len = sizeof(struct bpf_prog_info) * 2; bzero(&prog_infos[i], info_len); @@ -347,34 +377,6 @@ static void test_bpf_obj_id(void) !!memcmp(xlated_insns, zeros, sizeof(zeros)))) goto done; - map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id"); - assert(map_fds[i] >= 0); - err = bpf_map_update_elem(map_fds[i], &array_key, - &array_magic_value, 0); - assert(!err); - - /* Check getting map info */ - info_len = sizeof(struct bpf_map_info) * 2; - bzero(&map_infos[i], info_len); - err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i], - &info_len); - if (CHECK(err || - map_infos[i].type != BPF_MAP_TYPE_ARRAY || - map_infos[i].key_size != sizeof(__u32) || - map_infos[i].value_size != sizeof(__u64) || - map_infos[i].max_entries != 1 || - map_infos[i].map_flags != 0 || - info_len != sizeof(struct bpf_map_info), - "get-map-info(fd)", - "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n", - err, errno, - map_infos[i].type, BPF_MAP_TYPE_ARRAY, - info_len, sizeof(struct bpf_map_info), - map_infos[i].key_size, - map_infos[i].value_size, - map_infos[i].max_entries, - map_infos[i].map_flags)) - goto done; } /* Check bpf_prog_get_next_id() */ From 3a8ad560a9674235d1dd2e174434f540924e249f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:56 -0700 Subject: [PATCH 1033/1322] bpf: Test new fields in bpf_attr and bpf_{prog, map}_info This patch tests newly added fields of the bpf_attr, bpf_prog_info and bpf_map_info. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_progs.c | 143 +++++++++++++++++++++-- 1 file changed, 132 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 31ae27dc8d04..69427531408d 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include typedef __u16 __sum16; @@ -19,6 +20,8 @@ typedef __u16 __sum16; #include #include #include +#include +#include #include #include @@ -273,16 +276,26 @@ static void test_bpf_obj_id(void) const int nr_iters = 2; const char *file = "./test_obj_id.o"; const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable"; + const char *expected_prog_name = "test_obj_id"; + const char *expected_map_name = "test_map_id"; + const __u64 nsec_per_sec = 1000000000; struct bpf_object *objs[nr_iters]; int prog_fds[nr_iters], map_fds[nr_iters]; /* +1 to test for the info_len returned by kernel */ struct bpf_prog_info prog_infos[nr_iters + 1]; struct bpf_map_info map_infos[nr_iters + 1]; + /* Each prog only uses one map. +1 to test nr_map_ids + * returned by kernel. + */ + __u32 map_ids[nr_iters + 1]; char jited_insns[128], xlated_insns[128], zeros[128]; __u32 i, next_id, info_len, nr_id_found, duration = 0; + struct timespec real_time_ts, boot_time_ts; int sysctl_fd, jit_enabled = 0, err = 0; __u64 array_value; + uid_t my_uid = getuid(); + time_t now, load_time; sysctl_fd = open(jit_sysctl, 0, O_RDONLY); if (sysctl_fd != -1) { @@ -307,6 +320,7 @@ static void test_bpf_obj_id(void) /* Check bpf_obj_get_info_by_fd() */ bzero(zeros, sizeof(zeros)); for (i = 0; i < nr_iters; i++) { + now = time(NULL); err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER, &objs[i], &prog_fds[i]); /* test_obj_id.o is a dumb prog. It should never fail @@ -334,16 +348,18 @@ static void test_bpf_obj_id(void) map_infos[i].value_size != sizeof(__u64) || map_infos[i].max_entries != 1 || map_infos[i].map_flags != 0 || - info_len != sizeof(struct bpf_map_info), + info_len != sizeof(struct bpf_map_info) || + strcmp((char *)map_infos[i].name, expected_map_name), "get-map-info(fd)", - "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X\n", + "err %d errno %d type %d(%d) info_len %u(%lu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n", err, errno, map_infos[i].type, BPF_MAP_TYPE_ARRAY, info_len, sizeof(struct bpf_map_info), map_infos[i].key_size, map_infos[i].value_size, map_infos[i].max_entries, - map_infos[i].map_flags)) + map_infos[i].map_flags, + map_infos[i].name, expected_map_name)) goto done; /* Check getting prog info */ @@ -355,8 +371,16 @@ static void test_bpf_obj_id(void) prog_infos[i].jited_prog_len = sizeof(jited_insns); prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns); prog_infos[i].xlated_prog_len = sizeof(xlated_insns); + prog_infos[i].map_ids = ptr_to_u64(map_ids + i); + prog_infos[i].nr_map_ids = 2; + err = clock_gettime(CLOCK_REALTIME, &real_time_ts); + assert(!err); + err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts); + assert(!err); err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i], &info_len); + load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + + (prog_infos[i].load_time / nsec_per_sec); if (CHECK(err || prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER || info_len != sizeof(struct bpf_prog_info) || @@ -364,9 +388,14 @@ static void test_bpf_obj_id(void) (jit_enabled && !memcmp(jited_insns, zeros, sizeof(zeros))) || !prog_infos[i].xlated_prog_len || - !memcmp(xlated_insns, zeros, sizeof(zeros)), + !memcmp(xlated_insns, zeros, sizeof(zeros)) || + load_time < now - 60 || load_time > now + 60 || + prog_infos[i].created_by_uid != my_uid || + prog_infos[i].nr_map_ids != 1 || + *(int *)prog_infos[i].map_ids != map_infos[i].id || + strcmp((char *)prog_infos[i].name, expected_prog_name), "get-prog-info(fd)", - "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d\n", + "err %d errno %d i %d type %d(%d) info_len %u(%lu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n", err, errno, i, prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER, info_len, sizeof(struct bpf_prog_info), @@ -374,9 +403,13 @@ static void test_bpf_obj_id(void) prog_infos[i].jited_prog_len, prog_infos[i].xlated_prog_len, !!memcmp(jited_insns, zeros, sizeof(zeros)), - !!memcmp(xlated_insns, zeros, sizeof(zeros)))) + !!memcmp(xlated_insns, zeros, sizeof(zeros)), + load_time, now, + prog_infos[i].created_by_uid, my_uid, + prog_infos[i].nr_map_ids, 1, + *(int *)prog_infos[i].map_ids, map_infos[i].id, + prog_infos[i].name, expected_prog_name)) goto done; - } /* Check bpf_prog_get_next_id() */ @@ -384,6 +417,7 @@ static void test_bpf_obj_id(void) next_id = 0; while (!bpf_prog_get_next_id(next_id, &next_id)) { struct bpf_prog_info prog_info = {}; + __u32 saved_map_id; int prog_fd; info_len = sizeof(prog_info); @@ -406,16 +440,33 @@ static void test_bpf_obj_id(void) nr_id_found++; + /* Negative test: + * prog_info.nr_map_ids = 1 + * prog_info.map_ids = NULL + */ + prog_info.nr_map_ids = 1; + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + if (CHECK(!err || errno != EFAULT, + "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)", + err, errno, EFAULT)) + break; + bzero(&prog_info, sizeof(prog_info)); + info_len = sizeof(prog_info); + + saved_map_id = *(int *)(prog_infos[i].map_ids); + prog_info.map_ids = prog_infos[i].map_ids; + prog_info.nr_map_ids = 2; err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); prog_infos[i].jited_prog_insns = 0; prog_infos[i].xlated_prog_insns = 0; CHECK(err || info_len != sizeof(struct bpf_prog_info) || - memcmp(&prog_info, &prog_infos[i], info_len), + memcmp(&prog_info, &prog_infos[i], info_len) || + *(int *)prog_info.map_ids != saved_map_id, "get-prog-info(next_id->fd)", - "err %d errno %d info_len %u(%lu) memcmp %d\n", + "err %d errno %d info_len %u(%lu) memcmp %d map_id %u(%u)\n", err, errno, info_len, sizeof(struct bpf_prog_info), - memcmp(&prog_info, &prog_infos[i], info_len)); - + memcmp(&prog_info, &prog_infos[i], info_len), + *(int *)prog_info.map_ids, saved_map_id); close(prog_fd); } CHECK(nr_id_found != nr_iters, @@ -497,6 +548,75 @@ static void test_pkt_md_access(void) bpf_object__close(obj); } +static void test_obj_name(void) +{ + struct { + const char *name; + int success; + int expected_errno; + } tests[] = { + { "", 1, 0 }, + { "_123456789ABCDE", 1, 0 }, + { "_123456789ABCDEF", 0, EINVAL }, + { "_123456789ABCD\n", 0, EINVAL }, + }; + struct bpf_insn prog[] = { + BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + __u32 duration = 0; + int i; + + for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) { + size_t name_len = strlen(tests[i].name) + 1; + union bpf_attr attr; + size_t ncopy; + int fd; + + /* test different attr.prog_name during BPF_PROG_LOAD */ + ncopy = name_len < sizeof(attr.prog_name) ? + name_len : sizeof(attr.prog_name); + bzero(&attr, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SCHED_CLS; + attr.insn_cnt = 2; + attr.insns = ptr_to_u64(prog); + attr.license = ptr_to_u64(""); + memcpy(attr.prog_name, tests[i].name, ncopy); + + fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + CHECK((tests[i].success && fd < 0) || + (!tests[i].success && fd != -1) || + (!tests[i].success && errno != tests[i].expected_errno), + "check-bpf-prog-name", + "fd %d(%d) errno %d(%d)\n", + fd, tests[i].success, errno, tests[i].expected_errno); + + if (fd != -1) + close(fd); + + /* test different attr.map_name during BPF_MAP_CREATE */ + ncopy = name_len < sizeof(attr.map_name) ? + name_len : sizeof(attr.map_name); + bzero(&attr, sizeof(attr)); + attr.map_type = BPF_MAP_TYPE_ARRAY; + attr.key_size = 4; + attr.value_size = 4; + attr.max_entries = 1; + attr.map_flags = 0; + memcpy(attr.map_name, tests[i].name, ncopy); + fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); + CHECK((tests[i].success && fd < 0) || + (!tests[i].success && fd != -1) || + (!tests[i].success && errno != tests[i].expected_errno), + "check-bpf-map-name", + "fd %d(%d) errno %d(%d)\n", + fd, tests[i].success, errno, tests[i].expected_errno); + + if (fd != -1) + close(fd); + } +} + int main(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; @@ -509,6 +629,7 @@ int main(void) test_tcp_estats(); test_bpf_obj_id(); test_pkt_md_access(); + test_obj_name(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; From c7c3e5913bf18eda3cf38932bebdce48351baac9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 19:08:00 -0700 Subject: [PATCH 1034/1322] net: ipv4: remove fib_weight fib_weight in fib_info is set but not used. Remove it and the helpers for setting it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 --- net/ipv4/fib_semantics.c | 9 --------- 2 files changed, 12 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1a7f7e424320..f80524396c06 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -122,9 +122,6 @@ struct fib_info { #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int fib_weight; -#endif struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 57a5d48acee8..be0874620ecc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); From fa8fefaa678ea390b873195d19c09930da84a4bb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 20:41:59 -0700 Subject: [PATCH 1035/1322] net: ipv4: remove fib_info arg to fib_check_nh fib_check_nh does not use the fib_info arg; remove t. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index be0874620ecc..467b3c15395f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -766,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1250,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) From 441430eb54a00586f95f1aefc48e0801bbd6a923 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 6 Sep 2017 19:08:11 +0300 Subject: [PATCH 1036/1322] perf/aux: Only update ->aux_wakeup in non-overwrite mode The following commit: d9a50b0256 ("perf/aux: Ensure aux_wakeup represents most recent wakeup index") changed the AUX wakeup position calculation to rounddown(), which causes a division-by-zero in AUX overwrite mode (aka "snapshot mode"). The zero denominator results from the fact that perf record doesn't set aux_watermark to anything, in which case the kernel will set it to half the AUX buffer size, but only for non-overwrite mode. In the overwrite mode aux_watermark stays zero. The good news is that, AUX overwrite mode, wakeups don't happen and related bookkeeping is not relevant, so we can simply forego the whole wakeup updates. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/20170906160811.16510-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } From 69b73e95982649a1f2dc63b8f08f2113d28f7fed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Sep 2017 10:07:44 +0200 Subject: [PATCH 1037/1322] um/time: Fixup namespace collision The new timer_setup() function for struct timer_list collides with a private um function. Rename it. Fixes: 686fef928bba ("timer: Prepare to change timer callback argument type") Signed-off-by: Thomas Gleixner Cc: Richard Weinberger Cc: Jeff Dike Cc: user-mode-linux-devel@lists.sourceforge.net Cc: Kees Cook --- arch/um/kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 0b034ebbda2a..7f69d17de354 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -98,7 +98,7 @@ static struct clocksource timer_clocksource = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void __init timer_setup(void) +static void __init um_timer_setup(void) { int err; @@ -132,5 +132,5 @@ void read_persistent_clock(struct timespec *ts) void __init time_init(void) { timer_set_signal_handler(); - late_time_init = timer_setup; + late_time_init = um_timer_setup; } From 1593baab910da72480d651ea7bf2ce6e3a25a484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:09:26 +0200 Subject: [PATCH 1038/1322] sched/debug: Implement consistent task-state printing Currently get_task_state() and task_state_to_char() report different states, create a number of common helpers and unify the reported state space. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 15 ++------------- include/linux/sched.h | 26 +++++++++++++++++++------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 525157ca25cb..01196d3ad452 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -130,19 +130,8 @@ static const char * const task_state_array[] = { static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - - /* - * Parked tasks do not run; they sit in __kthread_parkme(). - * Without this check, we would report them as running, which is - * clearly wrong, so we report them as sleeping instead. - */ - if (tsk->state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - - return task_state_array[fls(state)]; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + return task_state_array[__get_task_state(tsk)]; } static inline int get_task_umask(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 92fb8dd5a9e4..163a0b738908 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,17 +1243,29 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } -static inline char task_state_to_char(struct task_struct *task) +static inline unsigned int __get_task_state(struct task_struct *tsk) { - const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - unsigned long state = task->state; + unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; - state = state ? __ffs(state) + 1 : 0; + if (tsk_state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; - /* Make sure the string lines up properly with the number of task states: */ - BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1); + return fls(state); +} - return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'; +static inline char __task_state_to_char(unsigned int state) +{ + static const char state_char[] = "RSDTtXZ"; + + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + + return state_char[state]; +} + +static inline char task_state_to_char(struct task_struct *tsk) +{ + return __task_state_to_char(__get_task_state(tsk)); } /** From 92c4bc9f9cd92a8581e36bc5105f03b569f37e36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:13:36 +0200 Subject: [PATCH 1039/1322] sched/debug: Convert TASK_state to hex Bit patterns are easier in hex. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 163a0b738908..69bed5339ffa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,23 +65,23 @@ struct task_group; */ /* Used in tsk->state: */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 +#define TASK_RUNNING 0x0000 +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 +#define __TASK_STOPPED 0x0004 +#define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 16 -#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 0x0010 +#define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_NOLOAD 1024 -#define TASK_NEW 2048 -#define TASK_STATE_MAX 4096 +#define TASK_DEAD 0x0040 +#define TASK_WAKEKILL 0x0080 +#define TASK_WAKING 0x0100 +#define TASK_PARKED 0x0200 +#define TASK_NOLOAD 0x0400 +#define TASK_NEW 0x0800 +#define TASK_STATE_MAX 0x1000 #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" From 65d5dc47fe8530e17e318722fb4df676536c2bfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:14:08 +0200 Subject: [PATCH 1040/1322] sched/debug: Remove unused variable Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { From efb40f588b4370ffaeffafbd50f6ff213d954254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:19:53 +0200 Subject: [PATCH 1041/1322] sched/tracing: Fix trace_sched_switch task-state printing Convert trace_sched_switch to use the common task-state helpers and fix the "X" and "Z" order, possibly they ended up in the wrong order because TASK_REPORT has them in the wrong order too. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- include/trace/events/sched.h | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 69bed5339ffa..a2fe636b6825 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,7 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ae1409ffe99a..c63e20c9ef24 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -114,7 +114,10 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ - return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; + if (preempt) + return TASK_STATE_MAX; + + return __get_task_state(p); } #endif /* CREATE_TRACE_POINTS */ @@ -152,12 +155,13 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state & (TASK_STATE_MAX-1) ? - __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", - { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, - { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "K" }, { 256, "W" }, { 512, "P" }, - { 1024, "N" }) : "R", + + (__entry->prev_state & TASK_REPORT) ? + __print_flags(__entry->prev_state & TASK_REPORT, "|", + { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); From 9c29c31830a4eca724e137a9339137204bbb31be Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Thu, 7 Sep 2017 20:00:58 +0530 Subject: [PATCH 1042/1322] locking/rwsem-xadd: Fix missed wakeup due to reordering of load If a spinner is present, there is a chance that the load of rwsem_has_spinner() in rwsem_wake() can be reordered with respect to decrement of rwsem count in __up_write() leading to wakeup being missed: spinning writer up_write caller --------------- ----------------------- [S] osq_unlock() [L] osq spin_lock(wait_lock) sem->count=0xFFFFFFFF00000001 +0xFFFFFFFF00000000 count=sem->count MB sem->count=0xFFFFFFFE00000001 -0xFFFFFFFF00000001 spin_trylock(wait_lock) return rwsem_try_write_lock(count) spin_unlock(wait_lock) schedule() Reordering of atomic_long_sub_return_release() in __up_write() and rwsem_has_spinner() in rwsem_wake() can cause missing of wakeup in up_write() context. In spinning writer, sem->count and local variable count is 0XFFFFFFFE00000001. It would result in rwsem_try_write_lock() failing to acquire rwsem and spinning writer going to sleep in rwsem_down_write_failed(). The smp_rmb() will make sure that the spinner state is consulted after sem->count is updated in up_write context. Signed-off-by: Prateek Sood Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: longman@redhat.com Cc: parri.andrea@gmail.com Cc: sramana@codeaurora.org Link: http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prsood@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -612,6 +612,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); + /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize From 5f6ad26ea353fdf3dad2328052cbee49e0b9c5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:23:31 +0200 Subject: [PATCH 1043/1322] sched/tracing: Use common task-state helpers Remove yet another task-state char instance. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/trace/trace_output.c | 21 ++++++--------------- kernel/trace/trace_sched_wakeup.c | 8 ++++---- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a2fe636b6825..bc7807933415 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -83,8 +83,6 @@ struct task_group; #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" - /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) From 06eb61844d841d0032a9950ce7f8e783ee49c0d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:30:40 +0200 Subject: [PATCH 1044/1322] sched/debug: Add explicit TASK_IDLE printing Markus reported that kthreads that idle using TASK_IDLE instead of TASK_INTERRUPTIBLE are reported in as TASK_UNINTERRUPTIBLE and things like htop mark those red. This is undesirable, so add an explicit state for TASK_IDLE. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 21 +++++++++++++-------- include/linux/sched.h | 12 ++++++++++-- include/trace/events/sched.h | 7 ++++--- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 01196d3ad452..a120a4549d48 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -119,18 +119,23 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char * const task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "t (tracing stop)", /* 8 */ - "X (dead)", /* 16 */ - "Z (zombie)", /* 32 */ + + /* states in TASK_REPORT: */ + "R (running)", /* 0x00 */ + "S (sleeping)", /* 0x01 */ + "D (disk sleep)", /* 0x02 */ + "T (stopped)", /* 0x04 */ + "t (tracing stop)", /* 0x08 */ + "X (dead)", /* 0x10 */ + "Z (zombie)", /* 0x20 */ + + /* states beyond TASK_REPORT: */ + "I (idle)", /* 0x40 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array) - 1); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); return task_state_array[__get_task_state(tsk)]; } diff --git a/include/linux/sched.h b/include/linux/sched.h index bc7807933415..286fc1117046 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,22 +1241,30 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) return task_pgrp_nr_ns(tsk, &init_pid_ns); } +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + static inline unsigned int __get_task_state(struct task_struct *tsk) { unsigned int tsk_state = READ_ONCE(tsk->state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); + if (tsk_state == TASK_PARKED) state = TASK_INTERRUPTIBLE; + if (tsk_state == TASK_IDLE) + state = TASK_REPORT_IDLE; + return fls(state); } static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZ"; + static const char state_char[] = "RSDTtXZI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 2); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c63e20c9ef24..b371ef8206e1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,10 +156,11 @@ TRACE_EVENT(sched_switch, TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - (__entry->prev_state & TASK_REPORT) ? - __print_flags(__entry->prev_state & TASK_REPORT, "|", + (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? + __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, - { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }) : + { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, + { 0x40, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", From 5d68cc95fb24b2f58060cc5340dd7402a11f054e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:32:41 +0200 Subject: [PATCH 1045/1322] sched/debug: Ignore TASK_IDLE for SysRq-W Markus reported that tasks in TASK_IDLE state are reported by SysRq-W, which results in undesirable clutter. Reported-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows + * TASK_KILLABLE). + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } From 8ef9925b02c23e3838d5e593c5cf37984141150f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Sep 2017 18:37:28 +0200 Subject: [PATCH 1046/1322] sched/debug: Add explicit TASK_PARKED printing Currently TASK_PARKED is masqueraded as TASK_INTERRUPTIBLE, give it its own print state because it will not in fact get woken by regular wakeups and is a long-term state. This requires moving TASK_PARKED into the TASK_REPORT mask, and since that latter needs to be a contiguous bitmask, we need to shuffle the bits around a bit. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 3 ++- include/linux/sched.h | 16 +++++++--------- include/trace/events/sched.h | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index a120a4549d48..77a8eacbe032 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -128,9 +128,10 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 0x08 */ "X (dead)", /* 0x10 */ "Z (zombie)", /* 0x20 */ + "P (parked)", /* 0x40 */ /* states beyond TASK_REPORT: */ - "I (idle)", /* 0x40 */ + "I (idle)", /* 0x80 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index 286fc1117046..26a7df4e558c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -75,10 +75,10 @@ struct task_group; #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ -#define TASK_DEAD 0x0040 -#define TASK_WAKEKILL 0x0080 -#define TASK_WAKING 0x0100 -#define TASK_PARKED 0x0200 +#define TASK_PARKED 0x0040 +#define TASK_DEAD 0x0080 +#define TASK_WAKEKILL 0x0100 +#define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 @@ -97,7 +97,8 @@ struct task_group; /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE) + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) @@ -1251,9 +1252,6 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_PARKED) - state = TASK_INTERRUPTIBLE; - if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; @@ -1262,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) static inline char __task_state_to_char(unsigned int state) { - static const char state_char[] = "RSDTtXZI"; + static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index b371ef8206e1..3c8b7f625670 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -160,7 +160,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" }, { 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" }, - { 0x40, "I" }) : + { 0x40, "P" }, { 0x80, "I" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", From 520a13c530aeb5f63e011d668c42db1af19ed349 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 28 Sep 2017 16:58:26 -0500 Subject: [PATCH 1047/1322] x86/asm: Fix inline asm call constraints for GCC 4.4 The kernel test bot (run by Xiaolong Ye) reported that the following commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") is causing double faults in a kernel compiled with GCC 4.4. Linus subsequently diagnosed the crash pattern and the buggy commit and found that the issue is with this code: register unsigned int __asm_call_sp asm("esp"); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) Even on a 64-bit kernel, it's using ESP instead of RSP. That causes GCC to produce the following bogus code: ffffffff8147461d: 89 e0 mov %esp,%eax ffffffff8147461f: 4c 89 f7 mov %r14,%rdi ffffffff81474622: 4c 89 fe mov %r15,%rsi ffffffff81474625: ba 20 00 00 00 mov $0x20,%edx ffffffff8147462a: 89 c4 mov %eax,%esp ffffffff8147462c: e8 bf 52 05 00 callq ffffffff814c98f0 Despite the absurdity of it backing up and restoring the stack pointer for no reason, the bug is actually the fact that it's only backing up and restoring the lower 32 bits of the stack pointer. The upper 32 bits are getting cleared out, corrupting the stack pointer. So change the '__asm_call_sp' register variable to be associated with the actual full-size stack pointer. This also requires changing the __ASM_SEL() macro to be based on the actual compiled arch size, rather than the CONFIG value, because CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso). Otherwise Clang fails to build the kernel because it complains about the use of a 64-bit register (RSP) in a 32-bit file. Reported-and-Bisected-and-Tested-by: kernel test robot Diagnosed-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: LKP Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index c1eadbaf1115..30c3c9ac784a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif @@ -139,7 +141,7 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned int __asm_call_sp asm("esp"); +register unsigned long __asm_call_sp asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) #endif From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Mon, 4 Sep 2017 13:59:34 +0800 Subject: [PATCH 1048/1322] sched/sysctl: Check user input value of sysctl_sched_time_avg System will hang if user set sysctl_sched_time_avg to 0: [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0 Stack traceback for pid 0 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3 ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8 ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08 Call Trace: [] ? update_group_capacity+0x110/0x200 [] ? update_sd_lb_stats+0x109/0x600 [] ? find_busiest_group+0x47/0x530 [] ? load_balance+0x194/0x900 [] ? update_rq_clock.part.83+0x1a/0xe0 [] ? rebalance_domains+0x152/0x290 [] ? run_rebalance_domains+0xdc/0x1d0 [] ? __do_softirq+0xfb/0x320 [] ? irq_exit+0x125/0x130 [] ? scheduler_ipi+0x97/0x160 [] ? smp_reschedule_interrupt+0x29/0x30 [] ? reschedule_interrupt+0x6e/0x80 [] ? cpuidle_enter_state+0xcc/0x230 [] ? cpuidle_enter_state+0x9c/0x230 [] ? cpuidle_enter+0x17/0x20 [] ? cpu_startup_entry+0x38c/0x420 [] ? start_secondary+0x173/0x1e0 Because divide-by-zero error happens in function: update_group_capacity() update_cpu_capacity() scale_rt_capacity() { ... total = sched_avg_period() + delta; used = div_u64(avg, total); ... } To fix this issue, check user input value of sysctl_sched_time_avg, keep it unchanged when hitting invalid input, and set the minimum limit of sysctl_sched_time_avg to 1 ms. Reported-by: James Puthukattukaran Signed-off-by: Ethan Zhao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: ethan.kernel@gmail.com Cc: keescook@chromium.org Cc: mcgrof@kernel.org Cc: Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..423554ad3610 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, #ifdef CONFIG_SCHEDSTATS { From 305d0ab4764d36a02c8e7cddb67099aca65351ce Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 28 Sep 2017 18:16:44 -0700 Subject: [PATCH 1049/1322] KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------[ cut here ]------------ WARNING: CPU: 4 PID: 5280 at /home/kernel/linux/arch/x86/kvm//vmx.c:11394 nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] CPU: 4 PID: 5280 Comm: qemu-system-x86 Tainted: G W OE 4.13.0+ #17 RIP: 0010:nested_vmx_vmexit+0xc2b/0xd70 [kvm_intel] Call Trace: ? emulator_read_emulated+0x15/0x20 [kvm] ? segmented_read+0xae/0xf0 [kvm] vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] ? vmx_inject_page_fault_nested+0x60/0x70 [kvm_intel] x86_emulate_instruction+0x733/0x810 [kvm] vmx_handle_exit+0x2f4/0xda0 [kvm_intel] ? kvm_arch_vcpu_ioctl_run+0xd2f/0x1c60 [kvm] kvm_arch_vcpu_ioctl_run+0xdab/0x1c60 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc2 A nested #PF is triggered during L0 emulating instruction for L2. However, it doesn't consider we should not break L1's vmlauch/vmresme. This patch fixes it by queuing the #PF exception instead ,requesting an immediate VM exit from L2 and keeping the exception for L1 pending for a subsequent nested VM exit. This should actually work all the time, making vmx_inject_page_fault_nested totally unnecessary. However, that's not working yet, so this patch can work around the issue in the meanwhile. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7f62c94196d1..5bfa353f6354 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9845,7 +9845,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { vmcs12->vm_exit_intr_error_code = fault->error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 29 Sep 2017 19:01:45 +0800 Subject: [PATCH 1050/1322] kvm/x86: Handle async PF in RCU read-side critical sections Sasha Levin reported a WARNING: | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 | rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 ... | CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS | 1.10.1-1ubuntu1 04/01/2014 | Call Trace: ... | RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] | RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 | RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002 | RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000 | RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410 | RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001 | R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08 | R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040 | __schedule+0x201/0x2240 kernel/sched/core.c:3292 | schedule+0x113/0x460 kernel/sched/core.c:3421 | kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158 | do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271 | async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069 | RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996 | RSP: 0018:ffff88003b2df520 EFLAGS: 00010283 | RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670 | RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140 | RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000 | R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8 | R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000 | vsnprintf+0x173/0x1700 lib/vsprintf.c:2136 | sprintf+0xbe/0xf0 lib/vsprintf.c:2386 | proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23 | get_link fs/namei.c:1047 [inline] | link_path_walk+0x1041/0x1490 fs/namei.c:2127 ... This happened when the host hit a page fault, and delivered it as in an async page fault, while the guest was in an RCU read-side critical section. The guest then tries to reschedule in kvm_async_pf_task_wait(), but rcu_preempt_note_context_switch() would treat the reschedule as a sleep in RCU read-side critical section, which is not allowed (even in preemptible RCU). Thus the WARN. To cure this, make kvm_async_pf_task_wait() go to the halt path if the PF happens in a RCU read-side critical section. Reported-by: Sasha Levin Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Boqun Feng Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index aa60a08b65b1..e675704fa6f7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || preempt_count() > 1 || + rcu_preempt_depth(); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 11:29:55 +0100 Subject: [PATCH 1051/1322] arm64: mm: Use READ_ONCE when dereferencing pointer to pte table On kernels built with support for transparent huge pages, different CPUs can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk and they must take care to use READ_ONCE to avoid value tearing or caching of stale values by the compiler. Unfortunately, these functions call into our pgtable macros, which don't use READ_ONCE, and compiler caching has been observed to cause the following crash during ext4 writeback: PC is at check_pte+0x20/0x170 LR is at page_vma_mapped_walk+0x2e0/0x540 [...] Process doio (pid: 2463, stack limit = 0xffff00000f2e8000) Call trace: [] check_pte+0x20/0x170 [] page_vma_mapped_walk+0x2e0/0x540 [] page_mkclean_one+0xac/0x278 [] rmap_walk_file+0xf0/0x238 [] rmap_walk+0x64/0xa0 [] page_mkclean+0x90/0xa8 [] clear_page_dirty_for_io+0x84/0x2a8 [] mpage_submit_page+0x34/0x98 [] mpage_process_page_bufs+0x164/0x170 [] mpage_prepare_extent_to_map+0x134/0x2b8 [] ext4_writepages+0x484/0xe30 [] do_writepages+0x44/0xe8 [] __filemap_fdatawrite_range+0xbc/0x110 [] file_write_and_wait_range+0x48/0xd8 [] ext4_sync_file+0x80/0x4b8 [] vfs_fsync_range+0x64/0xc0 [] SyS_msync+0x194/0x1e8 This is because page_vma_mapped_walk loads the PMD twice before calling pte_offset_map: the first time without READ_ONCE (where it gets all zeroes due to a concurrent pmdp_invalidate) and the second time with READ_ONCE (where it sees a valid table pointer due to a concurrent pmd_populate). However, the compiler inlines everything and caches the first value in a register, which is subsequently used in pte_offset_phys which returns a junk pointer that is later dereferenced when attempting to access the relevant pte. This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure that a stale value is not used. Whilst this is a point fix for a known failure (and simple to backport), a full fix moving all of our page table accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in page_vma_mapped_walk is in the works for a future kernel release. Cc: Jon Masters Cc: Timur Tabi Cc: Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") Tested-by: Richard Ruigrok Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bc4e92337d16..b46e54c2399b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -401,7 +401,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 12:27:41 +0100 Subject: [PATCH 1052/1322] arm64: fault: Route pte translation faults via do_translation_fault We currently route pte translation faults via do_page_fault, which elides the address check against TASK_SIZE before invoking the mm fault handling code. However, this can cause issues with the path walking code in conjunction with our word-at-a-time implementation because load_unaligned_zeropad can end up faulting in kernel space if it reads across a page boundary and runs into a page fault (e.g. by attempting to read from a guard region). In the case of such a fault, load_unaligned_zeropad has registered a fixup to shift the valid data and pad with zeroes, however the abort is reported as a level 3 translation fault and we dispatch it straight to do_page_fault, despite it being a kernel address. This results in calling a sleeping function from atomic context: BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 in_atomic(): 0, irqs_disabled(): 0, pid: 10290 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [...] [] ___might_sleep+0x134/0x144 [] __might_sleep+0x7c/0x8c [] do_page_fault+0x140/0x330 [] do_mem_abort+0x54/0xb0 Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) [...] [] el1_da+0x18/0x78 [] path_parentat+0x44/0x88 [] filename_parentat+0x5c/0xd8 [] filename_create+0x4c/0x128 [] SyS_mkdirat+0x50/0xc8 [] el0_svc_naked+0x24/0x28 Code: 36380080 d5384100 f9400800 9402566d (d4210000) ---[ end trace 2d01889f2bca9b9f ]--- Fix this by dispatching all translation faults to do_translation_faults, which avoids invoking the page fault logic for faults on kernel addresses. Cc: Reported-by: Ankit Jain Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 89993c4be1be..2069e9bc0fca 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -651,7 +651,7 @@ static const struct fault_info fault_info[] = { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, From bc829ee36e0ec92383c9d9b88fe08f00d4d592f8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 29 Sep 2017 11:24:19 -0500 Subject: [PATCH 1053/1322] x86/mm: Disable branch profiling in mem_encrypt.c Some routines in mem_encrypt.c are called very early in the boot process, e.g. sme_encrypt_kernel(). When CONFIG_TRACE_BRANCH_PROFILING=y is defined the resulting branch profiling associated with the check to see if SME is active results in a kernel crash. Disable branch profiling for mem_encrypt.c by defining DISABLE_BRANCH_PROFILING before including any header files. Reported-by: kernel test robot Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929162419.6016.53390.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3fcc8e01683b..16c5f37933a2 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -10,6 +10,8 @@ * published by the Free Software Foundation. */ +#define DISABLE_BRANCH_PROFILING + #include #include #include From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:15:36 +0300 Subject: [PATCH 1054/1322] x86/asm: Use register variable to get stack pointer value Currently we use current_stack_pointer() function to get the value of the stack pointer register. Since commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") ... we have a stack register variable declared. It can be used instead of current_stack_pointer() function which allows to optimize away some excessive "mov %rsp, %" instructions: -mov %rsp,%rdx -sub %rdx,%rax -cmp $0x3fff,%rax -ja ffffffff810722fd +sub %rsp,%rax +cmp $0x3fff,%rax +ja ffffffff810722fa Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer and use it instead of the removed function. Signed-off-by: Andrey Ryabinin Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 4 ++-- arch/x86/include/asm/thread_info.h | 11 ----------- arch/x86/kernel/irq_32.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/tlb.c | 2 +- 5 files changed, 7 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 30c3c9ac784a..b0dc91f4bedc 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -141,8 +141,8 @@ * gets set up by the containing function. If you forget to do this, objtool * may print a "call without frame pointer save/setup" warning. */ -register unsigned long __asm_call_sp asm(_ASM_SP); -#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 5161da1a0fa0..89e7eeb5cec1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,17 +158,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1f38d9a4d9de..d4eb450144fd 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -64,7 +64,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -139,7 +139,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 34ea3651362e..67db4f43309e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -142,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93fe97cce581..49d9778376d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -191,7 +191,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer()); + unsigned int index = pgd_index(current_stack_pointer); pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Sep 2017 13:43:15 -0400 Subject: [PATCH 1055/1322] fix infoleak in waitid(2) kernel_waitid() can return a PID, an error or 0. rusage is filled in the first case and waitid(2) rusage should've been copied out exactly in that case, *not* whenever kernel_waitid() has not returned an error. Compat variant shares that braino; none of kernel_wait4() callers do, so the below ought to fix it. Reported-and-tested-by: Alexander Potapenko Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") Cc: stable@vger.kernel.org # v4.13 Signed-off-by: Al Viro --- kernel/exit.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) From 2b634bb0686e43a6338fe779fbabd72b6b928fdc Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:10:14 -0400 Subject: [PATCH 1056/1322] i40e/i40evf: rename bytes_per_int to bytes_per_usec This value is not calculating bytes_per_int, which would actually just be bytes/ITR_COUNTDOWN_START, but rather it's calculating bytes/usecs. Rename the variable for clarity so that future developers understand what the value is actually calculating. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 12 ++++++------ drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index f426762bd83a..d9fdf69bbc6e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -960,14 +960,14 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc) { enum i40e_latency_range new_latency_range = rc->latency_range; u32 new_itr = rc->itr; - int bytes_per_int; + int bytes_per_usec; unsigned int usecs, estimated_usecs; if (rc->total_packets == 0 || !rc->itr) return false; usecs = (rc->itr << 1) * ITR_COUNTDOWN_START; - bytes_per_int = rc->total_bytes / usecs; + bytes_per_usec = rc->total_bytes / usecs; /* The calculations in this algorithm depend on interrupts actually * firing at the ITR rate. This may not happen if the packet rate is @@ -993,18 +993,18 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc) */ switch (new_latency_range) { case I40E_LOWEST_LATENCY: - if (bytes_per_int > 10) + if (bytes_per_usec > 10) new_latency_range = I40E_LOW_LATENCY; break; case I40E_LOW_LATENCY: - if (bytes_per_int > 20) + if (bytes_per_usec > 20) new_latency_range = I40E_BULK_LATENCY; - else if (bytes_per_int <= 10) + else if (bytes_per_usec <= 10) new_latency_range = I40E_LOWEST_LATENCY; break; case I40E_BULK_LATENCY: default: - if (bytes_per_int <= 20) + if (bytes_per_usec <= 20) new_latency_range = I40E_LOW_LATENCY; break; } diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index c32c62462c84..37e1de886d48 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -358,14 +358,14 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc) { enum i40e_latency_range new_latency_range = rc->latency_range; u32 new_itr = rc->itr; - int bytes_per_int; + int bytes_per_usec; unsigned int usecs, estimated_usecs; if (rc->total_packets == 0 || !rc->itr) return false; usecs = (rc->itr << 1) * ITR_COUNTDOWN_START; - bytes_per_int = rc->total_bytes / usecs; + bytes_per_usec = rc->total_bytes / usecs; /* The calculations in this algorithm depend on interrupts actually * firing at the ITR rate. This may not happen if the packet rate is @@ -391,18 +391,18 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc) */ switch (new_latency_range) { case I40E_LOWEST_LATENCY: - if (bytes_per_int > 10) + if (bytes_per_usec > 10) new_latency_range = I40E_LOW_LATENCY; break; case I40E_LOW_LATENCY: - if (bytes_per_int > 20) + if (bytes_per_usec > 20) new_latency_range = I40E_BULK_LATENCY; - else if (bytes_per_int <= 10) + else if (bytes_per_usec <= 10) new_latency_range = I40E_LOWEST_LATENCY; break; case I40E_BULK_LATENCY: default: - if (bytes_per_int <= 20) + if (bytes_per_usec <= 20) new_latency_range = I40E_LOW_LATENCY; break; } From 16badf758b25bd00528246ab9af938296b9d368d Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Fri, 14 Jul 2017 09:10:15 -0400 Subject: [PATCH 1057/1322] i40e: Fix unqualified module message while bringing link up In current driver, when ifconfig ethx up is done, the link state doesn't transition to UP inside i40e_open(). It changes after AQ command response is handled in i40e_handle_link_event(). When pf->hw.phy.link_info.link_info is DOWN inside i40e_open(), The state is transient and invalid. So log message gets printed based on incorrect info (i.e link_info and an_info). This commit removes check for unqualified module inside i40e_up_complete(). The existing check in i40e_handle_link_event() logs the error message based on correct link state information. Signed-off-by: Sudheer Mogilappagari Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6498da8806cb..b235a27232a8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5470,15 +5470,6 @@ static int i40e_up_complete(struct i40e_vsi *vsi) i40e_print_link_message(vsi, true); netif_tx_start_all_queues(vsi->netdev); netif_carrier_on(vsi->netdev); - } else if (vsi->netdev) { - i40e_print_link_message(vsi, false); - /* need to check for qualified module here*/ - if ((pf->hw.phy.link_info.link_info & - I40E_AQ_MEDIA_AVAILABLE) && - (!(pf->hw.phy.link_info.an_info & - I40E_AQ_QUALIFIED_MODULE))) - netdev_err(vsi->netdev, - "the driver failed to link because an unqualified module was detected."); } /* replay FDIR SB filters */ From 9a03449d3ea0f6b497ff3a3bf6203a5e72c7e6be Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Fri, 14 Jul 2017 09:10:16 -0400 Subject: [PATCH 1058/1322] i40e: Fix link down message when interface is brought up i40e_print_link_message() is intended to compare new link state with current link state and print log message only if the new state is different from current state. However in current driver the new state does not get updated when link is going down because of the if condition. When an interface is brought down, vsi->state is set to I40E_VSI_DOWN in i40e_vsi_close() and later i40e_print_link_message() does not get invoked in i40e_link_event due to if condition. Hence link down message doesn't appear when link is going down. The down state is seen later during i40e_open() and old state gets printed. The actual link state doesn't get updated in i40e_close() or i40e_open() but when i40e_handle_link_event is called inside i40e_clean_adminq_subtask. This change allows i40e_print_link_message() to be called when interface is going down and keeps the state information updated. Signed-off-by: Sudheer Mogilappagari Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b235a27232a8..2e8fe6186b38 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6420,8 +6420,7 @@ static void i40e_link_event(struct i40e_pf *pf) new_link == netif_carrier_ok(vsi->netdev))) return; - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) - i40e_print_link_message(vsi, new_link); + i40e_print_link_message(vsi, new_link); /* Notify the base of the switch tree connected to * the link. Floating VEBs are not notified. From 3fded4663b07f8fa99b9424ca3d5c46b79f6b27e Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Fri, 14 Jul 2017 09:10:18 -0400 Subject: [PATCH 1059/1322] i40e: simplify member variable accesses This commit replaces usage of vsi->back in i40e_print_link_message() (which is actually a PF pointer) with temp variable. Signed-off-by: Sudheer Mogilappagari Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 2e8fe6186b38..3c650917b54f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5346,13 +5346,14 @@ out: void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) { enum i40e_aq_link_speed new_speed; + struct i40e_pf *pf = vsi->back; char *speed = "Unknown"; char *fc = "Unknown"; char *fec = ""; char *req_fec = ""; char *an = ""; - new_speed = vsi->back->hw.phy.link_info.link_speed; + new_speed = pf->hw.phy.link_info.link_speed; if ((vsi->current_isup == isup) && (vsi->current_speed == new_speed)) return; @@ -5366,13 +5367,13 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) /* Warn user if link speed on NPAR enabled partition is not at * least 10GB */ - if (vsi->back->hw.func_caps.npar_enable && - (vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_1GB || - vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_100MB)) + if (pf->hw.func_caps.npar_enable && + (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_1GB || + pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_100MB)) netdev_warn(vsi->netdev, "The partition detected link speed that is less than 10Gbps\n"); - switch (vsi->back->hw.phy.link_info.link_speed) { + switch (pf->hw.phy.link_info.link_speed) { case I40E_LINK_SPEED_40GB: speed = "40 G"; break; @@ -5395,7 +5396,7 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) break; } - switch (vsi->back->hw.fc.current_mode) { + switch (pf->hw.fc.current_mode) { case I40E_FC_FULL: fc = "RX/TX"; break; @@ -5410,18 +5411,18 @@ void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) break; } - if (vsi->back->hw.phy.link_info.link_speed == I40E_LINK_SPEED_25GB) { + if (pf->hw.phy.link_info.link_speed == I40E_LINK_SPEED_25GB) { req_fec = ", Requested FEC: None"; fec = ", FEC: None"; an = ", Autoneg: False"; - if (vsi->back->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED) + if (pf->hw.phy.link_info.an_info & I40E_AQ_AN_COMPLETED) an = ", Autoneg: True"; - if (vsi->back->hw.phy.link_info.fec_info & + if (pf->hw.phy.link_info.fec_info & I40E_AQ_CONFIG_FEC_KR_ENA) fec = ", FEC: CL74 FC-FEC/BASE-R"; - else if (vsi->back->hw.phy.link_info.fec_info & + else if (pf->hw.phy.link_info.fec_info & I40E_AQ_CONFIG_FEC_RS_ENA) fec = ", FEC: CL108 RS-FEC"; From e04ea00217904fc3f6fddac0b74e74e5ac488fda Mon Sep 17 00:00:00 2001 From: Mariusz Stachura Date: Fri, 14 Jul 2017 09:10:19 -0400 Subject: [PATCH 1060/1322] i40e: relax warning message in case of version mismatch Fortville and Fort Park devices are often on different firmware release schedules. This change relaxes the minor version warning message, so it is only displayed for older FW warning version for old firmware Fortville 3 or earlier. Signed-off-by: Mariusz Stachura Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3c650917b54f..a887087d08cd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11374,8 +11374,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw->aq.api_min_ver > I40E_FW_API_VERSION_MINOR) dev_info(&pdev->dev, "The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n"); - else if (hw->aq.api_maj_ver < I40E_FW_API_VERSION_MAJOR || - hw->aq.api_min_ver < (I40E_FW_API_VERSION_MINOR - 1)) + else if (hw->aq.api_maj_ver == 1 && hw->aq.api_min_ver < 4) dev_info(&pdev->dev, "The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n"); From 0dc8692e914ac49931d69b5217d5fe0171fc026e Mon Sep 17 00:00:00 2001 From: Mariusz Stachura Date: Fri, 14 Jul 2017 09:27:00 -0400 Subject: [PATCH 1061/1322] i40e: fix for flow director counters not wrapping as expected An errata with GLQF_PCNT causes it to not wrap as expected. This can cause an error in flow director statistics. This patch resets affected counters just after reading. Signed-off-by: Mariusz Stachura Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 35 ++++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index a887087d08cd..638f5bad0bd7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -599,6 +599,20 @@ static void i40e_stat_update32(struct i40e_hw *hw, u32 reg, *stat = (u32)((new_data + BIT_ULL(32)) - *offset); } +/** + * i40e_stat_update_and_clear32 - read and clear hw reg, update a 32 bit stat + * @hw: ptr to the hardware info + * @reg: the hw reg to read and clear + * @stat: ptr to the stat + **/ +static void i40e_stat_update_and_clear32(struct i40e_hw *hw, u32 reg, u64 *stat) +{ + u32 new_data = rd32(hw, reg); + + wr32(hw, reg, 1); /* must write a nonzero value to clear register */ + *stat += new_data; +} + /** * i40e_update_eth_stats - Update VSI-specific ethernet statistics counters. * @vsi: the VSI to be updated @@ -1040,18 +1054,15 @@ static void i40e_update_pf_stats(struct i40e_pf *pf) &osd->rx_jabber, &nsd->rx_jabber); /* FDIR stats */ - i40e_stat_update32(hw, - I40E_GLQF_PCNT(I40E_FD_ATR_STAT_IDX(pf->hw.pf_id)), - pf->stat_offsets_loaded, - &osd->fd_atr_match, &nsd->fd_atr_match); - i40e_stat_update32(hw, - I40E_GLQF_PCNT(I40E_FD_SB_STAT_IDX(pf->hw.pf_id)), - pf->stat_offsets_loaded, - &osd->fd_sb_match, &nsd->fd_sb_match); - i40e_stat_update32(hw, - I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id)), - pf->stat_offsets_loaded, - &osd->fd_atr_tunnel_match, &nsd->fd_atr_tunnel_match); + i40e_stat_update_and_clear32(hw, + I40E_GLQF_PCNT(I40E_FD_ATR_STAT_IDX(hw->pf_id)), + &nsd->fd_atr_match); + i40e_stat_update_and_clear32(hw, + I40E_GLQF_PCNT(I40E_FD_SB_STAT_IDX(hw->pf_id)), + &nsd->fd_sb_match); + i40e_stat_update_and_clear32(hw, + I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(hw->pf_id)), + &nsd->fd_atr_tunnel_match); val = rd32(hw, I40E_PRTPM_EEE_STAT); nsd->tx_lpi_status = From 905770fa3e6f30b393829ba1c238554e7f238aee Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 14 Jul 2017 09:27:01 -0400 Subject: [PATCH 1062/1322] i40evf: lower message level We see this message regularly on VF reset or unload (which invokes a reset). It's essentially meaningless unless it's happening constantly. To prevent consternation, lower the log level to debug so it's not seen under normal circumstance. Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c index 85876f4fb1fb..2bb0fe00361f 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c @@ -52,7 +52,7 @@ static int i40evf_send_pf_msg(struct i40evf_adapter *adapter, err = i40e_aq_send_msg_to_pf(hw, op, 0, msg, len, NULL); if (err) - dev_err(&adapter->pdev->dev, "Unable to send opcode %d to PF, err %s, aq_err %s\n", + dev_dbg(&adapter->pdev->dev, "Unable to send opcode %d to PF, err %s, aq_err %s\n", op, i40evf_stat_str(hw, err), i40evf_aq_str(hw, hw->aq.asq_last_status)); return err; From c17401a1dd210a5f22ab1ec7c7366037c158a14c Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:27:02 -0400 Subject: [PATCH 1063/1322] i40e: use separate state bit for miscellaneous IRQ setup We currently (mis)use the __I40E_RECOVERY_PENDING bit to determine when we should actually request a new IRQ in i40e_setup_misc_vector(). This led to a design mistake where we open-coded the re-setup of the miscellaneous vector in i40e_resume() instead of using the function provided. If we did not open-code this and instead tried to use the i40e_setup_misc_vector() function, it would lead to never reallocating the IRQ. This would lead to a second i40e_suspend() call failing to free the vector due to a NULL pointer dereference. A future patch is going to re-work how the i40e_suspend() and i40e_resume() flows work to clear all IRQ vectors, which would require us to use i40e_setup_misc_vector() directly. Since during this time the __I40E_RECOVERY_PENDING bit is set, we'll never re-allocate the vector. Rather than leaving the open-coded setup in i40e_resume() lets just fix the problem properly in i40e_setup_misc_vector(). Introduce a new state bit which indicates when the IRQ has been assigned, which will be set when i40e_setup_misc_vector is first called. This ultimately resolves the issue of re-requesting the vector, without overloading the __I40E_RECOVERY_PENDING state. This ensures that the suspend/resume cycle can use the setup function instead of open-coding the re-request during resume. Additionally, since the only callers of i40e_stop_misc_vector also want to free it, move this code directly into the function to avoid duplication. Due to the new functionality, rename it to i40e_free_misc_vector(). This lets us drop the extra calls to free and re-enable the vector during i40e_suspend() and i40e_resume(). We don't need to call i40e_setup_misc_Vector() in i40e_resume() because it gets called by the i40e_rebuild() call. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 39 ++++++++------------- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index d0c1bf5441d8..b7a539cdca00 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -136,6 +136,7 @@ enum i40e_state_t { __I40E_MDD_EVENT_PENDING, __I40E_VFLR_EVENT_PENDING, __I40E_RESET_RECOVERY_PENDING, + __I40E_MISC_IRQ_REQUESTED, __I40E_RESET_INTR_RECEIVED, __I40E_REINIT_REQUESTED, __I40E_PF_RESET_REQUESTED, diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 638f5bad0bd7..3ea4f8b942c3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3604,14 +3604,20 @@ static int i40e_vsi_enable_irq(struct i40e_vsi *vsi) } /** - * i40e_stop_misc_vector - Stop the vector that handles non-queue events + * i40e_free_misc_vector - Free the vector that handles non-queue events * @pf: board private structure **/ -static void i40e_stop_misc_vector(struct i40e_pf *pf) +static void i40e_free_misc_vector(struct i40e_pf *pf) { /* Disable ICR 0 */ wr32(&pf->hw, I40E_PFINT_ICR0_ENA, 0); i40e_flush(&pf->hw); + + if (pf->flags & I40E_FLAG_MSIX_ENABLED && pf->msix_entries) { + synchronize_irq(pf->msix_entries[0].vector); + free_irq(pf->msix_entries[0].vector, pf); + clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state); + } } /** @@ -4466,11 +4472,7 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf) { int i; - i40e_stop_misc_vector(pf); - if (pf->flags & I40E_FLAG_MSIX_ENABLED && pf->msix_entries) { - synchronize_irq(pf->msix_entries[0].vector); - free_irq(pf->msix_entries[0].vector, pf); - } + i40e_free_misc_vector(pf); i40e_put_lump(pf->irq_pile, pf->iwarp_base_vector, I40E_IWARP_IRQ_PILE_ID); @@ -8365,13 +8367,12 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf) struct i40e_hw *hw = &pf->hw; int err = 0; - /* Only request the irq if this is the first time through, and - * not when we're rebuilding after a Reset - */ - if (!test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state)) { + /* Only request the IRQ once, the first time through. */ + if (!test_and_set_bit(__I40E_MISC_IRQ_REQUESTED, pf->state)) { err = request_irq(pf->msix_entries[0].vector, i40e_intr, 0, pf->int_name, pf); if (err) { + clear_bit(__I40E_MISC_IRQ_REQUESTED, pf->state); dev_info(&pf->pdev->dev, "request_irq for %s failed: %d\n", pf->int_name, err); @@ -12069,11 +12070,8 @@ static int i40e_suspend(struct pci_dev *pdev, pm_message_t state) wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); - i40e_stop_misc_vector(pf); - if (pf->msix_entries) { - synchronize_irq(pf->msix_entries[0].vector); - free_irq(pf->msix_entries[0].vector, pf); - } + i40e_free_misc_vector(pf); + retval = pci_save_state(pdev); if (retval) return retval; @@ -12113,15 +12111,6 @@ static int i40e_resume(struct pci_dev *pdev) /* handling the reset will rebuild the device state */ if (test_and_clear_bit(__I40E_SUSPENDED, pf->state)) { clear_bit(__I40E_DOWN, pf->state); - if (pf->msix_entries) { - err = request_irq(pf->msix_entries[0].vector, - i40e_intr, 0, pf->int_name, pf); - if (err) { - dev_err(&pf->pdev->dev, - "request_irq for %s failed: %d\n", - pf->int_name, err); - } - } i40e_reset_and_rebuild(pf, false, false); } From 0e5d3da400558b7d30586a2cc1afe02276445636 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:27:03 -0400 Subject: [PATCH 1064/1322] i40e: use newer generic PM support instead of legacy PM callbacks Stop using the old legacy PM support, since we now have stable support for the newer generic PM callbacks. This has several advantages. First, we no longer have to manage our own pci_save_state() and power changes, as it's preferred to have the PCI stack do this. Second, these routines get called for both hibernate and suspend to ram, so we can have the driver properly handle all the suspend/resume flows that it needs to. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 54 +++++++-------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3ea4f8b942c3..c82360437024 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12050,14 +12050,14 @@ static void i40e_shutdown(struct pci_dev *pdev) #ifdef CONFIG_PM /** - * i40e_suspend - PCI callback for moving to D3 - * @pdev: PCI device information struct + * i40e_suspend - PM callback for moving to D3 + * @dev: generic device information structure **/ -static int i40e_suspend(struct pci_dev *pdev, pm_message_t state) +static int i40e_suspend(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct i40e_pf *pf = pci_get_drvdata(pdev); struct i40e_hw *hw = &pf->hw; - int retval = 0; set_bit(__I40E_SUSPENDED, pf->state); set_bit(__I40E_DOWN, pf->state); @@ -12072,41 +12072,17 @@ static int i40e_suspend(struct pci_dev *pdev, pm_message_t state) i40e_free_misc_vector(pf); - retval = pci_save_state(pdev); - if (retval) - return retval; - - pci_wake_from_d3(pdev, pf->wol_en); - pci_set_power_state(pdev, PCI_D3hot); - - return retval; + return 0; } /** - * i40e_resume - PCI callback for waking up from D3 - * @pdev: PCI device information struct + * i40e_resume - PM callback for waking up from D3 + * @dev: generic device information structure **/ -static int i40e_resume(struct pci_dev *pdev) +static int i40e_resume(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct i40e_pf *pf = pci_get_drvdata(pdev); - u32 err; - - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - /* pci_restore_state() clears dev->state_saves, so - * call pci_save_state() again to restore it. - */ - pci_save_state(pdev); - - err = pci_enable_device_mem(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n"); - return err; - } - pci_set_master(pdev); - - /* no wakeup events while running */ - pci_wake_from_d3(pdev, false); /* handling the reset will rebuild the device state */ if (test_and_clear_bit(__I40E_SUSPENDED, pf->state)) { @@ -12117,22 +12093,26 @@ static int i40e_resume(struct pci_dev *pdev) return 0; } -#endif +#endif /* CONFIG_PM */ + static const struct pci_error_handlers i40e_err_handler = { .error_detected = i40e_pci_error_detected, .slot_reset = i40e_pci_error_slot_reset, .resume = i40e_pci_error_resume, }; +static SIMPLE_DEV_PM_OPS(i40e_pm_ops, i40e_suspend, i40e_resume); + static struct pci_driver i40e_driver = { .name = i40e_driver_name, .id_table = i40e_pci_tbl, .probe = i40e_probe, .remove = i40e_remove, #ifdef CONFIG_PM - .suspend = i40e_suspend, - .resume = i40e_resume, -#endif + .driver = { + .pm = &i40e_pm_ops, + }, +#endif /* CONFIG_PM */ .shutdown = i40e_shutdown, .err_handler = &i40e_err_handler, .sriov_configure = i40e_pci_sriov_configure, From 401586c2b9bb16147f3dcc64d3596013625e2c44 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:27:04 -0400 Subject: [PATCH 1065/1322] i40e: don't clear suspended state until we finish resuming When handling suspend and resume callbacks we want to make sure that (a) we don't suspend again if we're already suspended and (b) we don't resume again if we're already resuming. Lets make sure we test_and_set the __I40E_SUSPENDED bit in i40e_suspend which ensures that a suspend call when already suspended will exit early. Additionally, if __I40E_SUSPENDED is not set when we begin resuming, exit early as well. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index c82360437024..494cafde6b26 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12059,7 +12059,10 @@ static int i40e_suspend(struct device *dev) struct i40e_pf *pf = pci_get_drvdata(pdev); struct i40e_hw *hw = &pf->hw; - set_bit(__I40E_SUSPENDED, pf->state); + /* If we're already suspended, then there is nothing to do */ + if (test_and_set_bit(__I40E_SUSPENDED, pf->state)) + return 0; + set_bit(__I40E_DOWN, pf->state); if (pf->wol_en && (pf->hw_features & I40E_HW_WOL_MC_MAGIC_PKT_WAKE)) @@ -12084,11 +12087,15 @@ static int i40e_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct i40e_pf *pf = pci_get_drvdata(pdev); - /* handling the reset will rebuild the device state */ - if (test_and_clear_bit(__I40E_SUSPENDED, pf->state)) { - clear_bit(__I40E_DOWN, pf->state); - i40e_reset_and_rebuild(pf, false, false); - } + /* If we're not suspended, then there is nothing to do */ + if (!test_bit(__I40E_SUSPENDED, pf->state)) + return 0; + + clear_bit(__I40E_DOWN, pf->state); + i40e_reset_and_rebuild(pf, false, false); + + /* Clear suspended state last after everything is recovered */ + clear_bit(__I40E_SUSPENDED, pf->state); return 0; } From 5c499228803a77bd4e878c7119fbd40a1dc6d773 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:27:05 -0400 Subject: [PATCH 1066/1322] i40e: prevent service task from running while we're suspended Although the service task does check the suspended status before running, it might already be part way through running when we go to suspend. Lets ensure that the service task is stopped and will not be restarted again until we finish resuming. This ensures that service task code does not cause strange interactions with the suspend/resume handlers. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 494cafde6b26..368373459ad5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12065,6 +12065,10 @@ static int i40e_suspend(struct device *dev) set_bit(__I40E_DOWN, pf->state); + /* Ensure service task will not be running */ + del_timer_sync(&pf->service_timer); + cancel_work_sync(&pf->service_task); + if (pf->wol_en && (pf->hw_features & I40E_HW_WOL_MC_MAGIC_PKT_WAKE)) i40e_enable_mc_magic_wake(pf); @@ -12097,6 +12101,10 @@ static int i40e_resume(struct device *dev) /* Clear suspended state last after everything is recovered */ clear_bit(__I40E_SUSPENDED, pf->state); + /* Restart the service task */ + mod_timer(&pf->service_timer, + round_jiffies(jiffies + pf->service_timer_period)); + return 0; } From b980c0634fe56928a45cc3c0f688d96e36705403 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 14 Jul 2017 09:27:06 -0400 Subject: [PATCH 1067/1322] i40e: shutdown all IRQs and disable MSI-X when suspended On some platforms with a large number of CPUs, we will allocate many IRQ vectors. When hibernating, the system will attempt to migrate all of the vectors back to CPU0 when shutting down all the other CPUs. It is possible that we have so many vectors that it cannot re-assign them to CPU0. This is even more likely if we have many devices installed in one platform. The end result is failure to hibernate, as it is not possible to shutdown the CPUs. We can avoid this by disabling MSI-X and clearing our interrupt scheme when the device is suspended. A more ideal solution would be some method for the stack to properly handle this for all drivers, rather than on a case-by-case basis for each driver to fix itself. However, until this more ideal solution exists, we can do our part and shutdown our IRQs during suspend, which should allow systems with a large number of CPUs to safely suspend or hibernate. It may be worth investigating if we should shut down even further when we suspend as it may make the path cleaner, but this was the minimum fix for the hibernation issue mentioned here. Testing-hints: This affects systems with a large number of CPUs, and with multiple devices enabled. Without this change, those platforms are unable to hibernate at all. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 68 ++++++++++++++++++- .../net/ethernet/intel/i40evf/i40evf_main.c | 2 +- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 368373459ad5..8a44793d5390 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8354,6 +8354,57 @@ static int i40e_init_interrupt_scheme(struct i40e_pf *pf) return 0; } +#ifdef CONFIG_PM +/** + * i40e_restore_interrupt_scheme - Restore the interrupt scheme + * @pf: private board data structure + * + * Restore the interrupt scheme that was cleared when we suspended the + * device. This should be called during resume to re-allocate the q_vectors + * and reacquire IRQs. + */ +static int i40e_restore_interrupt_scheme(struct i40e_pf *pf) +{ + int err, i; + + /* We cleared the MSI and MSI-X flags when disabling the old interrupt + * scheme. We need to re-enabled them here in order to attempt to + * re-acquire the MSI or MSI-X vectors + */ + pf->flags |= (I40E_FLAG_MSIX_ENABLED | I40E_FLAG_MSI_ENABLED); + + err = i40e_init_interrupt_scheme(pf); + if (err) + return err; + + /* Now that we've re-acquired IRQs, we need to remap the vectors and + * rings together again. + */ + for (i = 0; i < pf->num_alloc_vsi; i++) { + if (pf->vsi[i]) { + err = i40e_vsi_alloc_q_vectors(pf->vsi[i]); + if (err) + goto err_unwind; + i40e_vsi_map_rings_to_vectors(pf->vsi[i]); + } + } + + err = i40e_setup_misc_vector(pf); + if (err) + goto err_unwind; + + return 0; + +err_unwind: + while (i--) { + if (pf->vsi[i]) + i40e_vsi_free_q_vectors(pf->vsi[i]); + } + + return err; +} +#endif /* CONFIG_PM */ + /** * i40e_setup_misc_vector - Setup the misc vector to handle non queue events * @pf: board private structure @@ -12077,7 +12128,12 @@ static int i40e_suspend(struct device *dev) wr32(hw, I40E_PFPM_APM, (pf->wol_en ? I40E_PFPM_APM_APME_MASK : 0)); wr32(hw, I40E_PFPM_WUFC, (pf->wol_en ? I40E_PFPM_WUFC_MAG_MASK : 0)); - i40e_free_misc_vector(pf); + /* Clear the interrupt scheme and release our IRQs so that the system + * can safely hibernate even when there are a large number of CPUs. + * Otherwise hibernation might fail when mapping all the vectors back + * to CPU0. + */ + i40e_clear_interrupt_scheme(pf); return 0; } @@ -12090,11 +12146,21 @@ static int i40e_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct i40e_pf *pf = pci_get_drvdata(pdev); + int err; /* If we're not suspended, then there is nothing to do */ if (!test_bit(__I40E_SUSPENDED, pf->state)) return 0; + /* We cleared the interrupt scheme when we suspended, so we need to + * restore it now to resume device functionality. + */ + err = i40e_restore_interrupt_scheme(pf); + if (err) { + dev_err(&pdev->dev, "Cannot restore interrupt scheme: %d\n", + err); + } + clear_bit(__I40E_DOWN, pf->state); i40e_reset_and_rebuild(pf, false, false); diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index c243f9da95ae..80ade6510279 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -46,7 +46,7 @@ static const char i40evf_driver_string[] = #define DRV_VERSION_MAJOR 3 #define DRV_VERSION_MINOR 0 -#define DRV_VERSION_BUILD 0 +#define DRV_VERSION_BUILD 1 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \ __stringify(DRV_VERSION_MINOR) "." \ __stringify(DRV_VERSION_BUILD) \ From c97fc9b6a798f4253c176231ba0aceda6b59b058 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Fri, 14 Jul 2017 09:27:07 -0400 Subject: [PATCH 1068/1322] i40evf: fix ring to vector mapping The current implementation for mapping queues to vectors is broken because it attempts to map each Tx and Rx ring to its own vector, however we use combined queues so we should actually be mapping the Tx/Rx rings together on one vector. Also in the current implementation, in the case where we have more queues than vectors, we attempt to group the queues together into 'chunks' and map each 'chunk' of queues to a vector. Chunking them together would be more ideal if, and only if, we only had RSS because of the way the hashing algorithm works but in the case of a future patch that enables VF ADq, round robin assignment is better and still works with RSS. This patch resolves both those issues and simplifies the code needed to accomplish this. Instead of treating the case where we have more queues than vectors as special, if we notice our vector index is greater than vectors, reset the vector index to zero and continue mapping. This should ensure that in both cases, whether we have enough vectors for each queue or not, the queues get appropriately mapped. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../net/ethernet/intel/i40evf/i40evf_main.c | 48 ++++--------------- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 80ade6510279..69ef6c1d5364 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -432,52 +432,24 @@ i40evf_map_vector_to_txq(struct i40evf_adapter *adapter, int v_idx, int t_idx) **/ static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) { + int rings_remaining = adapter->num_active_queues; + int ridx = 0, vidx = 0; int q_vectors; - int v_start = 0; - int rxr_idx = 0, txr_idx = 0; - int rxr_remaining = adapter->num_active_queues; - int txr_remaining = adapter->num_active_queues; - int i, j; - int rqpv, tqpv; int err = 0; q_vectors = adapter->num_msix_vectors - NONQ_VECS; - /* The ideal configuration... - * We have enough vectors to map one per queue. - */ - if (q_vectors >= (rxr_remaining * 2)) { - for (; rxr_idx < rxr_remaining; v_start++, rxr_idx++) - i40evf_map_vector_to_rxq(adapter, v_start, rxr_idx); + for (; ridx < rings_remaining; ridx++) { + i40evf_map_vector_to_rxq(adapter, vidx, ridx); + i40evf_map_vector_to_txq(adapter, vidx, ridx); - for (; txr_idx < txr_remaining; v_start++, txr_idx++) - i40evf_map_vector_to_txq(adapter, v_start, txr_idx); - goto out; + /* In the case where we have more queues than vectors, continue + * round-robin on vectors until all queues are mapped. + */ + if (++vidx >= q_vectors) + vidx = 0; } - /* If we don't have enough vectors for a 1-to-1 - * mapping, we'll have to group them so there are - * multiple queues per vector. - * Re-adjusting *qpv takes care of the remainder. - */ - for (i = v_start; i < q_vectors; i++) { - rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - i); - for (j = 0; j < rqpv; j++) { - i40evf_map_vector_to_rxq(adapter, i, rxr_idx); - rxr_idx++; - rxr_remaining--; - } - } - for (i = v_start; i < q_vectors; i++) { - tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - i); - for (j = 0; j < tqpv; j++) { - i40evf_map_vector_to_txq(adapter, i, txr_idx); - txr_idx++; - txr_remaining--; - } - } - -out: adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS; return err; From a3f5aa907340b5d7b54223ddbaa90410f168864d Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Fri, 14 Jul 2017 09:27:08 -0400 Subject: [PATCH 1069/1322] i40e: Enable VF to negotiate number of allocated queues Currently the PF allocates a default number of queues for each VF and cannot be changed. This patch enables the VF to request a different number of queues allocated to it. This patch also adds a new virtchnl op and capability flag to facilitate this negotiation. After the PF receives a request message, it will set a requested number of queues for that VF. Then when the VF resets, its VSI will get a new number of queues allocated to it. This is a best effort request and since we only allocate a guaranteed default number, if the VF tries to ask for more than the guaranteed number, there may not be enough in HW to accommodate it unless other queues for other VFs are freed. It should also be noted decreasing the number queues allocated to a VF to below the default will NOT enable the allocation of more than 32 VFs per PF and will not free queues guaranteed to each VF by default. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 75 +++++++++++++++++++ .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 1 + include/linux/avf/virtchnl.h | 20 +++++ 4 files changed, 97 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index b7a539cdca00..439c63cb2a0c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -77,6 +77,7 @@ #define i40e_default_queues_per_vmdq(pf) \ (((pf)->hw_features & I40E_HW_RSS_AQ_CAPABLE) ? 4 : 1) #define I40E_DEFAULT_QUEUES_PER_VF 4 +#define I40E_MAX_VF_QUEUES 16 #define I40E_DEFAULT_QUEUES_PER_TC 1 /* should be a power of 2 */ #define i40e_pf_get_max_q_per_tc(pf) \ (((pf)->hw_features & I40E_HW_128_QP_RSS_CAPABLE) ? 128 : 64) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 4d1e670f490e..a75396c157d9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -815,6 +815,14 @@ static void i40e_free_vf_res(struct i40e_vf *vf) */ clear_bit(I40E_VF_STATE_INIT, &vf->vf_states); + /* It's possible the VF had requeuested more queues than the default so + * do the accounting here when we're about to free them. + */ + if (vf->num_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) { + pf->queues_left += vf->num_queue_pairs - + I40E_DEFAULT_QUEUES_PER_VF; + } + /* free vsi & disconnect it from the parent uplink */ if (vf->lan_vsi_idx) { i40e_vsi_release(pf->vsi[vf->lan_vsi_idx]); @@ -868,12 +876,27 @@ static int i40e_alloc_vf_res(struct i40e_vf *vf) int total_queue_pairs = 0; int ret; + if (vf->num_req_queues && + vf->num_req_queues <= pf->queues_left + I40E_DEFAULT_QUEUES_PER_VF) + pf->num_vf_qps = vf->num_req_queues; + else + pf->num_vf_qps = I40E_DEFAULT_QUEUES_PER_VF; + /* allocate hw vsi context & associated resources */ ret = i40e_alloc_vsi_res(vf, I40E_VSI_SRIOV); if (ret) goto error_alloc; total_queue_pairs += pf->vsi[vf->lan_vsi_idx]->alloc_queue_pairs; + /* We account for each VF to get a default number of queue pairs. If + * the VF has now requested more, we need to account for that to make + * certain we never request more queues than we actually have left in + * HW. + */ + if (total_queue_pairs > I40E_DEFAULT_QUEUES_PER_VF) + pf->queues_left -= + total_queue_pairs - I40E_DEFAULT_QUEUES_PER_VF; + if (vf->trusted) set_bit(I40E_VIRTCHNL_VF_CAP_PRIVILEGE, &vf->vf_caps); else @@ -1579,6 +1602,9 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) VIRTCHNL_VF_OFFLOAD_WB_ON_ITR; } + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_REQ_QUEUES) + vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_REQ_QUEUES; + vfres->num_vsis = num_vsis; vfres->num_queue_pairs = vf->num_queue_pairs; vfres->max_vectors = pf->hw.func_caps.num_msix_vectors_vf; @@ -1986,6 +2012,52 @@ error_param: aq_ret); } +/** + * i40e_vc_request_queues_msg + * @vf: pointer to the VF info + * @msg: pointer to the msg buffer + * @msglen: msg length + * + * VFs get a default number of queues but can use this message to request a + * different number. Will respond with either the number requested or the + * maximum we can support. + **/ +static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg, int msglen) +{ + struct virtchnl_vf_res_request *vfres = + (struct virtchnl_vf_res_request *)msg; + int req_pairs = vfres->num_queue_pairs; + int cur_pairs = vf->num_queue_pairs; + struct i40e_pf *pf = vf->pf; + + if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) + return -EINVAL; + + if (req_pairs <= 0) { + dev_err(&pf->pdev->dev, + "VF %d tried to request %d queues. Ignoring.\n", + vf->vf_id, req_pairs); + } else if (req_pairs > I40E_MAX_VF_QUEUES) { + dev_err(&pf->pdev->dev, + "VF %d tried to request more than %d queues.\n", + vf->vf_id, + I40E_MAX_VF_QUEUES); + vfres->num_queue_pairs = I40E_MAX_VF_QUEUES; + } else if (req_pairs - cur_pairs > pf->queues_left) { + dev_warn(&pf->pdev->dev, + "VF %d requested %d more queues, but only %d left.\n", + vf->vf_id, + req_pairs - cur_pairs, + pf->queues_left); + vfres->num_queue_pairs = pf->queues_left + cur_pairs; + } else { + vf->num_req_queues = req_pairs; + } + + return i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_REQUEST_QUEUES, 0, + (u8 *)vfres, sizeof(vfres)); +} + /** * i40e_vc_get_stats_msg * @vf: pointer to the VF info @@ -2708,6 +2780,9 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen); break; + case VIRTCHNL_OP_REQUEST_QUEUES: + ret = i40e_vc_request_queues_msg(vf, msg, msglen); + break; case VIRTCHNL_OP_UNKNOWN: default: diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 1f4b0c504368..5111d05d5f2f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -97,6 +97,7 @@ struct i40e_vf { u16 lan_vsi_id; /* ID as used by firmware */ u8 num_queue_pairs; /* num of qps assigned to VF vsis */ + u8 num_req_queues; /* num of requested qps */ u64 num_mdd_events; /* num of mdd events detected */ /* num of continuous malformed or invalid msgs detected */ u64 num_invalid_msgs; diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2b038442c352..60e5d90cb18a 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -135,6 +135,7 @@ enum virtchnl_ops { VIRTCHNL_OP_SET_RSS_HENA = 26, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28, + VIRTCHNL_OP_REQUEST_QUEUES = 29, }; /* This macro is used to generate a compilation error if a structure @@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RSS_AQ 0x00000008 #define VIRTCHNL_VF_OFFLOAD_RSS_REG 0x00000010 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR 0x00000020 +#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES 0x00000040 #define VIRTCHNL_VF_OFFLOAD_VLAN 0x00010000 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING 0x00020000 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 0x00040000 @@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info { struct virtchnl_queue_pair_info qpair[1]; }; +/* VIRTCHNL_OP_REQUEST_QUEUES + * VF sends this message to request the PF to allocate additional queues to + * this VF. Each VF gets a guaranteed number of queues on init but asking for + * additional queues must be negotiated. This is a best effort request as it + * is possible the PF does not have enough queues left to support the request. + * If the PF cannot support the number requested it will respond with the + * maximum number it is able to support; otherwise it will respond with the + * number requested. + */ + +/* VF resource request */ +struct virtchnl_vf_res_request { + u16 num_queue_pairs; +}; + VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info); /* VIRTCHNL_OP_CONFIG_IRQ_MAP @@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: break; + case VIRTCHNL_OP_REQUEST_QUEUES: + valid_len = sizeof(struct virtchnl_vf_res_request); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: From 22b96551f213d7e7d743442c923c266a10306b9b Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 14 Jul 2017 09:27:09 -0400 Subject: [PATCH 1070/1322] i40e: refactor FW version checking The i40e driver now supports two different devices with two different firmware versions. So be smart about how we handle these. Move the FW version macros to the appropriate header file, and add a convenience macro that checks the version based on the device. Then use this macro to check whether or not the driver can use the new link info API. Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 10 +++++++++- drivers/net/ethernet/intel/i40e/i40e_common.c | 6 ++++-- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h | 10 +++++++++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 5d5f422cbae5..e2a9ec80a623 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -34,7 +34,15 @@ */ #define I40E_FW_API_VERSION_MAJOR 0x0001 -#define I40E_FW_API_VERSION_MINOR 0x0005 +#define I40E_FW_API_VERSION_MINOR_X722 0x0005 +#define I40E_FW_API_VERSION_MINOR_X710 0x0007 + +#define I40E_FW_MINOR_VERSION(_h) ((_h)->mac.type == I40E_MAC_XL710 ? \ + I40E_FW_API_VERSION_MINOR_X710 : \ + I40E_FW_API_VERSION_MINOR_X722) + +/* API version 1.7 implements additional link and PHY-specific APIs */ +#define I40E_MINOR_VER_GET_LINK_INFO_XL710 0x0007 struct i40e_aq_desc { __le16 flags; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 111426ba5fbc..7346d8850c8e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1593,8 +1593,10 @@ i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw, status = I40E_ERR_UNKNOWN_PHY; if (report_init) { - hw->phy.phy_types = le32_to_cpu(abilities->phy_type); - hw->phy.phy_types |= ((u64)abilities->phy_type_ext << 32); + if (hw->mac.type == I40E_MAC_XL710 && + hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR && + hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) + status = i40e_aq_get_link_info(hw, true, NULL, NULL); } return status; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 8a44793d5390..47f71d7c3ae0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11434,7 +11434,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i40e_nvm_version_str(hw)); if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR && - hw->aq.api_min_ver > I40E_FW_API_VERSION_MINOR) + hw->aq.api_min_ver > I40E_FW_MINOR_VERSION(hw)) dev_info(&pdev->dev, "The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n"); else if (hw->aq.api_maj_ver == 1 && hw->aq.api_min_ver < 4) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index 83e63e55c4b4..f9f48d1900b0 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -34,7 +34,15 @@ */ #define I40E_FW_API_VERSION_MAJOR 0x0001 -#define I40E_FW_API_VERSION_MINOR 0x0005 +#define I40E_FW_API_VERSION_MINOR_X722 0x0005 +#define I40E_FW_API_VERSION_MINOR_X710 0x0007 + +#define I40E_FW_MINOR_VERSION(_h) ((_h)->mac.type == I40E_MAC_XL710 ? \ + I40E_FW_API_VERSION_MINOR_X710 : \ + I40E_FW_API_VERSION_MINOR_X722) + +/* API version 1.7 implements additional link and PHY-specific APIs */ +#define I40E_MINOR_VER_GET_LINK_INFO_XL710 0x0007 struct i40e_aq_desc { __le16 flags; From 1f372c7bfb23286d2bf4ce0423ab488e86b74bb2 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Mon, 25 Sep 2017 22:01:36 +0100 Subject: [PATCH 1071/1322] net: ipv6: send NS for DAD when link operationally up The NS for DAD are sent on admin up as long as a valid qdisc is found. A race condition exists by which these packets will not egress the interface if the operational state of the lower device is not yet up. The solution is to delay DAD until the link is operationally up according to RFC2863. Rather than only doing this, follow the existing code checks by deferring IPv6 device initialization altogether. The fix allows DAD on devices like tunnels that are controlled by userspace control plane. The fix has no impact on regular deployments, but means that there is no IPv6 connectivity until the port has been opened in the case of port-based network access control, which should be desirable. Signed-off-by: Mike Manning Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 13c3b697f8c0..f553f72d0bee 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -3403,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3418,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } From 84e14fe353de7624872e582887712079ba0b2d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 21:32:42 -0700 Subject: [PATCH 1072/1322] net-ipv6: add support for sockopt(SOL_IPV6, IPV6_FREEBIND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far we've been relying on sockopt(SOL_IP, IP_FREEBIND) being usable even on IPv6 sockets. However, it turns out it is perfectly reasonable to want to set freebind on an AF_INET6 SOCK_RAW socket - but there is no way to set any SOL_IP socket option on such a socket (they're all blindly errored out). One use case for this is to allow spoofing src ip on a raw socket via sendmsg cmsg. Tested: built, and booted # python >>> import socket >>> SOL_IP = socket.SOL_IP >>> SOL_IPV6 = socket.IPPROTO_IPV6 >>> IP_FREEBIND = 15 >>> IPV6_FREEBIND = 78 >>> s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM, 0) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 0 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 0 >>> s.setsockopt(SOL_IPV6, IPV6_FREEBIND, 1) >>> s.getsockopt(SOL_IP, IP_FREEBIND) 1 >>> s.getsockopt(SOL_IPV6, IPV6_FREEBIND) 1 Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 46444f8fbee4..4f8f3eb0699f 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -284,6 +284,7 @@ struct in6_flowlabel_req { #define IPV6_TRANSPARENT 75 #define IPV6_UNICAST_IF 76 #define IPV6_RECVFRAGSIZE 77 +#define IPV6_FREEBIND 78 /* * Multicast Routing: diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..b9404feabd78 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_FREEBIND: + if (optlen < sizeof(int)) + goto e_inval; + /* we also don't have a separate freebind bit for IPV6 */ + inet_sk(sk)->freebind = valbool; + retv = 0; + break; + case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; @@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = inet_sk(sk)->transparent; break; + case IPV6_FREEBIND: + val = inet_sk(sk)->freebind; + break; + case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; From 45c1fd61d5cead2ae23880e0677b8660ab9006a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 27 Sep 2017 22:45:13 +0100 Subject: [PATCH 1073/1322] mkiss: remove redundant check on len being zero The check on len is redundant as it is always greater than 1, so just remove it and make the printk less complex. Detected by CoverityScan, CID#1226729 ("Logically dead code") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/hamradio/mkiss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index aec6c26563cf..54bf8e6e4a09 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -477,7 +477,8 @@ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len) cmd = 0; } ax->crcauto = (cmd ? 0 : 1); - printk(KERN_INFO "mkiss: %s: crc mode %s %d\n", ax->dev->name, (len) ? "set to" : "is", cmd); + printk(KERN_INFO "mkiss: %s: crc mode set to %d\n", + ax->dev->name, cmd); } spin_unlock_bh(&ax->buflock); netif_start_queue(dev); From fef0035c0f31322d417d1954bba5ab959bf91183 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 28 Sep 2017 00:41:44 +0200 Subject: [PATCH 1074/1322] netlink: do not proceed if dump's start() errs Drivers that use the start method for netlink dumping rely on dumpit not being called if start fails. For example, ila_xlat.c allocates memory and assigns it to cb->args[0] in its start() function. It might fail to do that and return -ENOMEM instead. However, even when returning an error, dumpit will be called, which, in the example above, quickly dereferences the memory in cb->args[0], which will OOPS the kernel. This is but one example of how this goes wrong. Since start() has always been a function with an int return type, it therefore makes sense to use it properly, rather than ignoring it. This patch thus returns early and does not call dumpit() when start() fails. Signed-off-by: Jason A. Donenfeld Cc: Johannes Berg Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) From d51711c0557d6dbd26c63144aef32c7b3ec264b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:31 +0800 Subject: [PATCH 1075/1322] ip_gre: ipgre_tap device should keep dst Without keeping dst, the tunnel will not update any mtu/pmtu info, since it does not have a dst on the skb. Reproducer: client(ipgre_tap1 - eth1) <-----> (eth1 - ipgre_tap1)server After reducing eth1's mtu on client, then perforamnce became 0. This patch is to netif_keep_dst in gre_tap_init, as ipgre does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..8b837f6f5532 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1223,6 +1223,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } From 2d40557cc702ed8e5edd9bd422233f86652d932e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:23:50 +0800 Subject: [PATCH 1076/1322] ip6_gre: ip6gre_tap device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed a issue that ipgre_tap mtu couldn't be updated in tx path. The same fix is needed for ip6gre_tap as well. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 20f66f4c9460..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1311,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], From d41bb33ba33b8f8debe54ed36be6925eb496e354 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Sep 2017 13:24:07 +0800 Subject: [PATCH 1077/1322] ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path Now when updating mtu in tx path, it doesn't consider ARPHRD_ETHER tunnel device, like ip6gre_tap tunnel, for which it should also subtract ether header to get the correct mtu. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f2f21c24915f..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; From 7487449c86c65202b3b725c4524cb48dd65e4e6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:36 +0200 Subject: [PATCH 1078/1322] IPv4: early demux can return an error code Currently no error is emitted, but this infrastructure will used by the next patch to allow source address validation for mcast sockets. Since early demux can do a route lookup and an ipv4 route lookup can return an error code this is consistent with the current ipv4 route infrastructure. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/protocol.h | 4 ++-- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 25 +++++++++++++++---------- net/ipv4/tcp_ipv4.c | 9 +++++---- net/ipv4/udp.c | 11 ++++++----- 6 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/net/protocol.h b/include/net/protocol.h index 65ba335b0e7e..4fc75f7ae23b 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -39,8 +39,8 @@ /* This is used to register protocols. */ struct net_protocol { - void (*early_demux)(struct sk_buff *skb); - void (*early_demux_handler)(struct sk_buff *skb); + int (*early_demux)(struct sk_buff *skb); + int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 3bc910a9bfc6..89974c5286d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -void tcp_v4_early_demux(struct sk_buff *skb); +int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); diff --git a/include/net/udp.h b/include/net/udp.h index 12dfbfe2e2d7..6c759c8594e2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); } -void udp_v4_early_demux(struct sk_buff *skb); +int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..9b30f821fe96 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct iphdr *iph; @@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb) struct in_device *in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb) */ skb_dst_set_noref(skb, dst); } + return 0; } int udp_rcv(struct sk_buff *skb) From bc044e8db7962e727a75b591b9851ff2ac5cf846 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Sep 2017 15:51:37 +0200 Subject: [PATCH 1079/1322] udp: perform source validation for mcast early demux The UDP early demux can leverate the rx dst cache even for multicast unconnected sockets. In such scenario the ipv4 source address is validated only on the first packet in the given flow. After that, when we fetch the dst entry from the socket rx cache, we stop enforcing the rp_filter and we even start accepting any kind of martian addresses. Disabling the dst cache for unconnected multicast socket will cause large performace regression, nearly reducing by half the max ingress tput. Instead we factor out a route helper to completely validate an skb source address for multicast packets and we call it from the UDP early demux for mcast packets landing on unconnected sockets, after successful fetching the related cached dst entry. This still gives a measurable, but limited performance regression: rp_filter = 0 rp_filter = 1 edmux disabled: 1182 Kpps 1127 Kpps edmux before: 2238 Kpps 2238 Kpps edmux after: 2037 Kpps 2019 Kpps The above figures are on top of current net tree. Applying the net-next commit 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") the delta with rp_filter == 0 will decrease even more. Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/route.h | 4 +++- net/ipv4/route.c | 46 +++++++++++++++++++++++++-------------------- net/ipv4/udp.c | 13 ++++++++++++- 3 files changed, 41 insertions(+), 22 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 57dfc6850d37..d538e6db1afe 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } - +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin); int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..ac6fde5d45f1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9b30f821fe96..5676237d2b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; @@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } return 0; } From 5a59a3a0ef0e546626a762d49dc06feaa204bab3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 17:57:58 +0200 Subject: [PATCH 1080/1322] ppp: fix __percpu annotation Move sparse annotation right after pointer type. Fixes sparse warning: drivers/net/ppp/ppp_generic.c:1422:13: warning: incorrect type in initializer (different address spaces) drivers/net/ppp/ppp_generic.c:1422:13: expected void const [noderef] *__vpp_verify drivers/net/ppp/ppp_generic.c:1422:13: got int * ... Fixes: e5dadc65f9e0 ("ppp: Fix false xmit recursion detect with two ppp devices") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a404552555d4..c3f77e3b7819 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -120,7 +120,7 @@ struct ppp { int n_channels; /* how many channels are attached 54 */ spinlock_t rlock; /* lock for receive side 58 */ spinlock_t wlock; /* lock for transmit side 5c */ - int *xmit_recursion __percpu; /* xmit recursion detect */ + int __percpu *xmit_recursion; /* xmit recursion detect */ int mru; /* max receive unit 60 */ unsigned int flags; /* control bits 64 */ unsigned int xstate; /* transmit state bits 68 */ From 21a2774ef5d4fde772fbcb9d0eabb76f585e5af5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 28 Sep 2017 11:19:06 -0700 Subject: [PATCH 1081/1322] Revert "net: dsa: bcm_sf2: Defer port enabling to calling port_enable" This reverts commit e85ec74ace29 ("net: dsa: bcm_sf2: Defer port enabling to calling port_enable") because this now makes an unbind followed by a bind to fail connecting to the ingrated PHY. What this patch missed is that we need the PHY to be enabled with bcm_sf2_gphy_enable_set() before probing it on the MDIO bus. This is correctly done in the ops->setup() function, but by the time ops->port_enable() runs, this is too late. Upon unbind we would power down the PHY, and so when we would bind again, the PHY would be left powered off. Fixes: e85ec74ace29 ("net: dsa: bcm_sf2: Defer port enabling to calling port_enable") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 898d5642b516..7aecc98d0a18 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -754,11 +754,14 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); unsigned int port; - /* Disable unused ports and configure IMP port */ + /* Enable all valid ports and disable those unused */ for (port = 0; port < priv->hw_params.num_ports; port++) { - if (dsa_is_cpu_port(ds, port)) + /* IMP port receives special treatment */ + if ((1 << port) & ds->enabled_port_mask) + bcm_sf2_port_setup(ds, port, NULL); + else if (dsa_is_cpu_port(ds, port)) bcm_sf2_imp_setup(ds, port); - else if (!((1 << port) & ds->enabled_port_mask)) + else bcm_sf2_port_disable(ds, port, NULL); } From e876a8a7e9dd89dc88c12ca2e81beb478dbe9897 Mon Sep 17 00:00:00 2001 From: Mick Tarsel Date: Thu, 28 Sep 2017 13:53:18 -0700 Subject: [PATCH 1082/1322] ibmvnic: Set state UP State is initially reported as UNKNOWN. Before register call netif_carrier_off(). Once the device is opened, call netif_carrier_on() in order to set the state to UP. Signed-off-by: Mick Tarsel Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index cb8182f4fdfa..4bc14a901571 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -927,6 +927,7 @@ static int ibmvnic_open(struct net_device *netdev) } rc = __ibmvnic_open(netdev); + netif_carrier_on(netdev); mutex_unlock(&adapter->reset_lock); return rc; @@ -3899,6 +3900,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) if (rc) goto ibmvnic_init_fail; + netif_carrier_off(netdev); rc = register_netdev(netdev); if (rc) { dev_err(&dev->dev, "failed to register netdev rc=%d\n", rc); From aad06212d36cf34859428a0a279e5c14ee5c9e26 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Fri, 29 Sep 2017 10:02:54 +0200 Subject: [PATCH 1083/1322] tipc: use only positive error codes in messages In commit e3a77561e7d32 ("tipc: split up function tipc_msg_eval()"), we have updated the function tipc_msg_lookup_dest() to set the error codes to negative values at destination lookup failures. Thus when the function sets the error code to -TIPC_ERR_NO_NAME, its inserted into the 4 bit error field of the message header as 0xf instead of TIPC_ERR_NO_NAME (1). The value 0xf is an unknown error code. In this commit, we set only positive error code. Fixes: e3a77561e7d32 ("tipc: split up function tipc_msg_eval()") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..121e59a1d0e7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); From ef739d8aabaa26dd99f5661e38a86f4d85d8dfed Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 14:34:22 +0100 Subject: [PATCH 1084/1322] net: ipmr: make function ipmr_notifier_init static The function ipmr_notifier_init is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: warning: symbol 'ipmr_notifier_init' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 292a8e80bdfa..a844738b38bd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3224,7 +3224,7 @@ static const struct fib_notifier_ops ipmr_notifier_ops_template = { .owner = THIS_MODULE, }; -int __net_init ipmr_notifier_init(struct net *net) +static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; From b1c49d14200dae47544f7ea6ee9040ded01f8425 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 15:01:16 +0100 Subject: [PATCH 1085/1322] net_sched: remove redundant assignment to ret The assignment of -EINVAL to variable ret is redundant as it is being overwritten on the following error exit paths or to the return value from the following call to basic_set_parms. Fix this up by removing it. Cleans up clang warning message: net/sched/cls_basic.c:185:2: warning: Value stored to 'err' is never read Fixes: 1d8134fea2eb ("net_sched: use idr to allocate basic filter handles") Signed-off-by: Colin Ian King Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_basic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index cfeb6f158566..700b345b07f9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -182,7 +182,6 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; if (handle) { fnew->handle = handle; if (!fold) { From 721e08dad17e226ef68819d0a23dc53c25fe8ea5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 29 Sep 2017 10:52:17 -0700 Subject: [PATCH 1086/1322] bpf: Fix compiler warning on info.map_ids for 32bit platform This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr. It also tags the user_map_ids with '__user' for sparse check. Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info") Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 11a7f82a55d1..b927da66f653 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1405,7 +1405,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { - u32 *user_map_ids = (u32 *)info.map_ids; + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) From 09af87d18f6ba05588e6316c47fdacf06e28cce8 Mon Sep 17 00:00:00 2001 From: Simon Xiao Date: Fri, 29 Sep 2017 11:39:46 -0700 Subject: [PATCH 1087/1322] hv_netvsc: report stop_queue and wake_queue Report the numbers of events for stop_queue and wake_queue in ethtool stats. Example: ethtool -S eth0 NIC statistics: ... stop_queue: 7 wake_queue: 7 ... Signed-off-by: Simon Xiao Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc.c | 12 ++++++++++-- drivers/net/hyperv/netvsc_drv.c | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 5176be76ca7d..6f550e15a41c 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -686,6 +686,8 @@ struct netvsc_ethtool_stats { unsigned long tx_busy; unsigned long tx_send_full; unsigned long rx_comp_busy; + unsigned long stop_queue; + unsigned long wake_queue; }; struct netvsc_vf_pcpu_stats { diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index b0d323e24978..6e5194916bbe 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -609,6 +609,7 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device, { struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id; struct net_device *ndev = hv_get_drvdata(device); + struct net_device_context *ndev_ctx = netdev_priv(ndev); struct vmbus_channel *channel = device->channel; u16 q_idx = 0; int queue_sends; @@ -643,8 +644,10 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device, if (netif_tx_queue_stopped(netdev_get_tx_queue(ndev, q_idx)) && (hv_ringbuf_avail_percent(&channel->outbound) > RING_AVAIL_PERCENT_HIWATER || - queue_sends < 1)) + queue_sends < 1)) { netif_tx_wake_queue(netdev_get_tx_queue(ndev, q_idx)); + ndev_ctx->eth_stats.wake_queue++; + } } static void netvsc_send_completion(struct netvsc_device *net_device, @@ -749,6 +752,7 @@ static inline int netvsc_send_pkt( &net_device->chan_table[packet->q_idx]; struct vmbus_channel *out_channel = nvchan->channel; struct net_device *ndev = hv_get_drvdata(device); + struct net_device_context *ndev_ctx = netdev_priv(ndev); struct netdev_queue *txq = netdev_get_tx_queue(ndev, packet->q_idx); u64 req_id; int ret; @@ -789,12 +793,16 @@ static inline int netvsc_send_pkt( if (ret == 0) { atomic_inc_return(&nvchan->queue_sends); - if (ring_avail < RING_AVAIL_PERCENT_LOWATER) + if (ring_avail < RING_AVAIL_PERCENT_LOWATER) { netif_tx_stop_queue(txq); + ndev_ctx->eth_stats.stop_queue++; + } } else if (ret == -EAGAIN) { netif_tx_stop_queue(txq); + ndev_ctx->eth_stats.stop_queue++; if (atomic_read(&nvchan->queue_sends) < 1) { netif_tx_wake_queue(txq); + ndev_ctx->eth_stats.wake_queue++; ret = -ENOSPC; } } else { diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index e9d54c9ee78c..f300ae61c6c6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1126,6 +1126,8 @@ static const struct { { "tx_busy", offsetof(struct netvsc_ethtool_stats, tx_busy) }, { "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) }, { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) }, + { "stop_queue", offsetof(struct netvsc_ethtool_stats, stop_queue) }, + { "wake_queue", offsetof(struct netvsc_ethtool_stats, wake_queue) }, }, vf_stats[] = { { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) }, { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) }, From 075cfdd659cb1e86f948f11ba577f27706f0756e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 29 Sep 2017 20:51:23 +0100 Subject: [PATCH 1088/1322] net: hns3: fix null pointer dereference before null check pointer ndev is being dereferenced with the call to netif_running before it is being null checked. Re-order the code to only dereference ndev after it has been null checked. Detected by CoverityScan, CID#1457206 ("Dereference before null check") Fixes: 9df8f79a4d29 ("net: hns3: Add DCB support when interacting with network stack") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index 4a0890f98b70..c31506514e5d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -2865,7 +2865,7 @@ static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc) { struct hnae3_knic_private_info *kinfo = &handle->kinfo; struct net_device *ndev = kinfo->netdev; - bool if_running = netif_running(ndev); + bool if_running; int ret; u8 i; @@ -2875,6 +2875,8 @@ static int hns3_client_setup_tc(struct hnae3_handle *handle, u8 tc) if (!ndev) return -ENODEV; + if_running = netif_running(ndev); + ret = netdev_set_num_tc(ndev, tc); if (ret) return ret; From 3775b1b7f0c330e59c434d1852d7762ae0a9c164 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:15 -0400 Subject: [PATCH 1089/1322] net: dsa: add master helper to look up slaves The DSA tagging code does not need to know about the DSA architecture, it only needs to return the slave device corresponding to the source port index (and eventually the source device index for cascade-capable switches) parsed from the frame received on the master device. For this purpose, provide an inline dsa_master_get_slave helper which validates the device and port indexes and look up the slave device. This makes the tagging rcv functions more concise and robust, and also makes dsa_get_cpu_port obsolete. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 24 +++++++++++++++++++----- net/dsa/tag_brcm.c | 9 ++------- net/dsa/tag_dsa.c | 18 ++---------------- net/dsa/tag_edsa.c | 18 ++---------------- net/dsa/tag_ksz.c | 9 +++------ net/dsa/tag_lan9303.c | 20 ++------------------ net/dsa/tag_mtk.c | 16 +++------------- net/dsa/tag_qca.c | 17 +++-------------- net/dsa/tag_trailer.c | 9 +++------ 9 files changed, 39 insertions(+), 101 deletions(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index eccc62776283..d429505dc4e7 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -113,6 +113,25 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int dsa_master_ethtool_setup(struct net_device *dev); void dsa_master_ethtool_restore(struct net_device *dev); +static inline struct net_device *dsa_master_get_slave(struct net_device *dev, + int device, int port) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + + if (device < 0 || device >= DSA_MAX_SWITCHES) + return NULL; + + ds = dst->ds[device]; + if (!ds) + return NULL; + + if (port < 0 || port >= ds->num_ports) + return NULL; + + return ds->ports[port].netdev; +} + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -182,9 +201,4 @@ static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p) return p->dp->cpu_dp->netdev; } -static inline struct dsa_port *dsa_get_cpu_port(struct dsa_switch_tree *dst) -{ - return dst->cpu_dp; -} - #endif diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index dbb016434ace..8e4bdb9d9ae3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -92,9 +92,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; int source_port; u8 *brcm_tag; @@ -117,8 +114,8 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; - /* Validate port against switch setup, either the port is totally */ - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; /* Remove Broadcom tag and update checksum */ @@ -129,8 +126,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fbf9ca954773..c77218f173d1 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -67,8 +67,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *dsa_header; int source_device; int source_port; @@ -93,18 +91,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -153,8 +141,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 76367ba1b2e2..0b83cbe0c9e8 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -80,8 +80,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *edsa_header; int source_device; int source_port; @@ -106,18 +104,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -172,8 +160,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 010ca0a336c4..b241c990cde0 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -80,22 +80,19 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *tag; int source_port; tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; source_port = tag[0] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 0b9826105e42..4f211e56c81f 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -71,17 +71,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u16 *lan9303_tag; - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; unsigned int source_port; - ds = dst->ds[0]; - - if (unlikely(!ds)) { - dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n"); - return NULL; - } - if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull\n"); @@ -103,16 +94,12 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, source_port = ntohs(lan9303_tag[1]) & 0x3; - if (source_port >= ds->num_ports) { + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); return NULL; } - if (!ds->ports[source_port].netdev) { - dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); - return NULL; - } - /* remove the special VLAN tag between the MAC addresses * and the current ethertype field. */ @@ -120,9 +107,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - /* forward the packet to the dedicated interface */ - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ec8ee5f43255..968586c5d40e 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -46,8 +46,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; int port; __be16 *phdr, hdr; @@ -68,20 +66,12 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - MTK_HDR_LEN, 2 * ETH_ALEN); - /* This protocol doesn't support cascading multiple - * switches so it's safe to assume the switch is first - * in the tree. - */ - ds = dst->ds[0]; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1d4c70711c0f..8d33d9ebf910 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -65,9 +65,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds; u8 ver; int port; __be16 *phdr, hdr; @@ -92,20 +89,12 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, ETH_HLEN - QCA_HDR_LEN); - /* This protocol doesn't support cascading multiple switches so it's - * safe to assume the switch is first in the tree - */ - ds = cpu_dp->ds; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - /* Update skb & forward the frame accordingly */ - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d2fd4923aa3e..61668be267f5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -58,9 +58,6 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *trailer; int source_port; @@ -73,13 +70,13 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; source_port = trailer[1] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port].netdev; - return skb; } From 7ec764eef934409c4e15539440c31bca0b8de005 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:16 -0400 Subject: [PATCH 1090/1322] net: dsa: use cpu_dp in master code Make it clear that the master device is linked to a CPU port by using "cpu_dp" for the dsa_port variable in master.c instead of "port", then use a "port" variable to describe the port index, as usually seen in other places of DSA core. This will make the future patch touching dsa_ptr more readable. There is no functional changes. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/master.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/net/dsa/master.c b/net/dsa/master.c index 5e5147ec5a44..ef15d35f1574 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -17,9 +17,10 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, uint64_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int count = 0; if (ops && ops->get_sset_count && ops->get_ethtool_stats) { @@ -28,15 +29,15 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, } if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, port->index, data + count); + ds->ops->get_ethtool_stats(ds, port, data + count); } static int dsa_master_get_sset_count(struct net_device *dev, int sset) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; int count = 0; if (ops && ops->get_sset_count) @@ -52,16 +53,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dst->cpu_dp; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; unsigned int i; uint8_t pfx[4]; uint8_t *ndata; - snprintf(pfx, sizeof(pfx), "p%.2d", port->index); + snprintf(pfx, sizeof(pfx), "p%.2d", port); /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; @@ -76,7 +78,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->ops->get_strings(ds, port->index, ndata); + ds->ops->get_strings(ds, port, ndata); count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), @@ -89,17 +91,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, int dsa_master_ethtool_setup(struct net_device *dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; + struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; - port->orig_ethtool_ops = dev->ethtool_ops; - if (port->orig_ethtool_ops) - memcpy(ops, port->orig_ethtool_ops, sizeof(*ops)); + cpu_dp->orig_ethtool_ops = dev->ethtool_ops; + if (cpu_dp->orig_ethtool_ops) + memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops)); ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; @@ -113,8 +115,8 @@ int dsa_master_ethtool_setup(struct net_device *dev) void dsa_master_ethtool_restore(struct net_device *dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; + struct dsa_port *cpu_dp = dst->cpu_dp; - dev->ethtool_ops = port->orig_ethtool_ops; - port->orig_ethtool_ops = NULL; + dev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } From 62fc95876298987c35e9fb10badd467f4787aae7 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:17 -0400 Subject: [PATCH 1091/1322] net: dsa: use temporary dsa_device_ops variable When resolving the DSA tagging protocol used by a CPU switch, use a temporary "tag_ops" variable to store the dsa_device_ops instead of using directly dst->tag_ops. This will make the future patches moving this pointer around easier to read. There is no functional changes. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 8 +++++--- net/dsa/legacy.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcccaebde708..6a10c5c1639f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -485,6 +485,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -514,13 +515,14 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, ds->cpu_port_mask |= BIT(index); tag_protocol = ds->ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); ds->cpu_port_mask &= ~BIT(index); - return PTR_ERR(dst->tag_ops); + return PTR_ERR(tag_ops); } + dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ae505d8e4417..8e849013f69d 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -144,13 +144,15 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, * switch. */ if (dst->cpu_dp->ds == ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) - return PTR_ERR(dst->tag_ops); + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) + return PTR_ERR(tag_ops); + dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; } From 152402483ed75b167d5628d414e876ffa7a6d4c4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:18 -0400 Subject: [PATCH 1092/1322] net: dsa: add tagging ops to port The DSA tagging protocol operations are specific to each CPU port, thus the dsa_device_ops pointer belongs to the dsa_port structure. >From now on assign a slave's xmit copy from its CPU port tagging operations. This will ease the future support for multiple CPU ports. Also keep the tag_ops at the beginning of the dsa_port structure so that we ensure copies for hot path are in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 2 +- net/dsa/legacy.c | 1 + net/dsa/slave.c | 3 +-- 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index 8dee216a5a9b..4d1df2f086e8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -175,6 +175,9 @@ struct dsa_mall_tc_entry { struct dsa_port { + /* CPU port tagging operations used by master or slave devices */ + const struct dsa_device_ops *tag_ops; + struct dsa_switch *ds; unsigned int index; const char *name; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 6a10c5c1639f..9eac4726dc0c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -522,6 +522,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, return PTR_ERR(tag_ops); } + dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d429505dc4e7..9397291bb3aa 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info { }; struct dsa_slave_priv { - /* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */ + /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 8e849013f69d..4d374541815a 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -152,6 +152,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (IS_ERR(tag_ops)) return PTR_ERR(tag_ops); + dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; dst->rcv = dst->tag_ops->rcv; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bf8800de13c1..4b634db05cee 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1117,7 +1117,6 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { struct dsa_switch *ds = port->ds; - struct dsa_switch_tree *dst = ds->dst; struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; @@ -1162,7 +1161,7 @@ int dsa_slave_create(struct dsa_port *port, const char *name) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = dst->tag_ops->xmit; + p->xmit = cpu_dp->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; From 3e41f93b358a8800194b87995ad076fc50919719 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:19 -0400 Subject: [PATCH 1093/1322] net: dsa: prepare master receive hot path In preparation to make DSA master devices point to their corresponding CPU port instead of the whole tree, add copies of dst and rcv in the dsa_port structure so that we keep fast access in the receive hot path. Also keep the copies at the beginning of the dsa_port structure in order to ensure they are available in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ net/dsa/dsa2.c | 4 ++++ net/dsa/legacy.c | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/include/net/dsa.h b/include/net/dsa.h index 4d1df2f086e8..6bda01fa5747 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -178,6 +178,11 @@ struct dsa_port { /* CPU port tagging operations used by master or slave devices */ const struct dsa_device_ops *tag_ops; + /* Copies for faster access in master receive hot path */ + struct dsa_switch_tree *dst; + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt); + struct dsa_switch *ds; unsigned int index; const char *name; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9eac4726dc0c..b71e3bb478e4 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -524,7 +524,11 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; + + /* Make a few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->dst = dst; return 0; } diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 4d374541815a..96c7e3f8b8bb 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -154,7 +154,11 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, dst->cpu_dp->tag_ops = tag_ops; dst->tag_ops = tag_ops; + + /* Few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->dst = dst; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:20 -0400 Subject: [PATCH 1094/1322] net: dsa: change dsa_ptr for a dsa_port With DSA, a master net device (CPU facing interface) has a dsa_ptr pointer to which hangs a dsa_switch_tree. This is not correct because a master interface is wired to a dedicated switch port, and because we can theoretically have several master interfaces pointing to several CPU ports of the same switch fabric. Change the master interface's dsa_ptr for the CPU dsa_port pointer. This is a step towards supporting multiple CPU ports. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/dsa/dsa.c | 6 +++--- net/dsa/dsa2.c | 2 +- net/dsa/dsa_priv.h | 3 ++- net/dsa/legacy.c | 2 +- net/dsa/master.c | 15 +++++---------- 6 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f535779d9dc1..e1d6ef130611 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -55,7 +55,7 @@ struct netpoll_info; struct device; struct phy_device; -struct dsa_switch_tree; +struct dsa_port; /* 802.11 specific */ struct wireless_dev; @@ -1752,7 +1752,7 @@ struct net_device { struct vlan_info __rcu *vlan_info; #endif #if IS_ENABLED(CONFIG_NET_DSA) - struct dsa_switch_tree *dsa_ptr; + struct dsa_port *dsa_ptr; #endif #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 81c852e32821..51ca2a524a27 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -160,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; - if (unlikely(dst == NULL)) { + if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } @@ -174,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = dst->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev, pt); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b71e3bb478e4..62302558f38c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -438,7 +438,7 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) * sent to the tag format's receive function. */ wmb(); - dst->cpu_dp->netdev->dsa_ptr = dst; + dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp; err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); if (err) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9397291bb3aa..2850077cc9cc 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -116,7 +116,8 @@ void dsa_master_ethtool_restore(struct net_device *dev); static inline struct net_device *dsa_master_get_slave(struct net_device *dev, int device, int port) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_switch *ds; if (device < 0 || device >= DSA_MAX_SWITCHES) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 96c7e3f8b8bb..71917505a5cc 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -607,7 +607,7 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, * sent to the tag format's receive function. */ wmb(); - dev->dsa_ptr = dst; + dev->dsa_ptr = dst->cpu_dp; return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } diff --git a/net/dsa/master.c b/net/dsa/master.c index ef15d35f1574..5f3f57e372e0 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -16,8 +16,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int port = cpu_dp->index; @@ -34,8 +33,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, static int dsa_master_get_sset_count(struct net_device *dev, int sset) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int count = 0; @@ -52,8 +50,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; struct dsa_switch *ds = cpu_dp->ds; int port = cpu_dp->index; @@ -90,8 +87,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, int dsa_master_ethtool_setup(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; @@ -114,8 +110,7 @@ int dsa_master_ethtool_setup(struct net_device *dev) void dsa_master_ethtool_restore(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; dev->ethtool_ops = cpu_dp->orig_ethtool_ops; cpu_dp->orig_ethtool_ops = NULL; From aa193d9b1d7ea6893ce24a9d141f676950563987 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:21 -0400 Subject: [PATCH 1095/1322] net: dsa: remove tag ops from the switch tree Now that the dsa_ptr is a dsa_port instance, there is no need to keep the tag operations in the dsa_switch_tree structure. Remove it. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 11 ----------- net/dsa/dsa2.c | 2 -- net/dsa/legacy.c | 2 -- 3 files changed, 15 deletions(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index 6bda01fa5747..10dceccd9ce8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -130,11 +130,6 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* Copy of tag_ops->rcv for faster access in hot path */ - struct sk_buff * (*rcv)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt); - /* * The switch port to which the CPU is attached. */ @@ -144,12 +139,6 @@ struct dsa_switch_tree { * Data for the individual switch chips. */ struct dsa_switch *ds[DSA_MAX_SWITCHES]; - - /* - * Tagging protocol operations for adding and removing an - * encapsulation tag. - */ - const struct dsa_device_ops *tag_ops; }; /* TC matchall action types, only mirroring for now */ diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 62302558f38c..54ed054777bd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -523,11 +523,9 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, } dst->cpu_dp->tag_ops = tag_ops; - dst->tag_ops = tag_ops; /* Make a few copies for faster access in master receive hot path */ dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->rcv = dst->tag_ops->rcv; dst->cpu_dp->dst = dst; return 0; diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 71917505a5cc..19ff6e0a21dc 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -153,11 +153,9 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, return PTR_ERR(tag_ops); dst->cpu_dp->tag_ops = tag_ops; - dst->tag_ops = tag_ops; /* Few copies for faster access in master receive hot path */ dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; - dst->rcv = dst->tag_ops->rcv; dst->cpu_dp->dst = dst; } From 9e66317d3c92ddaab330c125dfe9d06eee268aff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Oct 2017 14:54:54 -0700 Subject: [PATCH 1096/1322] Linux 4.14-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f0c5b21fadb2..cf007a31d575 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Fearless Coyote # *DOCUMENTATION* From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: [PATCH 1097/1322] ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - net/ipv4/af_inet.c | 7 ++++--- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_fastopen.c | 11 +++++------ net/ipv4/tcp_ipv4.c | 2 ++ 7 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index abc84d986da4..16420ccaef15 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -128,6 +128,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; + int sysctl_tcp_fastopen; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index 770b608c8439..9e414a99034f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ddd126d120ac 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,8 +217,9 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d3c038d7b04..e31e853cf486 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,13 +400,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "tcp_fastopen_key", .mode = 0600, @@ -1085,6 +1078,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..dac56c4ad357 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1126,7 +1126,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -2759,7 +2759,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3c33220c418..31b08ec38cb8 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,8 +9,6 @@ #include #include -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); @@ -279,21 +277,22 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ @@ -347,7 +346,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..88409b13c9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2472,6 +2472,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + return 0; fail: tcp_sk_exit(net); From dd000598a39b6937fcefdf143720ec9fb5250e72 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:41 +0800 Subject: [PATCH 1098/1322] ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp: Do not call tcp_fastopen_reset_cipher from interrupt context"), because in tcp_fastopen_cookie_gen,it wouldn't call tcp_fastopen_init_key_once. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 5 ----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 4 ++-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9e414a99034f..d9376e2458e9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1555,7 +1555,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(bool publish); +void tcp_fastopen_init_key_once(void); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ddd126d120ac..e73ce79d7176 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e31e853cf486..f6324ead0e19 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -282,11 +282,6 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dac56c4ad357..4e395452d69f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 31b08ec38cb8..8c8f0f0af59d 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -13,7 +13,7 @@ struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(void) { static u8 key[TCP_FASTOPEN_KEY_LENGTH]; @@ -23,7 +23,7 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) + if (net_get_random_once(key, sizeof(key))) tcp_fastopen_reset_cipher(key, sizeof(key)); } From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: [PATCH 1099/1322] ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 +++ include/net/tcp.h | 6 ++-- net/ipv4/af_inet.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 21 +++++++------ net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 64 +++++++++++++++++++++++++------------- net/ipv4/tcp_ipv4.c | 6 ++++ 7 files changed, 70 insertions(+), 35 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 16420ccaef15..7bb9603ff66c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -36,6 +36,8 @@ struct inet_timewait_death_row { int sysctl_max_tw_buckets; }; +struct tcp_fastopen_context; + struct netns_ipv4 { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; @@ -129,6 +131,8 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + spinlock_t tcp_fastopen_ctx_lock; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index d9376e2458e9..6d25d8305054 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1549,13 +1549,13 @@ struct tcp_fastopen_request { }; void tcp_free_fastopen_req(struct tcp_sock *tp); -extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; -int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_ctx_destroy(struct net *net); +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(void); +void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e73ce79d7176..43a1bbed7a42 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -222,7 +222,7 @@ int inet_listen(struct socket *sock, int backlog) (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f6324ead0e19..20e19fe78dbd 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -282,7 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -395,12 +397,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, { .procname = "tcp_fastopen_blackhole_timeout_sec", .data = &sysctl_tcp_fastopen_blackhole_timeout, @@ -1080,6 +1076,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e395452d69f..23225c98d287 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2749,7 +2749,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8c8f0f0af59d..4eae44ac3cb0 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,13 +9,18 @@ #include #include -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(void) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -23,8 +28,8 @@ void tcp_fastopen_init_key_once(void) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key))) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -35,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) { int err; struct tcp_fastopen_context *ctx, *octx; @@ -59,26 +79,27 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct net *net, + const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -94,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct net *net, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -102,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(net, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -110,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(net, buf, foc); } } #endif @@ -296,7 +318,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 88409b13c9d2..49c74c0d0d21 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2473,6 +2473,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); return 0; fail: @@ -2483,7 +2484,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: [PATCH 1100/1322] ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 20 +++++++++++--------- net/ipv4/tcp_fastopen.c | 30 +++++++++++------------------- net/ipv4/tcp_ipv4.c | 2 ++ 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7bb9603ff66c..2c4222a5d102 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -133,6 +133,9 @@ struct netns_ipv4 { int sysctl_tcp_fastopen; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; + unsigned int sysctl_tcp_fastopen_blackhole_timeout; + atomic_t tfo_active_disable_times; + unsigned long tfo_active_disable_stamp; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 20e19fe78dbd..cac8dd309f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -355,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -397,14 +399,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, @@ -1083,6 +1077,14 @@ static struct ctl_table ipv4_net_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 4eae44ac3cb0..de470e7e586f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -422,25 +422,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -449,17 +440,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -495,10 +487,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49c74c0d0d21..ad3b5bbaf942 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2474,6 +2474,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return 0; fail: From b80ccfe9bbcac70e66fdfaef73f0988a27f9a68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 20:37:22 -0700 Subject: [PATCH 1101/1322] net-ipv6: remove unused IP6_ECN_clear() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is unused, and furthermore it is buggy since it suffers from the same issue that requires IP6_ECN_set_ce() to take a pointer to the skb so that it may (in case of CHECKSUM_COMPLETE) update skb->csum Instead of fixing it, let's just outright remove it. Tested: builds, and 'git grep IP6_ECN_clear' comes up empty Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index dce2d586d9ce..f5ff16d72fe6 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -133,11 +133,6 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) return 1; } -static inline void IP6_ECN_clear(struct ipv6hdr *iph) -{ - *(__be32*)iph &= ~htonl(INET_ECN_MASK << 20); -} - static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; From 935a9749a36828af0e8be224a5cd4bc758112c34 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:53 +0800 Subject: [PATCH 1102/1322] ip_gre: get key from session_id correctly in erspan_rcv erspan only uses the first 10 bits of session_id as the key to look up the tunnel. But in erspan_rcv, it missed 'session_id & ID_MASK' when getting the key from session_id. If any other flag is also set in session_id in a packet, it would fail to find the tunnel due to incorrect key in erspan_rcv. This patch is to add 'session_id & ID_MASK' there and also remove the unnecessary variable session_id. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8b837f6f5532..b25b1e5112d0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, From 5513d08d29511c263c00933c00dd7a82fffda3c9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:54 +0800 Subject: [PATCH 1103/1322] ip_gre: check packet length and mtu correctly in erspan_xmit As a ARPHRD_ETHER device, skb->len in erspan_xmit is the length of the whole ether packet. So before checking if a packet size exceeds the mtu, skb->len should subtract dev->hard_header_len. Otherwise, all packets with max size according to mtu would be trimmed to be truncated packet. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b25b1e5112d0..2a4ef9dc48ff 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -731,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } From c122fda271717f4fc618e0a31e833941fd5f1efd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:55 +0800 Subject: [PATCH 1104/1322] ip_gre: set tunnel hlen properly in erspan_tunnel_init According to __gre_tunnel_init, tunnel->hlen should be set as the headers' length between inner packet and outer iphdr. It would be used especially to calculate a proper mtu when updating mtu in tnl_update_pmtu. Now without setting it, a bigger mtu value than expected would be updated, which hurts performance a lot. This patch is to fix it by setting tunnel->hlen with: tunnel->tun_hlen + tunnel->encap_hlen + sizeof(struct erspanhdr) Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a4ef9dc48ff..fad0bb1e3e9a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1245,7 +1245,9 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; From c84bed440e4e11a973e8c0254d0dfaccfca41fb0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2017 22:00:56 +0800 Subject: [PATCH 1105/1322] ip_gre: erspan device should keep dst The patch 'ip_gre: ipgre_tap device should keep dst' fixed the issue ipgre_tap dev mtu couldn't be updated in tx path. The same fix is needed for erspan as well. Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fad0bb1e3e9a..467e44d7587d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1254,6 +1254,7 @@ static int erspan_tunnel_init(struct net_device *dev) dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } From 9f775ead5e570e7e19015b9e4e2f3dd6e71a5935 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 28 Sep 2017 15:44:38 +0200 Subject: [PATCH 1106/1322] l2tp: fix l2tp_eth module loading The l2tp_eth module crashes if its netlink callbacks are run when the pernet data aren't initialised. We should normally register_pernet_device() before the genl callbacks. However, the pernet data only maintain a list of l2tpeth interfaces, and this list is never used. So let's just drop pernet handling instead. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 51 ++------------------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } From 90841047a01b452cc8c3f9b990698b264143334a Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Thu, 28 Sep 2017 11:35:00 -0700 Subject: [PATCH 1107/1322] r8152: add Linksys USB3GIGV1 id This linksys dongle by default comes up in cdc_ether mode. This patch allows r8152 to claim the device: Bus 002 Device 002: ID 13b1:0041 Linksys Signed-off-by: Grant Grundler Reviewed-by: Douglas Anderson Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 10 ++++++++++ drivers/net/usb/r8152.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 8ab281b478f2..677a85360db1 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -547,6 +547,7 @@ static const struct driver_info wwan_info = { #define REALTEK_VENDOR_ID 0x0bda #define SAMSUNG_VENDOR_ID 0x04e8 #define LENOVO_VENDOR_ID 0x17ef +#define LINKSYS_VENDOR_ID 0x13b1 #define NVIDIA_VENDOR_ID 0x0955 #define HP_VENDOR_ID 0x03f0 #define MICROSOFT_VENDOR_ID 0x045e @@ -737,6 +738,15 @@ static const struct usb_device_id products[] = { .driver_info = 0, }, +#if IS_ENABLED(CONFIG_USB_RTL8152) +/* Linksys USB3GIGV1 Ethernet Adapter */ +{ + USB_DEVICE_AND_INTERFACE_INFO(LINKSYS_VENDOR_ID, 0x0041, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), + .driver_info = 0, +}, +#endif + /* ThinkPad USB-C Dock (based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(LENOVO_VENDOR_ID, 0x3062, USB_CLASS_COMM, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index ceb78e2ea4f0..941ece08ba78 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -613,6 +613,7 @@ enum rtl8152_flags { #define VENDOR_ID_MICROSOFT 0x045e #define VENDOR_ID_SAMSUNG 0x04e8 #define VENDOR_ID_LENOVO 0x17ef +#define VENDOR_ID_LINKSYS 0x13b1 #define VENDOR_ID_NVIDIA 0x0955 #define MCU_TYPE_PLA 0x0100 @@ -5316,6 +5317,7 @@ static const struct usb_device_id rtl8152_table[] = { {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x7205)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x720c)}, {REALTEK_USB_DEVICE(VENDOR_ID_LENOVO, 0x7214)}, + {REALTEK_USB_DEVICE(VENDOR_ID_LINKSYS, 0x0041)}, {REALTEK_USB_DEVICE(VENDOR_ID_NVIDIA, 0x09ff)}, {} }; From 4792ea04bcd03b8ccfd1ae336c5deba52dd9edc9 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Fri, 29 Sep 2017 14:27:39 +0200 Subject: [PATCH 1108/1322] net: mvpp2: Fix clock resource by adding an optional bus clock On Armada 7K/8K we need to explicitly enable the bus clock. The bus clock is optional because not all the SoCs need them but at least for Armada 7K/8K it is actually mandatory. The binding documentation is updating accordingly. Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- .../devicetree/bindings/net/marvell-pp2.txt | 10 ++++++---- drivers/net/ethernet/marvell/mvpp2.c | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 7e2dad08a12e..1814fa13f6ab 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -21,8 +21,9 @@ Required properties: - main controller clock (for both armada-375-pp2 and armada-7k-pp2) - GOP clock (for both armada-375-pp2 and armada-7k-pp2) - MG clock (only for armada-7k-pp2) -- clock-names: names of used clocks, must be "pp_clk", "gop_clk" and - "mg_clk" (the latter only for armada-7k-pp2). + - AXI clock (only for armada-7k-pp2) +- clock-names: names of used clocks, must be "pp_clk", "gop_clk", "mg_clk" + and "axi_clk" (the 2 latter only for armada-7k-pp2). The ethernet ports are represented by subnodes. At least one port is required. @@ -78,8 +79,9 @@ Example for marvell,armada-7k-pp2: cpm_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>, <&cpm_syscon0 1 5>; - clock-names = "pp_clk", "gop_clk", "gp_clk"; + clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>, + <&cpm_syscon0 1 5>, <&cpm_syscon0 1 18>; + clock-names = "pp_clk", "gop_clk", "gp_clk", "axi_clk"; eth0: eth0 { interrupts = , diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 161055564720..9c86cb7cb988 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -793,6 +793,7 @@ struct mvpp2 { struct clk *pp_clk; struct clk *gop_clk; struct clk *mg_clk; + struct clk *axi_clk; /* List of pointers to port structures */ struct mvpp2_port **port_list; @@ -7970,6 +7971,18 @@ static int mvpp2_probe(struct platform_device *pdev) err = clk_prepare_enable(priv->mg_clk); if (err < 0) goto err_gop_clk; + + priv->axi_clk = devm_clk_get(&pdev->dev, "axi_clk"); + if (IS_ERR(priv->axi_clk)) { + err = PTR_ERR(priv->axi_clk); + if (err == -EPROBE_DEFER) + goto err_gop_clk; + priv->axi_clk = NULL; + } else { + err = clk_prepare_enable(priv->axi_clk); + if (err < 0) + goto err_gop_clk; + } } /* Get system's tclk rate */ @@ -8024,6 +8037,7 @@ static int mvpp2_probe(struct platform_device *pdev) return 0; err_mg_clk: + clk_disable_unprepare(priv->axi_clk); if (priv->hw_version == MVPP22) clk_disable_unprepare(priv->mg_clk); err_gop_clk: @@ -8061,6 +8075,7 @@ static int mvpp2_remove(struct platform_device *pdev) aggr_txq->descs_dma); } + clk_disable_unprepare(priv->axi_clk); clk_disable_unprepare(priv->mg_clk); clk_disable_unprepare(priv->pp_clk); clk_disable_unprepare(priv->gop_clk); From 81359617f1b783a01e6e22b46cbb046e9513b9c6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 30 Sep 2017 07:34:34 +0200 Subject: [PATCH 1109/1322] net: hns3: Fix an error handling path in 'hclge_rss_init_hw()' If this sanity check fails, we must free 'rss_indir'. Otherwise there is a memory leak. 'goto err' as done in the other error handling paths to fix it. Fixes: 46a3df9f9718 ("net: hns3: Fix for setting rss_size incorrectly") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e0685e630afe..c1cdbfd83bdb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2652,7 +2652,8 @@ static int hclge_rss_init_hw(struct hclge_dev *hdev) dev_err(&hdev->pdev->dev, "Configure rss tc size failed, invalid TC_SIZE = %d\n", rss_size); - return -EINVAL; + ret = -EINVAL; + goto err; } roundup_size = roundup_pow_of_two(rss_size); From 9c86b846ce02f7e35d7234cf090b80553eba5389 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:15 +0200 Subject: [PATCH 1110/1322] bcm63xx_enet: correct clock usage Check the return code of prepare_enable and change one last instance of enable only to prepare_enable. Also properly disable and release the clock in error paths and on remove for enetsw. Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 33 ++++++++++++++------ 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index c6221f04a748..a45ec97b5b1e 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1773,7 +1773,9 @@ static int bcm_enet_probe(struct platform_device *pdev) ret = PTR_ERR(priv->mac_clk); goto out; } - clk_prepare_enable(priv->mac_clk); + ret = clk_prepare_enable(priv->mac_clk); + if (ret) + goto out_put_clk_mac; /* initialize default and fetch platform data */ priv->rx_ring_size = BCMENET_DEF_RX_DESC; @@ -1805,9 +1807,11 @@ static int bcm_enet_probe(struct platform_device *pdev) if (IS_ERR(priv->phy_clk)) { ret = PTR_ERR(priv->phy_clk); priv->phy_clk = NULL; - goto out_put_clk_mac; + goto out_disable_clk_mac; } - clk_prepare_enable(priv->phy_clk); + ret = clk_prepare_enable(priv->phy_clk); + if (ret) + goto out_put_clk_phy; } /* do minimal hardware init to be able to probe mii bus */ @@ -1900,13 +1904,16 @@ out_free_mdio: out_uninit_hw: /* turn off mdc clock */ enet_writel(priv, 0, ENET_MIISC_REG); - if (priv->phy_clk) { + if (priv->phy_clk) clk_disable_unprepare(priv->phy_clk); - clk_put(priv->phy_clk); - } -out_put_clk_mac: +out_put_clk_phy: + if (priv->phy_clk) + clk_put(priv->phy_clk); + +out_disable_clk_mac: clk_disable_unprepare(priv->mac_clk); +out_put_clk_mac: clk_put(priv->mac_clk); out: free_netdev(dev); @@ -2748,7 +2755,9 @@ static int bcm_enetsw_probe(struct platform_device *pdev) ret = PTR_ERR(priv->mac_clk); goto out_unmap; } - clk_enable(priv->mac_clk); + ret = clk_prepare_enable(priv->mac_clk); + if (ret) + goto out_put_clk; priv->rx_chan = 0; priv->tx_chan = 1; @@ -2769,7 +2778,7 @@ static int bcm_enetsw_probe(struct platform_device *pdev) ret = register_netdev(dev); if (ret) - goto out_put_clk; + goto out_disable_clk; netif_carrier_off(dev); platform_set_drvdata(pdev, dev); @@ -2778,6 +2787,9 @@ static int bcm_enetsw_probe(struct platform_device *pdev) return 0; +out_disable_clk: + clk_disable_unprepare(priv->mac_clk); + out_put_clk: clk_put(priv->mac_clk); @@ -2809,6 +2821,9 @@ static int bcm_enetsw_remove(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); release_mem_region(res->start, resource_size(res)); + clk_disable_unprepare(priv->mac_clk); + clk_put(priv->mac_clk); + free_netdev(dev); return 0; } From d6213c1f2ad54a964b77471690264ed685718928 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:16 +0200 Subject: [PATCH 1111/1322] bcm63xx_enet: do not write to random DMA channel on BCM6345 The DMA controller regs actually point to DMA channel 0, so the write to ENETDMA_CFG_REG will actually modify a random DMA channel. Since DMA controller registers do not exist on BCM6345, guard the write with the usual check for dma_has_sram. Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index a45ec97b5b1e..a1e1e12e187a 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1062,7 +1062,8 @@ static int bcm_enet_open(struct net_device *dev) val = enet_readl(priv, ENET_CTL_REG); val |= ENET_CTL_ENABLE_MASK; enet_writel(priv, val, ENET_CTL_REG); - enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG); + if (priv->dma_has_sram) + enet_dma_writel(priv, ENETDMA_CFG_EN_MASK, ENETDMA_CFG_REG); enet_dmac_writel(priv, priv->dma_chan_en_mask, ENETDMAC_CHANCFG, priv->rx_chan); From 527a48713b01057d94aeec8f4383b1e20c82522c Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:17 +0200 Subject: [PATCH 1112/1322] bcm63xx_enet: do not rely on probe order Do not rely on the shared device being probed before the enet(sw) devices. This makes it easier to eventually move out the shared device as a dma controller driver (what it should be). Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index a1e1e12e187a..8caf6abab3a6 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1722,10 +1722,8 @@ static int bcm_enet_probe(struct platform_device *pdev) const char *clk_name; int i, ret; - /* stop if shared driver failed, assume driver->probe will be - * called in the same order we register devices (correct ?) */ if (!bcm_enet_shared_base[0]) - return -ENODEV; + return -EPROBE_DEFER; res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); res_irq_rx = platform_get_resource(pdev, IORESOURCE_IRQ, 1); @@ -2696,11 +2694,8 @@ static int bcm_enetsw_probe(struct platform_device *pdev) struct resource *res_mem; int ret, irq_rx, irq_tx; - /* stop if shared driver failed, assume driver->probe will be - * called in the same order we register devices (correct ?) - */ if (!bcm_enet_shared_base[0]) - return -ENODEV; + return -EPROBE_DEFER; res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); irq_rx = platform_get_irq(pdev, 0); From 7e697ce99ceb09538cdc1dfa9ebb3db60236b0a7 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:18 +0200 Subject: [PATCH 1113/1322] bcm63xx_enet: use managed functions for clock/ioremap Use managed functions where possible to reduce the amount of resource handling on error and remove paths. Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 54 +++++--------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 8caf6abab3a6..059ef4f1d137 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1767,14 +1767,14 @@ static int bcm_enet_probe(struct platform_device *pdev) clk_name = "enet1"; } - priv->mac_clk = clk_get(&pdev->dev, clk_name); + priv->mac_clk = devm_clk_get(&pdev->dev, clk_name); if (IS_ERR(priv->mac_clk)) { ret = PTR_ERR(priv->mac_clk); goto out; } ret = clk_prepare_enable(priv->mac_clk); if (ret) - goto out_put_clk_mac; + goto out; /* initialize default and fetch platform data */ priv->rx_ring_size = BCMENET_DEF_RX_DESC; @@ -1802,7 +1802,7 @@ static int bcm_enet_probe(struct platform_device *pdev) if (priv->mac_id == 0 && priv->has_phy && !priv->use_external_mii) { /* using internal PHY, enable clock */ - priv->phy_clk = clk_get(&pdev->dev, "ephy"); + priv->phy_clk = devm_clk_get(&pdev->dev, "ephy"); if (IS_ERR(priv->phy_clk)) { ret = PTR_ERR(priv->phy_clk); priv->phy_clk = NULL; @@ -1810,7 +1810,7 @@ static int bcm_enet_probe(struct platform_device *pdev) } ret = clk_prepare_enable(priv->phy_clk); if (ret) - goto out_put_clk_phy; + goto out_disable_clk_mac; } /* do minimal hardware init to be able to probe mii bus */ @@ -1906,14 +1906,8 @@ out_uninit_hw: if (priv->phy_clk) clk_disable_unprepare(priv->phy_clk); -out_put_clk_phy: - if (priv->phy_clk) - clk_put(priv->phy_clk); - out_disable_clk_mac: clk_disable_unprepare(priv->mac_clk); -out_put_clk_mac: - clk_put(priv->mac_clk); out: free_netdev(dev); return ret; @@ -1949,12 +1943,10 @@ static int bcm_enet_remove(struct platform_device *pdev) } /* disable hw block clocks */ - if (priv->phy_clk) { + if (priv->phy_clk) clk_disable_unprepare(priv->phy_clk); - clk_put(priv->phy_clk); - } + clk_disable_unprepare(priv->mac_clk); - clk_put(priv->mac_clk); free_netdev(dev); return 0; @@ -2734,26 +2726,20 @@ static int bcm_enetsw_probe(struct platform_device *pdev) if (ret) goto out; - if (!request_mem_region(res_mem->start, resource_size(res_mem), - "bcm63xx_enetsw")) { - ret = -EBUSY; + priv->base = devm_ioremap_resource(&pdev->dev, res_mem); + if (IS_ERR(priv->base)) { + ret = PTR_ERR(priv->base); goto out; } - priv->base = ioremap(res_mem->start, resource_size(res_mem)); - if (priv->base == NULL) { - ret = -ENOMEM; - goto out_release_mem; - } - - priv->mac_clk = clk_get(&pdev->dev, "enetsw"); + priv->mac_clk = devm_clk_get(&pdev->dev, "enetsw"); if (IS_ERR(priv->mac_clk)) { ret = PTR_ERR(priv->mac_clk); - goto out_unmap; + goto out; } ret = clk_prepare_enable(priv->mac_clk); if (ret) - goto out_put_clk; + goto out; priv->rx_chan = 0; priv->tx_chan = 1; @@ -2785,15 +2771,6 @@ static int bcm_enetsw_probe(struct platform_device *pdev) out_disable_clk: clk_disable_unprepare(priv->mac_clk); - -out_put_clk: - clk_put(priv->mac_clk); - -out_unmap: - iounmap(priv->base); - -out_release_mem: - release_mem_region(res_mem->start, resource_size(res_mem)); out: free_netdev(dev); return ret; @@ -2805,20 +2782,13 @@ static int bcm_enetsw_remove(struct platform_device *pdev) { struct bcm_enet_priv *priv; struct net_device *dev; - struct resource *res; /* stop netdevice */ dev = platform_get_drvdata(pdev); priv = netdev_priv(dev); unregister_netdev(dev); - /* release device resources */ - iounmap(priv->base); - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - release_mem_region(res->start, resource_size(res)); - clk_disable_unprepare(priv->mac_clk); - clk_put(priv->mac_clk); free_netdev(dev); return 0; From 4e78e5c5d881bf2d6267545a554c1baf245257b7 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:19 +0200 Subject: [PATCH 1114/1322] bcm63xx_enet: drop unneeded NULL phy_clk check clk_disable and clk_unprepare are NULL-safe, so need to duplicate the NULL check of the functions. Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c index 059ef4f1d137..f6bc13fe8a99 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c @@ -1903,8 +1903,7 @@ out_free_mdio: out_uninit_hw: /* turn off mdc clock */ enet_writel(priv, 0, ENET_MIISC_REG); - if (priv->phy_clk) - clk_disable_unprepare(priv->phy_clk); + clk_disable_unprepare(priv->phy_clk); out_disable_clk_mac: clk_disable_unprepare(priv->mac_clk); @@ -1943,9 +1942,7 @@ static int bcm_enet_remove(struct platform_device *pdev) } /* disable hw block clocks */ - if (priv->phy_clk) - clk_disable_unprepare(priv->phy_clk); - + clk_disable_unprepare(priv->phy_clk); clk_disable_unprepare(priv->mac_clk); free_netdev(dev); From 840f922317fb5c20841d6d7f3853ead506546ade Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 1 Oct 2017 13:02:20 +0200 Subject: [PATCH 1115/1322] bcm63xx_enet: remove unneeded include We don't use anyhing from that file, so drop it. Signed-off-by: Jonas Gorski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm63xx_enet.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.h b/drivers/net/ethernet/broadcom/bcm63xx_enet.h index 0a1b7b2e55bd..dd6ae3077433 100644 --- a/drivers/net/ethernet/broadcom/bcm63xx_enet.h +++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.h @@ -8,7 +8,6 @@ #include #include -#include #include #include From 45bfbc013b4294cadafbef821d377d3a99c7ab1e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 1 Oct 2017 17:27:35 +0100 Subject: [PATCH 1116/1322] mlxsw: spectrum: fix uninitialized value in err In the unlikely event that mfc->mfc_un.res.ttls[i] is 255 for all values of i from 0 to MAXIVS-1, the err is not set at all and hence has a garbage value on the error return at the end of the function, so initialize it to 0. Also, the error return check on err and goto to err: inside the for loop makes it impossible for err to be zero at the end of the for loop, so we can remove the redundant err check at the end of the loop. Detected by CoverityScan CID#1457207 ("Unitialized scalar value") Fixes: c011ec1bbfd6 ("mlxsw: spectrum: Add the multicast routing offloading logic") Signed-off-by: Colin Ian King Reviewed-by: Yotam Gigi Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index 09120259a45d..4aaf6ca1be7c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -349,7 +349,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, { struct mlxsw_sp_mr_route_vif_entry *rve, *tmp; struct mlxsw_sp_mr_route *mr_route; - int err; + int err = 0; int i; /* Allocate and init a new route and fill it with parameters */ @@ -376,8 +376,6 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, } } mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]); - if (err) - goto err; mr_route->route_action = mlxsw_sp_mr_route_action(mr_route); return mr_route; From 0929567a7a2dab8455a7313956973ff0d339709a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 1 Oct 2017 14:07:34 -0700 Subject: [PATCH 1117/1322] samples/bpf: fix warnings in xdp_monitor_user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make local functions static to fix HOSTCC samples/bpf/xdp_monitor_user.o samples/bpf/xdp_monitor_user.c:64:7: warning: no previous prototype for ‘gettime’ [-Wmissing-prototypes] __u64 gettime(void) ^~~~~~~ samples/bpf/xdp_monitor_user.c:209:6: warning: no previous prototype for ‘print_bpf_prog_info’ [-Wmissing-prototypes] void print_bpf_prog_info(void) ^~~~~~~~~~~~~~~~~~~ Fixes: 3ffab5460264 ("samples/bpf: xdp_monitor tool based on tracepoints") Signed-off-by: Stephen Hemminger Acked-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/bpf/xdp_monitor_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index b51b4f5e3257..c5ab8b776973 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -61,7 +61,7 @@ static void usage(char *argv[]) } #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -__u64 gettime(void) +static __u64 gettime(void) { struct timespec t; int res; @@ -206,7 +206,7 @@ static void stats_poll(int interval, bool err_only) } } -void print_bpf_prog_info(void) +static void print_bpf_prog_info(void) { int i; From 2a5e597c6bb1b873e473e5f57147e9e5d2755430 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Mon, 18 Sep 2017 09:27:42 -0700 Subject: [PATCH 1118/1322] HID: wacom: Always increment hdev refcount within wacom_get_hdev_data The wacom_get_hdev_data function is used to find and return a reference to the "other half" of a Wacom device (i.e., the touch device associated with a pen, or vice-versa). To ensure these references are properly accounted for, the function is supposed to automatically increment the refcount before returning. This was not done, however, for devices which have pen & touch on different interfaces of the same USB device. This can lead to a WARNING ("refcount_t: underflow; use-after-free") when removing the module or device as we call kref_put() more times than kref_get(). Triggering an "actual" use- after-free would be difficult since both devices will disappear nearly- simultaneously. To silence this warning and prevent the potential error, we need to increment the refcount for all cases within wacom_get_hdev_data. Fixes: 41372d5d40 ("HID: wacom: Augment 'oVid' and 'oPid' with heuristics for HID_GENERIC") Cc: # v4.9+ Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 735bfbbcaa82..906e654fb0ba 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -668,8 +668,10 @@ static struct wacom_hdev_data *wacom_get_hdev_data(struct hid_device *hdev) /* Try to find an already-probed interface from the same device */ list_for_each_entry(data, &wacom_udev_list, list) { - if (compare_device_paths(hdev, data->dev, '/')) + if (compare_device_paths(hdev, data->dev, '/')) { + kref_get(&data->kref); return data; + } } /* Fallback to finding devices that appear to be "siblings" */ From 814b6d17487fd970f293ee674c90ba267f82415d Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 19 Sep 2017 18:37:46 -0700 Subject: [PATCH 1119/1322] HID: hidraw: fix power sequence when closing device We should not try to bring HID device out of full power state before calling hid_hw_close(), so that transport driver operates on powered up device (making this inverse of the opening sequence). Signed-off-by: Dmitry Torokhov Reviewed-by: Guenter Roeck Reviewed-by: Benson Leung Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index ec530454e6f6..5fbe0f81ab2e 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -337,8 +337,8 @@ static void drop_ref(struct hidraw *hidraw, int exists_bit) kfree(hidraw); } else { /* close device for last reader */ - hid_hw_power(hidraw->hid, PM_HINT_NORMAL); hid_hw_close(hidraw->hid); + hid_hw_power(hidraw->hid, PM_HINT_NORMAL); } } } From 66dcdafe8e251a3edc5d84cf725835567bd3dd35 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 2 Oct 2017 15:50:34 +0800 Subject: [PATCH 1120/1322] Revert "HID: multitouch: Support ALPS PTP stick with pid 0x120A" This reverts commit fcaa4a07d2a4b541e91da7a55d8b3331f96d1865. As noted by Masaki [1], 0x120A + trackpoint will not be used in mass production machines, so remove the ID accordingly. [1] http://www.spinics.net/lists/linux-input/msg53222.html Signed-off-by: Kai-Heng Feng Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 - drivers/hid/hid-multitouch.c | 4 ---- 2 files changed, 5 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 832897d4a849..a98919199858 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -75,7 +75,6 @@ #define USB_VENDOR_ID_ALPS_JP 0x044E #define HID_DEVICE_ID_ALPS_U1_DUAL 0x120B -#define HID_DEVICE_ID_ALPS_U1_PTP_2 0x120A #define HID_DEVICE_ID_ALPS_U1_DUAL_PTP 0x121F #define HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP 0x1220 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index c78625dceced..9e8c4d2ba11d 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1419,10 +1419,6 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_ALPS_JP, HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP) }, - { .driver_data = MT_CLS_WIN_8_DUAL, - HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, - USB_VENDOR_ID_ALPS_JP, - HID_DEVICE_ID_ALPS_U1_PTP_2) }, /* Lenovo X1 TAB Gen 2 */ { .driver_data = MT_CLS_WIN_8_DUAL, From 51db452df07bb4c5754b73789253ba21681d9dc2 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 26 Sep 2017 09:11:49 +0900 Subject: [PATCH 1121/1322] Revert "ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members" This reverts commit 275353bb684e to fix a regression which can abort 'alsactl' program in alsa-utils due to assertion in alsa-lib. alsactl: control.c:2513: snd_ctl_elem_value_get_integer: Assertion `idx < sizeof(obj->value.integer.value) / sizeof(obj->value.integer.value[0])' failed. alsactl: control.c:2976: snd_ctl_elem_value_get_integer: Assertion `idx < ARRAY_SIZE(obj->value.integer.value)' failed. This commit is a band-aid. In a point of usage of ALSA control interface, the drivers still bring an issue that they prevent userspace applications to have a consistent way to parse each levels of the dimension information via ALSA control interface. Let me investigate this issue. Current implementation of the drivers have three control element sets with dimension information: * 'Monitor Mixer Volume' (type: integer) * 'VMixer Volume' (type: integer) * 'VU-meters' (type: boolean) Although the number of elements named as 'Monitor Mixer Volume' differs depending on drivers in this group, it can be calculated by macros defined by each driver (= (BX_NUM - BX_ANALOG_IN) * BX_ANALOG_IN). Each of the elements has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * (BX_NUM - BX_ANALOG_IN)). For these elements, userspace applications are expected to handle the dimension information so that all of the elements construct a matrix where the number of rows and columns are represented by the dimension information. The same way is applied to elements named as 'VMixer Volume'. The number of these elements can also be calculated by macros defined by each drivers (= PX_ANALOG_IN * BX_ANALOG_IN). Each of the element has one member for value and has dimension information with 2 levels (= BX_ANALOG_IN * PX_ANALOG_IN). All of the elements construct a matrix with the dimension information. An element named as 'VU-meters' gets a different way in a point of dimension information. The element includes 96 members for value. The element has dimension information with 3 levels (= 3 or 2 * 16 * 2). For this element, userspace applications are expected to handle the dimension information so that all of the members for value construct a matrix where the number of rows and columns are represented by the dimension information. This is different from the way for the former. As a summary, the drivers were not designed to produce a consistent way to parse the dimension information. This makes it hard for general userspace applications such as amixer to parse the information by a consistent way, and actually no userspace applications except for 'echomixer' utilize the dimension information. Additionally, no drivers excluding this group use the information. The reverted commit was written based on the latter way. A commit 860c1994a70a ('ALSA: control: add dimension validator for userspace elements') is written based on the latter way, too. The patch should be reconsider too in the same time to re-define a consistent way to parse the dimension information. Reported-by: Mark Hills Reported-by: S. Christian Collins Fixes: 275353bb684e ('ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members') Cc: # v4.8+ Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai --- sound/pci/echoaudio/echoaudio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c index 7326695bca33..d68f99e076a8 100644 --- a/sound/pci/echoaudio/echoaudio.c +++ b/sound/pci/echoaudio/echoaudio.c @@ -1272,11 +1272,11 @@ static int snd_echo_mixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_busses_in(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1344,11 +1344,11 @@ static int snd_echo_vmixer_info(struct snd_kcontrol *kcontrol, chip = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = ECHOGAIN_MAXOUT; uinfo->dimen.d[0] = num_busses_out(chip); uinfo->dimen.d[1] = num_pipes_out(chip); - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1]; return 0; } @@ -1728,6 +1728,7 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 96; uinfo->value.integer.min = ECHOGAIN_MINOUT; uinfo->value.integer.max = 0; #ifdef ECHOCARD_HAS_VMIXER @@ -1737,7 +1738,6 @@ static int snd_echo_vumeters_info(struct snd_kcontrol *kcontrol, #endif uinfo->dimen.d[1] = 16; /* 16 channels */ uinfo->dimen.d[2] = 2; /* 0=level, 1=peak */ - uinfo->count = uinfo->dimen.d[0] * uinfo->dimen.d[1] * uinfo->dimen.d[2]; return 0; } From 17a91809942ca32c70026d2d5ba3348a2c4fdf8f Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 2 Oct 2017 07:17:50 -0700 Subject: [PATCH 1122/1322] fm10k: ensure we process SM mbx when processing VF mbx When we process VF mailboxes, the driver is likely going to also queue up messages to the switch manager. This process merely queues up the FIFO, but doesn't actually begin the transmission process. Because we hold the mailbox lock during this VF processing, the PF<->SM mailbox is not getting processed at this time. Ensure that we actually process the PF<->SM mailbox in between each PF<->VF mailbox. This should ensure prompt transmission of the messages queued up after each VF message is received and handled. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index 5f4dac0d36ef..2ec49116fe91 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -126,6 +126,9 @@ process_mbx: struct fm10k_mbx_info *mbx = &vf_info->mbx; u16 glort = vf_info->glort; + /* process the SM mailbox first to drain outgoing messages */ + hw->mbx.ops.process(hw, &hw->mbx); + /* verify port mapping is valid, if not reset port */ if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort)) hw->iov.ops.reset_lport(hw, vf_info); From b52b7f7059f2df8eb3258a25bc69e12dc21ebcd7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 8 Mar 2017 15:55:43 -0800 Subject: [PATCH 1123/1322] fm10k: reschedule service event if we stall the PF<->SM mailbox When we are handling PF<->VF mailbox messages, it is possible that the VF will send us so many messages that the PF<->SM FIFO will fill up. In this case, we stop the loop and wait until the service event is rescheduled. Normally this should happen due to an interrupt. But it is possible that we don't get another interrupt for a while and it isn't until the service timer actually reschedules us. Instead, simply reschedule immediately which will cause the service event to be run again as soon as we exit. This ensures that we promptly handle all of the PF<->VF messages with minimal delay, while still giving time for the SM mailbox to drain. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index 2ec49116fe91..d8356c494f06 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -143,6 +143,10 @@ process_mbx: if (!hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU)) { /* keep track of how many times this occurs */ interface->hw_sm_mbx_full++; + + /* make sure we try again momentarily */ + fm10k_service_event_schedule(interface); + break; } From 95f49d4bdee34dd0f68446bb260ab537f62ed9b3 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 8 May 2017 18:18:09 +0200 Subject: [PATCH 1124/1322] fm10k: Use seq_putc() in fm10k_dbg_desc_break() Two single characters should be put into a sequence. Thus use the corresponding function "seq_putc". This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c index 5116fd043630..14df09e2d964 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c @@ -52,9 +52,9 @@ static void fm10k_dbg_desc_seq_stop(struct seq_file __always_unused *s, static void fm10k_dbg_desc_break(struct seq_file *s, int i) { while (i--) - seq_puts(s, "-"); + seq_putc(s, '-'); - seq_puts(s, "\n"); + seq_putc(s, '\n'); } static int fm10k_dbg_tx_desc_seq_show(struct seq_file *s, void *v) From 5c66d1251d67714e9f6e6b0af18ca989109b876f Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:04 -0700 Subject: [PATCH 1125/1322] fm10k: stop spurious link down messages when Tx FIFO is full In fm10k_get_host_state_generic, we check the mailbox tx_read() function to ensure that the mailbox is still open. This function also checks to make sure we have space to transmit another message. Unfortunately, if we just recently sent a bunch of messages (such as enabling hundreds of VLANs on a VF) this can result in a race where the watchdog task thinks the link went down just because we haven't had time to process all these messages yet. Instead, lets just check whether the mailbox is still open. This ensures that we don't race with the Tx FIFO, and we only link down once the mailbox is not open. This is safe, because if the FIFO fills up and we're unable to send a message for too long, we'll end up triggering the timeout detection which results in a reset. Additionally, since we still check to ensure the mailbox state is OPEN, we'll transition to link down whenever the mailbox closes as well. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_common.c b/drivers/net/ethernet/intel/fm10k/fm10k_common.c index 62a6ad9b3eed..736a9f087bc9 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_common.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_common.c @@ -1,5 +1,5 @@ /* Intel(R) Ethernet Switch Host Interface Driver - * Copyright(c) 2013 - 2016 Intel Corporation. + * Copyright(c) 2013 - 2017 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -517,8 +517,8 @@ s32 fm10k_get_host_state_generic(struct fm10k_hw *hw, bool *host_ready) goto out; } - /* verify Mailbox is still valid */ - if (!mbx->ops.tx_ready(mbx, FM10K_VFMBX_MSG_MTU)) + /* verify Mailbox is still open */ + if (mbx->state != FM10K_STATE_OPEN) goto out; /* interface cannot receive traffic without logical ports */ From 375ce90eab7ee1c87eefa2cd312b0be9ac961082 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:05 -0700 Subject: [PATCH 1126/1322] fm10k: fix typos on fall through comments Newer versions of GCC since version 7 now warn when a case statement may fall through without an explicit comment. "Fallthough" does not count as it is misspelled. Fix the typos for these comments to appease the new warnings. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_mbx.c | 4 ++-- drivers/net/ethernet/intel/fm10k/fm10k_pf.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c index 334088a101c3..244d3ad58ca7 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c @@ -1,5 +1,5 @@ /* Intel(R) Ethernet Switch Host Interface Driver - * Copyright(c) 2013 - 2016 Intel Corporation. + * Copyright(c) 2013 - 2017 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -1586,7 +1586,7 @@ s32 fm10k_pfvf_mbx_init(struct fm10k_hw *hw, struct fm10k_mbx_info *mbx, mbx->mbmem_reg = FM10K_MBMEM_VF(id, 0); break; } - /* fallthough */ + /* fall through */ default: return FM10K_MBX_ERR_NO_MBX; } diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c index 40ee0242a80a..9e4fb3a44376 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c @@ -1,5 +1,5 @@ /* Intel(R) Ethernet Switch Host Interface Driver - * Copyright(c) 2013 - 2016 Intel Corporation. + * Copyright(c) 2013 - 2017 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -1334,19 +1334,19 @@ static u8 fm10k_iov_supported_xcast_mode_pf(struct fm10k_vf_info *vf_info, case FM10K_XCAST_MODE_PROMISC: if (vf_flags & FM10K_VF_FLAG_PROMISC_CAPABLE) return FM10K_XCAST_MODE_PROMISC; - /* fallthough */ + /* fall through */ case FM10K_XCAST_MODE_ALLMULTI: if (vf_flags & FM10K_VF_FLAG_ALLMULTI_CAPABLE) return FM10K_XCAST_MODE_ALLMULTI; - /* fallthough */ + /* fall through */ case FM10K_XCAST_MODE_MULTI: if (vf_flags & FM10K_VF_FLAG_MULTI_CAPABLE) return FM10K_XCAST_MODE_MULTI; - /* fallthough */ + /* fall through */ case FM10K_XCAST_MODE_NONE: if (vf_flags & FM10K_VF_FLAG_NONE_CAPABLE) return FM10K_XCAST_MODE_NONE; - /* fallthough */ + /* fall through */ default: break; } From b94dd008c401fc73a8d843e3219356255f40c1ed Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:06 -0700 Subject: [PATCH 1127/1322] fm10k: avoid possible truncation of q_vector->name New versions of GCC since version 7 began warning about possible truncation of calls to snprintf. We can fix this and avoid false positives. First, we should pass the full buffer size to snprintf, because it guarantees a NULL character as part of its passed length, so passing len-1 is simply wasting a byte of possible storage. Second, if we make the ri and ti variables unsigned, the compiler is able to correctly reason that the value never gets larger than 256, so it doesn't need to warn about the full space required to print a signed integer. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 63784576ae8b..9212b3fa3b62 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1544,7 +1544,7 @@ int fm10k_qv_request_irq(struct fm10k_intfc *interface) struct net_device *dev = interface->netdev; struct fm10k_hw *hw = &interface->hw; struct msix_entry *entry; - int ri = 0, ti = 0; + unsigned int ri = 0, ti = 0; int vector, err; entry = &interface->msix_entries[NON_Q_VECTORS(hw)]; @@ -1554,15 +1554,15 @@ int fm10k_qv_request_irq(struct fm10k_intfc *interface) /* name the vector */ if (q_vector->tx.count && q_vector->rx.count) { - snprintf(q_vector->name, sizeof(q_vector->name) - 1, - "%s-TxRx-%d", dev->name, ri++); + snprintf(q_vector->name, sizeof(q_vector->name), + "%s-TxRx-%u", dev->name, ri++); ti++; } else if (q_vector->rx.count) { - snprintf(q_vector->name, sizeof(q_vector->name) - 1, - "%s-rx-%d", dev->name, ri++); + snprintf(q_vector->name, sizeof(q_vector->name), + "%s-rx-%u", dev->name, ri++); } else if (q_vector->tx.count) { - snprintf(q_vector->name, sizeof(q_vector->name) - 1, - "%s-tx-%d", dev->name, ti++); + snprintf(q_vector->name, sizeof(q_vector->name), + "%s-tx-%u", dev->name, ti++); } else { /* skip this unused q_vector */ continue; From 523a0b558db4ca205522976077911e5efe235781 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:07 -0700 Subject: [PATCH 1128/1322] fm10k: add missing fall through comment Newer versions of GCC starting with 7 now additionally warn when a case statement may fall through without an explicit comment mentioning it. Add such a comment to silence the warning, as this is expected. Unfortunately the comment must come directly before the next case statement, so we put it outside the #ifdef. Otherwise, the compiler cannot properly detect it and thus the warning is displayed regardless. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 9dffaba85ae6..189d52a8a605 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -876,6 +876,7 @@ static void fm10k_tx_csum(struct fm10k_ring *tx_ring, case IPPROTO_GRE: if (skb->encapsulation) break; + /* fall through */ default: if (unlikely(net_ratelimit())) { dev_warn(tx_ring->dev, From 8bac58be1700dab3cac8cb53ed0651da40777024 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:08 -0700 Subject: [PATCH 1129/1322] fm10k: avoid needless delay when loading driver When we load the driver, we set the last_reset to be in the future, which delays the initial driver reset. Additionally, the service task isn't scheduled to run automatically until the timer runs out. This causes a needless delay of the first reset to begin talking to the switch manager. We can avoid this by simply not setting last_reset and immediately scheduling the service task while in probe. This allows the device to wake up faster, and avoids this delay. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 9212b3fa3b62..6c2c4bffaedf 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1800,9 +1800,6 @@ static int fm10k_sw_init(struct fm10k_intfc *interface, netdev->vlan_features |= NETIF_F_HIGHDMA; } - /* delay any future reset requests */ - interface->last_reset = jiffies + (10 * HZ); - /* reset and initialize the hardware so it is in a known state */ err = hw->mac.ops.reset_hw(hw); if (err) { @@ -2079,8 +2076,9 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* enable SR-IOV after registering netdev to enforce PF/VF ordering */ fm10k_iov_configure(pdev, 0); - /* clear the service task disable bit to allow service task to start */ + /* clear the service task disable bit and kick off service task */ clear_bit(__FM10K_SERVICE_DISABLE, interface->state); + fm10k_service_event_schedule(interface); return 0; From 4abf01b43b62525e4f1a20dd1a2bc4a1967d8928 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:09 -0700 Subject: [PATCH 1130/1322] fm10k: simplify reading PFVFLRE register We're doing a really convoluted bitshift and read for the PFVFLRE register. Just reading the PFVFLRE(1), shifting it by 32, then reading PFVFLRE(0) should be sufficient. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index d8356c494f06..dfc88a463735 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -1,5 +1,5 @@ /* Intel(R) Ethernet Switch Host Interface Driver - * Copyright(c) 2013 - 2016 Intel Corporation. + * Copyright(c) 2013 - 2017 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -67,10 +67,8 @@ s32 fm10k_iov_event(struct fm10k_intfc *interface) /* read VFLRE to determine if any VFs have been reset */ do { - vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(0)); + vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(1)); vflre <<= 32; - vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(1)); - vflre = (vflre << 32) | (vflre >> 32); vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(0)); i = iov_data->num_vfs; From d876c1583bb1b7f7264880265b824e88b791aa5d Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:10 -0700 Subject: [PATCH 1131/1322] fm10k: don't loop while resetting VFs due to VFLR event We've always had a really weird looping construction for resetting VFs. We read the VFLRE register and reset the VF if the corresponding bit is set, which makes sense. However we loop continuously until we no longer have any bits left unset. At first this makes sense, as a sort of "keep trying until we succeed" concept. Unfortunately this causes a problem if we happen to surprise remove while this code is executing, because in this case we'll always read all 1s for the VFLRE register. This results in a hard lockup on the CPU because the loop will never terminate. Because our own reset function will clear the VFLR event register always, (except when we've lost PCIe link obviously) there is no real reason to loop. In practice, we'll loop over once and find that no VFs are pending anymore. Lets just check once. Since we're clear the notification when we reset there's no benefit to the loop. Additionally, there shouldn't be a race as future VLFRE events should trigger an interrupt. Additionally, we didn't warn or do anything in the looped case anyways. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 24 +++++++++----------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index dfc88a463735..03897720bf0b 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -66,23 +66,21 @@ s32 fm10k_iov_event(struct fm10k_intfc *interface) goto read_unlock; /* read VFLRE to determine if any VFs have been reset */ - do { - vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(1)); - vflre <<= 32; - vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(0)); + vflre = fm10k_read_reg(hw, FM10K_PFVFLRE(1)); + vflre <<= 32; + vflre |= fm10k_read_reg(hw, FM10K_PFVFLRE(0)); - i = iov_data->num_vfs; + i = iov_data->num_vfs; - for (vflre <<= 64 - i; vflre && i--; vflre += vflre) { - struct fm10k_vf_info *vf_info = &iov_data->vf_info[i]; + for (vflre <<= 64 - i; vflre && i--; vflre += vflre) { + struct fm10k_vf_info *vf_info = &iov_data->vf_info[i]; - if (vflre >= 0) - continue; + if (vflre >= 0) + continue; - hw->iov.ops.reset_resources(hw, vf_info); - vf_info->mbx.ops.connect(hw, &vf_info->mbx); - } - } while (i != iov_data->num_vfs); + hw->iov.ops.reset_resources(hw, vf_info); + vf_info->mbx.ops.connect(hw, &vf_info->mbx); + } read_unlock: rcu_read_unlock(); From dd5eede2b711350f684e8510300cb3762a821ae6 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:11 -0700 Subject: [PATCH 1132/1322] fm10k: avoid divide by zero in rare cases when device is resetting It is possible that under rare circumstances the device is undergoing a reset, such as when a PFLR occurs, and the device may be transmitting simultaneously. In this case, we might attempt to divide by zero when finding the proper r_idx. Instead, lets read the num_tx_queues once, and make sure it's non-zero. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index e69d49d91d67..77d495fedced 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -643,9 +643,13 @@ int fm10k_close(struct net_device *netdev) static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev) { struct fm10k_intfc *interface = netdev_priv(dev); + int num_tx_queues = READ_ONCE(interface->num_tx_queues); unsigned int r_idx = skb->queue_mapping; int err; + if (!num_tx_queues) + return NETDEV_TX_BUSY; + if ((skb->protocol == htons(ETH_P_8021Q)) && !skb_vlan_tag_present(skb)) { /* FM10K only supports hardware tagging, any tags in frame @@ -698,8 +702,8 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev) __skb_put(skb, pad_len); } - if (r_idx >= interface->num_tx_queues) - r_idx %= interface->num_tx_queues; + if (r_idx >= num_tx_queues) + r_idx %= num_tx_queues; err = fm10k_xmit_frame_ring(skb, interface->tx_ring[r_idx]); From 65b0a469e9e6d025ce9cc46f6cc94d28abf5561c Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:12 -0700 Subject: [PATCH 1133/1322] fm10k: move fm10k_prepare_for_reset and fm10k_handle_reset A future patch needs these functions defined earlier in the file. Move them closer to above where they will be called. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 58 ++++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 6c2c4bffaedf..41335154d6b1 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -132,35 +132,6 @@ static void fm10k_service_timer(unsigned long data) fm10k_service_event_schedule(interface); } -static void fm10k_detach_subtask(struct fm10k_intfc *interface) -{ - struct net_device *netdev = interface->netdev; - u32 __iomem *hw_addr; - u32 value; - - /* do nothing if device is still present or hw_addr is set */ - if (netif_device_present(netdev) || interface->hw.hw_addr) - return; - - /* check the real address space to see if we've recovered */ - hw_addr = READ_ONCE(interface->uc_addr); - value = readl(hw_addr); - if (~value) { - interface->hw.hw_addr = interface->uc_addr; - netif_device_attach(netdev); - set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags); - netdev_warn(netdev, "PCIe link restored, device now attached\n"); - return; - } - - rtnl_lock(); - - if (netif_running(netdev)) - dev_close(netdev); - - rtnl_unlock(); -} - static void fm10k_prepare_for_reset(struct fm10k_intfc *interface) { struct net_device *netdev = interface->netdev; @@ -270,6 +241,35 @@ reinit_err: return err; } +static void fm10k_detach_subtask(struct fm10k_intfc *interface) +{ + struct net_device *netdev = interface->netdev; + u32 __iomem *hw_addr; + u32 value; + + /* do nothing if device is still present or hw_addr is set */ + if (netif_device_present(netdev) || interface->hw.hw_addr) + return; + + /* check the real address space to see if we've recovered */ + hw_addr = READ_ONCE(interface->uc_addr); + value = readl(hw_addr); + if (~value) { + interface->hw.hw_addr = interface->uc_addr; + netif_device_attach(netdev); + set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags); + netdev_warn(netdev, "PCIe link restored, device now attached\n"); + return; + } + + rtnl_lock(); + + if (netif_running(netdev)) + dev_close(netdev); + + rtnl_unlock(); +} + static void fm10k_reinit(struct fm10k_intfc *interface) { int err; From 04914390f5a197da1e042f585e1263ad8ebff632 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:13 -0700 Subject: [PATCH 1134/1322] fm10k: prevent race condition of __FM10K_SERVICE_SCHED Although very unlikely, it is possible that cancel_work_sync() may stop the service_task before it actually started. In this case, the __FM10K_SERVICE_SCHED bit will never be cleared. This results in the service task being unable to reschedule in the future. Add a helper function which sets the service disable bit, waits for the service task to stop and clears the schedule bit, thus avoiding the race condition. We know the schedule bit is safe to clear because the cancel_work_sync() guarantees the service task is not running. Add a helper function also to restart the service task, for symmetry. This is not strictly needed but helps the mental model of how to stop and start the service task. This race could only happen in fm10k_suspend/fm10k_resume as this is the only place where the service task is actually restarted. Thus, suspend/resume testing would be ideal. However, note that the chance of this happening is very slim as the service event is scheduled for immediate execution, and you would have to trigger a suspend at almost the exact same time as the service task was scheduled. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 32 +++++++++++++++----- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 41335154d6b1..9575f7c1862d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -118,6 +118,27 @@ static void fm10k_service_event_complete(struct fm10k_intfc *interface) fm10k_service_event_schedule(interface); } +static void fm10k_stop_service_event(struct fm10k_intfc *interface) +{ + set_bit(__FM10K_SERVICE_DISABLE, interface->state); + cancel_work_sync(&interface->service_task); + + /* It's possible that cancel_work_sync stopped the service task from + * running before it could actually start. In this case the + * __FM10K_SERVICE_SCHED bit will never be cleared. Since we know that + * the service task cannot be running at this point, we need to clear + * the scheduled bit, as otherwise the service task may never be + * restarted. + */ + clear_bit(__FM10K_SERVICE_SCHED, interface->state); +} + +static void fm10k_start_service_event(struct fm10k_intfc *interface) +{ + clear_bit(__FM10K_SERVICE_DISABLE, interface->state); + fm10k_service_event_schedule(interface); +} + /** * fm10k_service_timer - Timer Call-back * @data: pointer to interface cast into an unsigned long @@ -2116,8 +2137,7 @@ static void fm10k_remove(struct pci_dev *pdev) del_timer_sync(&interface->service_timer); - set_bit(__FM10K_SERVICE_DISABLE, interface->state); - cancel_work_sync(&interface->service_task); + fm10k_stop_service_event(interface); /* free netdev, this may bounce the interrupts due to setup_tc */ if (netdev->reg_state == NETREG_REGISTERED) @@ -2155,8 +2175,7 @@ static void fm10k_prepare_suspend(struct fm10k_intfc *interface) * stopped. We stop the watchdog task until after we resume software * activity. */ - set_bit(__FM10K_SERVICE_DISABLE, interface->state); - cancel_work_sync(&interface->service_task); + fm10k_stop_service_event(interface); fm10k_prepare_for_reset(interface); } @@ -2183,9 +2202,8 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) interface->link_down_event = jiffies + (HZ); set_bit(__FM10K_LINK_DOWN, interface->state); - /* clear the service task disable bit to allow service task to start */ - clear_bit(__FM10K_SERVICE_DISABLE, interface->state); - fm10k_service_event_schedule(interface); + /* restart the service task */ + fm10k_start_service_event(interface); return err; } From 7682e399485fe19622b6fd82510b1f4551e48a25 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Oct 2017 14:06:43 +0200 Subject: [PATCH 1135/1322] ALSA: usx2y: Suppress kernel warning at page allocation failures The usx2y driver allocates the stream read/write buffers in continuous pages depending on the stream setup, and this may spew the kernel warning messages with a stack trace like: WARNING: CPU: 1 PID: 1846 at mm/page_alloc.c:3883 __alloc_pages_slowpath+0x1ef2/0x2d70 Modules linked in: CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted .... It may confuse user as if it were any serious error, although this is no fatal error and the driver handles the error case gracefully. Since the driver has already some sanity check of the given size (128 and 256 pages), it can't pass any crazy value. So it's merely page fragmentation. This patch adds __GFP_NOWARN to each caller for suppressing such kernel warnings. The original issue was spotted by syzkaller. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Signed-off-by: Takashi Iwai --- sound/usb/usx2y/usb_stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/usb/usx2y/usb_stream.c b/sound/usb/usx2y/usb_stream.c index 4dab49080700..e229abd21652 100644 --- a/sound/usb/usx2y/usb_stream.c +++ b/sound/usb/usx2y/usb_stream.c @@ -191,7 +191,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, } pg = get_order(read_size); - sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->s) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); goto out; @@ -211,7 +212,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, pg = get_order(write_size); sk->write_page = - (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->write_page) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); usb_stream_free(sk); From 32f16369e59fcc505c5ed93a6a8cad3d5636b463 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2017 10:41:15 +0200 Subject: [PATCH 1136/1322] net/dst: Make skb parameter of skb{metadata_dst, tunnel_info}() const Make the skb parameter of skb_metadata_dst() and skb_tunnel_info() const as they are not modified. This is in preparation for using them in call-sites where skb is const. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a803129a4849..9fba2ebf6dda 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -24,7 +24,7 @@ struct metadata_dst { } u; }; -static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); @@ -34,7 +34,8 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info * +skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; From a38402bc50709aac76796a955a15152a76e3fd4e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2017 10:41:16 +0200 Subject: [PATCH 1137/1322] flow_dissector: dissect tunnel info Move dissection of tunnel info from the flower classifier to the flow dissector where all other dissection occurs. This should not have any behavioural affect on other users of the flow dissector. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 100 ++++++++++++++++++++++++++++++++++++++ net/sched/cls_flower.c | 25 ---------- 2 files changed, 100 insertions(+), 25 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0a977373d003..1f5caafb4492 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} + +static void +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } +} + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + __skb_flow_dissect_tunnel_info(skb, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d230cb4c8094..db831ac708f6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -152,37 +152,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); - info = skb_tunnel_info(skb); - if (info) { - struct ip_tunnel_key *key = &info->key; - - switch (ip_tunnel_info_af(info)) { - case AF_INET: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV4_ADDRS; - skb_key.enc_ipv4.src = key->u.ipv4.src; - skb_key.enc_ipv4.dst = key->u.ipv4.dst; - break; - case AF_INET6: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV6_ADDRS; - skb_key.enc_ipv6.src = key->u.ipv6.src; - skb_key.enc_ipv6.dst = key->u.ipv6.dst; - break; - } - - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); - skb_key.enc_tp.src = key->tp_src; - skb_key.enc_tp.dst = key->tp_dst; - } - skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. From 6227efc1a20b24a21ce8e9122c0569696d644aff Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Oct 2017 12:05:29 +0200 Subject: [PATCH 1138/1322] selftests: rtnetlink.sh: add vxlan and fou test cases fou test lifted from ip-fou man page. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 96 ++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index a048f7a5f94c..62c87da92770 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -332,6 +332,101 @@ kci_test_vrf() echo "PASS: vrf" } +kci_test_encap_vxlan() +{ + ret=0 + vxlan="test-vxlan0" + vlan="test-vlan0" + testns="$1" + + ip netns exec "$testns" ip link add "$vxlan" type vxlan id 42 group 239.1.1.1 \ + dev "$devdummy" dstport 4789 2>/dev/null + if [ $? -ne 0 ]; then + echo "FAIL: can't add vxlan interface, skipping test" + return 0 + fi + check_err $? + + ip netns exec "$testns" ip addr add 10.2.11.49/24 dev "$vxlan" + check_err $? + + ip netns exec "$testns" ip link set up dev "$vxlan" + check_err $? + + ip netns exec "$testns" ip link add link "$vxlan" name "$vlan" type vlan id 1 + check_err $? + + ip netns exec "$testns" ip link del "$vxlan" + check_err $? + + if [ $ret -ne 0 ]; then + echo "FAIL: vxlan" + return 1 + fi + echo "PASS: vxlan" +} + +kci_test_encap_fou() +{ + ret=0 + name="test-fou" + testns="$1" + + ip fou help 2>&1 |grep -q 'Usage: ip fou' + if [ $? -ne 0 ];then + echo "SKIP: fou: iproute2 too old" + return 1 + fi + + ip netns exec "$testns" ip fou add port 7777 ipproto 47 2>/dev/null + if [ $? -ne 0 ];then + echo "FAIL: can't add fou port 7777, skipping test" + return 1 + fi + + ip netns exec "$testns" ip fou add port 8888 ipproto 4 + check_err $? + + ip netns exec "$testns" ip fou del port 9999 2>/dev/null + check_fail $? + + ip netns exec "$testns" ip fou del port 7777 + check_err $? + + if [ $ret -ne 0 ]; then + echo "FAIL: fou" + return 1 + fi + + echo "PASS: fou" +} + +# test various encap methods, use netns to avoid unwanted interference +kci_test_encap() +{ + testns="testns" + ret=0 + + ip netns add "$testns" + if [ $? -ne 0 ]; then + echo "SKIP encap tests: cannot add net namespace $testns" + return 1 + fi + + ip netns exec "$testns" ip link set lo up + check_err $? + + ip netns exec "$testns" ip link add name "$devdummy" type dummy + check_err $? + ip netns exec "$testns" ip link set "$devdummy" up + check_err $? + + kci_test_encap_vxlan "$testns" + kci_test_encap_fou "$testns" + + ip netns del "$testns" +} + kci_test_rtnl() { kci_add_dummy @@ -348,6 +443,7 @@ kci_test_rtnl() kci_test_addrlabel kci_test_ifalias kci_test_vrf + kci_test_encap kci_del_dummy } From 28a04c7b7bbecaab642fcb6a2d7354eb70ea7fbe Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:14:56 +0200 Subject: [PATCH 1139/1322] mlxsw: spectrum_router: Move VRF refcounting When creating a new RIF, bumping RIF count of the containing VR is the last thing to be done. Symmetrically, when destroying a RIF, RIF count is first dropped and only then the rest of the cleanup proceeds. That's a problem for loopback RIFs. Those hold two VR references: one for overlay and one for underlay. mlxsw_sp_rif_destroy() releases the overlay one, and the deconfigure() callback the underlay one. But if both overlay and underlay are the same, and if there are no other artifacts holding the VR alive, this put actually destroys the VR. Later on, when mlxsw_sp_rif_destroy() calls mlxsw_sp_vr_put() for the same VR, the VR will already have been released and the kernel crashes with NULL pointer dereference. The underlying problem is that the RIF under destruction ends up referencing the overlay VR much longer than it claims: all the way until the call to mlxsw_sp_vr_put(). So line up the reference counting properly to reflect this. Make corresponding changes in mlxsw_sp_rif_create() as well for symmetry. Fixes: 6ddb7426a7d4 ("mlxsw: spectrum_router: Introduce loopback RIFs") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 2cfb3f5d092d..3917b4dd4202 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5068,6 +5068,7 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN); if (IS_ERR(vr)) return ERR_CAST(vr); + vr->rif_count++; err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index); if (err) @@ -5099,7 +5100,6 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_rif_counters_alloc(rif); mlxsw_sp->router->rifs[rif_index] = rif; - vr->rif_count++; return rif; @@ -5110,6 +5110,7 @@ err_fid_get: kfree(rif); err_rif_alloc: err_rif_index_alloc: + vr->rif_count--; mlxsw_sp_vr_put(vr); return ERR_PTR(err); } @@ -5124,7 +5125,6 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) mlxsw_sp_router_rif_gone_sync(mlxsw_sp, rif); vr = &mlxsw_sp->router->vrs[rif->vr_id]; - vr->rif_count--; mlxsw_sp->router->rifs[rif->rif_index] = NULL; mlxsw_sp_rif_counters_free(rif); ops->deconfigure(rif); @@ -5132,6 +5132,7 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif) /* Loopback RIFs are not associated with a FID. */ mlxsw_sp_fid_put(fid); kfree(rif); + vr->rif_count--; mlxsw_sp_vr_put(vr); } From de0f43c01a4b5d408a5c087c8a92ac1739938f8b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:14:57 +0200 Subject: [PATCH 1140/1322] mlxsw: spectrum_router: Track RIF of IPIP next hops When considering whether to set RTNH_F_OFFLOAD flag on an IPv6 route, mlxsw_sp_fib6_entry_offload_set() looks up the mlxsw_sp_nexthop corresponding to a given route, and decides based on whether the next hop's offloaded flag was set. When looking for the matching next hop, it also takes into account the device of the route, which must match next hop's RIF. IPIP next hops however hitherto didn't set the RIF. As a result, IPv6 routes forwarding traffic to IP-in-IP netdevices are never marked as offloaded, even when they actually are. Thus track RIF of IPIP next hops the same way as that of ETHERNET next hops. Fixes: 8f28a3097645 ("mlxsw: spectrum_router: Support IPv6 overlay encap") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 3917b4dd4202..032089efc1a0 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2723,6 +2723,7 @@ static void mlxsw_sp_nexthop_type_fini(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_nexthop_rif_fini(nh); break; case MLXSW_SP_NEXTHOP_TYPE_IPIP: + mlxsw_sp_nexthop_rif_fini(nh); mlxsw_sp_nexthop_ipip_fini(mlxsw_sp, nh); break; } @@ -2742,7 +2743,11 @@ static int mlxsw_sp_nexthop4_type_init(struct mlxsw_sp *mlxsw_sp, router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, MLXSW_SP_L3_PROTO_IPV4)) { nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; - return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + err = mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + if (err) + return err; + mlxsw_sp_nexthop_rif_init(nh, &nh->ipip_entry->ol_lb->common); + return 0; } nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; @@ -4009,7 +4014,11 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp, router->ipip_ops_arr[ipipt]->can_offload(mlxsw_sp, dev, MLXSW_SP_L3_PROTO_IPV6)) { nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP; - return mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + err = mlxsw_sp_nexthop_ipip_init(mlxsw_sp, ipipt, nh, dev); + if (err) + return err; + mlxsw_sp_nexthop_rif_init(nh, &nh->ipip_entry->ol_lb->common); + return 0; } nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH; From 7ff176f81dd4675d08951d2395c7ddb1d9974da6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:21:57 +0200 Subject: [PATCH 1141/1322] mlxsw: spectrum_router: Fix a typo Signed-off-by: Petr Machata Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index ef4b86b3aa9b..75078a3fbee8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1680,7 +1680,7 @@ __mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp, err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd), rauhtd_pl); if (err) { - dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour talbe\n"); + dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour table\n"); break; } num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl); From 85f44a15b1920babc2d28c11f514f5b217a29968 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 2 Oct 2017 12:21:58 +0200 Subject: [PATCH 1142/1322] mlxsw: spectrum_router: Drop a redundant condition Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 75078a3fbee8..58bc04cbbef4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3251,7 +3251,7 @@ mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry, return; if (mlxsw_sp_fib_entry_should_offload(fib_entry)) mlxsw_sp_fib_entry_offload_set(fib_entry); - else if (!mlxsw_sp_fib_entry_should_offload(fib_entry)) + else mlxsw_sp_fib_entry_offload_unset(fib_entry); return; default: From f2f2efb807d339513199b1bb771806c90cce83ae Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:28 +0300 Subject: [PATCH 1143/1322] byteorder: Move {cpu_to_be32, be32_to_cpu}_array() from Thunderbolt to core We will be using these when communicating XDomain discovery protocol over Thunderbolt link but they might be useful for other drivers as well. Make them available through byteorder/generic.h. Suggested-by: Andy Shevchenko Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/ctl.c | 14 -------------- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c index fb40dd0588b9..e6a4c9458c76 100644 --- a/drivers/thunderbolt/ctl.c +++ b/drivers/thunderbolt/ctl.c @@ -289,20 +289,6 @@ static void tb_cfg_print_error(struct tb_ctl *ctl, } } -static void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) -{ - int i; - for (i = 0; i < len; i++) - dst[i] = cpu_to_be32(src[i]); -} - -static void be32_to_cpu_array(u32 *dst, __be32 *src, size_t len) -{ - int i; - for (i = 0; i < len; i++) - dst[i] = be32_to_cpu(src[i]); -} - static __be32 tb_crc(const void *data, size_t len) { return cpu_to_be32(~__crc32c_le(~0, data, len)); diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 89f67c1c3160..805d16654459 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val) *var = cpu_to_be64(be64_to_cpu(*var) + val); } +static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) +{ + int i; + + for (i = 0; i < len; i++) + dst[i] = cpu_to_be32(src[i]); +} + +static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len) +{ + int i; + + for (i = 0; i < len; i++) + dst[i] = be32_to_cpu(src[i]); +} + #endif /* _LINUX_BYTEORDER_GENERIC_H */ From 806717081ab6088957c4857d42941159a07cc571 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:29 +0300 Subject: [PATCH 1144/1322] thunderbolt: Remove __packed from ICM message structures These messages are all 32-bit aligned and they should be packed without the __packed attribute just fine. It also allows compiler to generate better code on some architectures. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/tb_msgs.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h index de6441e4a060..f3adf58a40ce 100644 --- a/drivers/thunderbolt/tb_msgs.h +++ b/drivers/thunderbolt/tb_msgs.h @@ -130,7 +130,7 @@ struct icm_pkg_header { u8 flags; u8 packet_id; u8 total_packets; -} __packed; +}; #define ICM_FLAGS_ERROR BIT(0) #define ICM_FLAGS_NO_KEY BIT(1) @@ -139,20 +139,20 @@ struct icm_pkg_header { struct icm_pkg_driver_ready { struct icm_pkg_header hdr; -} __packed; +}; struct icm_pkg_driver_ready_response { struct icm_pkg_header hdr; u8 romver; u8 ramver; u16 security_level; -} __packed; +}; /* Falcon Ridge & Alpine Ridge common messages */ struct icm_fr_pkg_get_topology { struct icm_pkg_header hdr; -} __packed; +}; #define ICM_GET_TOPOLOGY_PACKETS 14 @@ -167,7 +167,7 @@ struct icm_fr_pkg_get_topology_response { u32 reserved[2]; u32 ports[16]; u32 port_hop_info[16]; -} __packed; +}; #define ICM_SWITCH_USED BIT(0) #define ICM_SWITCH_UPSTREAM_PORT_MASK GENMASK(7, 1) @@ -184,7 +184,7 @@ struct icm_fr_event_device_connected { u8 connection_id; u16 link_info; u32 ep_name[55]; -} __packed; +}; #define ICM_LINK_INFO_LINK_MASK 0x7 #define ICM_LINK_INFO_DEPTH_SHIFT 4 @@ -197,13 +197,13 @@ struct icm_fr_pkg_approve_device { u8 connection_key; u8 connection_id; u16 reserved; -} __packed; +}; struct icm_fr_event_device_disconnected { struct icm_pkg_header hdr; u16 reserved; u16 link_info; -} __packed; +}; struct icm_fr_pkg_add_device_key { struct icm_pkg_header hdr; @@ -212,7 +212,7 @@ struct icm_fr_pkg_add_device_key { u8 connection_id; u16 reserved; u32 key[8]; -} __packed; +}; struct icm_fr_pkg_add_device_key_response { struct icm_pkg_header hdr; @@ -220,7 +220,7 @@ struct icm_fr_pkg_add_device_key_response { u8 connection_key; u8 connection_id; u16 reserved; -} __packed; +}; struct icm_fr_pkg_challenge_device { struct icm_pkg_header hdr; @@ -229,7 +229,7 @@ struct icm_fr_pkg_challenge_device { u8 connection_id; u16 reserved; u32 challenge[8]; -} __packed; +}; struct icm_fr_pkg_challenge_device_response { struct icm_pkg_header hdr; @@ -239,7 +239,7 @@ struct icm_fr_pkg_challenge_device_response { u16 reserved; u32 challenge[8]; u32 response[8]; -} __packed; +}; /* Alpine Ridge only messages */ @@ -247,7 +247,7 @@ struct icm_ar_pkg_get_route { struct icm_pkg_header hdr; u16 reserved; u16 link_info; -} __packed; +}; struct icm_ar_pkg_get_route_response { struct icm_pkg_header hdr; @@ -255,6 +255,6 @@ struct icm_ar_pkg_get_route_response { u16 link_info; u32 route_hi; u32 route_lo; -} __packed; +}; #endif From cdae7c07e3e3509eaabc18c1640a55dc5b99c179 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:30 +0300 Subject: [PATCH 1145/1322] thunderbolt: Add support for XDomain properties Thunderbolt XDomain discovery protocol uses directories which contain properties and other directories to exchange information about what capabilities the remote host supports. This also includes identification information like device ID and name. This adds support for parsing and formatting these properties and establishes an API drivers can use in addition to the core Thunderbolt driver. This API is exposed in a new header: include/linux/thunderbolt.h. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/Makefile | 2 +- drivers/thunderbolt/property.c | 670 +++++++++++++++++++++++++++++++++ include/linux/thunderbolt.h | 89 +++++ 3 files changed, 760 insertions(+), 1 deletion(-) create mode 100644 drivers/thunderbolt/property.c create mode 100644 include/linux/thunderbolt.h diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile index 4900febc6c8a..7afd21f5383a 100644 --- a/drivers/thunderbolt/Makefile +++ b/drivers/thunderbolt/Makefile @@ -1,3 +1,3 @@ obj-${CONFIG_THUNDERBOLT} := thunderbolt.o thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o -thunderbolt-objs += domain.o dma_port.o icm.o +thunderbolt-objs += domain.o dma_port.o icm.o property.o diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c new file mode 100644 index 000000000000..8fe913a95b4a --- /dev/null +++ b/drivers/thunderbolt/property.c @@ -0,0 +1,670 @@ +/* + * Thunderbolt XDomain property support + * + * Copyright (C) 2017, Intel Corporation + * Authors: Michael Jamet + * Mika Westerberg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +struct tb_property_entry { + u32 key_hi; + u32 key_lo; + u16 length; + u8 reserved; + u8 type; + u32 value; +}; + +struct tb_property_rootdir_entry { + u32 magic; + u32 length; + struct tb_property_entry entries[]; +}; + +struct tb_property_dir_entry { + u32 uuid[4]; + struct tb_property_entry entries[]; +}; + +#define TB_PROPERTY_ROOTDIR_MAGIC 0x55584401 + +static struct tb_property_dir *__tb_property_parse_dir(const u32 *block, + size_t block_len, unsigned int dir_offset, size_t dir_len, + bool is_root); + +static inline void parse_dwdata(void *dst, const void *src, size_t dwords) +{ + be32_to_cpu_array(dst, src, dwords); +} + +static inline void format_dwdata(void *dst, const void *src, size_t dwords) +{ + cpu_to_be32_array(dst, src, dwords); +} + +static bool tb_property_entry_valid(const struct tb_property_entry *entry, + size_t block_len) +{ + switch (entry->type) { + case TB_PROPERTY_TYPE_DIRECTORY: + case TB_PROPERTY_TYPE_DATA: + case TB_PROPERTY_TYPE_TEXT: + if (entry->length > block_len) + return false; + if (entry->value + entry->length > block_len) + return false; + break; + + case TB_PROPERTY_TYPE_VALUE: + if (entry->length != 1) + return false; + break; + } + + return true; +} + +static bool tb_property_key_valid(const char *key) +{ + return key && strlen(key) <= TB_PROPERTY_KEY_SIZE; +} + +static struct tb_property * +tb_property_alloc(const char *key, enum tb_property_type type) +{ + struct tb_property *property; + + property = kzalloc(sizeof(*property), GFP_KERNEL); + if (!property) + return NULL; + + strcpy(property->key, key); + property->type = type; + INIT_LIST_HEAD(&property->list); + + return property; +} + +static struct tb_property *tb_property_parse(const u32 *block, size_t block_len, + const struct tb_property_entry *entry) +{ + char key[TB_PROPERTY_KEY_SIZE + 1]; + struct tb_property *property; + struct tb_property_dir *dir; + + if (!tb_property_entry_valid(entry, block_len)) + return NULL; + + parse_dwdata(key, entry, 2); + key[TB_PROPERTY_KEY_SIZE] = '\0'; + + property = tb_property_alloc(key, entry->type); + if (!property) + return NULL; + + property->length = entry->length; + + switch (property->type) { + case TB_PROPERTY_TYPE_DIRECTORY: + dir = __tb_property_parse_dir(block, block_len, entry->value, + entry->length, false); + if (!dir) { + kfree(property); + return NULL; + } + property->value.dir = dir; + break; + + case TB_PROPERTY_TYPE_DATA: + property->value.data = kcalloc(property->length, sizeof(u32), + GFP_KERNEL); + if (!property->value.data) { + kfree(property); + return NULL; + } + parse_dwdata(property->value.data, block + entry->value, + entry->length); + break; + + case TB_PROPERTY_TYPE_TEXT: + property->value.text = kcalloc(property->length, sizeof(u32), + GFP_KERNEL); + if (!property->value.text) { + kfree(property); + return NULL; + } + parse_dwdata(property->value.text, block + entry->value, + entry->length); + /* Force null termination */ + property->value.text[property->length * 4 - 1] = '\0'; + break; + + case TB_PROPERTY_TYPE_VALUE: + property->value.immediate = entry->value; + break; + + default: + property->type = TB_PROPERTY_TYPE_UNKNOWN; + break; + } + + return property; +} + +static struct tb_property_dir *__tb_property_parse_dir(const u32 *block, + size_t block_len, unsigned int dir_offset, size_t dir_len, bool is_root) +{ + const struct tb_property_entry *entries; + size_t i, content_len, nentries; + unsigned int content_offset; + struct tb_property_dir *dir; + + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return NULL; + + if (is_root) { + content_offset = dir_offset + 2; + content_len = dir_len; + } else { + dir->uuid = kmemdup(&block[dir_offset], sizeof(*dir->uuid), + GFP_KERNEL); + content_offset = dir_offset + 4; + content_len = dir_len - 4; /* Length includes UUID */ + } + + entries = (const struct tb_property_entry *)&block[content_offset]; + nentries = content_len / (sizeof(*entries) / 4); + + INIT_LIST_HEAD(&dir->properties); + + for (i = 0; i < nentries; i++) { + struct tb_property *property; + + property = tb_property_parse(block, block_len, &entries[i]); + if (!property) { + tb_property_free_dir(dir); + return NULL; + } + + list_add_tail(&property->list, &dir->properties); + } + + return dir; +} + +/** + * tb_property_parse_dir() - Parses properties from given property block + * @block: Property block to parse + * @block_len: Number of dword elements in the property block + * + * This function parses the XDomain properties data block into format that + * can be traversed using the helper functions provided by this module. + * Upon success returns the parsed directory. In case of error returns + * %NULL. The resulting &struct tb_property_dir needs to be released by + * calling tb_property_free_dir() when not needed anymore. + * + * The @block is expected to be root directory. + */ +struct tb_property_dir *tb_property_parse_dir(const u32 *block, + size_t block_len) +{ + const struct tb_property_rootdir_entry *rootdir = + (const struct tb_property_rootdir_entry *)block; + + if (rootdir->magic != TB_PROPERTY_ROOTDIR_MAGIC) + return NULL; + if (rootdir->length > block_len) + return NULL; + + return __tb_property_parse_dir(block, block_len, 0, rootdir->length, + true); +} + +/** + * tb_property_create_dir() - Creates new property directory + * @uuid: UUID used to identify the particular directory + * + * Creates new, empty property directory. If @uuid is %NULL then the + * directory is assumed to be root directory. + */ +struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid) +{ + struct tb_property_dir *dir; + + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return NULL; + + INIT_LIST_HEAD(&dir->properties); + if (uuid) { + dir->uuid = kmemdup(uuid, sizeof(*dir->uuid), GFP_KERNEL); + if (!dir->uuid) { + kfree(dir); + return NULL; + } + } + + return dir; +} +EXPORT_SYMBOL_GPL(tb_property_create_dir); + +static void tb_property_free(struct tb_property *property) +{ + switch (property->type) { + case TB_PROPERTY_TYPE_DIRECTORY: + tb_property_free_dir(property->value.dir); + break; + + case TB_PROPERTY_TYPE_DATA: + kfree(property->value.data); + break; + + case TB_PROPERTY_TYPE_TEXT: + kfree(property->value.text); + break; + + default: + break; + } + + kfree(property); +} + +/** + * tb_property_free_dir() - Release memory allocated for property directory + * @dir: Directory to release + * + * This will release all the memory the directory occupies including all + * descendants. It is OK to pass %NULL @dir, then the function does + * nothing. + */ +void tb_property_free_dir(struct tb_property_dir *dir) +{ + struct tb_property *property, *tmp; + + if (!dir) + return; + + list_for_each_entry_safe(property, tmp, &dir->properties, list) { + list_del(&property->list); + tb_property_free(property); + } + kfree(dir->uuid); + kfree(dir); +} +EXPORT_SYMBOL_GPL(tb_property_free_dir); + +static size_t tb_property_dir_length(const struct tb_property_dir *dir, + bool recurse, size_t *data_len) +{ + const struct tb_property *property; + size_t len = 0; + + if (dir->uuid) + len += sizeof(*dir->uuid) / 4; + else + len += sizeof(struct tb_property_rootdir_entry) / 4; + + list_for_each_entry(property, &dir->properties, list) { + len += sizeof(struct tb_property_entry) / 4; + + switch (property->type) { + case TB_PROPERTY_TYPE_DIRECTORY: + if (recurse) { + len += tb_property_dir_length( + property->value.dir, recurse, data_len); + } + /* Reserve dword padding after each directory */ + if (data_len) + *data_len += 1; + break; + + case TB_PROPERTY_TYPE_DATA: + case TB_PROPERTY_TYPE_TEXT: + if (data_len) + *data_len += property->length; + break; + + default: + break; + } + } + + return len; +} + +static ssize_t __tb_property_format_dir(const struct tb_property_dir *dir, + u32 *block, unsigned int start_offset, size_t block_len) +{ + unsigned int data_offset, dir_end; + const struct tb_property *property; + struct tb_property_entry *entry; + size_t dir_len, data_len = 0; + int ret; + + /* + * The structure of property block looks like following. Leaf + * data/text is included right after the directory and each + * directory follows each other (even nested ones). + * + * +----------+ <-- start_offset + * | header | <-- root directory header + * +----------+ --- + * | entry 0 | -^--------------------. + * +----------+ | | + * | entry 1 | -|--------------------|--. + * +----------+ | | | + * | entry 2 | -|-----------------. | | + * +----------+ | | | | + * : : | dir_len | | | + * . . | | | | + * : : | | | | + * +----------+ | | | | + * | entry n | v | | | + * +----------+ <-- data_offset | | | + * | data 0 | <------------------|--' | + * +----------+ | | + * | data 1 | <------------------|-----' + * +----------+ | + * | 00000000 | padding | + * +----------+ <-- dir_end <------' + * | UUID | <-- directory UUID (child directory) + * +----------+ + * | entry 0 | + * +----------+ + * | entry 1 | + * +----------+ + * : : + * . . + * : : + * +----------+ + * | entry n | + * +----------+ + * | data 0 | + * +----------+ + * + * We use dir_end to hold pointer to the end of the directory. It + * will increase as we add directories and each directory should be + * added starting from previous dir_end. + */ + dir_len = tb_property_dir_length(dir, false, &data_len); + data_offset = start_offset + dir_len; + dir_end = start_offset + data_len + dir_len; + + if (data_offset > dir_end) + return -EINVAL; + if (dir_end > block_len) + return -EINVAL; + + /* Write headers first */ + if (dir->uuid) { + struct tb_property_dir_entry *pe; + + pe = (struct tb_property_dir_entry *)&block[start_offset]; + memcpy(pe->uuid, dir->uuid, sizeof(pe->uuid)); + entry = pe->entries; + } else { + struct tb_property_rootdir_entry *re; + + re = (struct tb_property_rootdir_entry *)&block[start_offset]; + re->magic = TB_PROPERTY_ROOTDIR_MAGIC; + re->length = dir_len - sizeof(*re) / 4; + entry = re->entries; + } + + list_for_each_entry(property, &dir->properties, list) { + const struct tb_property_dir *child; + + format_dwdata(entry, property->key, 2); + entry->type = property->type; + + switch (property->type) { + case TB_PROPERTY_TYPE_DIRECTORY: + child = property->value.dir; + ret = __tb_property_format_dir(child, block, dir_end, + block_len); + if (ret < 0) + return ret; + entry->length = tb_property_dir_length(child, false, + NULL); + entry->value = dir_end; + dir_end = ret; + break; + + case TB_PROPERTY_TYPE_DATA: + format_dwdata(&block[data_offset], property->value.data, + property->length); + entry->length = property->length; + entry->value = data_offset; + data_offset += entry->length; + break; + + case TB_PROPERTY_TYPE_TEXT: + format_dwdata(&block[data_offset], property->value.text, + property->length); + entry->length = property->length; + entry->value = data_offset; + data_offset += entry->length; + break; + + case TB_PROPERTY_TYPE_VALUE: + entry->length = property->length; + entry->value = property->value.immediate; + break; + + default: + break; + } + + entry++; + } + + return dir_end; +} + +/** + * tb_property_format_dir() - Formats directory to the packed XDomain format + * @dir: Directory to format + * @block: Property block where the packed data is placed + * @block_len: Length of the property block + * + * This function formats the directory to the packed format that can be + * then send over the thunderbolt fabric to receiving host. Returns %0 in + * case of success and negative errno on faulure. Passing %NULL in @block + * returns number of entries the block takes. + */ +ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block, + size_t block_len) +{ + ssize_t ret; + + if (!block) { + size_t dir_len, data_len = 0; + + dir_len = tb_property_dir_length(dir, true, &data_len); + return dir_len + data_len; + } + + ret = __tb_property_format_dir(dir, block, 0, block_len); + return ret < 0 ? ret : 0; +} + +/** + * tb_property_add_immediate() - Add immediate property to directory + * @parent: Directory to add the property + * @key: Key for the property + * @value: Immediate value to store with the property + */ +int tb_property_add_immediate(struct tb_property_dir *parent, const char *key, + u32 value) +{ + struct tb_property *property; + + if (!tb_property_key_valid(key)) + return -EINVAL; + + property = tb_property_alloc(key, TB_PROPERTY_TYPE_VALUE); + if (!property) + return -ENOMEM; + + property->length = 1; + property->value.immediate = value; + + list_add_tail(&property->list, &parent->properties); + return 0; +} +EXPORT_SYMBOL_GPL(tb_property_add_immediate); + +/** + * tb_property_add_data() - Adds arbitrary data property to directory + * @parent: Directory to add the property + * @key: Key for the property + * @buf: Data buffer to add + * @buflen: Number of bytes in the data buffer + * + * Function takes a copy of @buf and adds it to the directory. + */ +int tb_property_add_data(struct tb_property_dir *parent, const char *key, + const void *buf, size_t buflen) +{ + /* Need to pad to dword boundary */ + size_t size = round_up(buflen, 4); + struct tb_property *property; + + if (!tb_property_key_valid(key)) + return -EINVAL; + + property = tb_property_alloc(key, TB_PROPERTY_TYPE_DATA); + if (!property) + return -ENOMEM; + + property->length = size / 4; + property->value.data = kzalloc(size, GFP_KERNEL); + memcpy(property->value.data, buf, buflen); + + list_add_tail(&property->list, &parent->properties); + return 0; +} +EXPORT_SYMBOL_GPL(tb_property_add_data); + +/** + * tb_property_add_text() - Adds string property to directory + * @parent: Directory to add the property + * @key: Key for the property + * @text: String to add + * + * Function takes a copy of @text and adds it to the directory. + */ +int tb_property_add_text(struct tb_property_dir *parent, const char *key, + const char *text) +{ + /* Need to pad to dword boundary */ + size_t size = round_up(strlen(text) + 1, 4); + struct tb_property *property; + + if (!tb_property_key_valid(key)) + return -EINVAL; + + property = tb_property_alloc(key, TB_PROPERTY_TYPE_TEXT); + if (!property) + return -ENOMEM; + + property->length = size / 4; + property->value.data = kzalloc(size, GFP_KERNEL); + strcpy(property->value.text, text); + + list_add_tail(&property->list, &parent->properties); + return 0; +} +EXPORT_SYMBOL_GPL(tb_property_add_text); + +/** + * tb_property_add_dir() - Adds a directory to the parent directory + * @parent: Directory to add the property + * @key: Key for the property + * @dir: Directory to add + */ +int tb_property_add_dir(struct tb_property_dir *parent, const char *key, + struct tb_property_dir *dir) +{ + struct tb_property *property; + + if (!tb_property_key_valid(key)) + return -EINVAL; + + property = tb_property_alloc(key, TB_PROPERTY_TYPE_DIRECTORY); + if (!property) + return -ENOMEM; + + property->value.dir = dir; + + list_add_tail(&property->list, &parent->properties); + return 0; +} +EXPORT_SYMBOL_GPL(tb_property_add_dir); + +/** + * tb_property_remove() - Removes property from a parent directory + * @property: Property to remove + * + * Note memory for @property is released as well so it is not allowed to + * touch the object after call to this function. + */ +void tb_property_remove(struct tb_property *property) +{ + list_del(&property->list); + kfree(property); +} +EXPORT_SYMBOL_GPL(tb_property_remove); + +/** + * tb_property_find() - Find a property from a directory + * @dir: Directory where the property is searched + * @key: Key to look for + * @type: Type of the property + * + * Finds and returns property from the given directory. Does not recurse + * into sub-directories. Returns %NULL if the property was not found. + */ +struct tb_property *tb_property_find(struct tb_property_dir *dir, + const char *key, enum tb_property_type type) +{ + struct tb_property *property; + + list_for_each_entry(property, &dir->properties, list) { + if (property->type == type && !strcmp(property->key, key)) + return property; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(tb_property_find); + +/** + * tb_property_get_next() - Get next property from directory + * @dir: Directory holding properties + * @prev: Previous property in the directory (%NULL returns the first) + */ +struct tb_property *tb_property_get_next(struct tb_property_dir *dir, + struct tb_property *prev) +{ + if (prev) { + if (list_is_last(&prev->list, &dir->properties)) + return NULL; + return list_next_entry(prev, list); + } + return list_first_entry_or_null(&dir->properties, struct tb_property, + list); +} +EXPORT_SYMBOL_GPL(tb_property_get_next); diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h new file mode 100644 index 000000000000..96561c1265ae --- /dev/null +++ b/include/linux/thunderbolt.h @@ -0,0 +1,89 @@ +/* + * Thunderbolt service API + * + * Copyright (C) 2017, Intel Corporation + * Authors: Michael Jamet + * Mika Westerberg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef THUNDERBOLT_H_ +#define THUNDERBOLT_H_ + +#include +#include + +/** + * struct tb_property_dir - XDomain property directory + * @uuid: Directory UUID or %NULL if root directory + * @properties: List of properties in this directory + * + * User needs to provide serialization if needed. + */ +struct tb_property_dir { + const uuid_t *uuid; + struct list_head properties; +}; + +enum tb_property_type { + TB_PROPERTY_TYPE_UNKNOWN = 0x00, + TB_PROPERTY_TYPE_DIRECTORY = 0x44, + TB_PROPERTY_TYPE_DATA = 0x64, + TB_PROPERTY_TYPE_TEXT = 0x74, + TB_PROPERTY_TYPE_VALUE = 0x76, +}; + +#define TB_PROPERTY_KEY_SIZE 8 + +/** + * struct tb_property - XDomain property + * @list: Used to link properties together in a directory + * @key: Key for the property (always terminated). + * @type: Type of the property + * @length: Length of the property data in dwords + * @value: Property value + * + * Users use @type to determine which field in @value is filled. + */ +struct tb_property { + struct list_head list; + char key[TB_PROPERTY_KEY_SIZE + 1]; + enum tb_property_type type; + size_t length; + union { + struct tb_property_dir *dir; + u8 *data; + char *text; + u32 immediate; + } value; +}; + +struct tb_property_dir *tb_property_parse_dir(const u32 *block, + size_t block_len); +ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block, + size_t block_len); +struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid); +void tb_property_free_dir(struct tb_property_dir *dir); +int tb_property_add_immediate(struct tb_property_dir *parent, const char *key, + u32 value); +int tb_property_add_data(struct tb_property_dir *parent, const char *key, + const void *buf, size_t buflen); +int tb_property_add_text(struct tb_property_dir *parent, const char *key, + const char *text); +int tb_property_add_dir(struct tb_property_dir *parent, const char *key, + struct tb_property_dir *dir); +void tb_property_remove(struct tb_property *tb_property); +struct tb_property *tb_property_find(struct tb_property_dir *dir, + const char *key, enum tb_property_type type); +struct tb_property *tb_property_get_next(struct tb_property_dir *dir, + struct tb_property *prev); + +#define tb_property_for_each(dir, property) \ + for (property = tb_property_get_next(dir, NULL); \ + property; \ + property = tb_property_get_next(dir, property)) + +#endif /* THUNDERBOLT_H_ */ From eaf8ff35a345449207ad116e2574c19780ec9a98 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:31 +0300 Subject: [PATCH 1146/1322] thunderbolt: Move enum tb_cfg_pkg_type to thunderbolt.h These will be needed by Thunderbolt services when sending and receiving XDomain control messages. While there change TB_CFG_PKG_PREPARE_TO_SLEEP value to be decimal in order to be consistent with other members. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/ctl.h | 1 + drivers/thunderbolt/tb_msgs.h | 17 ----------------- include/linux/thunderbolt.h | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h index 36fd28b1c1c5..d0f21e1e0b8b 100644 --- a/drivers/thunderbolt/ctl.h +++ b/drivers/thunderbolt/ctl.h @@ -8,6 +8,7 @@ #define _TB_CFG #include +#include #include "nhi.h" #include "tb_msgs.h" diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h index f3adf58a40ce..f2b2550cd97c 100644 --- a/drivers/thunderbolt/tb_msgs.h +++ b/drivers/thunderbolt/tb_msgs.h @@ -15,23 +15,6 @@ #include #include -enum tb_cfg_pkg_type { - TB_CFG_PKG_READ = 1, - TB_CFG_PKG_WRITE = 2, - TB_CFG_PKG_ERROR = 3, - TB_CFG_PKG_NOTIFY_ACK = 4, - TB_CFG_PKG_EVENT = 5, - TB_CFG_PKG_XDOMAIN_REQ = 6, - TB_CFG_PKG_XDOMAIN_RESP = 7, - TB_CFG_PKG_OVERRIDE = 8, - TB_CFG_PKG_RESET = 9, - TB_CFG_PKG_ICM_EVENT = 10, - TB_CFG_PKG_ICM_CMD = 11, - TB_CFG_PKG_ICM_RESP = 12, - TB_CFG_PKG_PREPARE_TO_SLEEP = 0xd, - -}; - enum tb_cfg_space { TB_CFG_HOPS = 0, TB_CFG_PORT = 1, diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 96561c1265ae..b512b1e2b4f2 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -1,6 +1,7 @@ /* * Thunderbolt service API * + * Copyright (C) 2014 Andreas Noever * Copyright (C) 2017, Intel Corporation * Authors: Michael Jamet * Mika Westerberg @@ -16,6 +17,22 @@ #include #include +enum tb_cfg_pkg_type { + TB_CFG_PKG_READ = 1, + TB_CFG_PKG_WRITE = 2, + TB_CFG_PKG_ERROR = 3, + TB_CFG_PKG_NOTIFY_ACK = 4, + TB_CFG_PKG_EVENT = 5, + TB_CFG_PKG_XDOMAIN_REQ = 6, + TB_CFG_PKG_XDOMAIN_RESP = 7, + TB_CFG_PKG_OVERRIDE = 8, + TB_CFG_PKG_RESET = 9, + TB_CFG_PKG_ICM_EVENT = 10, + TB_CFG_PKG_ICM_CMD = 11, + TB_CFG_PKG_ICM_RESP = 12, + TB_CFG_PKG_PREPARE_TO_SLEEP = 13, +}; + /** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory From 9e99b9f4d5c36340dabda6d14053195b2a43796b Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:32 +0300 Subject: [PATCH 1147/1322] thunderbolt: Move thunderbolt domain structure to thunderbolt.h These are needed by Thunderbolt services so move them to thunderbolt.h to make sure they are available outside of drivers/thunderbolt. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/tb.h | 42 ---------------------------------- include/linux/thunderbolt.h | 45 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index e0deee4f1eb0..2fefe76621ca 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -39,20 +39,6 @@ struct tb_switch_nvm { bool authenticating; }; -/** - * enum tb_security_level - Thunderbolt security level - * @TB_SECURITY_NONE: No security, legacy mode - * @TB_SECURITY_USER: User approval required at minimum - * @TB_SECURITY_SECURE: One time saved key required at minimum - * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB) - */ -enum tb_security_level { - TB_SECURITY_NONE, - TB_SECURITY_USER, - TB_SECURITY_SECURE, - TB_SECURITY_DPONLY, -}; - #define TB_SWITCH_KEY_SIZE 32 /* Each physical port contains 2 links on modern controllers */ #define TB_SWITCH_LINKS_PER_PHY_PORT 2 @@ -223,33 +209,6 @@ struct tb_cm_ops { int (*disconnect_pcie_paths)(struct tb *tb); }; -/** - * struct tb - main thunderbolt bus structure - * @dev: Domain device - * @lock: Big lock. Must be held when accessing any struct - * tb_switch / struct tb_port. - * @nhi: Pointer to the NHI structure - * @ctl: Control channel for this domain - * @wq: Ordered workqueue for all domain specific work - * @root_switch: Root switch of this domain - * @cm_ops: Connection manager specific operations vector - * @index: Linux assigned domain number - * @security_level: Current security level - * @privdata: Private connection manager specific data - */ -struct tb { - struct device dev; - struct mutex lock; - struct tb_nhi *nhi; - struct tb_ctl *ctl; - struct workqueue_struct *wq; - struct tb_switch *root_switch; - const struct tb_cm_ops *cm_ops; - int index; - enum tb_security_level security_level; - unsigned long privdata[0]; -}; - static inline void *tb_priv(struct tb *tb) { return (void *)tb->privdata; @@ -368,7 +327,6 @@ static inline int tb_port_write(struct tb_port *port, const void *buffer, struct tb *icm_probe(struct tb_nhi *nhi); struct tb *tb_probe(struct tb_nhi *nhi); -extern struct bus_type tb_bus_type; extern struct device_type tb_domain_type; extern struct device_type tb_switch_type; diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index b512b1e2b4f2..910b1bf92112 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -14,7 +14,9 @@ #ifndef THUNDERBOLT_H_ #define THUNDERBOLT_H_ +#include #include +#include #include enum tb_cfg_pkg_type { @@ -33,6 +35,49 @@ enum tb_cfg_pkg_type { TB_CFG_PKG_PREPARE_TO_SLEEP = 13, }; +/** + * enum tb_security_level - Thunderbolt security level + * @TB_SECURITY_NONE: No security, legacy mode + * @TB_SECURITY_USER: User approval required at minimum + * @TB_SECURITY_SECURE: One time saved key required at minimum + * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB) + */ +enum tb_security_level { + TB_SECURITY_NONE, + TB_SECURITY_USER, + TB_SECURITY_SECURE, + TB_SECURITY_DPONLY, +}; + +/** + * struct tb - main thunderbolt bus structure + * @dev: Domain device + * @lock: Big lock. Must be held when accessing any struct + * tb_switch / struct tb_port. + * @nhi: Pointer to the NHI structure + * @ctl: Control channel for this domain + * @wq: Ordered workqueue for all domain specific work + * @root_switch: Root switch of this domain + * @cm_ops: Connection manager specific operations vector + * @index: Linux assigned domain number + * @security_level: Current security level + * @privdata: Private connection manager specific data + */ +struct tb { + struct device dev; + struct mutex lock; + struct tb_nhi *nhi; + struct tb_ctl *ctl; + struct workqueue_struct *wq; + struct tb_switch *root_switch; + const struct tb_cm_ops *cm_ops; + int index; + enum tb_security_level security_level; + unsigned long privdata[0]; +}; + +extern struct bus_type tb_bus_type; + /** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory From e69b71f8458b78a2ef44e3d07374a8f46e45123d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:33 +0300 Subject: [PATCH 1148/1322] thunderbolt: Move tb_switch_phy_port_from_link() to thunderbolt.h A Thunderbolt service might need to find the physical port from a link the cable is connected to. For instance networking driver uses this information to generate MAC address according the Apple ThunderboltIP protocol. Move this function to thunderbolt.h and rename it to tb_phy_port_from_link() to reflect the fact that it does not take switch as parameter. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/icm.c | 2 +- drivers/thunderbolt/tb.h | 7 ------- include/linux/thunderbolt.h | 7 +++++++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index 53250fc057e1..8c22b91ed040 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -89,7 +89,7 @@ static inline struct tb *icm_to_tb(struct icm *icm) static inline u8 phy_port_from_route(u64 route, u8 depth) { - return tb_switch_phy_port_from_link(route >> ((depth - 1) * 8)); + return tb_phy_port_from_link(route >> ((depth - 1) * 8)); } static inline u8 dual_link_from_link(u8 link) diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index 2fefe76621ca..ea21d927bd09 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -40,8 +40,6 @@ struct tb_switch_nvm { }; #define TB_SWITCH_KEY_SIZE 32 -/* Each physical port contains 2 links on modern controllers */ -#define TB_SWITCH_LINKS_PER_PHY_PORT 2 /** * struct tb_switch - a thunderbolt switch @@ -367,11 +365,6 @@ struct tb_switch *tb_switch_find_by_link_depth(struct tb *tb, u8 link, u8 depth); struct tb_switch *tb_switch_find_by_uuid(struct tb *tb, const uuid_t *uuid); -static inline unsigned int tb_switch_phy_port_from_link(unsigned int link) -{ - return (link - 1) / TB_SWITCH_LINKS_PER_PHY_PORT; -} - static inline void tb_switch_put(struct tb_switch *sw) { put_device(&sw->dev); diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 910b1bf92112..43b8d1e09341 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -78,6 +78,13 @@ struct tb { extern struct bus_type tb_bus_type; +#define TB_LINKS_PER_PHY_PORT 2 + +static inline unsigned int tb_phy_port_from_link(unsigned int link) +{ + return (link - 1) / TB_LINKS_PER_PHY_PORT; +} + /** * struct tb_property_dir - XDomain property directory * @uuid: Directory UUID or %NULL if root directory From d1ff70241a275133e1a0258b7c23588b122276c8 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:34 +0300 Subject: [PATCH 1149/1322] thunderbolt: Add support for XDomain discovery protocol When two hosts are connected over a Thunderbolt cable, there is a protocol they can use to communicate capabilities supported by the host. The discovery protocol uses automatically configured control channel (ring 0) and is build on top of request/response transactions using special XDomain primitives provided by the Thunderbolt base protocol. The capabilities consists of a root directory block of basic properties used for identification of the host, and then there can be zero or more directories each describing a Thunderbolt service and its capabilities. Once both sides have discovered what is supported the two hosts can setup high-speed DMA paths and transfer data to the other side using whatever protocol was agreed based on the properties. The software protocol used to communicate which DMA paths to enable is service specific. This patch adds support for the XDomain discovery protocol to the Thunderbolt bus. We model each remote host connection as a Linux XDomain device. For each Thunderbolt service found supported on the XDomain device, we create Linux Thunderbolt service device which Thunderbolt service drivers can then bind to based on the protocol identification information retrieved from the property directory describing the service. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- .../ABI/testing/sysfs-bus-thunderbolt | 48 + drivers/thunderbolt/Makefile | 2 +- drivers/thunderbolt/ctl.c | 11 +- drivers/thunderbolt/ctl.h | 2 +- drivers/thunderbolt/domain.c | 197 ++- drivers/thunderbolt/icm.c | 218 ++- drivers/thunderbolt/nhi.h | 2 + drivers/thunderbolt/switch.c | 7 +- drivers/thunderbolt/tb.h | 39 +- drivers/thunderbolt/tb_msgs.h | 123 ++ drivers/thunderbolt/xdomain.c | 1576 +++++++++++++++++ include/linux/mod_devicetable.h | 26 + include/linux/thunderbolt.h | 242 +++ scripts/mod/devicetable-offsets.c | 7 + scripts/mod/file2alias.c | 25 + 15 files changed, 2507 insertions(+), 18 deletions(-) create mode 100644 drivers/thunderbolt/xdomain.c diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt index 392bef5bd399..93798c02e28b 100644 --- a/Documentation/ABI/testing/sysfs-bus-thunderbolt +++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt @@ -110,3 +110,51 @@ Description: When new NVM image is written to the non-active NVM is directly the status value from the DMA configuration based mailbox before the device is power cycled. Writing 0 here clears the status. + +What: /sys/bus/thunderbolt/devices/./key +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: This contains name of the property directory the XDomain + service exposes. This entry describes the protocol in + question. Following directories are already reserved by + the Apple XDomain specification: + + network: IP/ethernet over Thunderbolt + targetdm: Target disk mode protocol over Thunderbolt + extdisp: External display mode protocol over Thunderbolt + +What: /sys/bus/thunderbolt/devices/./modalias +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: Stores the same MODALIAS value emitted by uevent for + the XDomain service. Format: tbtsvc:kSpNvNrN + +What: /sys/bus/thunderbolt/devices/./prtcid +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: This contains XDomain protocol identifier the XDomain + service supports. + +What: /sys/bus/thunderbolt/devices/./prtcvers +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: This contains XDomain protocol version the XDomain + service supports. + +What: /sys/bus/thunderbolt/devices/./prtcrevs +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: This contains XDomain software version the XDomain + service supports. + +What: /sys/bus/thunderbolt/devices/./prtcstns +Date: Jan 2018 +KernelVersion: 4.15 +Contact: thunderbolt-software@lists.01.org +Description: This contains XDomain service specific settings as + bitmask. Format: %x diff --git a/drivers/thunderbolt/Makefile b/drivers/thunderbolt/Makefile index 7afd21f5383a..f2f0de27252b 100644 --- a/drivers/thunderbolt/Makefile +++ b/drivers/thunderbolt/Makefile @@ -1,3 +1,3 @@ obj-${CONFIG_THUNDERBOLT} := thunderbolt.o thunderbolt-objs := nhi.o ctl.o tb.o switch.o cap.o path.o tunnel_pci.o eeprom.o -thunderbolt-objs += domain.o dma_port.o icm.o property.o +thunderbolt-objs += domain.o dma_port.o icm.o property.o xdomain.o diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c index e6a4c9458c76..46e393c5fd1d 100644 --- a/drivers/thunderbolt/ctl.c +++ b/drivers/thunderbolt/ctl.c @@ -368,10 +368,10 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len, /** * tb_ctl_handle_event() - acknowledge a plug event, invoke ctl->callback */ -static void tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type, +static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type, struct ctl_pkg *pkg, size_t size) { - ctl->callback(ctl->callback_data, type, pkg->buffer, size); + return ctl->callback(ctl->callback_data, type, pkg->buffer, size); } static void tb_ctl_rx_submit(struct ctl_pkg *pkg) @@ -444,6 +444,8 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame, break; case TB_CFG_PKG_EVENT: + case TB_CFG_PKG_XDOMAIN_RESP: + case TB_CFG_PKG_XDOMAIN_REQ: if (*(__be32 *)(pkg->buffer + frame->size) != crc32) { tb_ctl_err(pkg->ctl, "RX: checksum mismatch, dropping packet\n"); @@ -451,8 +453,9 @@ static void tb_ctl_rx_callback(struct tb_ring *ring, struct ring_frame *frame, } /* Fall through */ case TB_CFG_PKG_ICM_EVENT: - tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size); - goto rx; + if (tb_ctl_handle_event(pkg->ctl, frame->eof, pkg, frame->size)) + goto rx; + break; default: break; diff --git a/drivers/thunderbolt/ctl.h b/drivers/thunderbolt/ctl.h index d0f21e1e0b8b..85c49dd301ea 100644 --- a/drivers/thunderbolt/ctl.h +++ b/drivers/thunderbolt/ctl.h @@ -16,7 +16,7 @@ /* control channel */ struct tb_ctl; -typedef void (*event_cb)(void *data, enum tb_cfg_pkg_type type, +typedef bool (*event_cb)(void *data, enum tb_cfg_pkg_type type, const void *buf, size_t size); struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data); diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c index 9f2dcd48974d..9b90115319ce 100644 --- a/drivers/thunderbolt/domain.c +++ b/drivers/thunderbolt/domain.c @@ -20,6 +20,98 @@ static DEFINE_IDA(tb_domain_ida); +static bool match_service_id(const struct tb_service_id *id, + const struct tb_service *svc) +{ + if (id->match_flags & TBSVC_MATCH_PROTOCOL_KEY) { + if (strcmp(id->protocol_key, svc->key)) + return false; + } + + if (id->match_flags & TBSVC_MATCH_PROTOCOL_ID) { + if (id->protocol_id != svc->prtcid) + return false; + } + + if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) { + if (id->protocol_version != svc->prtcvers) + return false; + } + + if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) { + if (id->protocol_revision != svc->prtcrevs) + return false; + } + + return true; +} + +static const struct tb_service_id *__tb_service_match(struct device *dev, + struct device_driver *drv) +{ + struct tb_service_driver *driver; + const struct tb_service_id *ids; + struct tb_service *svc; + + svc = tb_to_service(dev); + if (!svc) + return NULL; + + driver = container_of(drv, struct tb_service_driver, driver); + if (!driver->id_table) + return NULL; + + for (ids = driver->id_table; ids->match_flags != 0; ids++) { + if (match_service_id(ids, svc)) + return ids; + } + + return NULL; +} + +static int tb_service_match(struct device *dev, struct device_driver *drv) +{ + return !!__tb_service_match(dev, drv); +} + +static int tb_service_probe(struct device *dev) +{ + struct tb_service *svc = tb_to_service(dev); + struct tb_service_driver *driver; + const struct tb_service_id *id; + + driver = container_of(dev->driver, struct tb_service_driver, driver); + id = __tb_service_match(dev, &driver->driver); + + return driver->probe(svc, id); +} + +static int tb_service_remove(struct device *dev) +{ + struct tb_service *svc = tb_to_service(dev); + struct tb_service_driver *driver; + + driver = container_of(dev->driver, struct tb_service_driver, driver); + if (driver->remove) + driver->remove(svc); + + return 0; +} + +static void tb_service_shutdown(struct device *dev) +{ + struct tb_service_driver *driver; + struct tb_service *svc; + + svc = tb_to_service(dev); + if (!svc || !dev->driver) + return; + + driver = container_of(dev->driver, struct tb_service_driver, driver); + if (driver->shutdown) + driver->shutdown(svc); +} + static const char * const tb_security_names[] = { [TB_SECURITY_NONE] = "none", [TB_SECURITY_USER] = "user", @@ -52,6 +144,10 @@ static const struct attribute_group *domain_attr_groups[] = { struct bus_type tb_bus_type = { .name = "thunderbolt", + .match = tb_service_match, + .probe = tb_service_probe, + .remove = tb_service_remove, + .shutdown = tb_service_shutdown, }; static void tb_domain_release(struct device *dev) @@ -128,17 +224,26 @@ err_free: return NULL; } -static void tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type, +static bool tb_domain_event_cb(void *data, enum tb_cfg_pkg_type type, const void *buf, size_t size) { struct tb *tb = data; if (!tb->cm_ops->handle_event) { tb_warn(tb, "domain does not have event handler\n"); - return; + return true; } - tb->cm_ops->handle_event(tb, type, buf, size); + switch (type) { + case TB_CFG_PKG_XDOMAIN_REQ: + case TB_CFG_PKG_XDOMAIN_RESP: + return tb_xdomain_handle_request(tb, type, buf, size); + + default: + tb->cm_ops->handle_event(tb, type, buf, size); + } + + return true; } /** @@ -443,9 +548,92 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb) return tb->cm_ops->disconnect_pcie_paths(tb); } +/** + * tb_domain_approve_xdomain_paths() - Enable DMA paths for XDomain + * @tb: Domain enabling the DMA paths + * @xd: XDomain DMA paths are created to + * + * Calls connection manager specific method to enable DMA paths to the + * XDomain in question. + * + * Return: 0% in case of success and negative errno otherwise. In + * particular returns %-ENOTSUPP if the connection manager + * implementation does not support XDomains. + */ +int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd) +{ + if (!tb->cm_ops->approve_xdomain_paths) + return -ENOTSUPP; + + return tb->cm_ops->approve_xdomain_paths(tb, xd); +} + +/** + * tb_domain_disconnect_xdomain_paths() - Disable DMA paths for XDomain + * @tb: Domain disabling the DMA paths + * @xd: XDomain whose DMA paths are disconnected + * + * Calls connection manager specific method to disconnect DMA paths to + * the XDomain in question. + * + * Return: 0% in case of success and negative errno otherwise. In + * particular returns %-ENOTSUPP if the connection manager + * implementation does not support XDomains. + */ +int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd) +{ + if (!tb->cm_ops->disconnect_xdomain_paths) + return -ENOTSUPP; + + return tb->cm_ops->disconnect_xdomain_paths(tb, xd); +} + +static int disconnect_xdomain(struct device *dev, void *data) +{ + struct tb_xdomain *xd; + struct tb *tb = data; + int ret = 0; + + xd = tb_to_xdomain(dev); + if (xd && xd->tb == tb) + ret = tb_xdomain_disable_paths(xd); + + return ret; +} + +/** + * tb_domain_disconnect_all_paths() - Disconnect all paths for the domain + * @tb: Domain whose paths are disconnected + * + * This function can be used to disconnect all paths (PCIe, XDomain) for + * example in preparation for host NVM firmware upgrade. After this is + * called the paths cannot be established without resetting the switch. + * + * Return: %0 in case of success and negative errno otherwise. + */ +int tb_domain_disconnect_all_paths(struct tb *tb) +{ + int ret; + + ret = tb_domain_disconnect_pcie_paths(tb); + if (ret) + return ret; + + return bus_for_each_dev(&tb_bus_type, NULL, tb, disconnect_xdomain); +} + int tb_domain_init(void) { - return bus_register(&tb_bus_type); + int ret; + + ret = tb_xdomain_init(); + if (ret) + return ret; + ret = bus_register(&tb_bus_type); + if (ret) + tb_xdomain_exit(); + + return ret; } void tb_domain_exit(void) @@ -453,4 +641,5 @@ void tb_domain_exit(void) bus_unregister(&tb_bus_type); ida_destroy(&tb_domain_ida); tb_switch_exit(); + tb_xdomain_exit(); } diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index 8c22b91ed040..ab02d13f40b7 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -60,6 +60,8 @@ * @get_route: Find a route string for given switch * @device_connected: Handle device connected ICM message * @device_disconnected: Handle device disconnected ICM message + * @xdomain_connected - Handle XDomain connected ICM message + * @xdomain_disconnected - Handle XDomain disconnected ICM message */ struct icm { struct mutex request_lock; @@ -74,6 +76,10 @@ struct icm { const struct icm_pkg_header *hdr); void (*device_disconnected)(struct tb *tb, const struct icm_pkg_header *hdr); + void (*xdomain_connected)(struct tb *tb, + const struct icm_pkg_header *hdr); + void (*xdomain_disconnected)(struct tb *tb, + const struct icm_pkg_header *hdr); }; struct icm_notification { @@ -89,7 +95,10 @@ static inline struct tb *icm_to_tb(struct icm *icm) static inline u8 phy_port_from_route(u64 route, u8 depth) { - return tb_phy_port_from_link(route >> ((depth - 1) * 8)); + u8 link; + + link = depth ? route >> ((depth - 1) * 8) : route; + return tb_phy_port_from_link(link); } static inline u8 dual_link_from_link(u8 link) @@ -320,6 +329,51 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw, return 0; } +static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd) +{ + struct icm_fr_pkg_approve_xdomain_response reply; + struct icm_fr_pkg_approve_xdomain request; + int ret; + + memset(&request, 0, sizeof(request)); + request.hdr.code = ICM_APPROVE_XDOMAIN; + request.link_info = xd->depth << ICM_LINK_INFO_DEPTH_SHIFT | xd->link; + memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid)); + + request.transmit_path = xd->transmit_path; + request.transmit_ring = xd->transmit_ring; + request.receive_path = xd->receive_path; + request.receive_ring = xd->receive_ring; + + memset(&reply, 0, sizeof(reply)); + ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply), + 1, ICM_TIMEOUT); + if (ret) + return ret; + + if (reply.hdr.flags & ICM_FLAGS_ERROR) + return -EIO; + + return 0; +} + +static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd) +{ + u8 phy_port; + u8 cmd; + + phy_port = tb_phy_port_from_link(xd->link); + if (phy_port == 0) + cmd = NHI_MAILBOX_DISCONNECT_PA; + else + cmd = NHI_MAILBOX_DISCONNECT_PB; + + nhi_mailbox_cmd(tb->nhi, cmd, 1); + usleep_range(10, 50); + nhi_mailbox_cmd(tb->nhi, cmd, 2); + return 0; +} + static void remove_switch(struct tb_switch *sw) { struct tb_switch *parent_sw; @@ -475,6 +529,141 @@ icm_fr_device_disconnected(struct tb *tb, const struct icm_pkg_header *hdr) tb_switch_put(sw); } +static void remove_xdomain(struct tb_xdomain *xd) +{ + struct tb_switch *sw; + + sw = tb_to_switch(xd->dev.parent); + tb_port_at(xd->route, sw)->xdomain = NULL; + tb_xdomain_remove(xd); +} + +static void +icm_fr_xdomain_connected(struct tb *tb, const struct icm_pkg_header *hdr) +{ + const struct icm_fr_event_xdomain_connected *pkg = + (const struct icm_fr_event_xdomain_connected *)hdr; + struct tb_xdomain *xd; + struct tb_switch *sw; + u8 link, depth; + bool approved; + u64 route; + + /* + * After NVM upgrade adding root switch device fails because we + * initiated reset. During that time ICM might still send + * XDomain connected message which we ignore here. + */ + if (!tb->root_switch) + return; + + link = pkg->link_info & ICM_LINK_INFO_LINK_MASK; + depth = (pkg->link_info & ICM_LINK_INFO_DEPTH_MASK) >> + ICM_LINK_INFO_DEPTH_SHIFT; + approved = pkg->link_info & ICM_LINK_INFO_APPROVED; + + if (link > ICM_MAX_LINK || depth > ICM_MAX_DEPTH) { + tb_warn(tb, "invalid topology %u.%u, ignoring\n", link, depth); + return; + } + + route = get_route(pkg->local_route_hi, pkg->local_route_lo); + + xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid); + if (xd) { + u8 xd_phy_port, phy_port; + + xd_phy_port = phy_port_from_route(xd->route, xd->depth); + phy_port = phy_port_from_route(route, depth); + + if (xd->depth == depth && xd_phy_port == phy_port) { + xd->link = link; + xd->route = route; + xd->is_unplugged = false; + tb_xdomain_put(xd); + return; + } + + /* + * If we find an existing XDomain connection remove it + * now. We need to go through login handshake and + * everything anyway to be able to re-establish the + * connection. + */ + remove_xdomain(xd); + tb_xdomain_put(xd); + } + + /* + * Look if there already exists an XDomain in the same place + * than the new one and in that case remove it because it is + * most likely another host that got disconnected. + */ + xd = tb_xdomain_find_by_link_depth(tb, link, depth); + if (!xd) { + u8 dual_link; + + dual_link = dual_link_from_link(link); + if (dual_link) + xd = tb_xdomain_find_by_link_depth(tb, dual_link, + depth); + } + if (xd) { + remove_xdomain(xd); + tb_xdomain_put(xd); + } + + /* + * If the user disconnected a switch during suspend and + * connected another host to the same port, remove the switch + * first. + */ + sw = get_switch_at_route(tb->root_switch, route); + if (sw) + remove_switch(sw); + + sw = tb_switch_find_by_link_depth(tb, link, depth); + if (!sw) { + tb_warn(tb, "no switch exists at %u.%u, ignoring\n", link, + depth); + return; + } + + xd = tb_xdomain_alloc(sw->tb, &sw->dev, route, + &pkg->local_uuid, &pkg->remote_uuid); + if (!xd) { + tb_switch_put(sw); + return; + } + + xd->link = link; + xd->depth = depth; + + tb_port_at(route, sw)->xdomain = xd; + + tb_xdomain_add(xd); + tb_switch_put(sw); +} + +static void +icm_fr_xdomain_disconnected(struct tb *tb, const struct icm_pkg_header *hdr) +{ + const struct icm_fr_event_xdomain_disconnected *pkg = + (const struct icm_fr_event_xdomain_disconnected *)hdr; + struct tb_xdomain *xd; + + /* + * If the connection is through one or multiple devices, the + * XDomain device is removed along with them so it is fine if we + * cannot find it here. + */ + xd = tb_xdomain_find_by_uuid(tb, &pkg->remote_uuid); + if (xd) { + remove_xdomain(xd); + tb_xdomain_put(xd); + } +} + static struct pci_dev *get_upstream_port(struct pci_dev *pdev) { struct pci_dev *parent; @@ -594,6 +783,12 @@ static void icm_handle_notification(struct work_struct *work) case ICM_EVENT_DEVICE_DISCONNECTED: icm->device_disconnected(tb, n->pkg); break; + case ICM_EVENT_XDOMAIN_CONNECTED: + icm->xdomain_connected(tb, n->pkg); + break; + case ICM_EVENT_XDOMAIN_DISCONNECTED: + icm->xdomain_disconnected(tb, n->pkg); + break; } mutex_unlock(&tb->lock); @@ -927,6 +1122,10 @@ static void icm_unplug_children(struct tb_switch *sw) if (tb_is_upstream_port(port)) continue; + if (port->xdomain) { + port->xdomain->is_unplugged = true; + continue; + } if (!port->remote) continue; @@ -943,6 +1142,13 @@ static void icm_free_unplugged_children(struct tb_switch *sw) if (tb_is_upstream_port(port)) continue; + + if (port->xdomain && port->xdomain->is_unplugged) { + tb_xdomain_remove(port->xdomain); + port->xdomain = NULL; + continue; + } + if (!port->remote) continue; @@ -1009,8 +1215,10 @@ static int icm_start(struct tb *tb) tb->root_switch->no_nvm_upgrade = x86_apple_machine; ret = tb_switch_add(tb->root_switch); - if (ret) + if (ret) { tb_switch_put(tb->root_switch); + tb->root_switch = NULL; + } return ret; } @@ -1042,6 +1250,8 @@ static const struct tb_cm_ops icm_fr_ops = { .add_switch_key = icm_fr_add_switch_key, .challenge_switch_key = icm_fr_challenge_switch_key, .disconnect_pcie_paths = icm_disconnect_pcie_paths, + .approve_xdomain_paths = icm_fr_approve_xdomain_paths, + .disconnect_xdomain_paths = icm_fr_disconnect_xdomain_paths, }; struct tb *icm_probe(struct tb_nhi *nhi) @@ -1064,6 +1274,8 @@ struct tb *icm_probe(struct tb_nhi *nhi) icm->get_route = icm_fr_get_route; icm->device_connected = icm_fr_device_connected; icm->device_disconnected = icm_fr_device_disconnected; + icm->xdomain_connected = icm_fr_xdomain_connected; + icm->xdomain_disconnected = icm_fr_xdomain_disconnected; tb->cm_ops = &icm_fr_ops; break; @@ -1077,6 +1289,8 @@ struct tb *icm_probe(struct tb_nhi *nhi) icm->get_route = icm_ar_get_route; icm->device_connected = icm_fr_device_connected; icm->device_disconnected = icm_fr_device_disconnected; + icm->xdomain_connected = icm_fr_xdomain_connected; + icm->xdomain_disconnected = icm_fr_xdomain_disconnected; tb->cm_ops = &icm_fr_ops; break; } diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 5b5bb2c436be..0e05828983db 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -157,6 +157,8 @@ enum nhi_mailbox_cmd { NHI_MAILBOX_SAVE_DEVS = 0x05, NHI_MAILBOX_DISCONNECT_PCIE_PATHS = 0x06, NHI_MAILBOX_DRV_UNLOADS = 0x07, + NHI_MAILBOX_DISCONNECT_PA = 0x10, + NHI_MAILBOX_DISCONNECT_PB = 0x11, NHI_MAILBOX_ALLOW_ALL_DEVS = 0x23, }; diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 53f40c57df59..dfc357d33e1e 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -171,11 +171,11 @@ static int nvm_authenticate_host(struct tb_switch *sw) /* * Root switch NVM upgrade requires that we disconnect the - * existing PCIe paths first (in case it is not in safe mode + * existing paths first (in case it is not in safe mode * already). */ if (!sw->safe_mode) { - ret = tb_domain_disconnect_pcie_paths(sw->tb); + ret = tb_domain_disconnect_all_paths(sw->tb); if (ret) return ret; /* @@ -1363,6 +1363,9 @@ void tb_switch_remove(struct tb_switch *sw) if (sw->ports[i].remote) tb_switch_remove(sw->ports[i].remote->sw); sw->ports[i].remote = NULL; + if (sw->ports[i].xdomain) + tb_xdomain_remove(sw->ports[i].xdomain); + sw->ports[i].xdomain = NULL; } if (!sw->is_unplugged) diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index ea21d927bd09..74af9d4929ab 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -9,6 +9,7 @@ #include #include +#include #include #include "tb_regs.h" @@ -109,14 +110,25 @@ struct tb_switch { /** * struct tb_port - a thunderbolt port, part of a tb_switch + * @config: Cached port configuration read from registers + * @sw: Switch the port belongs to + * @remote: Remote port (%NULL if not connected) + * @xdomain: Remote host (%NULL if not connected) + * @cap_phy: Offset, zero if not found + * @port: Port number on switch + * @disabled: Disabled by eeprom + * @dual_link_port: If the switch is connected using two ports, points + * to the other port. + * @link_nr: Is this primary or secondary port on the dual_link. */ struct tb_port { struct tb_regs_port_header config; struct tb_switch *sw; - struct tb_port *remote; /* remote port, NULL if not connected */ - int cap_phy; /* offset, zero if not found */ - u8 port; /* port number on switch */ - bool disabled; /* disabled by eeprom */ + struct tb_port *remote; + struct tb_xdomain *xdomain; + int cap_phy; + u8 port; + bool disabled; struct tb_port *dual_link_port; u8 link_nr:1; }; @@ -189,6 +201,8 @@ struct tb_path { * @add_switch_key: Add key to switch * @challenge_switch_key: Challenge switch using key * @disconnect_pcie_paths: Disconnects PCIe paths before NVM update + * @approve_xdomain_paths: Approve (establish) XDomain DMA paths + * @disconnect_xdomain_paths: Disconnect XDomain DMA paths */ struct tb_cm_ops { int (*driver_ready)(struct tb *tb); @@ -205,6 +219,8 @@ struct tb_cm_ops { int (*challenge_switch_key)(struct tb *tb, struct tb_switch *sw, const u8 *challenge, u8 *response); int (*disconnect_pcie_paths)(struct tb *tb); + int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd); + int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd); }; static inline void *tb_priv(struct tb *tb) @@ -331,6 +347,8 @@ extern struct device_type tb_switch_type; int tb_domain_init(void); void tb_domain_exit(void); void tb_switch_exit(void); +int tb_xdomain_init(void); +void tb_xdomain_exit(void); struct tb *tb_domain_alloc(struct tb_nhi *nhi, size_t privsize); int tb_domain_add(struct tb *tb); @@ -343,6 +361,9 @@ int tb_domain_approve_switch(struct tb *tb, struct tb_switch *sw); int tb_domain_approve_switch_key(struct tb *tb, struct tb_switch *sw); int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw); int tb_domain_disconnect_pcie_paths(struct tb *tb); +int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd); +int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd); +int tb_domain_disconnect_all_paths(struct tb *tb); static inline void tb_domain_put(struct tb *tb) { @@ -422,4 +443,14 @@ static inline u64 tb_downstream_route(struct tb_port *port) | ((u64) port->port << (port->sw->config.depth * 8)); } +bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type, + const void *buf, size_t size); +struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent, + u64 route, const uuid_t *local_uuid, + const uuid_t *remote_uuid); +void tb_xdomain_add(struct tb_xdomain *xd); +void tb_xdomain_remove(struct tb_xdomain *xd); +struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link, + u8 depth); + #endif diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h index f2b2550cd97c..b0a092baa605 100644 --- a/drivers/thunderbolt/tb_msgs.h +++ b/drivers/thunderbolt/tb_msgs.h @@ -101,11 +101,14 @@ enum icm_pkg_code { ICM_CHALLENGE_DEVICE = 0x5, ICM_ADD_DEVICE_KEY = 0x6, ICM_GET_ROUTE = 0xa, + ICM_APPROVE_XDOMAIN = 0x10, }; enum icm_event_code { ICM_EVENT_DEVICE_CONNECTED = 3, ICM_EVENT_DEVICE_DISCONNECTED = 4, + ICM_EVENT_XDOMAIN_CONNECTED = 6, + ICM_EVENT_XDOMAIN_DISCONNECTED = 7, }; struct icm_pkg_header { @@ -188,6 +191,25 @@ struct icm_fr_event_device_disconnected { u16 link_info; }; +struct icm_fr_event_xdomain_connected { + struct icm_pkg_header hdr; + u16 reserved; + u16 link_info; + uuid_t remote_uuid; + uuid_t local_uuid; + u32 local_route_hi; + u32 local_route_lo; + u32 remote_route_hi; + u32 remote_route_lo; +}; + +struct icm_fr_event_xdomain_disconnected { + struct icm_pkg_header hdr; + u16 reserved; + u16 link_info; + uuid_t remote_uuid; +}; + struct icm_fr_pkg_add_device_key { struct icm_pkg_header hdr; uuid_t ep_uuid; @@ -224,6 +246,28 @@ struct icm_fr_pkg_challenge_device_response { u32 response[8]; }; +struct icm_fr_pkg_approve_xdomain { + struct icm_pkg_header hdr; + u16 reserved; + u16 link_info; + uuid_t remote_uuid; + u16 transmit_path; + u16 transmit_ring; + u16 receive_path; + u16 receive_ring; +}; + +struct icm_fr_pkg_approve_xdomain_response { + struct icm_pkg_header hdr; + u16 reserved; + u16 link_info; + uuid_t remote_uuid; + u16 transmit_path; + u16 transmit_ring; + u16 receive_path; + u16 receive_ring; +}; + /* Alpine Ridge only messages */ struct icm_ar_pkg_get_route { @@ -240,4 +284,83 @@ struct icm_ar_pkg_get_route_response { u32 route_lo; }; +/* XDomain messages */ + +struct tb_xdomain_header { + u32 route_hi; + u32 route_lo; + u32 length_sn; +}; + +#define TB_XDOMAIN_LENGTH_MASK GENMASK(5, 0) +#define TB_XDOMAIN_SN_MASK GENMASK(28, 27) +#define TB_XDOMAIN_SN_SHIFT 27 + +enum tb_xdp_type { + UUID_REQUEST_OLD = 1, + UUID_RESPONSE = 2, + PROPERTIES_REQUEST, + PROPERTIES_RESPONSE, + PROPERTIES_CHANGED_REQUEST, + PROPERTIES_CHANGED_RESPONSE, + ERROR_RESPONSE, + UUID_REQUEST = 12, +}; + +struct tb_xdp_header { + struct tb_xdomain_header xd_hdr; + uuid_t uuid; + u32 type; +}; + +struct tb_xdp_properties { + struct tb_xdp_header hdr; + uuid_t src_uuid; + uuid_t dst_uuid; + u16 offset; + u16 reserved; +}; + +struct tb_xdp_properties_response { + struct tb_xdp_header hdr; + uuid_t src_uuid; + uuid_t dst_uuid; + u16 offset; + u16 data_length; + u32 generation; + u32 data[0]; +}; + +/* + * Max length of data array single XDomain property response is allowed + * to carry. + */ +#define TB_XDP_PROPERTIES_MAX_DATA_LENGTH \ + (((256 - 4 - sizeof(struct tb_xdp_properties_response))) / 4) + +/* Maximum size of the total property block in dwords we allow */ +#define TB_XDP_PROPERTIES_MAX_LENGTH 500 + +struct tb_xdp_properties_changed { + struct tb_xdp_header hdr; + uuid_t src_uuid; +}; + +struct tb_xdp_properties_changed_response { + struct tb_xdp_header hdr; +}; + +enum tb_xdp_error { + ERROR_SUCCESS, + ERROR_UNKNOWN_PACKET, + ERROR_UNKNOWN_DOMAIN, + ERROR_NOT_SUPPORTED, + ERROR_NOT_READY, +}; + +struct tb_xdp_error_response { + struct tb_xdp_header hdr; + u32 error; +}; + #endif diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c new file mode 100644 index 000000000000..f2d06f6f7be9 --- /dev/null +++ b/drivers/thunderbolt/xdomain.c @@ -0,0 +1,1576 @@ +/* + * Thunderbolt XDomain discovery protocol support + * + * Copyright (C) 2017, Intel Corporation + * Authors: Michael Jamet + * Mika Westerberg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include "tb.h" + +#define XDOMAIN_DEFAULT_TIMEOUT 5000 /* ms */ +#define XDOMAIN_PROPERTIES_RETRIES 60 +#define XDOMAIN_PROPERTIES_CHANGED_RETRIES 10 + +struct xdomain_request_work { + struct work_struct work; + struct tb_xdp_header *pkg; + struct tb *tb; +}; + +/* Serializes access to the properties and protocol handlers below */ +static DEFINE_MUTEX(xdomain_lock); + +/* Properties exposed to the remote domains */ +static struct tb_property_dir *xdomain_property_dir; +static u32 *xdomain_property_block; +static u32 xdomain_property_block_len; +static u32 xdomain_property_block_gen; + +/* Additional protocol handlers */ +static LIST_HEAD(protocol_handlers); + +/* UUID for XDomain discovery protocol: b638d70e-42ff-40bb-97c2-90e2c0b2ff07 */ +static const uuid_t tb_xdp_uuid = + UUID_INIT(0xb638d70e, 0x42ff, 0x40bb, + 0x97, 0xc2, 0x90, 0xe2, 0xc0, 0xb2, 0xff, 0x07); + +static bool tb_xdomain_match(const struct tb_cfg_request *req, + const struct ctl_pkg *pkg) +{ + switch (pkg->frame.eof) { + case TB_CFG_PKG_ERROR: + return true; + + case TB_CFG_PKG_XDOMAIN_RESP: { + const struct tb_xdp_header *res_hdr = pkg->buffer; + const struct tb_xdp_header *req_hdr = req->request; + u8 req_seq, res_seq; + + if (pkg->frame.size < req->response_size / 4) + return false; + + /* Make sure route matches */ + if ((res_hdr->xd_hdr.route_hi & ~BIT(31)) != + req_hdr->xd_hdr.route_hi) + return false; + if ((res_hdr->xd_hdr.route_lo) != req_hdr->xd_hdr.route_lo) + return false; + + /* Then check that the sequence number matches */ + res_seq = res_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK; + res_seq >>= TB_XDOMAIN_SN_SHIFT; + req_seq = req_hdr->xd_hdr.length_sn & TB_XDOMAIN_SN_MASK; + req_seq >>= TB_XDOMAIN_SN_SHIFT; + if (res_seq != req_seq) + return false; + + /* Check that the XDomain protocol matches */ + if (!uuid_equal(&res_hdr->uuid, &req_hdr->uuid)) + return false; + + return true; + } + + default: + return false; + } +} + +static bool tb_xdomain_copy(struct tb_cfg_request *req, + const struct ctl_pkg *pkg) +{ + memcpy(req->response, pkg->buffer, req->response_size); + req->result.err = 0; + return true; +} + +static void response_ready(void *data) +{ + tb_cfg_request_put(data); +} + +static int __tb_xdomain_response(struct tb_ctl *ctl, const void *response, + size_t size, enum tb_cfg_pkg_type type) +{ + struct tb_cfg_request *req; + + req = tb_cfg_request_alloc(); + if (!req) + return -ENOMEM; + + req->match = tb_xdomain_match; + req->copy = tb_xdomain_copy; + req->request = response; + req->request_size = size; + req->request_type = type; + + return tb_cfg_request(ctl, req, response_ready, req); +} + +/** + * tb_xdomain_response() - Send a XDomain response message + * @xd: XDomain to send the message + * @response: Response to send + * @size: Size of the response + * @type: PDF type of the response + * + * This can be used to send a XDomain response message to the other + * domain. No response for the message is expected. + * + * Return: %0 in case of success and negative errno in case of failure + */ +int tb_xdomain_response(struct tb_xdomain *xd, const void *response, + size_t size, enum tb_cfg_pkg_type type) +{ + return __tb_xdomain_response(xd->tb->ctl, response, size, type); +} +EXPORT_SYMBOL_GPL(tb_xdomain_response); + +static int __tb_xdomain_request(struct tb_ctl *ctl, const void *request, + size_t request_size, enum tb_cfg_pkg_type request_type, void *response, + size_t response_size, enum tb_cfg_pkg_type response_type, + unsigned int timeout_msec) +{ + struct tb_cfg_request *req; + struct tb_cfg_result res; + + req = tb_cfg_request_alloc(); + if (!req) + return -ENOMEM; + + req->match = tb_xdomain_match; + req->copy = tb_xdomain_copy; + req->request = request; + req->request_size = request_size; + req->request_type = request_type; + req->response = response; + req->response_size = response_size; + req->response_type = response_type; + + res = tb_cfg_request_sync(ctl, req, timeout_msec); + + tb_cfg_request_put(req); + + return res.err == 1 ? -EIO : res.err; +} + +/** + * tb_xdomain_request() - Send a XDomain request + * @xd: XDomain to send the request + * @request: Request to send + * @request_size: Size of the request in bytes + * @request_type: PDF type of the request + * @response: Response is copied here + * @response_size: Expected size of the response in bytes + * @response_type: Expected PDF type of the response + * @timeout_msec: Timeout in milliseconds to wait for the response + * + * This function can be used to send XDomain control channel messages to + * the other domain. The function waits until the response is received + * or when timeout triggers. Whichever comes first. + * + * Return: %0 in case of success and negative errno in case of failure + */ +int tb_xdomain_request(struct tb_xdomain *xd, const void *request, + size_t request_size, enum tb_cfg_pkg_type request_type, + void *response, size_t response_size, + enum tb_cfg_pkg_type response_type, unsigned int timeout_msec) +{ + return __tb_xdomain_request(xd->tb->ctl, request, request_size, + request_type, response, response_size, + response_type, timeout_msec); +} +EXPORT_SYMBOL_GPL(tb_xdomain_request); + +static inline void tb_xdp_fill_header(struct tb_xdp_header *hdr, u64 route, + u8 sequence, enum tb_xdp_type type, size_t size) +{ + u32 length_sn; + + length_sn = (size - sizeof(hdr->xd_hdr)) / 4; + length_sn |= (sequence << TB_XDOMAIN_SN_SHIFT) & TB_XDOMAIN_SN_MASK; + + hdr->xd_hdr.route_hi = upper_32_bits(route); + hdr->xd_hdr.route_lo = lower_32_bits(route); + hdr->xd_hdr.length_sn = length_sn; + hdr->type = type; + memcpy(&hdr->uuid, &tb_xdp_uuid, sizeof(tb_xdp_uuid)); +} + +static int tb_xdp_handle_error(const struct tb_xdp_header *hdr) +{ + const struct tb_xdp_error_response *error; + + if (hdr->type != ERROR_RESPONSE) + return 0; + + error = (const struct tb_xdp_error_response *)hdr; + + switch (error->error) { + case ERROR_UNKNOWN_PACKET: + case ERROR_UNKNOWN_DOMAIN: + return -EIO; + case ERROR_NOT_SUPPORTED: + return -ENOTSUPP; + case ERROR_NOT_READY: + return -EAGAIN; + default: + break; + } + + return 0; +} + +static int tb_xdp_error_response(struct tb_ctl *ctl, u64 route, u8 sequence, + enum tb_xdp_error error) +{ + struct tb_xdp_error_response res; + + memset(&res, 0, sizeof(res)); + tb_xdp_fill_header(&res.hdr, route, sequence, ERROR_RESPONSE, + sizeof(res)); + res.error = error; + + return __tb_xdomain_response(ctl, &res, sizeof(res), + TB_CFG_PKG_XDOMAIN_RESP); +} + +static int tb_xdp_properties_request(struct tb_ctl *ctl, u64 route, + const uuid_t *src_uuid, const uuid_t *dst_uuid, int retry, + u32 **block, u32 *generation) +{ + struct tb_xdp_properties_response *res; + struct tb_xdp_properties req; + u16 data_len, len; + size_t total_size; + u32 *data = NULL; + int ret; + + total_size = sizeof(*res) + TB_XDP_PROPERTIES_MAX_DATA_LENGTH * 4; + res = kzalloc(total_size, GFP_KERNEL); + if (!res) + return -ENOMEM; + + memset(&req, 0, sizeof(req)); + tb_xdp_fill_header(&req.hdr, route, retry % 4, PROPERTIES_REQUEST, + sizeof(req)); + memcpy(&req.src_uuid, src_uuid, sizeof(*src_uuid)); + memcpy(&req.dst_uuid, dst_uuid, sizeof(*dst_uuid)); + + len = 0; + data_len = 0; + + do { + ret = __tb_xdomain_request(ctl, &req, sizeof(req), + TB_CFG_PKG_XDOMAIN_REQ, res, + total_size, TB_CFG_PKG_XDOMAIN_RESP, + XDOMAIN_DEFAULT_TIMEOUT); + if (ret) + goto err; + + ret = tb_xdp_handle_error(&res->hdr); + if (ret) + goto err; + + /* + * Package length includes the whole payload without the + * XDomain header. Validate first that the package is at + * least size of the response structure. + */ + len = res->hdr.xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK; + if (len < sizeof(*res) / 4) { + ret = -EINVAL; + goto err; + } + + len += sizeof(res->hdr.xd_hdr) / 4; + len -= sizeof(*res) / 4; + + if (res->offset != req.offset) { + ret = -EINVAL; + goto err; + } + + /* + * First time allocate block that has enough space for + * the whole properties block. + */ + if (!data) { + data_len = res->data_length; + if (data_len > TB_XDP_PROPERTIES_MAX_LENGTH) { + ret = -E2BIG; + goto err; + } + + data = kcalloc(data_len, sizeof(u32), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto err; + } + } + + memcpy(data + req.offset, res->data, len * 4); + req.offset += len; + } while (!data_len || req.offset < data_len); + + *block = data; + *generation = res->generation; + + kfree(res); + + return data_len; + +err: + kfree(data); + kfree(res); + + return ret; +} + +static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl, + u64 route, u8 sequence, const uuid_t *src_uuid, + const struct tb_xdp_properties *req) +{ + struct tb_xdp_properties_response *res; + size_t total_size; + u16 len; + int ret; + + /* + * Currently we expect all requests to be directed to us. The + * protocol supports forwarding, though which we might add + * support later on. + */ + if (!uuid_equal(src_uuid, &req->dst_uuid)) { + tb_xdp_error_response(ctl, route, sequence, + ERROR_UNKNOWN_DOMAIN); + return 0; + } + + mutex_lock(&xdomain_lock); + + if (req->offset >= xdomain_property_block_len) { + mutex_unlock(&xdomain_lock); + return -EINVAL; + } + + len = xdomain_property_block_len - req->offset; + len = min_t(u16, len, TB_XDP_PROPERTIES_MAX_DATA_LENGTH); + total_size = sizeof(*res) + len * 4; + + res = kzalloc(total_size, GFP_KERNEL); + if (!res) { + mutex_unlock(&xdomain_lock); + return -ENOMEM; + } + + tb_xdp_fill_header(&res->hdr, route, sequence, PROPERTIES_RESPONSE, + total_size); + res->generation = xdomain_property_block_gen; + res->data_length = xdomain_property_block_len; + res->offset = req->offset; + uuid_copy(&res->src_uuid, src_uuid); + uuid_copy(&res->dst_uuid, &req->src_uuid); + memcpy(res->data, &xdomain_property_block[req->offset], len * 4); + + mutex_unlock(&xdomain_lock); + + ret = __tb_xdomain_response(ctl, res, total_size, + TB_CFG_PKG_XDOMAIN_RESP); + + kfree(res); + return ret; +} + +static int tb_xdp_properties_changed_request(struct tb_ctl *ctl, u64 route, + int retry, const uuid_t *uuid) +{ + struct tb_xdp_properties_changed_response res; + struct tb_xdp_properties_changed req; + int ret; + + memset(&req, 0, sizeof(req)); + tb_xdp_fill_header(&req.hdr, route, retry % 4, + PROPERTIES_CHANGED_REQUEST, sizeof(req)); + uuid_copy(&req.src_uuid, uuid); + + memset(&res, 0, sizeof(res)); + ret = __tb_xdomain_request(ctl, &req, sizeof(req), + TB_CFG_PKG_XDOMAIN_REQ, &res, sizeof(res), + TB_CFG_PKG_XDOMAIN_RESP, + XDOMAIN_DEFAULT_TIMEOUT); + if (ret) + return ret; + + return tb_xdp_handle_error(&res.hdr); +} + +static int +tb_xdp_properties_changed_response(struct tb_ctl *ctl, u64 route, u8 sequence) +{ + struct tb_xdp_properties_changed_response res; + + memset(&res, 0, sizeof(res)); + tb_xdp_fill_header(&res.hdr, route, sequence, + PROPERTIES_CHANGED_RESPONSE, sizeof(res)); + return __tb_xdomain_response(ctl, &res, sizeof(res), + TB_CFG_PKG_XDOMAIN_RESP); +} + +/** + * tb_register_protocol_handler() - Register protocol handler + * @handler: Handler to register + * + * This allows XDomain service drivers to hook into incoming XDomain + * messages. After this function is called the service driver needs to + * be able to handle calls to callback whenever a package with the + * registered protocol is received. + */ +int tb_register_protocol_handler(struct tb_protocol_handler *handler) +{ + if (!handler->uuid || !handler->callback) + return -EINVAL; + if (uuid_equal(handler->uuid, &tb_xdp_uuid)) + return -EINVAL; + + mutex_lock(&xdomain_lock); + list_add_tail(&handler->list, &protocol_handlers); + mutex_unlock(&xdomain_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(tb_register_protocol_handler); + +/** + * tb_unregister_protocol_handler() - Unregister protocol handler + * @handler: Handler to unregister + * + * Removes the previously registered protocol handler. + */ +void tb_unregister_protocol_handler(struct tb_protocol_handler *handler) +{ + mutex_lock(&xdomain_lock); + list_del_init(&handler->list); + mutex_unlock(&xdomain_lock); +} +EXPORT_SYMBOL_GPL(tb_unregister_protocol_handler); + +static void tb_xdp_handle_request(struct work_struct *work) +{ + struct xdomain_request_work *xw = container_of(work, typeof(*xw), work); + const struct tb_xdp_header *pkg = xw->pkg; + const struct tb_xdomain_header *xhdr = &pkg->xd_hdr; + struct tb *tb = xw->tb; + struct tb_ctl *ctl = tb->ctl; + const uuid_t *uuid; + int ret = 0; + u8 sequence; + u64 route; + + route = ((u64)xhdr->route_hi << 32 | xhdr->route_lo) & ~BIT_ULL(63); + sequence = xhdr->length_sn & TB_XDOMAIN_SN_MASK; + sequence >>= TB_XDOMAIN_SN_SHIFT; + + mutex_lock(&tb->lock); + if (tb->root_switch) + uuid = tb->root_switch->uuid; + else + uuid = NULL; + mutex_unlock(&tb->lock); + + if (!uuid) { + tb_xdp_error_response(ctl, route, sequence, ERROR_NOT_READY); + goto out; + } + + switch (pkg->type) { + case PROPERTIES_REQUEST: + ret = tb_xdp_properties_response(tb, ctl, route, sequence, uuid, + (const struct tb_xdp_properties *)pkg); + break; + + case PROPERTIES_CHANGED_REQUEST: { + const struct tb_xdp_properties_changed *xchg = + (const struct tb_xdp_properties_changed *)pkg; + struct tb_xdomain *xd; + + ret = tb_xdp_properties_changed_response(ctl, route, sequence); + + /* + * Since the properties have been changed, let's update + * the xdomain related to this connection as well in + * case there is a change in services it offers. + */ + xd = tb_xdomain_find_by_uuid_locked(tb, &xchg->src_uuid); + if (xd) { + queue_delayed_work(tb->wq, &xd->get_properties_work, + msecs_to_jiffies(50)); + tb_xdomain_put(xd); + } + + break; + } + + default: + break; + } + + if (ret) { + tb_warn(tb, "failed to send XDomain response for %#x\n", + pkg->type); + } + +out: + kfree(xw->pkg); + kfree(xw); +} + +static void +tb_xdp_schedule_request(struct tb *tb, const struct tb_xdp_header *hdr, + size_t size) +{ + struct xdomain_request_work *xw; + + xw = kmalloc(sizeof(*xw), GFP_KERNEL); + if (!xw) + return; + + INIT_WORK(&xw->work, tb_xdp_handle_request); + xw->pkg = kmemdup(hdr, size, GFP_KERNEL); + xw->tb = tb; + + queue_work(tb->wq, &xw->work); +} + +/** + * tb_register_service_driver() - Register XDomain service driver + * @drv: Driver to register + * + * Registers new service driver from @drv to the bus. + */ +int tb_register_service_driver(struct tb_service_driver *drv) +{ + drv->driver.bus = &tb_bus_type; + return driver_register(&drv->driver); +} +EXPORT_SYMBOL_GPL(tb_register_service_driver); + +/** + * tb_unregister_service_driver() - Unregister XDomain service driver + * @xdrv: Driver to unregister + * + * Unregisters XDomain service driver from the bus. + */ +void tb_unregister_service_driver(struct tb_service_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL_GPL(tb_unregister_service_driver); + +static ssize_t key_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + /* + * It should be null terminated but anything else is pretty much + * allowed. + */ + return sprintf(buf, "%*pEp\n", (int)strlen(svc->key), svc->key); +} +static DEVICE_ATTR_RO(key); + +static int get_modalias(struct tb_service *svc, char *buf, size_t size) +{ + return snprintf(buf, size, "tbsvc:k%sp%08Xv%08Xr%08X", svc->key, + svc->prtcid, svc->prtcvers, svc->prtcrevs); +} + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + /* Full buffer size except new line and null termination */ + get_modalias(svc, buf, PAGE_SIZE - 2); + return sprintf(buf, "%s\n", buf); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t prtcid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + return sprintf(buf, "%u\n", svc->prtcid); +} +static DEVICE_ATTR_RO(prtcid); + +static ssize_t prtcvers_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + return sprintf(buf, "%u\n", svc->prtcvers); +} +static DEVICE_ATTR_RO(prtcvers); + +static ssize_t prtcrevs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + return sprintf(buf, "%u\n", svc->prtcrevs); +} +static DEVICE_ATTR_RO(prtcrevs); + +static ssize_t prtcstns_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + + return sprintf(buf, "0x%08x\n", svc->prtcstns); +} +static DEVICE_ATTR_RO(prtcstns); + +static struct attribute *tb_service_attrs[] = { + &dev_attr_key.attr, + &dev_attr_modalias.attr, + &dev_attr_prtcid.attr, + &dev_attr_prtcvers.attr, + &dev_attr_prtcrevs.attr, + &dev_attr_prtcstns.attr, + NULL, +}; + +static struct attribute_group tb_service_attr_group = { + .attrs = tb_service_attrs, +}; + +static const struct attribute_group *tb_service_attr_groups[] = { + &tb_service_attr_group, + NULL, +}; + +static int tb_service_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + char modalias[64]; + + get_modalias(svc, modalias, sizeof(modalias)); + return add_uevent_var(env, "MODALIAS=%s", modalias); +} + +static void tb_service_release(struct device *dev) +{ + struct tb_service *svc = container_of(dev, struct tb_service, dev); + struct tb_xdomain *xd = tb_service_parent(svc); + + ida_simple_remove(&xd->service_ids, svc->id); + kfree(svc->key); + kfree(svc); +} + +struct device_type tb_service_type = { + .name = "thunderbolt_service", + .groups = tb_service_attr_groups, + .uevent = tb_service_uevent, + .release = tb_service_release, +}; +EXPORT_SYMBOL_GPL(tb_service_type); + +static int remove_missing_service(struct device *dev, void *data) +{ + struct tb_xdomain *xd = data; + struct tb_service *svc; + + svc = tb_to_service(dev); + if (!svc) + return 0; + + if (!tb_property_find(xd->properties, svc->key, + TB_PROPERTY_TYPE_DIRECTORY)) + device_unregister(dev); + + return 0; +} + +static int find_service(struct device *dev, void *data) +{ + const struct tb_property *p = data; + struct tb_service *svc; + + svc = tb_to_service(dev); + if (!svc) + return 0; + + return !strcmp(svc->key, p->key); +} + +static int populate_service(struct tb_service *svc, + struct tb_property *property) +{ + struct tb_property_dir *dir = property->value.dir; + struct tb_property *p; + + /* Fill in standard properties */ + p = tb_property_find(dir, "prtcid", TB_PROPERTY_TYPE_VALUE); + if (p) + svc->prtcid = p->value.immediate; + p = tb_property_find(dir, "prtcvers", TB_PROPERTY_TYPE_VALUE); + if (p) + svc->prtcvers = p->value.immediate; + p = tb_property_find(dir, "prtcrevs", TB_PROPERTY_TYPE_VALUE); + if (p) + svc->prtcrevs = p->value.immediate; + p = tb_property_find(dir, "prtcstns", TB_PROPERTY_TYPE_VALUE); + if (p) + svc->prtcstns = p->value.immediate; + + svc->key = kstrdup(property->key, GFP_KERNEL); + if (!svc->key) + return -ENOMEM; + + return 0; +} + +static void enumerate_services(struct tb_xdomain *xd) +{ + struct tb_service *svc; + struct tb_property *p; + struct device *dev; + + /* + * First remove all services that are not available anymore in + * the updated property block. + */ + device_for_each_child_reverse(&xd->dev, xd, remove_missing_service); + + /* Then re-enumerate properties creating new services as we go */ + tb_property_for_each(xd->properties, p) { + if (p->type != TB_PROPERTY_TYPE_DIRECTORY) + continue; + + /* If the service exists already we are fine */ + dev = device_find_child(&xd->dev, p, find_service); + if (dev) { + put_device(dev); + continue; + } + + svc = kzalloc(sizeof(*svc), GFP_KERNEL); + if (!svc) + break; + + if (populate_service(svc, p)) { + kfree(svc); + break; + } + + svc->id = ida_simple_get(&xd->service_ids, 0, 0, GFP_KERNEL); + svc->dev.bus = &tb_bus_type; + svc->dev.type = &tb_service_type; + svc->dev.parent = &xd->dev; + dev_set_name(&svc->dev, "%s.%d", dev_name(&xd->dev), svc->id); + + if (device_register(&svc->dev)) { + put_device(&svc->dev); + break; + } + } +} + +static int populate_properties(struct tb_xdomain *xd, + struct tb_property_dir *dir) +{ + const struct tb_property *p; + + /* Required properties */ + p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_VALUE); + if (!p) + return -EINVAL; + xd->device = p->value.immediate; + + p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_VALUE); + if (!p) + return -EINVAL; + xd->vendor = p->value.immediate; + + kfree(xd->device_name); + xd->device_name = NULL; + kfree(xd->vendor_name); + xd->vendor_name = NULL; + + /* Optional properties */ + p = tb_property_find(dir, "deviceid", TB_PROPERTY_TYPE_TEXT); + if (p) + xd->device_name = kstrdup(p->value.text, GFP_KERNEL); + p = tb_property_find(dir, "vendorid", TB_PROPERTY_TYPE_TEXT); + if (p) + xd->vendor_name = kstrdup(p->value.text, GFP_KERNEL); + + return 0; +} + +/* Called with @xd->lock held */ +static void tb_xdomain_restore_paths(struct tb_xdomain *xd) +{ + if (!xd->resume) + return; + + xd->resume = false; + if (xd->transmit_path) { + dev_dbg(&xd->dev, "re-establishing DMA path\n"); + tb_domain_approve_xdomain_paths(xd->tb, xd); + } +} + +static void tb_xdomain_get_properties(struct work_struct *work) +{ + struct tb_xdomain *xd = container_of(work, typeof(*xd), + get_properties_work.work); + struct tb_property_dir *dir; + struct tb *tb = xd->tb; + bool update = false; + u32 *block = NULL; + u32 gen = 0; + int ret; + + ret = tb_xdp_properties_request(tb->ctl, xd->route, xd->local_uuid, + xd->remote_uuid, xd->properties_retries, + &block, &gen); + if (ret < 0) { + if (xd->properties_retries-- > 0) { + queue_delayed_work(xd->tb->wq, &xd->get_properties_work, + msecs_to_jiffies(1000)); + } else { + /* Give up now */ + dev_err(&xd->dev, + "failed read XDomain properties from %pUb\n", + xd->remote_uuid); + } + return; + } + + xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES; + + mutex_lock(&xd->lock); + + /* Only accept newer generation properties */ + if (xd->properties && gen <= xd->property_block_gen) { + /* + * On resume it is likely that the properties block is + * not changed (unless the other end added or removed + * services). However, we need to make sure the existing + * DMA paths are restored properly. + */ + tb_xdomain_restore_paths(xd); + goto err_free_block; + } + + dir = tb_property_parse_dir(block, ret); + if (!dir) { + dev_err(&xd->dev, "failed to parse XDomain properties\n"); + goto err_free_block; + } + + ret = populate_properties(xd, dir); + if (ret) { + dev_err(&xd->dev, "missing XDomain properties in response\n"); + goto err_free_dir; + } + + /* Release the existing one */ + if (xd->properties) { + tb_property_free_dir(xd->properties); + update = true; + } + + xd->properties = dir; + xd->property_block_gen = gen; + + tb_xdomain_restore_paths(xd); + + mutex_unlock(&xd->lock); + + kfree(block); + + /* + * Now the device should be ready enough so we can add it to the + * bus and let userspace know about it. If the device is already + * registered, we notify the userspace that it has changed. + */ + if (!update) { + if (device_add(&xd->dev)) { + dev_err(&xd->dev, "failed to add XDomain device\n"); + return; + } + } else { + kobject_uevent(&xd->dev.kobj, KOBJ_CHANGE); + } + + enumerate_services(xd); + return; + +err_free_dir: + tb_property_free_dir(dir); +err_free_block: + kfree(block); + mutex_unlock(&xd->lock); +} + +static void tb_xdomain_properties_changed(struct work_struct *work) +{ + struct tb_xdomain *xd = container_of(work, typeof(*xd), + properties_changed_work.work); + int ret; + + ret = tb_xdp_properties_changed_request(xd->tb->ctl, xd->route, + xd->properties_changed_retries, xd->local_uuid); + if (ret) { + if (xd->properties_changed_retries-- > 0) + queue_delayed_work(xd->tb->wq, + &xd->properties_changed_work, + msecs_to_jiffies(1000)); + return; + } + + xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES; +} + +static ssize_t device_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + + return sprintf(buf, "%#x\n", xd->device); +} +static DEVICE_ATTR_RO(device); + +static ssize_t +device_name_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + int ret; + + if (mutex_lock_interruptible(&xd->lock)) + return -ERESTARTSYS; + ret = sprintf(buf, "%s\n", xd->device_name ? xd->device_name : ""); + mutex_unlock(&xd->lock); + + return ret; +} +static DEVICE_ATTR_RO(device_name); + +static ssize_t vendor_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + + return sprintf(buf, "%#x\n", xd->vendor); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t +vendor_name_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + int ret; + + if (mutex_lock_interruptible(&xd->lock)) + return -ERESTARTSYS; + ret = sprintf(buf, "%s\n", xd->vendor_name ? xd->vendor_name : ""); + mutex_unlock(&xd->lock); + + return ret; +} +static DEVICE_ATTR_RO(vendor_name); + +static ssize_t unique_id_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + + return sprintf(buf, "%pUb\n", xd->remote_uuid); +} +static DEVICE_ATTR_RO(unique_id); + +static struct attribute *xdomain_attrs[] = { + &dev_attr_device.attr, + &dev_attr_device_name.attr, + &dev_attr_unique_id.attr, + &dev_attr_vendor.attr, + &dev_attr_vendor_name.attr, + NULL, +}; + +static struct attribute_group xdomain_attr_group = { + .attrs = xdomain_attrs, +}; + +static const struct attribute_group *xdomain_attr_groups[] = { + &xdomain_attr_group, + NULL, +}; + +static void tb_xdomain_release(struct device *dev) +{ + struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev); + + put_device(xd->dev.parent); + + tb_property_free_dir(xd->properties); + ida_destroy(&xd->service_ids); + + kfree(xd->local_uuid); + kfree(xd->remote_uuid); + kfree(xd->device_name); + kfree(xd->vendor_name); + kfree(xd); +} + +static void start_handshake(struct tb_xdomain *xd) +{ + xd->properties_retries = XDOMAIN_PROPERTIES_RETRIES; + xd->properties_changed_retries = XDOMAIN_PROPERTIES_CHANGED_RETRIES; + + /* Start exchanging properties with the other host */ + queue_delayed_work(xd->tb->wq, &xd->properties_changed_work, + msecs_to_jiffies(100)); + queue_delayed_work(xd->tb->wq, &xd->get_properties_work, + msecs_to_jiffies(1000)); +} + +static void stop_handshake(struct tb_xdomain *xd) +{ + xd->properties_retries = 0; + xd->properties_changed_retries = 0; + + cancel_delayed_work_sync(&xd->get_properties_work); + cancel_delayed_work_sync(&xd->properties_changed_work); +} + +static int __maybe_unused tb_xdomain_suspend(struct device *dev) +{ + stop_handshake(tb_to_xdomain(dev)); + return 0; +} + +static int __maybe_unused tb_xdomain_resume(struct device *dev) +{ + struct tb_xdomain *xd = tb_to_xdomain(dev); + + /* + * Ask tb_xdomain_get_properties() restore any existing DMA + * paths after properties are re-read. + */ + xd->resume = true; + start_handshake(xd); + + return 0; +} + +static const struct dev_pm_ops tb_xdomain_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(tb_xdomain_suspend, tb_xdomain_resume) +}; + +struct device_type tb_xdomain_type = { + .name = "thunderbolt_xdomain", + .release = tb_xdomain_release, + .pm = &tb_xdomain_pm_ops, +}; +EXPORT_SYMBOL_GPL(tb_xdomain_type); + +/** + * tb_xdomain_alloc() - Allocate new XDomain object + * @tb: Domain where the XDomain belongs + * @parent: Parent device (the switch through the connection to the + * other domain is reached). + * @route: Route string used to reach the other domain + * @local_uuid: Our local domain UUID + * @remote_uuid: UUID of the other domain + * + * Allocates new XDomain structure and returns pointer to that. The + * object must be released by calling tb_xdomain_put(). + */ +struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent, + u64 route, const uuid_t *local_uuid, + const uuid_t *remote_uuid) +{ + struct tb_xdomain *xd; + + xd = kzalloc(sizeof(*xd), GFP_KERNEL); + if (!xd) + return NULL; + + xd->tb = tb; + xd->route = route; + ida_init(&xd->service_ids); + mutex_init(&xd->lock); + INIT_DELAYED_WORK(&xd->get_properties_work, tb_xdomain_get_properties); + INIT_DELAYED_WORK(&xd->properties_changed_work, + tb_xdomain_properties_changed); + + xd->local_uuid = kmemdup(local_uuid, sizeof(uuid_t), GFP_KERNEL); + if (!xd->local_uuid) + goto err_free; + + xd->remote_uuid = kmemdup(remote_uuid, sizeof(uuid_t), GFP_KERNEL); + if (!xd->remote_uuid) + goto err_free_local_uuid; + + device_initialize(&xd->dev); + xd->dev.parent = get_device(parent); + xd->dev.bus = &tb_bus_type; + xd->dev.type = &tb_xdomain_type; + xd->dev.groups = xdomain_attr_groups; + dev_set_name(&xd->dev, "%u-%llx", tb->index, route); + + return xd; + +err_free_local_uuid: + kfree(xd->local_uuid); +err_free: + kfree(xd); + + return NULL; +} + +/** + * tb_xdomain_add() - Add XDomain to the bus + * @xd: XDomain to add + * + * This function starts XDomain discovery protocol handshake and + * eventually adds the XDomain to the bus. After calling this function + * the caller needs to call tb_xdomain_remove() in order to remove and + * release the object regardless whether the handshake succeeded or not. + */ +void tb_xdomain_add(struct tb_xdomain *xd) +{ + /* Start exchanging properties with the other host */ + start_handshake(xd); +} + +static int unregister_service(struct device *dev, void *data) +{ + device_unregister(dev); + return 0; +} + +/** + * tb_xdomain_remove() - Remove XDomain from the bus + * @xd: XDomain to remove + * + * This will stop all ongoing configuration work and remove the XDomain + * along with any services from the bus. When the last reference to @xd + * is released the object will be released as well. + */ +void tb_xdomain_remove(struct tb_xdomain *xd) +{ + stop_handshake(xd); + + device_for_each_child_reverse(&xd->dev, xd, unregister_service); + + if (!device_is_registered(&xd->dev)) + put_device(&xd->dev); + else + device_unregister(&xd->dev); +} + +/** + * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection + * @xd: XDomain connection + * @transmit_path: HopID of the transmit path the other end is using to + * send packets + * @transmit_ring: DMA ring used to receive packets from the other end + * @receive_path: HopID of the receive path the other end is using to + * receive packets + * @receive_ring: DMA ring used to send packets to the other end + * + * The function enables DMA paths accordingly so that after successful + * return the caller can send and receive packets using high-speed DMA + * path. + * + * Return: %0 in case of success and negative errno in case of error + */ +int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path, + u16 transmit_ring, u16 receive_path, + u16 receive_ring) +{ + int ret; + + mutex_lock(&xd->lock); + + if (xd->transmit_path) { + ret = xd->transmit_path == transmit_path ? 0 : -EBUSY; + goto exit_unlock; + } + + xd->transmit_path = transmit_path; + xd->transmit_ring = transmit_ring; + xd->receive_path = receive_path; + xd->receive_ring = receive_ring; + + ret = tb_domain_approve_xdomain_paths(xd->tb, xd); + +exit_unlock: + mutex_unlock(&xd->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tb_xdomain_enable_paths); + +/** + * tb_xdomain_disable_paths() - Disable DMA paths for XDomain connection + * @xd: XDomain connection + * + * This does the opposite of tb_xdomain_enable_paths(). After call to + * this the caller is not expected to use the rings anymore. + * + * Return: %0 in case of success and negative errno in case of error + */ +int tb_xdomain_disable_paths(struct tb_xdomain *xd) +{ + int ret = 0; + + mutex_lock(&xd->lock); + if (xd->transmit_path) { + xd->transmit_path = 0; + xd->transmit_ring = 0; + xd->receive_path = 0; + xd->receive_ring = 0; + + ret = tb_domain_disconnect_xdomain_paths(xd->tb, xd); + } + mutex_unlock(&xd->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tb_xdomain_disable_paths); + +struct tb_xdomain_lookup { + const uuid_t *uuid; + u8 link; + u8 depth; +}; + +static struct tb_xdomain *switch_find_xdomain(struct tb_switch *sw, + const struct tb_xdomain_lookup *lookup) +{ + int i; + + for (i = 1; i <= sw->config.max_port_number; i++) { + struct tb_port *port = &sw->ports[i]; + struct tb_xdomain *xd; + + if (tb_is_upstream_port(port)) + continue; + + if (port->xdomain) { + xd = port->xdomain; + + if (lookup->uuid) { + if (uuid_equal(xd->remote_uuid, lookup->uuid)) + return xd; + } else if (lookup->link == xd->link && + lookup->depth == xd->depth) { + return xd; + } + } else if (port->remote) { + xd = switch_find_xdomain(port->remote->sw, lookup); + if (xd) + return xd; + } + } + + return NULL; +} + +/** + * tb_xdomain_find_by_uuid() - Find an XDomain by UUID + * @tb: Domain where the XDomain belongs to + * @uuid: UUID to look for + * + * Finds XDomain by walking through the Thunderbolt topology below @tb. + * The returned XDomain will have its reference count increased so the + * caller needs to call tb_xdomain_put() when it is done with the + * object. + * + * This will find all XDomains including the ones that are not yet added + * to the bus (handshake is still in progress). + * + * The caller needs to hold @tb->lock. + */ +struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid) +{ + struct tb_xdomain_lookup lookup; + struct tb_xdomain *xd; + + memset(&lookup, 0, sizeof(lookup)); + lookup.uuid = uuid; + + xd = switch_find_xdomain(tb->root_switch, &lookup); + if (xd) { + get_device(&xd->dev); + return xd; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(tb_xdomain_find_by_uuid); + +/** + * tb_xdomain_find_by_link_depth() - Find an XDomain by link and depth + * @tb: Domain where the XDomain belongs to + * @link: Root switch link number + * @depth: Depth in the link + * + * Finds XDomain by walking through the Thunderbolt topology below @tb. + * The returned XDomain will have its reference count increased so the + * caller needs to call tb_xdomain_put() when it is done with the + * object. + * + * This will find all XDomains including the ones that are not yet added + * to the bus (handshake is still in progress). + * + * The caller needs to hold @tb->lock. + */ +struct tb_xdomain *tb_xdomain_find_by_link_depth(struct tb *tb, u8 link, + u8 depth) +{ + struct tb_xdomain_lookup lookup; + struct tb_xdomain *xd; + + memset(&lookup, 0, sizeof(lookup)); + lookup.link = link; + lookup.depth = depth; + + xd = switch_find_xdomain(tb->root_switch, &lookup); + if (xd) { + get_device(&xd->dev); + return xd; + } + + return NULL; +} + +bool tb_xdomain_handle_request(struct tb *tb, enum tb_cfg_pkg_type type, + const void *buf, size_t size) +{ + const struct tb_protocol_handler *handler, *tmp; + const struct tb_xdp_header *hdr = buf; + unsigned int length; + int ret = 0; + + /* We expect the packet is at least size of the header */ + length = hdr->xd_hdr.length_sn & TB_XDOMAIN_LENGTH_MASK; + if (length != size / 4 - sizeof(hdr->xd_hdr) / 4) + return true; + if (length < sizeof(*hdr) / 4 - sizeof(hdr->xd_hdr) / 4) + return true; + + /* + * Handle XDomain discovery protocol packets directly here. For + * other protocols (based on their UUID) we call registered + * handlers in turn. + */ + if (uuid_equal(&hdr->uuid, &tb_xdp_uuid)) { + if (type == TB_CFG_PKG_XDOMAIN_REQ) { + tb_xdp_schedule_request(tb, hdr, size); + return true; + } + return false; + } + + mutex_lock(&xdomain_lock); + list_for_each_entry_safe(handler, tmp, &protocol_handlers, list) { + if (!uuid_equal(&hdr->uuid, handler->uuid)) + continue; + + mutex_unlock(&xdomain_lock); + ret = handler->callback(buf, size, handler->data); + mutex_lock(&xdomain_lock); + + if (ret) + break; + } + mutex_unlock(&xdomain_lock); + + return ret > 0; +} + +static int rebuild_property_block(void) +{ + u32 *block, len; + int ret; + + ret = tb_property_format_dir(xdomain_property_dir, NULL, 0); + if (ret < 0) + return ret; + + len = ret; + + block = kcalloc(len, sizeof(u32), GFP_KERNEL); + if (!block) + return -ENOMEM; + + ret = tb_property_format_dir(xdomain_property_dir, block, len); + if (ret) { + kfree(block); + return ret; + } + + kfree(xdomain_property_block); + xdomain_property_block = block; + xdomain_property_block_len = len; + xdomain_property_block_gen++; + + return 0; +} + +static int update_xdomain(struct device *dev, void *data) +{ + struct tb_xdomain *xd; + + xd = tb_to_xdomain(dev); + if (xd) { + queue_delayed_work(xd->tb->wq, &xd->properties_changed_work, + msecs_to_jiffies(50)); + } + + return 0; +} + +static void update_all_xdomains(void) +{ + bus_for_each_dev(&tb_bus_type, NULL, NULL, update_xdomain); +} + +static bool remove_directory(const char *key, const struct tb_property_dir *dir) +{ + struct tb_property *p; + + p = tb_property_find(xdomain_property_dir, key, + TB_PROPERTY_TYPE_DIRECTORY); + if (p && p->value.dir == dir) { + tb_property_remove(p); + return true; + } + return false; +} + +/** + * tb_register_property_dir() - Register property directory to the host + * @key: Key (name) of the directory to add + * @dir: Directory to add + * + * Service drivers can use this function to add new property directory + * to the host available properties. The other connected hosts are + * notified so they can re-read properties of this host if they are + * interested. + * + * Return: %0 on success and negative errno on failure + */ +int tb_register_property_dir(const char *key, struct tb_property_dir *dir) +{ + int ret; + + if (!key || strlen(key) > 8) + return -EINVAL; + + mutex_lock(&xdomain_lock); + if (tb_property_find(xdomain_property_dir, key, + TB_PROPERTY_TYPE_DIRECTORY)) { + ret = -EEXIST; + goto err_unlock; + } + + ret = tb_property_add_dir(xdomain_property_dir, key, dir); + if (ret) + goto err_unlock; + + ret = rebuild_property_block(); + if (ret) { + remove_directory(key, dir); + goto err_unlock; + } + + mutex_unlock(&xdomain_lock); + update_all_xdomains(); + return 0; + +err_unlock: + mutex_unlock(&xdomain_lock); + return ret; +} +EXPORT_SYMBOL_GPL(tb_register_property_dir); + +/** + * tb_unregister_property_dir() - Removes property directory from host + * @key: Key (name) of the directory + * @dir: Directory to remove + * + * This will remove the existing directory from this host and notify the + * connected hosts about the change. + */ +void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir) +{ + int ret = 0; + + mutex_lock(&xdomain_lock); + if (remove_directory(key, dir)) + ret = rebuild_property_block(); + mutex_unlock(&xdomain_lock); + + if (!ret) + update_all_xdomains(); +} +EXPORT_SYMBOL_GPL(tb_unregister_property_dir); + +int tb_xdomain_init(void) +{ + int ret; + + xdomain_property_dir = tb_property_create_dir(NULL); + if (!xdomain_property_dir) + return -ENOMEM; + + /* + * Initialize standard set of properties without any service + * directories. Those will be added by service drivers + * themselves when they are loaded. + */ + tb_property_add_immediate(xdomain_property_dir, "vendorid", + PCI_VENDOR_ID_INTEL); + tb_property_add_text(xdomain_property_dir, "vendorid", "Intel Corp."); + tb_property_add_immediate(xdomain_property_dir, "deviceid", 0x1); + tb_property_add_text(xdomain_property_dir, "deviceid", + utsname()->nodename); + tb_property_add_immediate(xdomain_property_dir, "devicerv", 0x80000100); + + ret = rebuild_property_block(); + if (ret) { + tb_property_free_dir(xdomain_property_dir); + xdomain_property_dir = NULL; + } + + return ret; +} + +void tb_xdomain_exit(void) +{ + kfree(xdomain_property_block); + tb_property_free_dir(xdomain_property_dir); +} diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 694cebb50f72..7625c3b81f84 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -683,5 +683,31 @@ struct fsl_mc_device_id { const char obj_type[16]; }; +/** + * struct tb_service_id - Thunderbolt service identifiers + * @match_flags: Flags used to match the structure + * @protocol_key: Protocol key the service supports + * @protocol_id: Protocol id the service supports + * @protocol_version: Version of the protocol + * @protocol_revision: Revision of the protocol software + * @driver_data: Driver specific data + * + * Thunderbolt XDomain services are exposed as devices where each device + * carries the protocol information the service supports. Thunderbolt + * XDomain service drivers match against that information. + */ +struct tb_service_id { + __u32 match_flags; + char protocol_key[8 + 1]; + __u32 protocol_id; + __u32 protocol_version; + __u32 protocol_revision; + kernel_ulong_t driver_data; +}; + +#define TBSVC_MATCH_PROTOCOL_KEY 0x0001 +#define TBSVC_MATCH_PROTOCOL_ID 0x0002 +#define TBSVC_MATCH_PROTOCOL_VERSION 0x0004 +#define TBSVC_MATCH_PROTOCOL_REVISION 0x0008 #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 43b8d1e09341..18c0e3d5e85c 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -17,6 +17,7 @@ #include #include #include +#include #include enum tb_cfg_pkg_type { @@ -77,6 +78,8 @@ struct tb { }; extern struct bus_type tb_bus_type; +extern struct device_type tb_service_type; +extern struct device_type tb_xdomain_type; #define TB_LINKS_PER_PHY_PORT 2 @@ -155,4 +158,243 @@ struct tb_property *tb_property_get_next(struct tb_property_dir *dir, property; \ property = tb_property_get_next(dir, property)) +int tb_register_property_dir(const char *key, struct tb_property_dir *dir); +void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); + +/** + * struct tb_xdomain - Cross-domain (XDomain) connection + * @dev: XDomain device + * @tb: Pointer to the domain + * @remote_uuid: UUID of the remote domain (host) + * @local_uuid: Cached local UUID + * @route: Route string the other domain can be reached + * @vendor: Vendor ID of the remote domain + * @device: Device ID of the demote domain + * @lock: Lock to serialize access to the following fields of this structure + * @vendor_name: Name of the vendor (or %NULL if not known) + * @device_name: Name of the device (or %NULL if not known) + * @is_unplugged: The XDomain is unplugged + * @resume: The XDomain is being resumed + * @transmit_path: HopID which the remote end expects us to transmit + * @transmit_ring: Local ring (hop) where outgoing packets are pushed + * @receive_path: HopID which we expect the remote end to transmit + * @receive_ring: Local ring (hop) where incoming packets arrive + * @service_ids: Used to generate IDs for the services + * @properties: Properties exported by the remote domain + * @property_block_gen: Generation of @properties + * @properties_lock: Lock protecting @properties. + * @get_properties_work: Work used to get remote domain properties + * @properties_retries: Number of times left to read properties + * @properties_changed_work: Work used to notify the remote domain that + * our properties have changed + * @properties_changed_retries: Number of times left to send properties + * changed notification + * @link: Root switch link the remote domain is connected (ICM only) + * @depth: Depth in the chain the remote domain is connected (ICM only) + * + * This structure represents connection across two domains (hosts). + * Each XDomain contains zero or more services which are exposed as + * &struct tb_service objects. + * + * Service drivers may access this structure if they need to enumerate + * non-standard properties but they need hold @lock when doing so + * because properties can be changed asynchronously in response to + * changes in the remote domain. + */ +struct tb_xdomain { + struct device dev; + struct tb *tb; + uuid_t *remote_uuid; + const uuid_t *local_uuid; + u64 route; + u16 vendor; + u16 device; + struct mutex lock; + const char *vendor_name; + const char *device_name; + bool is_unplugged; + bool resume; + u16 transmit_path; + u16 transmit_ring; + u16 receive_path; + u16 receive_ring; + struct ida service_ids; + struct tb_property_dir *properties; + u32 property_block_gen; + struct delayed_work get_properties_work; + int properties_retries; + struct delayed_work properties_changed_work; + int properties_changed_retries; + u8 link; + u8 depth; +}; + +int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path, + u16 transmit_ring, u16 receive_path, + u16 receive_ring); +int tb_xdomain_disable_paths(struct tb_xdomain *xd); +struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid); + +static inline struct tb_xdomain * +tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid) +{ + struct tb_xdomain *xd; + + mutex_lock(&tb->lock); + xd = tb_xdomain_find_by_uuid(tb, uuid); + mutex_unlock(&tb->lock); + + return xd; +} + +static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd) +{ + if (xd) + get_device(&xd->dev); + return xd; +} + +static inline void tb_xdomain_put(struct tb_xdomain *xd) +{ + if (xd) + put_device(&xd->dev); +} + +static inline bool tb_is_xdomain(const struct device *dev) +{ + return dev->type == &tb_xdomain_type; +} + +static inline struct tb_xdomain *tb_to_xdomain(struct device *dev) +{ + if (tb_is_xdomain(dev)) + return container_of(dev, struct tb_xdomain, dev); + return NULL; +} + +int tb_xdomain_response(struct tb_xdomain *xd, const void *response, + size_t size, enum tb_cfg_pkg_type type); +int tb_xdomain_request(struct tb_xdomain *xd, const void *request, + size_t request_size, enum tb_cfg_pkg_type request_type, + void *response, size_t response_size, + enum tb_cfg_pkg_type response_type, + unsigned int timeout_msec); + +/** + * tb_protocol_handler - Protocol specific handler + * @uuid: XDomain messages with this UUID are dispatched to this handler + * @callback: Callback called with the XDomain message. Returning %1 + * here tells the XDomain core that the message was handled + * by this handler and should not be forwared to other + * handlers. + * @data: Data passed with the callback + * @list: Handlers are linked using this + * + * Thunderbolt services can hook into incoming XDomain requests by + * registering protocol handler. Only limitation is that the XDomain + * discovery protocol UUID cannot be registered since it is handled by + * the core XDomain code. + * + * The @callback must check that the message is really directed to the + * service the driver implements. + */ +struct tb_protocol_handler { + const uuid_t *uuid; + int (*callback)(const void *buf, size_t size, void *data); + void *data; + struct list_head list; +}; + +int tb_register_protocol_handler(struct tb_protocol_handler *handler); +void tb_unregister_protocol_handler(struct tb_protocol_handler *handler); + +/** + * struct tb_service - Thunderbolt service + * @dev: XDomain device + * @id: ID of the service (shown in sysfs) + * @key: Protocol key from the properties directory + * @prtcid: Protocol ID from the properties directory + * @prtcvers: Protocol version from the properties directory + * @prtcrevs: Protocol software revision from the properties directory + * @prtcstns: Protocol settings mask from the properties directory + * + * Each domain exposes set of services it supports as collection of + * properties. For each service there will be one corresponding + * &struct tb_service. Service drivers are bound to these. + */ +struct tb_service { + struct device dev; + int id; + const char *key; + u32 prtcid; + u32 prtcvers; + u32 prtcrevs; + u32 prtcstns; +}; + +static inline struct tb_service *tb_service_get(struct tb_service *svc) +{ + if (svc) + get_device(&svc->dev); + return svc; +} + +static inline void tb_service_put(struct tb_service *svc) +{ + if (svc) + put_device(&svc->dev); +} + +static inline bool tb_is_service(const struct device *dev) +{ + return dev->type == &tb_service_type; +} + +static inline struct tb_service *tb_to_service(struct device *dev) +{ + if (tb_is_service(dev)) + return container_of(dev, struct tb_service, dev); + return NULL; +} + +/** + * tb_service_driver - Thunderbolt service driver + * @driver: Driver structure + * @probe: Called when the driver is probed + * @remove: Called when the driver is removed (optional) + * @shutdown: Called at shutdown time to stop the service (optional) + * @id_table: Table of service identifiers the driver supports + */ +struct tb_service_driver { + struct device_driver driver; + int (*probe)(struct tb_service *svc, const struct tb_service_id *id); + void (*remove)(struct tb_service *svc); + void (*shutdown)(struct tb_service *svc); + const struct tb_service_id *id_table; +}; + +#define TB_SERVICE(key, id) \ + .match_flags = TBSVC_MATCH_PROTOCOL_KEY | \ + TBSVC_MATCH_PROTOCOL_ID, \ + .protocol_key = (key), \ + .protocol_id = (id) + +int tb_register_service_driver(struct tb_service_driver *drv); +void tb_unregister_service_driver(struct tb_service_driver *drv); + +static inline void *tb_service_get_drvdata(const struct tb_service *svc) +{ + return dev_get_drvdata(&svc->dev); +} + +static inline void tb_service_set_drvdata(struct tb_service *svc, void *data) +{ + dev_set_drvdata(&svc->dev, data); +} + +static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) +{ + return tb_to_xdomain(svc->dev.parent); +} + #endif /* THUNDERBOLT_H_ */ diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c index e4d90e50f6fe..57263f2f8f2f 100644 --- a/scripts/mod/devicetable-offsets.c +++ b/scripts/mod/devicetable-offsets.c @@ -206,5 +206,12 @@ int main(void) DEVID_FIELD(fsl_mc_device_id, vendor); DEVID_FIELD(fsl_mc_device_id, obj_type); + DEVID(tb_service_id); + DEVID_FIELD(tb_service_id, match_flags); + DEVID_FIELD(tb_service_id, protocol_key); + DEVID_FIELD(tb_service_id, protocol_id); + DEVID_FIELD(tb_service_id, protocol_version); + DEVID_FIELD(tb_service_id, protocol_revision); + return 0; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 29d6699d5a06..6ef6e63f96fd 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1301,6 +1301,31 @@ static int do_fsl_mc_entry(const char *filename, void *symval, } ADD_TO_DEVTABLE("fslmc", fsl_mc_device_id, do_fsl_mc_entry); +/* Looks like: tbsvc:kSpNvNrN */ +static int do_tbsvc_entry(const char *filename, void *symval, char *alias) +{ + DEF_FIELD(symval, tb_service_id, match_flags); + DEF_FIELD_ADDR(symval, tb_service_id, protocol_key); + DEF_FIELD(symval, tb_service_id, protocol_id); + DEF_FIELD(symval, tb_service_id, protocol_version); + DEF_FIELD(symval, tb_service_id, protocol_revision); + + strcpy(alias, "tbsvc:"); + if (match_flags & TBSVC_MATCH_PROTOCOL_KEY) + sprintf(alias + strlen(alias), "k%s", *protocol_key); + else + strcat(alias + strlen(alias), "k*"); + ADD(alias, "p", match_flags & TBSVC_MATCH_PROTOCOL_ID, protocol_id); + ADD(alias, "v", match_flags & TBSVC_MATCH_PROTOCOL_VERSION, + protocol_version); + ADD(alias, "r", match_flags & TBSVC_MATCH_PROTOCOL_REVISION, + protocol_revision); + + add_wildcard(alias); + return 1; +} +ADD_TO_DEVTABLE("tbsvc", tb_service_id, do_tbsvc_entry); + /* Does namelen bytes of name exactly match the symbol? */ static bool sym_is(const char *name, unsigned namelen, const char *symbol) { From 8c6bba10fb9262d7b3a11e86a40621d5b37810a6 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:35 +0300 Subject: [PATCH 1150/1322] thunderbolt: Configure interrupt throttling for all interrupts This will keep the interrupt delivery rate reasonable. The value used here (128 us) is a recommendation from the hardware people. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/nhi.c | 23 ++++++++++++++++++++--- drivers/thunderbolt/nhi_regs.h | 2 ++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 05af126a2435..8a7a3d0133f9 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -651,6 +651,22 @@ static int nhi_suspend_noirq(struct device *dev) return tb_domain_suspend_noirq(tb); } +static void nhi_enable_int_throttling(struct tb_nhi *nhi) +{ + /* Throttling is specified in 256ns increments */ + u32 throttle = DIV_ROUND_UP(128 * NSEC_PER_USEC, 256); + unsigned int i; + + /* + * Configure interrupt throttling for all vectors even if we + * only use few. + */ + for (i = 0; i < MSIX_MAX_VECS; i++) { + u32 reg = REG_INT_THROTTLING_RATE + i * 4; + iowrite32(throttle, nhi->iobase + reg); + } +} + static int nhi_resume_noirq(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); @@ -663,6 +679,8 @@ static int nhi_resume_noirq(struct device *dev) */ if (!pci_device_is_present(pdev)) tb->nhi->going_away = true; + else + nhi_enable_int_throttling(tb->nhi); return tb_domain_resume_noirq(tb); } @@ -717,6 +735,8 @@ static int nhi_init_msi(struct tb_nhi *nhi) /* In case someone left them on. */ nhi_disable_interrupts(nhi); + nhi_enable_int_throttling(nhi); + ida_init(&nhi->msix_ida); /* @@ -796,9 +816,6 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); - /* magic value - clock related? */ - iowrite32(3906250 / 10000, nhi->iobase + 0x38c00); - tb = icm_probe(nhi); if (!tb) tb = tb_probe(nhi); diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h index 09ed574e92ff..46eff69b19ad 100644 --- a/drivers/thunderbolt/nhi_regs.h +++ b/drivers/thunderbolt/nhi_regs.h @@ -95,6 +95,8 @@ struct ring_desc { #define REG_RING_INTERRUPT_BASE 0x38200 #define RING_INTERRUPT_REG_COUNT(nhi) ((31 + 2 * nhi->hop_count) / 32) +#define REG_INT_THROTTLING_RATE 0x38c00 + /* Interrupt Vector Allocation */ #define REG_INT_VEC_ALLOC_BASE 0x38c40 #define REG_INT_VEC_ALLOC_BITS 4 From 9fb1e654dcf781e71a0ea7c5bdfea3ba85d1d06d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:36 +0300 Subject: [PATCH 1151/1322] thunderbolt: Add support for frame mode When high-speed DMA paths are used to transfer arbitrary data over a Thunderbolt link, DMA rings should be in frame mode instead of raw mode. The latter is used by the control channel (ring 0). In frame mode each data frame can hold up to 4kB payload. This patch modifies the DMA ring code to allow configuring a ring to be in frame mode by passing a new flag (RING_FLAG_FRAME) to the ring when it is allocated. In addition there might be need to enable end-to-end (E2E) workaround for the ring to prevent losing Rx frames in certain situations. We add another flag (RING_FLAG_E2E) that can be used for this purpose. This code is based on the work done by Amir Levy and Michael Jamet. Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/ctl.c | 3 +- drivers/thunderbolt/nhi.c | 76 +++++++++++++++++++++------------- drivers/thunderbolt/nhi.h | 10 ++++- drivers/thunderbolt/nhi_regs.h | 2 + 4 files changed, 61 insertions(+), 30 deletions(-) diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c index 46e393c5fd1d..05400b77dcd7 100644 --- a/drivers/thunderbolt/ctl.c +++ b/drivers/thunderbolt/ctl.c @@ -618,7 +618,8 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data) if (!ctl->tx) goto err; - ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND); + ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff, + 0xffff); if (!ctl->rx) goto err; diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 8a7a3d0133f9..bebcad3d2c1f 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -21,6 +21,12 @@ #define RING_TYPE(ring) ((ring)->is_tx ? "TX ring" : "RX ring") +/* + * Used to enable end-to-end workaround for missing RX packets. Do not + * use this ring for anything else. + */ +#define RING_E2E_UNUSED_HOPID 2 + /* * Minimal number of vectors when we use MSI-X. Two for control channel * Rx/Tx and the rest four are for cross domain DMA paths. @@ -229,23 +235,6 @@ static void ring_work(struct work_struct *work) frame->eof = ring->descriptors[ring->tail].eof; frame->sof = ring->descriptors[ring->tail].sof; frame->flags = ring->descriptors[ring->tail].flags; - if (frame->sof != 0) - dev_WARN(&ring->nhi->pdev->dev, - "%s %d got unexpected SOF: %#x\n", - RING_TYPE(ring), ring->hop, - frame->sof); - /* - * known flags: - * raw not enabled, interupt not set: 0x2=0010 - * raw enabled: 0xa=1010 - * raw not enabled: 0xb=1011 - * partial frame (>MAX_FRAME_SIZE): 0xe=1110 - */ - if (frame->flags != 0xa) - dev_WARN(&ring->nhi->pdev->dev, - "%s %d got unexpected flags: %#x\n", - RING_TYPE(ring), ring->hop, - frame->flags); } ring->tail = (ring->tail + 1) % ring->size; } @@ -321,12 +310,17 @@ static void ring_release_msix(struct tb_ring *ring) } static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size, - bool transmit, unsigned int flags) + bool transmit, unsigned int flags, + u16 sof_mask, u16 eof_mask) { struct tb_ring *ring = NULL; dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n", transmit ? "TX" : "RX", hop, size); + /* Tx Ring 2 is reserved for E2E workaround */ + if (transmit && hop == RING_E2E_UNUSED_HOPID) + return NULL; + mutex_lock(&nhi->lock); if (hop >= nhi->hop_count) { dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop); @@ -353,6 +347,8 @@ static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size, ring->is_tx = transmit; ring->size = size; ring->flags = flags; + ring->sof_mask = sof_mask; + ring->eof_mask = eof_mask; ring->head = 0; ring->tail = 0; ring->running = false; @@ -384,13 +380,13 @@ err: struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, unsigned int flags) { - return ring_alloc(nhi, hop, size, true, flags); + return ring_alloc(nhi, hop, size, true, flags, 0, 0); } struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags) + unsigned int flags, u16 sof_mask, u16 eof_mask) { - return ring_alloc(nhi, hop, size, false, flags); + return ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask); } /** @@ -400,6 +396,9 @@ struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, */ void ring_start(struct tb_ring *ring) { + u16 frame_size; + u32 flags; + mutex_lock(&ring->nhi->lock); mutex_lock(&ring->lock); if (ring->nhi->going_away) @@ -411,18 +410,39 @@ void ring_start(struct tb_ring *ring) dev_info(&ring->nhi->pdev->dev, "starting %s %d\n", RING_TYPE(ring), ring->hop); + if (ring->flags & RING_FLAG_FRAME) { + /* Means 4096 */ + frame_size = 0; + flags = RING_FLAG_ENABLE; + } else { + frame_size = TB_FRAME_SIZE; + flags = RING_FLAG_ENABLE | RING_FLAG_RAW; + } + + if (ring->flags & RING_FLAG_E2E && !ring->is_tx) { + u32 hop; + + /* + * In order not to lose Rx packets we enable end-to-end + * workaround which transfers Rx credits to an unused Tx + * HopID. + */ + hop = RING_E2E_UNUSED_HOPID << REG_RX_OPTIONS_E2E_HOP_SHIFT; + hop &= REG_RX_OPTIONS_E2E_HOP_MASK; + flags |= hop | RING_FLAG_E2E_FLOW_CONTROL; + } + ring_iowrite64desc(ring, ring->descriptors_dma, 0); if (ring->is_tx) { ring_iowrite32desc(ring, ring->size, 12); ring_iowrite32options(ring, 0, 4); /* time releated ? */ - ring_iowrite32options(ring, - RING_FLAG_ENABLE | RING_FLAG_RAW, 0); + ring_iowrite32options(ring, flags, 0); } else { - ring_iowrite32desc(ring, - (TB_FRAME_SIZE << 16) | ring->size, 12); - ring_iowrite32options(ring, 0xffffffff, 4); /* SOF EOF mask */ - ring_iowrite32options(ring, - RING_FLAG_ENABLE | RING_FLAG_RAW, 0); + u32 sof_eof_mask = ring->sof_mask << 16 | ring->eof_mask; + + ring_iowrite32desc(ring, (frame_size << 16) | ring->size, 12); + ring_iowrite32options(ring, sof_eof_mask, 4); + ring_iowrite32options(ring, flags, 0); } ring_interrupt_active(ring, true); ring->running = true; diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 0e05828983db..4503ddbeccb3 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -56,6 +56,8 @@ struct tb_nhi { * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise. * @vector: MSI-X vector number the ring uses (only set if @irq is > 0) * @flags: Ring specific flags + * @sof_mask: Bit mask used to detect start of frame PDF + * @eof_mask: Bit mask used to detect end of frame PDF */ struct tb_ring { struct mutex lock; @@ -74,10 +76,16 @@ struct tb_ring { int irq; u8 vector; unsigned int flags; + u16 sof_mask; + u16 eof_mask; }; /* Leave ring interrupt enabled on suspend */ #define RING_FLAG_NO_SUSPEND BIT(0) +/* Configure the ring to be in frame mode */ +#define RING_FLAG_FRAME BIT(1) +/* Enable end-to-end flow control */ +#define RING_FLAG_E2E BIT(2) struct ring_frame; typedef void (*ring_cb)(struct tb_ring*, struct ring_frame*, bool canceled); @@ -100,7 +108,7 @@ struct ring_frame { struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, unsigned int flags); struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags); + unsigned int flags, u16 sof_mask, u16 eof_mask); void ring_start(struct tb_ring *ring); void ring_stop(struct tb_ring *ring); void ring_free(struct tb_ring *ring); diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h index 46eff69b19ad..491a4c0c18fc 100644 --- a/drivers/thunderbolt/nhi_regs.h +++ b/drivers/thunderbolt/nhi_regs.h @@ -77,6 +77,8 @@ struct ring_desc { * ..: unknown */ #define REG_RX_OPTIONS_BASE 0x29800 +#define REG_RX_OPTIONS_E2E_HOP_MASK GENMASK(22, 12) +#define REG_RX_OPTIONS_E2E_HOP_SHIFT 12 /* * three bitfields: tx, rx, rx overflow From 3b3d9f4da96493e4f68d0a80ab210763a24f8b33 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:37 +0300 Subject: [PATCH 1152/1322] thunderbolt: Export ring handling functions to modules These are used by Thunderbolt services to send and receive frames over the high-speed DMA rings. We also put the functions to tb_ namespace to make sure we do not collide with others and add missing kernel-doc comments for the exported functions. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/ctl.c | 20 ++--- drivers/thunderbolt/nhi.c | 70 ++++++++++------ drivers/thunderbolt/nhi.h | 147 +-------------------------------- include/linux/thunderbolt.h | 158 ++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 180 deletions(-) diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c index 05400b77dcd7..dd10789e1dbb 100644 --- a/drivers/thunderbolt/ctl.c +++ b/drivers/thunderbolt/ctl.c @@ -359,7 +359,7 @@ static int tb_ctl_tx(struct tb_ctl *ctl, const void *data, size_t len, cpu_to_be32_array(pkg->buffer, data, len / 4); *(__be32 *) (pkg->buffer + len) = tb_crc(pkg->buffer, len); - res = ring_tx(ctl->tx, &pkg->frame); + res = tb_ring_tx(ctl->tx, &pkg->frame); if (res) /* ring is stopped */ tb_ctl_pkg_free(pkg); return res; @@ -376,7 +376,7 @@ static bool tb_ctl_handle_event(struct tb_ctl *ctl, enum tb_cfg_pkg_type type, static void tb_ctl_rx_submit(struct ctl_pkg *pkg) { - ring_rx(pkg->ctl->rx, &pkg->frame); /* + tb_ring_rx(pkg->ctl->rx, &pkg->frame); /* * We ignore failures during stop. * All rx packets are referenced * from ctl->rx_packets, so we do @@ -614,11 +614,11 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data) if (!ctl->frame_pool) goto err; - ctl->tx = ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND); + ctl->tx = tb_ring_alloc_tx(nhi, 0, 10, RING_FLAG_NO_SUSPEND); if (!ctl->tx) goto err; - ctl->rx = ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff, + ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff, 0xffff); if (!ctl->rx) goto err; @@ -652,9 +652,9 @@ void tb_ctl_free(struct tb_ctl *ctl) return; if (ctl->rx) - ring_free(ctl->rx); + tb_ring_free(ctl->rx); if (ctl->tx) - ring_free(ctl->tx); + tb_ring_free(ctl->tx); /* free RX packets */ for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++) @@ -673,8 +673,8 @@ void tb_ctl_start(struct tb_ctl *ctl) { int i; tb_ctl_info(ctl, "control channel starting...\n"); - ring_start(ctl->tx); /* is used to ack hotplug packets, start first */ - ring_start(ctl->rx); + tb_ring_start(ctl->tx); /* is used to ack hotplug packets, start first */ + tb_ring_start(ctl->rx); for (i = 0; i < TB_CTL_RX_PKG_COUNT; i++) tb_ctl_rx_submit(ctl->rx_packets[i]); @@ -695,8 +695,8 @@ void tb_ctl_stop(struct tb_ctl *ctl) ctl->running = false; mutex_unlock(&ctl->request_queue_lock); - ring_stop(ctl->rx); - ring_stop(ctl->tx); + tb_ring_stop(ctl->rx); + tb_ring_stop(ctl->tx); if (!list_empty(&ctl->request_queue)) tb_ctl_WARN(ctl, "dangling request in request_queue\n"); diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index bebcad3d2c1f..e0a47f7581cb 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -253,7 +253,7 @@ invoke_callback: } } -int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame) +int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame) { int ret = 0; mutex_lock(&ring->lock); @@ -266,6 +266,7 @@ int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame) mutex_unlock(&ring->lock); return ret; } +EXPORT_SYMBOL_GPL(__tb_ring_enqueue); static irqreturn_t ring_msix(int irq, void *data) { @@ -309,9 +310,9 @@ static void ring_release_msix(struct tb_ring *ring) ring->irq = 0; } -static struct tb_ring *ring_alloc(struct tb_nhi *nhi, u32 hop, int size, - bool transmit, unsigned int flags, - u16 sof_mask, u16 eof_mask) +static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, + bool transmit, unsigned int flags, + u16 sof_mask, u16 eof_mask) { struct tb_ring *ring = NULL; dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n", @@ -377,24 +378,42 @@ err: return NULL; } -struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags) +/** + * tb_ring_alloc_tx() - Allocate DMA ring for transmit + * @nhi: Pointer to the NHI the ring is to be allocated + * @hop: HopID (ring) to allocate + * @size: Number of entries in the ring + * @flags: Flags for the ring + */ +struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags) { - return ring_alloc(nhi, hop, size, true, flags, 0, 0); -} - -struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags, u16 sof_mask, u16 eof_mask) -{ - return ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask); + return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0); } +EXPORT_SYMBOL_GPL(tb_ring_alloc_tx); /** - * ring_start() - enable a ring - * - * Must not be invoked in parallel with ring_stop(). + * tb_ring_alloc_rx() - Allocate DMA ring for receive + * @nhi: Pointer to the NHI the ring is to be allocated + * @hop: HopID (ring) to allocate + * @size: Number of entries in the ring + * @flags: Flags for the ring + * @sof_mask: Mask of PDF values that start a frame + * @eof_mask: Mask of PDF values that end a frame */ -void ring_start(struct tb_ring *ring) +struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags, u16 sof_mask, u16 eof_mask) +{ + return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask); +} +EXPORT_SYMBOL_GPL(tb_ring_alloc_rx); + +/** + * tb_ring_start() - enable a ring + * + * Must not be invoked in parallel with tb_ring_stop(). + */ +void tb_ring_start(struct tb_ring *ring) { u16 frame_size; u32 flags; @@ -450,21 +469,22 @@ err: mutex_unlock(&ring->lock); mutex_unlock(&ring->nhi->lock); } - +EXPORT_SYMBOL_GPL(tb_ring_start); /** - * ring_stop() - shutdown a ring + * tb_ring_stop() - shutdown a ring * * Must not be invoked from a callback. * - * This method will disable the ring. Further calls to ring_tx/ring_rx will - * return -ESHUTDOWN until ring_stop has been called. + * This method will disable the ring. Further calls to + * tb_ring_tx/tb_ring_rx will return -ESHUTDOWN until ring_stop has been + * called. * * All enqueued frames will be canceled and their callbacks will be executed * with frame->canceled set to true (on the callback thread). This method * returns only after all callback invocations have finished. */ -void ring_stop(struct tb_ring *ring) +void tb_ring_stop(struct tb_ring *ring) { mutex_lock(&ring->nhi->lock); mutex_lock(&ring->lock); @@ -497,9 +517,10 @@ err: schedule_work(&ring->work); flush_work(&ring->work); } +EXPORT_SYMBOL_GPL(tb_ring_stop); /* - * ring_free() - free ring + * tb_ring_free() - free ring * * When this method returns all invocations of ring->callback will have * finished. @@ -508,7 +529,7 @@ err: * * Must NOT be called from ring_frame->callback! */ -void ring_free(struct tb_ring *ring) +void tb_ring_free(struct tb_ring *ring) { mutex_lock(&ring->nhi->lock); /* @@ -550,6 +571,7 @@ void ring_free(struct tb_ring *ring) mutex_destroy(&ring->lock); kfree(ring); } +EXPORT_SYMBOL_GPL(tb_ring_free); /** * nhi_mailbox_cmd() - Send a command through NHI mailbox diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 4503ddbeccb3..771d09ca5dc5 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -7,152 +7,7 @@ #ifndef DSL3510_H_ #define DSL3510_H_ -#include -#include -#include - -/** - * struct tb_nhi - thunderbolt native host interface - * @lock: Must be held during ring creation/destruction. Is acquired by - * interrupt_work when dispatching interrupts to individual rings. - * @pdev: Pointer to the PCI device - * @iobase: MMIO space of the NHI - * @tx_rings: All Tx rings available on this host controller - * @rx_rings: All Rx rings available on this host controller - * @msix_ida: Used to allocate MSI-X vectors for rings - * @going_away: The host controller device is about to disappear so when - * this flag is set, avoid touching the hardware anymore. - * @interrupt_work: Work scheduled to handle ring interrupt when no - * MSI-X is used. - * @hop_count: Number of rings (end point hops) supported by NHI. - */ -struct tb_nhi { - struct mutex lock; - struct pci_dev *pdev; - void __iomem *iobase; - struct tb_ring **tx_rings; - struct tb_ring **rx_rings; - struct ida msix_ida; - bool going_away; - struct work_struct interrupt_work; - u32 hop_count; -}; - -/** - * struct tb_ring - thunderbolt TX or RX ring associated with a NHI - * @lock: Lock serializing actions to this ring. Must be acquired after - * nhi->lock. - * @nhi: Pointer to the native host controller interface - * @size: Size of the ring - * @hop: Hop (DMA channel) associated with this ring - * @head: Head of the ring (write next descriptor here) - * @tail: Tail of the ring (complete next descriptor here) - * @descriptors: Allocated descriptors for this ring - * @queue: Queue holding frames to be transferred over this ring - * @in_flight: Queue holding frames that are currently in flight - * @work: Interrupt work structure - * @is_tx: Is the ring Tx or Rx - * @running: Is the ring running - * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise. - * @vector: MSI-X vector number the ring uses (only set if @irq is > 0) - * @flags: Ring specific flags - * @sof_mask: Bit mask used to detect start of frame PDF - * @eof_mask: Bit mask used to detect end of frame PDF - */ -struct tb_ring { - struct mutex lock; - struct tb_nhi *nhi; - int size; - int hop; - int head; - int tail; - struct ring_desc *descriptors; - dma_addr_t descriptors_dma; - struct list_head queue; - struct list_head in_flight; - struct work_struct work; - bool is_tx:1; - bool running:1; - int irq; - u8 vector; - unsigned int flags; - u16 sof_mask; - u16 eof_mask; -}; - -/* Leave ring interrupt enabled on suspend */ -#define RING_FLAG_NO_SUSPEND BIT(0) -/* Configure the ring to be in frame mode */ -#define RING_FLAG_FRAME BIT(1) -/* Enable end-to-end flow control */ -#define RING_FLAG_E2E BIT(2) - -struct ring_frame; -typedef void (*ring_cb)(struct tb_ring*, struct ring_frame*, bool canceled); - -/** - * struct ring_frame - for use with ring_rx/ring_tx - */ -struct ring_frame { - dma_addr_t buffer_phy; - ring_cb callback; - struct list_head list; - u32 size:12; /* TX: in, RX: out*/ - u32 flags:12; /* RX: out */ - u32 eof:4; /* TX:in, RX: out */ - u32 sof:4; /* TX:in, RX: out */ -}; - -#define TB_FRAME_SIZE 0x100 /* minimum size for ring_rx */ - -struct tb_ring *ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags); -struct tb_ring *ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags, u16 sof_mask, u16 eof_mask); -void ring_start(struct tb_ring *ring); -void ring_stop(struct tb_ring *ring); -void ring_free(struct tb_ring *ring); - -int __ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); - -/** - * ring_rx() - enqueue a frame on an RX ring - * - * frame->buffer, frame->buffer_phy and frame->callback have to be set. The - * buffer must contain at least TB_FRAME_SIZE bytes. - * - * frame->callback will be invoked with frame->size, frame->flags, frame->eof, - * frame->sof set once the frame has been received. - * - * If ring_stop is called after the packet has been enqueued frame->callback - * will be called with canceled set to true. - * - * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise. - */ -static inline int ring_rx(struct tb_ring *ring, struct ring_frame *frame) -{ - WARN_ON(ring->is_tx); - return __ring_enqueue(ring, frame); -} - -/** - * ring_tx() - enqueue a frame on an TX ring - * - * frame->buffer, frame->buffer_phy, frame->callback, frame->size, frame->eof - * and frame->sof have to be set. - * - * frame->callback will be invoked with once the frame has been transmitted. - * - * If ring_stop is called after the packet has been enqueued frame->callback - * will be called with canceled set to true. - * - * Return: Returns ESHUTDOWN if ring_stop has been called. Zero otherwise. - */ -static inline int ring_tx(struct tb_ring *ring, struct ring_frame *frame) -{ - WARN_ON(!ring->is_tx); - return __ring_enqueue(ring, frame); -} +#include enum nhi_fw_mode { NHI_FW_SAFE_MODE, diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 18c0e3d5e85c..9ddb83ad890f 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -15,10 +15,12 @@ #define THUNDERBOLT_H_ #include +#include #include #include #include #include +#include enum tb_cfg_pkg_type { TB_CFG_PKG_READ = 1, @@ -397,4 +399,160 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) return tb_to_xdomain(svc->dev.parent); } +/** + * struct tb_nhi - thunderbolt native host interface + * @lock: Must be held during ring creation/destruction. Is acquired by + * interrupt_work when dispatching interrupts to individual rings. + * @pdev: Pointer to the PCI device + * @iobase: MMIO space of the NHI + * @tx_rings: All Tx rings available on this host controller + * @rx_rings: All Rx rings available on this host controller + * @msix_ida: Used to allocate MSI-X vectors for rings + * @going_away: The host controller device is about to disappear so when + * this flag is set, avoid touching the hardware anymore. + * @interrupt_work: Work scheduled to handle ring interrupt when no + * MSI-X is used. + * @hop_count: Number of rings (end point hops) supported by NHI. + */ +struct tb_nhi { + struct mutex lock; + struct pci_dev *pdev; + void __iomem *iobase; + struct tb_ring **tx_rings; + struct tb_ring **rx_rings; + struct ida msix_ida; + bool going_away; + struct work_struct interrupt_work; + u32 hop_count; +}; + +/** + * struct tb_ring - thunderbolt TX or RX ring associated with a NHI + * @lock: Lock serializing actions to this ring. Must be acquired after + * nhi->lock. + * @nhi: Pointer to the native host controller interface + * @size: Size of the ring + * @hop: Hop (DMA channel) associated with this ring + * @head: Head of the ring (write next descriptor here) + * @tail: Tail of the ring (complete next descriptor here) + * @descriptors: Allocated descriptors for this ring + * @queue: Queue holding frames to be transferred over this ring + * @in_flight: Queue holding frames that are currently in flight + * @work: Interrupt work structure + * @is_tx: Is the ring Tx or Rx + * @running: Is the ring running + * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise. + * @vector: MSI-X vector number the ring uses (only set if @irq is > 0) + * @flags: Ring specific flags + * @sof_mask: Bit mask used to detect start of frame PDF + * @eof_mask: Bit mask used to detect end of frame PDF + */ +struct tb_ring { + struct mutex lock; + struct tb_nhi *nhi; + int size; + int hop; + int head; + int tail; + struct ring_desc *descriptors; + dma_addr_t descriptors_dma; + struct list_head queue; + struct list_head in_flight; + struct work_struct work; + bool is_tx:1; + bool running:1; + int irq; + u8 vector; + unsigned int flags; + u16 sof_mask; + u16 eof_mask; +}; + +/* Leave ring interrupt enabled on suspend */ +#define RING_FLAG_NO_SUSPEND BIT(0) +/* Configure the ring to be in frame mode */ +#define RING_FLAG_FRAME BIT(1) +/* Enable end-to-end flow control */ +#define RING_FLAG_E2E BIT(2) + +struct ring_frame; +typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); + +/** + * struct ring_frame - For use with ring_rx/ring_tx + * @buffer_phy: DMA mapped address of the frame + * @callback: Callback called when the frame is finished + * @list: Frame is linked to a queue using this + * @size: Size of the frame in bytes (%0 means %4096) + * @flags: Flags for the frame (see &enum ring_desc_flags) + * @eof: End of frame protocol defined field + * @sof: Start of frame protocol defined field + */ +struct ring_frame { + dma_addr_t buffer_phy; + ring_cb callback; + struct list_head list; + u32 size:12; + u32 flags:12; + u32 eof:4; + u32 sof:4; +}; + +/* Minimum size for ring_rx */ +#define TB_FRAME_SIZE 0x100 + +struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags); +struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, + unsigned int flags, u16 sof_mask, + u16 eof_mask); +void tb_ring_start(struct tb_ring *ring); +void tb_ring_stop(struct tb_ring *ring); +void tb_ring_free(struct tb_ring *ring); + +int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); + +/** + * tb_ring_rx() - enqueue a frame on an RX ring + * @ring: Ring to enqueue the frame + * @frame: Frame to enqueue + * + * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The + * buffer must contain at least %TB_FRAME_SIZE bytes. + * + * @frame->callback will be invoked with @frame->size, @frame->flags, + * @frame->eof, @frame->sof set once the frame has been received. + * + * If ring_stop() is called after the packet has been enqueued + * @frame->callback will be called with canceled set to true. + * + * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + */ +static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) +{ + WARN_ON(ring->is_tx); + return __tb_ring_enqueue(ring, frame); +} + +/** + * tb_ring_tx() - enqueue a frame on an TX ring + * @ring: Ring the enqueue the frame + * @frame: Frame to enqueue + * + * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size, + * @frame->eof and @frame->sof have to be set. + * + * @frame->callback will be invoked with once the frame has been transmitted. + * + * If ring_stop() is called after the packet has been enqueued @frame->callback + * will be called with canceled set to true. + * + * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + */ +static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) +{ + WARN_ON(!ring->is_tx); + return __tb_ring_enqueue(ring, frame); +} + #endif /* THUNDERBOLT_H_ */ From 2a91ec63f8a11e70d4b958dd4df867fec0247179 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:38 +0300 Subject: [PATCH 1153/1322] thunderbolt: Move ring descriptor flags to thunderbolt.h A Thunderbolt service driver might need to check if there was an error with the descriptor when in frame mode. We also add two Rx specific error flags RING_DESC_CRC_ERROR and RING_DESC_BUFFER_OVERRUN. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/nhi_regs.h | 7 ------- include/linux/thunderbolt.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/thunderbolt/nhi_regs.h b/drivers/thunderbolt/nhi_regs.h index 491a4c0c18fc..5ed6934e31e7 100644 --- a/drivers/thunderbolt/nhi_regs.h +++ b/drivers/thunderbolt/nhi_regs.h @@ -17,13 +17,6 @@ enum ring_flags { RING_FLAG_ENABLE = 1 << 31, }; -enum ring_desc_flags { - RING_DESC_ISOCH = 0x1, /* TX only? */ - RING_DESC_COMPLETED = 0x2, /* set by NHI */ - RING_DESC_POSTED = 0x4, /* always set this */ - RING_DESC_INTERRUPT = 0x8, /* request an interrupt on completion */ -}; - /** * struct ring_desc - TX/RX ring entry * diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 9ddb83ad890f..e3b9af7be0ad 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -478,6 +478,24 @@ struct tb_ring { struct ring_frame; typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); +/** + * enum ring_desc_flags - Flags for DMA ring descriptor + * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) + * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) + * %RING_DESC_COMPLETED: Descriptor completed (set by NHI) + * %RING_DESC_POSTED: Always set this + * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun + * %RING_DESC_INTERRUPT: Request an interrupt on completion + */ +enum ring_desc_flags { + RING_DESC_ISOCH = 0x1, + RING_DESC_CRC_ERROR = 0x1, + RING_DESC_COMPLETED = 0x2, + RING_DESC_POSTED = 0x4, + RING_DESC_BUFFER_OVERRUN = 0x04, + RING_DESC_INTERRUPT = 0x8, +}; + /** * struct ring_frame - For use with ring_rx/ring_tx * @buffer_phy: DMA mapped address of the frame From 22b7de1000e66d739c431d6be4e7e97c69fa7c98 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:39 +0300 Subject: [PATCH 1154/1322] thunderbolt: Use spinlock in ring serialization This makes it possible to enqueue frames also from atomic context which is needed for example, when networking packets are sent over a Thunderbolt cable. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/nhi.c | 26 ++++++++++++++------------ include/linux/thunderbolt.h | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index e0a47f7581cb..7d1891ec3c47 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -212,8 +212,10 @@ static void ring_work(struct work_struct *work) struct tb_ring *ring = container_of(work, typeof(*ring), work); struct ring_frame *frame; bool canceled = false; + unsigned long flags; LIST_HEAD(done); - mutex_lock(&ring->lock); + + spin_lock_irqsave(&ring->lock, flags); if (!ring->running) { /* Move all frames to done and mark them as canceled. */ @@ -241,7 +243,8 @@ static void ring_work(struct work_struct *work) ring_write_descriptors(ring); invoke_callback: - mutex_unlock(&ring->lock); /* allow callbacks to schedule new work */ + /* allow callbacks to schedule new work */ + spin_unlock_irqrestore(&ring->lock, flags); while (!list_empty(&done)) { frame = list_first_entry(&done, typeof(*frame), list); /* @@ -255,15 +258,17 @@ invoke_callback: int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame) { + unsigned long flags; int ret = 0; - mutex_lock(&ring->lock); + + spin_lock_irqsave(&ring->lock, flags); if (ring->running) { list_add_tail(&frame->list, &ring->queue); ring_write_descriptors(ring); } else { ret = -ESHUTDOWN; } - mutex_unlock(&ring->lock); + spin_unlock_irqrestore(&ring->lock, flags); return ret; } EXPORT_SYMBOL_GPL(__tb_ring_enqueue); @@ -338,7 +343,7 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, if (!ring) goto err; - mutex_init(&ring->lock); + spin_lock_init(&ring->lock); INIT_LIST_HEAD(&ring->queue); INIT_LIST_HEAD(&ring->in_flight); INIT_WORK(&ring->work, ring_work); @@ -371,8 +376,6 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, return ring; err: - if (ring) - mutex_destroy(&ring->lock); kfree(ring); mutex_unlock(&nhi->lock); return NULL; @@ -419,7 +422,7 @@ void tb_ring_start(struct tb_ring *ring) u32 flags; mutex_lock(&ring->nhi->lock); - mutex_lock(&ring->lock); + spin_lock_irq(&ring->lock); if (ring->nhi->going_away) goto err; if (ring->running) { @@ -466,7 +469,7 @@ void tb_ring_start(struct tb_ring *ring) ring_interrupt_active(ring, true); ring->running = true; err: - mutex_unlock(&ring->lock); + spin_unlock_irq(&ring->lock); mutex_unlock(&ring->nhi->lock); } EXPORT_SYMBOL_GPL(tb_ring_start); @@ -487,7 +490,7 @@ EXPORT_SYMBOL_GPL(tb_ring_start); void tb_ring_stop(struct tb_ring *ring) { mutex_lock(&ring->nhi->lock); - mutex_lock(&ring->lock); + spin_lock_irq(&ring->lock); dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n", RING_TYPE(ring), ring->hop); if (ring->nhi->going_away) @@ -508,7 +511,7 @@ void tb_ring_stop(struct tb_ring *ring) ring->running = false; err: - mutex_unlock(&ring->lock); + spin_unlock_irq(&ring->lock); mutex_unlock(&ring->nhi->lock); /* @@ -568,7 +571,6 @@ void tb_ring_free(struct tb_ring *ring) * to finish before freeing the ring. */ flush_work(&ring->work); - mutex_destroy(&ring->lock); kfree(ring); } EXPORT_SYMBOL_GPL(tb_ring_free); diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index e3b9af7be0ad..cf9e42db780f 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -448,7 +448,7 @@ struct tb_nhi { * @eof_mask: Bit mask used to detect end of frame PDF */ struct tb_ring { - struct mutex lock; + spinlock_t lock; struct tb_nhi *nhi; int size; int hop; From 59120e06101db72442acf4c8b364a0c76d8faa68 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:40 +0300 Subject: [PATCH 1155/1322] thunderbolt: Use spinlock in NHI serialization This is needed because ring polling functionality can be called from atomic contexts when networking and other high-speed traffic is transferred over a Thunderbolt cable. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/nhi.c | 75 ++++++++++++++++++++----------------- include/linux/thunderbolt.h | 2 +- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 7d1891ec3c47..0b3c0640048b 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -327,21 +327,9 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, if (transmit && hop == RING_E2E_UNUSED_HOPID) return NULL; - mutex_lock(&nhi->lock); - if (hop >= nhi->hop_count) { - dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop); - goto err; - } - if (transmit && nhi->tx_rings[hop]) { - dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop); - goto err; - } else if (!transmit && nhi->rx_rings[hop]) { - dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop); - goto err; - } ring = kzalloc(sizeof(*ring), GFP_KERNEL); if (!ring) - goto err; + return NULL; spin_lock_init(&ring->lock); INIT_LIST_HEAD(&ring->queue); @@ -359,25 +347,45 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, ring->tail = 0; ring->running = false; - if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND)) - goto err; - ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev, size * sizeof(*ring->descriptors), &ring->descriptors_dma, GFP_KERNEL | __GFP_ZERO); if (!ring->descriptors) - goto err; + goto err_free_ring; + if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND)) + goto err_free_descs; + + spin_lock_irq(&nhi->lock); + if (hop >= nhi->hop_count) { + dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop); + goto err_release_msix; + } + if (transmit && nhi->tx_rings[hop]) { + dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop); + goto err_release_msix; + } else if (!transmit && nhi->rx_rings[hop]) { + dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop); + goto err_release_msix; + } if (transmit) nhi->tx_rings[hop] = ring; else nhi->rx_rings[hop] = ring; - mutex_unlock(&nhi->lock); + spin_unlock_irq(&nhi->lock); + return ring; -err: +err_release_msix: + spin_unlock_irq(&nhi->lock); + ring_release_msix(ring); +err_free_descs: + dma_free_coherent(&ring->nhi->pdev->dev, + ring->size * sizeof(*ring->descriptors), + ring->descriptors, ring->descriptors_dma); +err_free_ring: kfree(ring); - mutex_unlock(&nhi->lock); + return NULL; } @@ -421,8 +429,8 @@ void tb_ring_start(struct tb_ring *ring) u16 frame_size; u32 flags; - mutex_lock(&ring->nhi->lock); - spin_lock_irq(&ring->lock); + spin_lock_irq(&ring->nhi->lock); + spin_lock(&ring->lock); if (ring->nhi->going_away) goto err; if (ring->running) { @@ -469,8 +477,8 @@ void tb_ring_start(struct tb_ring *ring) ring_interrupt_active(ring, true); ring->running = true; err: - spin_unlock_irq(&ring->lock); - mutex_unlock(&ring->nhi->lock); + spin_unlock(&ring->lock); + spin_unlock_irq(&ring->nhi->lock); } EXPORT_SYMBOL_GPL(tb_ring_start); @@ -489,8 +497,8 @@ EXPORT_SYMBOL_GPL(tb_ring_start); */ void tb_ring_stop(struct tb_ring *ring) { - mutex_lock(&ring->nhi->lock); - spin_lock_irq(&ring->lock); + spin_lock_irq(&ring->nhi->lock); + spin_lock(&ring->lock); dev_info(&ring->nhi->pdev->dev, "stopping %s %d\n", RING_TYPE(ring), ring->hop); if (ring->nhi->going_away) @@ -511,8 +519,8 @@ void tb_ring_stop(struct tb_ring *ring) ring->running = false; err: - spin_unlock_irq(&ring->lock); - mutex_unlock(&ring->nhi->lock); + spin_unlock(&ring->lock); + spin_unlock_irq(&ring->nhi->lock); /* * schedule ring->work to invoke callbacks on all remaining frames. @@ -534,7 +542,7 @@ EXPORT_SYMBOL_GPL(tb_ring_stop); */ void tb_ring_free(struct tb_ring *ring) { - mutex_lock(&ring->nhi->lock); + spin_lock_irq(&ring->nhi->lock); /* * Dissociate the ring from the NHI. This also ensures that * nhi_interrupt_work cannot reschedule ring->work. @@ -564,7 +572,7 @@ void tb_ring_free(struct tb_ring *ring) RING_TYPE(ring), ring->hop); - mutex_unlock(&ring->nhi->lock); + spin_unlock_irq(&ring->nhi->lock); /** * ring->work can no longer be scheduled (it is scheduled only * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it @@ -639,7 +647,7 @@ static void nhi_interrupt_work(struct work_struct *work) int type = 0; /* current interrupt type 0: TX, 1: RX, 2: RX overflow */ struct tb_ring *ring; - mutex_lock(&nhi->lock); + spin_lock_irq(&nhi->lock); /* * Starting at REG_RING_NOTIFY_BASE there are three status bitfields @@ -677,7 +685,7 @@ static void nhi_interrupt_work(struct work_struct *work) /* we do not check ring->running, this is done in ring->work */ schedule_work(&ring->work); } - mutex_unlock(&nhi->lock); + spin_unlock_irq(&nhi->lock); } static irqreturn_t nhi_msi(int irq, void *data) @@ -767,7 +775,6 @@ static void nhi_shutdown(struct tb_nhi *nhi) devm_free_irq(&nhi->pdev->dev, nhi->pdev->irq, nhi); flush_work(&nhi->interrupt_work); } - mutex_destroy(&nhi->lock); ida_destroy(&nhi->msix_ida); } @@ -856,7 +863,7 @@ static int nhi_probe(struct pci_dev *pdev, const struct pci_device_id *id) return res; } - mutex_init(&nhi->lock); + spin_lock_init(&nhi->lock); pci_set_master(pdev); diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index cf9e42db780f..d59e3f9a35c4 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc) * @hop_count: Number of rings (end point hops) supported by NHI. */ struct tb_nhi { - struct mutex lock; + spinlock_t lock; struct pci_dev *pdev; void __iomem *iobase; struct tb_ring **tx_rings; From 4ffe722eefcb07c76701f03e0d759fbaecedf79f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:41 +0300 Subject: [PATCH 1156/1322] thunderbolt: Add polling mode for rings In order to support things like networking over Thunderbolt cable, there needs to be a way to switch the ring to a mode where it can be polled with the interrupt masked. We implement such mode so that the caller can allocate a ring by passing pointer to a function that is then called when an interrupt is triggered. Completed frames can be fetched using tb_ring_poll() and the interrupt can be re-enabled when the caller is finished with polling by using tb_ring_poll_complete(). Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/ctl.c | 2 +- drivers/thunderbolt/nhi.c | 126 +++++++++++++++++++++++++++++++++--- include/linux/thunderbolt.h | 23 +++++-- 3 files changed, 134 insertions(+), 17 deletions(-) diff --git a/drivers/thunderbolt/ctl.c b/drivers/thunderbolt/ctl.c index dd10789e1dbb..d079dbba2c03 100644 --- a/drivers/thunderbolt/ctl.c +++ b/drivers/thunderbolt/ctl.c @@ -619,7 +619,7 @@ struct tb_ctl *tb_ctl_alloc(struct tb_nhi *nhi, event_cb cb, void *cb_data) goto err; ctl->rx = tb_ring_alloc_rx(nhi, 0, 10, RING_FLAG_NO_SUSPEND, 0xffff, - 0xffff); + 0xffff, NULL, NULL); if (!ctl->rx) goto err; diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 0b3c0640048b..af0a80ddf594 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -252,7 +252,8 @@ invoke_callback: * Do not hold on to it. */ list_del_init(&frame->list); - frame->callback(ring, frame, canceled); + if (frame->callback) + frame->callback(ring, frame, canceled); } } @@ -273,11 +274,106 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame) } EXPORT_SYMBOL_GPL(__tb_ring_enqueue); +/** + * tb_ring_poll() - Poll one completed frame from the ring + * @ring: Ring to poll + * + * This function can be called when @start_poll callback of the @ring + * has been called. It will read one completed frame from the ring and + * return it to the caller. Returns %NULL if there is no more completed + * frames. + */ +struct ring_frame *tb_ring_poll(struct tb_ring *ring) +{ + struct ring_frame *frame = NULL; + unsigned long flags; + + spin_lock_irqsave(&ring->lock, flags); + if (!ring->running) + goto unlock; + if (ring_empty(ring)) + goto unlock; + + if (ring->descriptors[ring->tail].flags & RING_DESC_COMPLETED) { + frame = list_first_entry(&ring->in_flight, typeof(*frame), + list); + list_del_init(&frame->list); + + if (!ring->is_tx) { + frame->size = ring->descriptors[ring->tail].length; + frame->eof = ring->descriptors[ring->tail].eof; + frame->sof = ring->descriptors[ring->tail].sof; + frame->flags = ring->descriptors[ring->tail].flags; + } + + ring->tail = (ring->tail + 1) % ring->size; + } + +unlock: + spin_unlock_irqrestore(&ring->lock, flags); + return frame; +} +EXPORT_SYMBOL_GPL(tb_ring_poll); + +static void __ring_interrupt_mask(struct tb_ring *ring, bool mask) +{ + int idx = ring_interrupt_index(ring); + int reg = REG_RING_INTERRUPT_BASE + idx / 32 * 4; + int bit = idx % 32; + u32 val; + + val = ioread32(ring->nhi->iobase + reg); + if (mask) + val &= ~BIT(bit); + else + val |= BIT(bit); + iowrite32(val, ring->nhi->iobase + reg); +} + +/* Both @nhi->lock and @ring->lock should be held */ +static void __ring_interrupt(struct tb_ring *ring) +{ + if (!ring->running) + return; + + if (ring->start_poll) { + __ring_interrupt_mask(ring, false); + ring->start_poll(ring->poll_data); + } else { + schedule_work(&ring->work); + } +} + +/** + * tb_ring_poll_complete() - Re-start interrupt for the ring + * @ring: Ring to re-start the interrupt + * + * This will re-start (unmask) the ring interrupt once the user is done + * with polling. + */ +void tb_ring_poll_complete(struct tb_ring *ring) +{ + unsigned long flags; + + spin_lock_irqsave(&ring->nhi->lock, flags); + spin_lock(&ring->lock); + if (ring->start_poll) + __ring_interrupt_mask(ring, false); + spin_unlock(&ring->lock); + spin_unlock_irqrestore(&ring->nhi->lock, flags); +} +EXPORT_SYMBOL_GPL(tb_ring_poll_complete); + static irqreturn_t ring_msix(int irq, void *data) { struct tb_ring *ring = data; - schedule_work(&ring->work); + spin_lock(&ring->nhi->lock); + spin_lock(&ring->lock); + __ring_interrupt(ring); + spin_unlock(&ring->lock); + spin_unlock(&ring->nhi->lock); + return IRQ_HANDLED; } @@ -317,7 +413,9 @@ static void ring_release_msix(struct tb_ring *ring) static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, bool transmit, unsigned int flags, - u16 sof_mask, u16 eof_mask) + u16 sof_mask, u16 eof_mask, + void (*start_poll)(void *), + void *poll_data) { struct tb_ring *ring = NULL; dev_info(&nhi->pdev->dev, "allocating %s ring %d of size %d\n", @@ -346,6 +444,8 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, ring->head = 0; ring->tail = 0; ring->running = false; + ring->start_poll = start_poll; + ring->poll_data = poll_data; ring->descriptors = dma_alloc_coherent(&ring->nhi->pdev->dev, size * sizeof(*ring->descriptors), @@ -399,7 +499,7 @@ err_free_ring: struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, unsigned int flags) { - return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0); + return tb_ring_alloc(nhi, hop, size, true, flags, 0, 0, NULL, NULL); } EXPORT_SYMBOL_GPL(tb_ring_alloc_tx); @@ -411,11 +511,17 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx); * @flags: Flags for the ring * @sof_mask: Mask of PDF values that start a frame * @eof_mask: Mask of PDF values that end a frame + * @start_poll: If not %NULL the ring will call this function when an + * interrupt is triggered and masked, instead of callback + * in each Rx frame. + * @poll_data: Optional data passed to @start_poll */ struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags, u16 sof_mask, u16 eof_mask) + unsigned int flags, u16 sof_mask, u16 eof_mask, + void (*start_poll)(void *), void *poll_data) { - return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask); + return tb_ring_alloc(nhi, hop, size, false, flags, sof_mask, eof_mask, + start_poll, poll_data); } EXPORT_SYMBOL_GPL(tb_ring_alloc_rx); @@ -556,6 +662,7 @@ void tb_ring_free(struct tb_ring *ring) dev_WARN(&ring->nhi->pdev->dev, "%s %d still running\n", RING_TYPE(ring), ring->hop); } + spin_unlock_irq(&ring->nhi->lock); ring_release_msix(ring); @@ -572,7 +679,6 @@ void tb_ring_free(struct tb_ring *ring) RING_TYPE(ring), ring->hop); - spin_unlock_irq(&ring->nhi->lock); /** * ring->work can no longer be scheduled (it is scheduled only * by nhi_interrupt_work, ring_stop and ring_msix). Wait for it @@ -682,8 +788,10 @@ static void nhi_interrupt_work(struct work_struct *work) hop); continue; } - /* we do not check ring->running, this is done in ring->work */ - schedule_work(&ring->work); + + spin_lock(&ring->lock); + __ring_interrupt(ring); + spin_unlock(&ring->lock); } spin_unlock_irq(&nhi->lock); } diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index d59e3f9a35c4..36925e3aec7c 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -446,6 +446,9 @@ struct tb_nhi { * @flags: Ring specific flags * @sof_mask: Bit mask used to detect start of frame PDF * @eof_mask: Bit mask used to detect end of frame PDF + * @start_poll: Called when ring interrupt is triggered to start + * polling. Passing %NULL keeps the ring in interrupt mode. + * @poll_data: Data passed to @start_poll */ struct tb_ring { spinlock_t lock; @@ -466,6 +469,8 @@ struct tb_ring { unsigned int flags; u16 sof_mask; u16 eof_mask; + void (*start_poll)(void *data); + void *poll_data; }; /* Leave ring interrupt enabled on suspend */ @@ -499,7 +504,7 @@ enum ring_desc_flags { /** * struct ring_frame - For use with ring_rx/ring_tx * @buffer_phy: DMA mapped address of the frame - * @callback: Callback called when the frame is finished + * @callback: Callback called when the frame is finished (optional) * @list: Frame is linked to a queue using this * @size: Size of the frame in bytes (%0 means %4096) * @flags: Flags for the frame (see &enum ring_desc_flags) @@ -522,8 +527,8 @@ struct ring_frame { struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size, unsigned int flags); struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size, - unsigned int flags, u16 sof_mask, - u16 eof_mask); + unsigned int flags, u16 sof_mask, u16 eof_mask, + void (*start_poll)(void *), void *poll_data); void tb_ring_start(struct tb_ring *ring); void tb_ring_stop(struct tb_ring *ring); void tb_ring_free(struct tb_ring *ring); @@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); * @ring: Ring to enqueue the frame * @frame: Frame to enqueue * - * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The - * buffer must contain at least %TB_FRAME_SIZE bytes. + * @frame->buffer, @frame->buffer_phy have to be set. The buffer must + * contain at least %TB_FRAME_SIZE bytes. * * @frame->callback will be invoked with @frame->size, @frame->flags, * @frame->eof, @frame->sof set once the frame has been received. @@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) * @ring: Ring the enqueue the frame * @frame: Frame to enqueue * - * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size, - * @frame->eof and @frame->sof have to be set. + * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and + * @frame->sof have to be set. * * @frame->callback will be invoked with once the frame has been transmitted. * @@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) return __tb_ring_enqueue(ring, frame); } +/* Used only when the ring is in polling mode */ +struct ring_frame *tb_ring_poll(struct tb_ring *ring); +void tb_ring_poll_complete(struct tb_ring *ring); + #endif /* THUNDERBOLT_H_ */ From 3304559e353f098d7e0ed5ca981e26c406513e12 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:42 +0300 Subject: [PATCH 1157/1322] thunderbolt: Add function to retrieve DMA device for the ring This is needed when Thunderbolt service drivers need to DMA map memory before it is passed down to the ring. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 36925e3aec7c..7b69853188b1 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) struct ring_frame *tb_ring_poll(struct tb_ring *ring); void tb_ring_poll_complete(struct tb_ring *ring); +/** + * tb_ring_dma_device() - Return device used for DMA mapping + * @ring: Ring whose DMA device is retrieved + * + * Use this function when you are mapping DMA for buffers that are + * passed to the ring for sending/receiving. + */ +static inline struct device *tb_ring_dma_device(struct tb_ring *ring) +{ + return &ring->nhi->pdev->dev; +} + #endif /* THUNDERBOLT_H_ */ From 9a01c7c26cf7cbd1f58d06319e798833e85ff550 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:43 +0300 Subject: [PATCH 1158/1322] thunderbolt: Allocate ring HopID automatically if requested Thunderbolt services should not care which HopID (ring) they use for sending and receiving packets over the high-speed DMA path, so make tb_ring_alloc_rx() and tb_ring_alloc_tx() accept negative HopID. This means that the NHI will allocate next available HopID for the caller automatically. These HopIDs will be allocated from the range which is not reserved for the Thunderbolt protocol (8 .. hop_count - 1). The allocated HopID can be retrieved from ring->hop field after the ring has been allocated successfully if needed. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/thunderbolt/nhi.c | 78 ++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index af0a80ddf594..0e79eebfcbb7 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -26,6 +26,8 @@ * use this ring for anything else. */ #define RING_E2E_UNUSED_HOPID 2 +/* HopIDs 0-7 are reserved by the Thunderbolt protocol */ +#define RING_FIRST_USABLE_HOPID 8 /* * Minimal number of vectors when we use MSI-X. Two for control channel @@ -411,6 +413,62 @@ static void ring_release_msix(struct tb_ring *ring) ring->irq = 0; } +static int nhi_alloc_hop(struct tb_nhi *nhi, struct tb_ring *ring) +{ + int ret = 0; + + spin_lock_irq(&nhi->lock); + + if (ring->hop < 0) { + unsigned int i; + + /* + * Automatically allocate HopID from the non-reserved + * range 8 .. hop_count - 1. + */ + for (i = RING_FIRST_USABLE_HOPID; i < nhi->hop_count; i++) { + if (ring->is_tx) { + if (!nhi->tx_rings[i]) { + ring->hop = i; + break; + } + } else { + if (!nhi->rx_rings[i]) { + ring->hop = i; + break; + } + } + } + } + + if (ring->hop < 0 || ring->hop >= nhi->hop_count) { + dev_warn(&nhi->pdev->dev, "invalid hop: %d\n", ring->hop); + ret = -EINVAL; + goto err_unlock; + } + if (ring->is_tx && nhi->tx_rings[ring->hop]) { + dev_warn(&nhi->pdev->dev, "TX hop %d already allocated\n", + ring->hop); + ret = -EBUSY; + goto err_unlock; + } else if (!ring->is_tx && nhi->rx_rings[ring->hop]) { + dev_warn(&nhi->pdev->dev, "RX hop %d already allocated\n", + ring->hop); + ret = -EBUSY; + goto err_unlock; + } + + if (ring->is_tx) + nhi->tx_rings[ring->hop] = ring; + else + nhi->rx_rings[ring->hop] = ring; + +err_unlock: + spin_unlock_irq(&nhi->lock); + + return ret; +} + static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, bool transmit, unsigned int flags, u16 sof_mask, u16 eof_mask, @@ -456,28 +514,12 @@ static struct tb_ring *tb_ring_alloc(struct tb_nhi *nhi, u32 hop, int size, if (ring_request_msix(ring, flags & RING_FLAG_NO_SUSPEND)) goto err_free_descs; - spin_lock_irq(&nhi->lock); - if (hop >= nhi->hop_count) { - dev_WARN(&nhi->pdev->dev, "invalid hop: %d\n", hop); + if (nhi_alloc_hop(nhi, ring)) goto err_release_msix; - } - if (transmit && nhi->tx_rings[hop]) { - dev_WARN(&nhi->pdev->dev, "TX hop %d already allocated\n", hop); - goto err_release_msix; - } else if (!transmit && nhi->rx_rings[hop]) { - dev_WARN(&nhi->pdev->dev, "RX hop %d already allocated\n", hop); - goto err_release_msix; - } - if (transmit) - nhi->tx_rings[hop] = ring; - else - nhi->rx_rings[hop] = ring; - spin_unlock_irq(&nhi->lock); return ring; err_release_msix: - spin_unlock_irq(&nhi->lock); ring_release_msix(ring); err_free_descs: dma_free_coherent(&ring->nhi->pdev->dev, @@ -506,7 +548,7 @@ EXPORT_SYMBOL_GPL(tb_ring_alloc_tx); /** * tb_ring_alloc_rx() - Allocate DMA ring for receive * @nhi: Pointer to the NHI the ring is to be allocated - * @hop: HopID (ring) to allocate + * @hop: HopID (ring) to allocate. Pass %-1 for automatic allocation. * @size: Number of entries in the ring * @flags: Flags for the ring * @sof_mask: Mask of PDF values that start a frame From 467cd25bf2aec14c2e2990512248b72f46f94c53 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:44 +0300 Subject: [PATCH 1159/1322] MAINTAINERS: Add thunderbolt.h to the Thunderbolt driver entry The new API header (include/linux/thunderbolt.h) is maintained by the Thunderbolt driver maintainers. Signed-off-by: Mika Westerberg Reviewed-by: Michael Jamet Reviewed-by: Yehezkel Bernat Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fc..c1c90d962012 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13278,6 +13278,7 @@ M: Mika Westerberg M: Yehezkel Bernat S: Maintained F: drivers/thunderbolt/ +F: include/linux/thunderbolt.h THUNDERX GPIO DRIVER M: David Daney From e69b6c02b4c3b8d03be7136f90dd9551ad5a5a5e Mon Sep 17 00:00:00 2001 From: Amir Levy Date: Mon, 2 Oct 2017 13:38:45 +0300 Subject: [PATCH 1160/1322] net: Add support for networking over Thunderbolt cable ThunderboltIP is a protocol created by Apple to tunnel IP/ethernet traffic over a Thunderbolt cable. The protocol consists of configuration phase where each side sends ThunderboltIP login packets (the protocol is determined by UUID in the XDomain packet header) over the configuration channel. Once both sides get positive acknowledgment to their login packet, they configure high-speed DMA path accordingly. This DMA path is then used to transmit and receive networking traffic. This patch creates a virtual ethernet interface the host software can use in the same way as any other networking interface. Once the interface is brought up successfully network packets get tunneled over the Thunderbolt cable to the remote host and back. The connection is terminated by sending a ThunderboltIP logout packet over the configuration channel. We do this when the network interface is brought down by user or the driver is unloaded. Signed-off-by: Amir Levy Signed-off-by: Michael Jamet Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- Documentation/admin-guide/thunderbolt.rst | 24 + drivers/net/Kconfig | 12 + drivers/net/Makefile | 3 + drivers/net/thunderbolt.c | 1362 +++++++++++++++++++++ 4 files changed, 1401 insertions(+) create mode 100644 drivers/net/thunderbolt.c diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst index 6a4cd1f159ca..5c62d11d77e8 100644 --- a/Documentation/admin-guide/thunderbolt.rst +++ b/Documentation/admin-guide/thunderbolt.rst @@ -197,3 +197,27 @@ information is missing. To recover from this mode, one needs to flash a valid NVM image to the host host controller in the same way it is done in the previous chapter. + +Networking over Thunderbolt cable +--------------------------------- +Thunderbolt technology allows software communication across two hosts +connected by a Thunderbolt cable. + +It is possible to tunnel any kind of traffic over Thunderbolt link but +currently we only support Apple ThunderboltIP protocol. + +If the other host is running Windows or macOS only thing you need to +do is to connect Thunderbolt cable between the two hosts, the +``thunderbolt-net`` is loaded automatically. If the other host is also +Linux you should load ``thunderbolt-net`` manually on one host (it does +not matter which one):: + + # modprobe thunderbolt-net + +This triggers module load on the other host automatically. If the driver +is built-in to the kernel image, there is no need to do anything. + +The driver will create one virtual ethernet interface per Thunderbolt +port which are named like ``thunderbolt0`` and so on. From this point +you can either use standard userspace tools like ``ifconfig`` to +configure the interface or let your GUI to handle it automatically. diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index aba0d652095b..0936da592e12 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -483,6 +483,18 @@ config FUJITSU_ES This driver provides support for Extended Socket network device on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series. +config THUNDERBOLT_NET + tristate "Networking over Thunderbolt cable" + depends on THUNDERBOLT && INET + help + Select this if you want to create network between two + computers over a Thunderbolt cable. The driver supports Apple + ThunderboltIP protocol and allows communication with any host + supporting the same protocol including Windows and macOS. + + To compile this driver a module, choose M here. The module will be + called thunderbolt-net. + source "drivers/net/hyperv/Kconfig" endif # NETDEVICES diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 8dff900085d6..7c8f4dd3a7c5 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -74,3 +74,6 @@ obj-$(CONFIG_HYPERV_NET) += hyperv/ obj-$(CONFIG_NTB_NETDEV) += ntb_netdev.o obj-$(CONFIG_FUJITSU_ES) += fjes/ + +thunderbolt-net-y += thunderbolt.o +obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c new file mode 100644 index 000000000000..1a7bc0bf4598 --- /dev/null +++ b/drivers/net/thunderbolt.c @@ -0,0 +1,1362 @@ +/* + * Networking over Thunderbolt cable using Apple ThunderboltIP protocol + * + * Copyright (C) 2017, Intel Corporation + * Authors: Amir Levy + * Michael Jamet + * Mika Westerberg + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Protocol timeouts in ms */ +#define TBNET_LOGIN_DELAY 4500 +#define TBNET_LOGIN_TIMEOUT 500 +#define TBNET_LOGOUT_TIMEOUT 100 + +#define TBNET_RING_SIZE 256 +#define TBNET_LOCAL_PATH 0xf +#define TBNET_LOGIN_RETRIES 60 +#define TBNET_LOGOUT_RETRIES 5 +#define TBNET_MATCH_FRAGS_ID BIT(1) +#define TBNET_MAX_MTU SZ_64K +#define TBNET_FRAME_SIZE SZ_4K +#define TBNET_MAX_PAYLOAD_SIZE \ + (TBNET_FRAME_SIZE - sizeof(struct thunderbolt_ip_frame_header)) +/* Rx packets need to hold space for skb_shared_info */ +#define TBNET_RX_MAX_SIZE \ + (TBNET_FRAME_SIZE + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) +#define TBNET_RX_PAGE_ORDER get_order(TBNET_RX_MAX_SIZE) +#define TBNET_RX_PAGE_SIZE (PAGE_SIZE << TBNET_RX_PAGE_ORDER) + +#define TBNET_L0_PORT_NUM(route) ((route) & GENMASK(5, 0)) + +/** + * struct thunderbolt_ip_frame_header - Header for each Thunderbolt frame + * @frame_size: size of the data with the frame + * @frame_index: running index on the frames + * @frame_id: ID of the frame to match frames to specific packet + * @frame_count: how many frames assembles a full packet + * + * Each data frame passed to the high-speed DMA ring has this header. If + * the XDomain network directory announces that %TBNET_MATCH_FRAGS_ID is + * supported then @frame_id is filled, otherwise it stays %0. + */ +struct thunderbolt_ip_frame_header { + u32 frame_size; + u16 frame_index; + u16 frame_id; + u32 frame_count; +}; + +enum thunderbolt_ip_frame_pdf { + TBIP_PDF_FRAME_START = 1, + TBIP_PDF_FRAME_END, +}; + +enum thunderbolt_ip_type { + TBIP_LOGIN, + TBIP_LOGIN_RESPONSE, + TBIP_LOGOUT, + TBIP_STATUS, +}; + +struct thunderbolt_ip_header { + u32 route_hi; + u32 route_lo; + u32 length_sn; + uuid_t uuid; + uuid_t initiator_uuid; + uuid_t target_uuid; + u32 type; + u32 command_id; +}; + +#define TBIP_HDR_LENGTH_MASK GENMASK(5, 0) +#define TBIP_HDR_SN_MASK GENMASK(28, 27) +#define TBIP_HDR_SN_SHIFT 27 + +struct thunderbolt_ip_login { + struct thunderbolt_ip_header hdr; + u32 proto_version; + u32 transmit_path; + u32 reserved[4]; +}; + +#define TBIP_LOGIN_PROTO_VERSION 1 + +struct thunderbolt_ip_login_response { + struct thunderbolt_ip_header hdr; + u32 status; + u32 receiver_mac[2]; + u32 receiver_mac_len; + u32 reserved[4]; +}; + +struct thunderbolt_ip_logout { + struct thunderbolt_ip_header hdr; +}; + +struct thunderbolt_ip_status { + struct thunderbolt_ip_header hdr; + u32 status; +}; + +struct tbnet_stats { + u64 tx_packets; + u64 rx_packets; + u64 tx_bytes; + u64 rx_bytes; + u64 rx_errors; + u64 tx_errors; + u64 rx_length_errors; + u64 rx_over_errors; + u64 rx_crc_errors; + u64 rx_missed_errors; +}; + +struct tbnet_frame { + struct net_device *dev; + struct page *page; + struct ring_frame frame; +}; + +struct tbnet_ring { + struct tbnet_frame frames[TBNET_RING_SIZE]; + unsigned int cons; + unsigned int prod; + struct tb_ring *ring; +}; + +/** + * struct tbnet - ThunderboltIP network driver private data + * @svc: XDomain service the driver is bound to + * @xd: XDomain the service blongs to + * @handler: ThunderboltIP configuration protocol handler + * @dev: Networking device + * @napi: NAPI structure for Rx polling + * @stats: Network statistics + * @skb: Network packet that is currently processed on Rx path + * @command_id: ID used for next configuration protocol packet + * @login_sent: ThunderboltIP login message successfully sent + * @login_received: ThunderboltIP login message received from the remote + * host + * @transmit_path: HopID the other end needs to use building the + * opposite side path. + * @connection_lock: Lock serializing access to @login_sent, + * @login_received and @transmit_path. + * @login_retries: Number of login retries currently done + * @login_work: Worker to send ThunderboltIP login packets + * @connected_work: Worker that finalizes the ThunderboltIP connection + * setup and enables DMA paths for high speed data + * transfers + * @rx_hdr: Copy of the currently processed Rx frame. Used when a + * network packet consists of multiple Thunderbolt frames. + * In host byte order. + * @rx_ring: Software ring holding Rx frames + * @frame_id: Frame ID use for next Tx packet + * (if %TBNET_MATCH_FRAGS_ID is supported in both ends) + * @tx_ring: Software ring holding Tx frames + */ +struct tbnet { + const struct tb_service *svc; + struct tb_xdomain *xd; + struct tb_protocol_handler handler; + struct net_device *dev; + struct napi_struct napi; + struct tbnet_stats stats; + struct sk_buff *skb; + atomic_t command_id; + bool login_sent; + bool login_received; + u32 transmit_path; + struct mutex connection_lock; + int login_retries; + struct delayed_work login_work; + struct work_struct connected_work; + struct thunderbolt_ip_frame_header rx_hdr; + struct tbnet_ring rx_ring; + atomic_t frame_id; + struct tbnet_ring tx_ring; +}; + +/* Network property directory UUID: c66189ca-1cce-4195-bdb8-49592e5f5a4f */ +static const uuid_t tbnet_dir_uuid = + UUID_INIT(0xc66189ca, 0x1cce, 0x4195, + 0xbd, 0xb8, 0x49, 0x59, 0x2e, 0x5f, 0x5a, 0x4f); + +/* ThunderboltIP protocol UUID: 798f589e-3616-8a47-97c6-5664a920c8dd */ +static const uuid_t tbnet_svc_uuid = + UUID_INIT(0x798f589e, 0x3616, 0x8a47, + 0x97, 0xc6, 0x56, 0x64, 0xa9, 0x20, 0xc8, 0xdd); + +static struct tb_property_dir *tbnet_dir; + +static void tbnet_fill_header(struct thunderbolt_ip_header *hdr, u64 route, + u8 sequence, const uuid_t *initiator_uuid, const uuid_t *target_uuid, + enum thunderbolt_ip_type type, size_t size, u32 command_id) +{ + u32 length_sn; + + /* Length does not include route_hi/lo and length_sn fields */ + length_sn = (size - 3 * 4) / 4; + length_sn |= (sequence << TBIP_HDR_SN_SHIFT) & TBIP_HDR_SN_MASK; + + hdr->route_hi = upper_32_bits(route); + hdr->route_lo = lower_32_bits(route); + hdr->length_sn = length_sn; + uuid_copy(&hdr->uuid, &tbnet_svc_uuid); + uuid_copy(&hdr->initiator_uuid, initiator_uuid); + uuid_copy(&hdr->target_uuid, target_uuid); + hdr->type = type; + hdr->command_id = command_id; +} + +static int tbnet_login_response(struct tbnet *net, u64 route, u8 sequence, + u32 command_id) +{ + struct thunderbolt_ip_login_response reply; + struct tb_xdomain *xd = net->xd; + + memset(&reply, 0, sizeof(reply)); + tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid, + xd->remote_uuid, TBIP_LOGIN_RESPONSE, sizeof(reply), + command_id); + memcpy(reply.receiver_mac, net->dev->dev_addr, ETH_ALEN); + reply.receiver_mac_len = ETH_ALEN; + + return tb_xdomain_response(xd, &reply, sizeof(reply), + TB_CFG_PKG_XDOMAIN_RESP); +} + +static int tbnet_login_request(struct tbnet *net, u8 sequence) +{ + struct thunderbolt_ip_login_response reply; + struct thunderbolt_ip_login request; + struct tb_xdomain *xd = net->xd; + + memset(&request, 0, sizeof(request)); + tbnet_fill_header(&request.hdr, xd->route, sequence, xd->local_uuid, + xd->remote_uuid, TBIP_LOGIN, sizeof(request), + atomic_inc_return(&net->command_id)); + + request.proto_version = TBIP_LOGIN_PROTO_VERSION; + request.transmit_path = TBNET_LOCAL_PATH; + + return tb_xdomain_request(xd, &request, sizeof(request), + TB_CFG_PKG_XDOMAIN_RESP, &reply, + sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP, + TBNET_LOGIN_TIMEOUT); +} + +static int tbnet_logout_response(struct tbnet *net, u64 route, u8 sequence, + u32 command_id) +{ + struct thunderbolt_ip_status reply; + struct tb_xdomain *xd = net->xd; + + memset(&reply, 0, sizeof(reply)); + tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid, + xd->remote_uuid, TBIP_STATUS, sizeof(reply), + atomic_inc_return(&net->command_id)); + return tb_xdomain_response(xd, &reply, sizeof(reply), + TB_CFG_PKG_XDOMAIN_RESP); +} + +static int tbnet_logout_request(struct tbnet *net) +{ + struct thunderbolt_ip_logout request; + struct thunderbolt_ip_status reply; + struct tb_xdomain *xd = net->xd; + + memset(&request, 0, sizeof(request)); + tbnet_fill_header(&request.hdr, xd->route, 0, xd->local_uuid, + xd->remote_uuid, TBIP_LOGOUT, sizeof(request), + atomic_inc_return(&net->command_id)); + + return tb_xdomain_request(xd, &request, sizeof(request), + TB_CFG_PKG_XDOMAIN_RESP, &reply, + sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP, + TBNET_LOGOUT_TIMEOUT); +} + +static void start_login(struct tbnet *net) +{ + mutex_lock(&net->connection_lock); + net->login_sent = false; + net->login_received = false; + mutex_unlock(&net->connection_lock); + + queue_delayed_work(system_long_wq, &net->login_work, + msecs_to_jiffies(1000)); +} + +static void stop_login(struct tbnet *net) +{ + cancel_delayed_work_sync(&net->login_work); + cancel_work_sync(&net->connected_work); +} + +static inline unsigned int tbnet_frame_size(const struct tbnet_frame *tf) +{ + return tf->frame.size ? : TBNET_FRAME_SIZE; +} + +static void tbnet_free_buffers(struct tbnet_ring *ring) +{ + unsigned int i; + + for (i = 0; i < TBNET_RING_SIZE; i++) { + struct device *dma_dev = tb_ring_dma_device(ring->ring); + struct tbnet_frame *tf = &ring->frames[i]; + enum dma_data_direction dir; + unsigned int order; + size_t size; + + if (!tf->page) + continue; + + if (ring->ring->is_tx) { + dir = DMA_TO_DEVICE; + order = 0; + size = tbnet_frame_size(tf); + } else { + dir = DMA_FROM_DEVICE; + order = TBNET_RX_PAGE_ORDER; + size = TBNET_RX_PAGE_SIZE; + } + + if (tf->frame.buffer_phy) + dma_unmap_page(dma_dev, tf->frame.buffer_phy, size, + dir); + + __free_pages(tf->page, order); + tf->page = NULL; + } + + ring->cons = 0; + ring->prod = 0; +} + +static void tbnet_tear_down(struct tbnet *net, bool send_logout) +{ + netif_carrier_off(net->dev); + netif_stop_queue(net->dev); + + stop_login(net); + + mutex_lock(&net->connection_lock); + + if (net->login_sent && net->login_received) { + int retries = TBNET_LOGOUT_RETRIES; + + while (send_logout && retries-- > 0) { + int ret = tbnet_logout_request(net); + if (ret != -ETIMEDOUT) + break; + } + + tb_ring_stop(net->rx_ring.ring); + tb_ring_stop(net->tx_ring.ring); + tbnet_free_buffers(&net->rx_ring); + tbnet_free_buffers(&net->tx_ring); + + if (tb_xdomain_disable_paths(net->xd)) + netdev_warn(net->dev, "failed to disable DMA paths\n"); + } + + net->login_retries = 0; + net->login_sent = false; + net->login_received = false; + + mutex_unlock(&net->connection_lock); +} + +static int tbnet_handle_packet(const void *buf, size_t size, void *data) +{ + const struct thunderbolt_ip_login *pkg = buf; + struct tbnet *net = data; + u32 command_id; + int ret = 0; + u8 sequence; + u64 route; + + /* Make sure the packet is for us */ + if (size < sizeof(struct thunderbolt_ip_header)) + return 0; + if (!uuid_equal(&pkg->hdr.initiator_uuid, net->xd->remote_uuid)) + return 0; + if (!uuid_equal(&pkg->hdr.target_uuid, net->xd->local_uuid)) + return 0; + + route = ((u64)pkg->hdr.route_hi << 32) | pkg->hdr.route_lo; + route &= ~BIT_ULL(63); + if (route != net->xd->route) + return 0; + + sequence = pkg->hdr.length_sn & TBIP_HDR_SN_MASK; + sequence >>= TBIP_HDR_SN_SHIFT; + command_id = pkg->hdr.command_id; + + switch (pkg->hdr.type) { + case TBIP_LOGIN: + if (!netif_running(net->dev)) + break; + + ret = tbnet_login_response(net, route, sequence, + pkg->hdr.command_id); + if (!ret) { + mutex_lock(&net->connection_lock); + net->login_received = true; + net->transmit_path = pkg->transmit_path; + + /* If we reached the number of max retries or + * previous logout, schedule another round of + * login retries + */ + if (net->login_retries >= TBNET_LOGIN_RETRIES || + !net->login_sent) { + net->login_retries = 0; + queue_delayed_work(system_long_wq, + &net->login_work, 0); + } + mutex_unlock(&net->connection_lock); + + queue_work(system_long_wq, &net->connected_work); + } + break; + + case TBIP_LOGOUT: + ret = tbnet_logout_response(net, route, sequence, command_id); + if (!ret) + tbnet_tear_down(net, false); + break; + + default: + return 0; + } + + if (ret) + netdev_warn(net->dev, "failed to send ThunderboltIP response\n"); + + return 1; +} + +static unsigned int tbnet_available_buffers(const struct tbnet_ring *ring) +{ + return ring->prod - ring->cons; +} + +static int tbnet_alloc_rx_buffers(struct tbnet *net, unsigned int nbuffers) +{ + struct tbnet_ring *ring = &net->rx_ring; + int ret; + + while (nbuffers--) { + struct device *dma_dev = tb_ring_dma_device(ring->ring); + unsigned int index = ring->prod & (TBNET_RING_SIZE - 1); + struct tbnet_frame *tf = &ring->frames[index]; + dma_addr_t dma_addr; + + if (tf->page) + break; + + /* Allocate page (order > 0) so that it can hold maximum + * ThunderboltIP frame (4kB) and the additional room for + * SKB shared info required by build_skb(). + */ + tf->page = dev_alloc_pages(TBNET_RX_PAGE_ORDER); + if (!tf->page) { + ret = -ENOMEM; + goto err_free; + } + + dma_addr = dma_map_page(dma_dev, tf->page, 0, + TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE); + if (dma_mapping_error(dma_dev, dma_addr)) { + ret = -ENOMEM; + goto err_free; + } + + tf->frame.buffer_phy = dma_addr; + tf->dev = net->dev; + + tb_ring_rx(ring->ring, &tf->frame); + + ring->prod++; + } + + return 0; + +err_free: + tbnet_free_buffers(ring); + return ret; +} + +static struct tbnet_frame *tbnet_get_tx_buffer(struct tbnet *net) +{ + struct tbnet_ring *ring = &net->tx_ring; + struct tbnet_frame *tf; + unsigned int index; + + if (!tbnet_available_buffers(ring)) + return NULL; + + index = ring->cons++ & (TBNET_RING_SIZE - 1); + + tf = &ring->frames[index]; + tf->frame.size = 0; + tf->frame.buffer_phy = 0; + + return tf; +} + +static void tbnet_tx_callback(struct tb_ring *ring, struct ring_frame *frame, + bool canceled) +{ + struct tbnet_frame *tf = container_of(frame, typeof(*tf), frame); + struct device *dma_dev = tb_ring_dma_device(ring); + struct tbnet *net = netdev_priv(tf->dev); + + dma_unmap_page(dma_dev, tf->frame.buffer_phy, tbnet_frame_size(tf), + DMA_TO_DEVICE); + + /* Return buffer to the ring */ + net->tx_ring.prod++; + + if (tbnet_available_buffers(&net->tx_ring) >= TBNET_RING_SIZE / 2) + netif_wake_queue(net->dev); +} + +static int tbnet_alloc_tx_buffers(struct tbnet *net) +{ + struct tbnet_ring *ring = &net->tx_ring; + unsigned int i; + + for (i = 0; i < TBNET_RING_SIZE; i++) { + struct tbnet_frame *tf = &ring->frames[i]; + + tf->page = alloc_page(GFP_KERNEL); + if (!tf->page) { + tbnet_free_buffers(ring); + return -ENOMEM; + } + + tf->dev = net->dev; + tf->frame.callback = tbnet_tx_callback; + tf->frame.sof = TBIP_PDF_FRAME_START; + tf->frame.eof = TBIP_PDF_FRAME_END; + } + + ring->cons = 0; + ring->prod = TBNET_RING_SIZE - 1; + + return 0; +} + +static void tbnet_connected_work(struct work_struct *work) +{ + struct tbnet *net = container_of(work, typeof(*net), connected_work); + bool connected; + int ret; + + if (netif_carrier_ok(net->dev)) + return; + + mutex_lock(&net->connection_lock); + connected = net->login_sent && net->login_received; + mutex_unlock(&net->connection_lock); + + if (!connected) + return; + + /* Both logins successful so enable the high-speed DMA paths and + * start the network device queue. + */ + ret = tb_xdomain_enable_paths(net->xd, TBNET_LOCAL_PATH, + net->rx_ring.ring->hop, + net->transmit_path, + net->tx_ring.ring->hop); + if (ret) { + netdev_err(net->dev, "failed to enable DMA paths\n"); + return; + } + + tb_ring_start(net->tx_ring.ring); + tb_ring_start(net->rx_ring.ring); + + ret = tbnet_alloc_rx_buffers(net, TBNET_RING_SIZE); + if (ret) + goto err_stop_rings; + + ret = tbnet_alloc_tx_buffers(net); + if (ret) + goto err_free_rx_buffers; + + netif_carrier_on(net->dev); + netif_start_queue(net->dev); + return; + +err_free_rx_buffers: + tbnet_free_buffers(&net->rx_ring); +err_stop_rings: + tb_ring_stop(net->rx_ring.ring); + tb_ring_stop(net->tx_ring.ring); +} + +static void tbnet_login_work(struct work_struct *work) +{ + struct tbnet *net = container_of(work, typeof(*net), login_work.work); + unsigned long delay = msecs_to_jiffies(TBNET_LOGIN_DELAY); + int ret; + + if (netif_carrier_ok(net->dev)) + return; + + ret = tbnet_login_request(net, net->login_retries % 4); + if (ret) { + if (net->login_retries++ < TBNET_LOGIN_RETRIES) { + queue_delayed_work(system_long_wq, &net->login_work, + delay); + } else { + netdev_info(net->dev, "ThunderboltIP login timed out\n"); + } + } else { + net->login_retries = 0; + + mutex_lock(&net->connection_lock); + net->login_sent = true; + mutex_unlock(&net->connection_lock); + + queue_work(system_long_wq, &net->connected_work); + } +} + +static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf, + const struct thunderbolt_ip_frame_header *hdr) +{ + u32 frame_id, frame_count, frame_size, frame_index; + unsigned int size; + + if (tf->frame.flags & RING_DESC_CRC_ERROR) { + net->stats.rx_crc_errors++; + return false; + } else if (tf->frame.flags & RING_DESC_BUFFER_OVERRUN) { + net->stats.rx_over_errors++; + return false; + } + + /* Should be greater than just header i.e. contains data */ + size = tbnet_frame_size(tf); + if (size <= sizeof(*hdr)) { + net->stats.rx_length_errors++; + return false; + } + + frame_count = le32_to_cpu(hdr->frame_count); + frame_size = le32_to_cpu(hdr->frame_size); + frame_index = le16_to_cpu(hdr->frame_index); + frame_id = le16_to_cpu(hdr->frame_id); + + if ((frame_size > size - sizeof(*hdr)) || !frame_size) { + net->stats.rx_length_errors++; + return false; + } + + /* In case we're in the middle of packet, validate the frame + * header based on first fragment of the packet. + */ + if (net->skb && net->rx_hdr.frame_count) { + /* Check the frame count fits the count field */ + if (frame_count != net->rx_hdr.frame_count) { + net->stats.rx_length_errors++; + return false; + } + + /* Check the frame identifiers are incremented correctly, + * and id is matching. + */ + if (frame_index != net->rx_hdr.frame_index + 1 || + frame_id != net->rx_hdr.frame_id) { + net->stats.rx_missed_errors++; + return false; + } + + if (net->skb->len + frame_size > TBNET_MAX_MTU) { + net->stats.rx_length_errors++; + return false; + } + + return true; + } + + /* Start of packet, validate the frame header */ + if (frame_count == 0 || frame_count > TBNET_RING_SIZE / 4) { + net->stats.rx_length_errors++; + return false; + } + if (frame_index != 0) { + net->stats.rx_missed_errors++; + return false; + } + + return true; +} + +static int tbnet_poll(struct napi_struct *napi, int budget) +{ + struct tbnet *net = container_of(napi, struct tbnet, napi); + unsigned int cleaned_count = tbnet_available_buffers(&net->rx_ring); + struct device *dma_dev = tb_ring_dma_device(net->rx_ring.ring); + unsigned int rx_packets = 0; + + while (rx_packets < budget) { + const struct thunderbolt_ip_frame_header *hdr; + unsigned int hdr_size = sizeof(*hdr); + struct sk_buff *skb = NULL; + struct ring_frame *frame; + struct tbnet_frame *tf; + struct page *page; + bool last = true; + u32 frame_size; + + /* Return some buffers to hardware, one at a time is too + * slow so allocate MAX_SKB_FRAGS buffers at the same + * time. + */ + if (cleaned_count >= MAX_SKB_FRAGS) { + tbnet_alloc_rx_buffers(net, cleaned_count); + cleaned_count = 0; + } + + frame = tb_ring_poll(net->rx_ring.ring); + if (!frame) + break; + + dma_unmap_page(dma_dev, frame->buffer_phy, + TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE); + + tf = container_of(frame, typeof(*tf), frame); + + page = tf->page; + tf->page = NULL; + net->rx_ring.cons++; + cleaned_count++; + + hdr = page_address(page); + if (!tbnet_check_frame(net, tf, hdr)) { + __free_pages(page, TBNET_RX_PAGE_ORDER); + dev_kfree_skb_any(net->skb); + net->skb = NULL; + continue; + } + + frame_size = le32_to_cpu(hdr->frame_size); + + skb = net->skb; + if (!skb) { + skb = build_skb(page_address(page), + TBNET_RX_PAGE_SIZE); + if (!skb) { + __free_pages(page, TBNET_RX_PAGE_ORDER); + net->stats.rx_errors++; + break; + } + + skb_reserve(skb, hdr_size); + skb_put(skb, frame_size); + + net->skb = skb; + } else { + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + page, hdr_size, frame_size, + TBNET_RX_PAGE_SIZE - hdr_size); + } + + net->rx_hdr.frame_size = frame_size; + net->rx_hdr.frame_count = le32_to_cpu(hdr->frame_count); + net->rx_hdr.frame_index = le16_to_cpu(hdr->frame_index); + net->rx_hdr.frame_id = le16_to_cpu(hdr->frame_id); + last = net->rx_hdr.frame_index == net->rx_hdr.frame_count - 1; + + rx_packets++; + net->stats.rx_bytes += frame_size; + + if (last) { + skb->protocol = eth_type_trans(skb, net->dev); + napi_gro_receive(&net->napi, skb); + net->skb = NULL; + } + } + + net->stats.rx_packets += rx_packets; + + if (cleaned_count) + tbnet_alloc_rx_buffers(net, cleaned_count); + + if (rx_packets >= budget) + return budget; + + napi_complete_done(napi, rx_packets); + /* Re-enable the ring interrupt */ + tb_ring_poll_complete(net->rx_ring.ring); + + return rx_packets; +} + +static void tbnet_start_poll(void *data) +{ + struct tbnet *net = data; + + napi_schedule(&net->napi); +} + +static int tbnet_open(struct net_device *dev) +{ + struct tbnet *net = netdev_priv(dev); + struct tb_xdomain *xd = net->xd; + u16 sof_mask, eof_mask; + struct tb_ring *ring; + + netif_carrier_off(dev); + + ring = tb_ring_alloc_tx(xd->tb->nhi, -1, TBNET_RING_SIZE, + RING_FLAG_FRAME); + if (!ring) { + netdev_err(dev, "failed to allocate Tx ring\n"); + return -ENOMEM; + } + net->tx_ring.ring = ring; + + sof_mask = BIT(TBIP_PDF_FRAME_START); + eof_mask = BIT(TBIP_PDF_FRAME_END); + + ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE, + RING_FLAG_FRAME | RING_FLAG_E2E, sof_mask, + eof_mask, tbnet_start_poll, net); + if (!ring) { + netdev_err(dev, "failed to allocate Rx ring\n"); + tb_ring_free(net->tx_ring.ring); + net->tx_ring.ring = NULL; + return -ENOMEM; + } + net->rx_ring.ring = ring; + + napi_enable(&net->napi); + start_login(net); + + return 0; +} + +static int tbnet_stop(struct net_device *dev) +{ + struct tbnet *net = netdev_priv(dev); + + napi_disable(&net->napi); + + tbnet_tear_down(net, true); + + tb_ring_free(net->rx_ring.ring); + net->rx_ring.ring = NULL; + tb_ring_free(net->tx_ring.ring); + net->tx_ring.ring = NULL; + + return 0; +} + +static bool tbnet_xmit_map(struct device *dma_dev, struct tbnet_frame *tf) +{ + dma_addr_t dma_addr; + + dma_addr = dma_map_page(dma_dev, tf->page, 0, tbnet_frame_size(tf), + DMA_TO_DEVICE); + if (dma_mapping_error(dma_dev, dma_addr)) + return false; + + tf->frame.buffer_phy = dma_addr; + return true; +} + +static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb, + struct tbnet_frame **frames, u32 frame_count) +{ + struct thunderbolt_ip_frame_header *hdr = page_address(frames[0]->page); + struct device *dma_dev = tb_ring_dma_device(net->tx_ring.ring); + __wsum wsum = htonl(skb->len - skb_transport_offset(skb)); + unsigned int i, len, offset = skb_transport_offset(skb); + __be16 protocol = skb->protocol; + void *data = skb->data; + void *dest = hdr + 1; + __sum16 *tucso; + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + /* No need to calculate checksum so we just update the + * total frame count and map the frames for DMA. + */ + for (i = 0; i < frame_count; i++) { + hdr = page_address(frames[i]->page); + hdr->frame_count = cpu_to_le32(frame_count); + if (!tbnet_xmit_map(dma_dev, frames[i])) + goto err_unmap; + } + + return true; + } + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_hdr *vhdr, vh; + + vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh); + if (!vhdr) + return false; + + protocol = vhdr->h_vlan_encapsulated_proto; + } + + /* Data points on the beginning of packet. + * Check is the checksum absolute place in the packet. + * ipcso will update IP checksum. + * tucso will update TCP/UPD checksum. + */ + if (protocol == htons(ETH_P_IP)) { + __sum16 *ipcso = dest + ((void *)&(ip_hdr(skb)->check) - data); + + *ipcso = 0; + *ipcso = ip_fast_csum(dest + skb_network_offset(skb), + ip_hdr(skb)->ihl); + + if (ip_hdr(skb)->protocol == IPPROTO_TCP) + tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data); + else if (ip_hdr(skb)->protocol == IPPROTO_UDP) + tucso = dest + ((void *)&(udp_hdr(skb)->check) - data); + else + return false; + + *tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, 0, + ip_hdr(skb)->protocol, 0); + } else if (skb_is_gso_v6(skb)) { + tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data); + *tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, 0, + IPPROTO_TCP, 0); + return false; + } else if (protocol == htons(ETH_P_IPV6)) { + tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset; + *tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, 0, + ipv6_hdr(skb)->nexthdr, 0); + } else { + return false; + } + + /* First frame was headers, rest of the frames contain data. + * Calculate checksum over each frame. + */ + for (i = 0; i < frame_count; i++) { + hdr = page_address(frames[i]->page); + dest = (void *)(hdr + 1) + offset; + len = le32_to_cpu(hdr->frame_size) - offset; + wsum = csum_partial(dest, len, wsum); + hdr->frame_count = cpu_to_le32(frame_count); + + offset = 0; + } + + *tucso = csum_fold(wsum); + + /* Checksum is finally calculated and we don't touch the memory + * anymore, so DMA map the frames now. + */ + for (i = 0; i < frame_count; i++) { + if (!tbnet_xmit_map(dma_dev, frames[i])) + goto err_unmap; + } + + return true; + +err_unmap: + while (i--) + dma_unmap_page(dma_dev, frames[i]->frame.buffer_phy, + tbnet_frame_size(frames[i]), DMA_TO_DEVICE); + + return false; +} + +static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num, + unsigned int *len) +{ + const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num]; + + *len = skb_frag_size(frag); + return kmap_atomic(skb_frag_page(frag)) + frag->page_offset; +} + +static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct tbnet *net = netdev_priv(dev); + struct tbnet_frame *frames[MAX_SKB_FRAGS]; + u16 frame_id = atomic_read(&net->frame_id); + struct thunderbolt_ip_frame_header *hdr; + unsigned int len = skb_headlen(skb); + unsigned int data_len = skb->len; + unsigned int nframes, i; + unsigned int frag = 0; + void *src = skb->data; + u32 frame_index = 0; + bool unmap = false; + void *dest; + + nframes = DIV_ROUND_UP(data_len, TBNET_MAX_PAYLOAD_SIZE); + if (tbnet_available_buffers(&net->tx_ring) < nframes) { + netif_stop_queue(net->dev); + return NETDEV_TX_BUSY; + } + + frames[frame_index] = tbnet_get_tx_buffer(net); + if (!frames[frame_index]) + goto err_drop; + + hdr = page_address(frames[frame_index]->page); + dest = hdr + 1; + + /* If overall packet is bigger than the frame data size */ + while (data_len > TBNET_MAX_PAYLOAD_SIZE) { + unsigned int size_left = TBNET_MAX_PAYLOAD_SIZE; + + hdr->frame_size = cpu_to_le32(TBNET_MAX_PAYLOAD_SIZE); + hdr->frame_index = cpu_to_le16(frame_index); + hdr->frame_id = cpu_to_le16(frame_id); + + do { + if (len > size_left) { + /* Copy data onto Tx buffer data with + * full frame size then break and go to + * next frame + */ + memcpy(dest, src, size_left); + len -= size_left; + dest += size_left; + src += size_left; + break; + } + + memcpy(dest, src, len); + size_left -= len; + dest += len; + + if (unmap) { + kunmap_atomic(src); + unmap = false; + } + + /* Ensure all fragments have been processed */ + if (frag < skb_shinfo(skb)->nr_frags) { + /* Map and then unmap quickly */ + src = tbnet_kmap_frag(skb, frag++, &len); + unmap = true; + } else if (unlikely(size_left > 0)) { + goto err_drop; + } + } while (size_left > 0); + + data_len -= TBNET_MAX_PAYLOAD_SIZE; + frame_index++; + + frames[frame_index] = tbnet_get_tx_buffer(net); + if (!frames[frame_index]) + goto err_drop; + + hdr = page_address(frames[frame_index]->page); + dest = hdr + 1; + } + + hdr->frame_size = cpu_to_le32(data_len); + hdr->frame_index = cpu_to_le16(frame_index); + hdr->frame_id = cpu_to_le16(frame_id); + + frames[frame_index]->frame.size = data_len + sizeof(*hdr); + + /* In case the remaining data_len is smaller than a frame */ + while (len < data_len) { + memcpy(dest, src, len); + data_len -= len; + dest += len; + + if (unmap) { + kunmap_atomic(src); + unmap = false; + } + + if (frag < skb_shinfo(skb)->nr_frags) { + src = tbnet_kmap_frag(skb, frag++, &len); + unmap = true; + } else if (unlikely(data_len > 0)) { + goto err_drop; + } + } + + memcpy(dest, src, data_len); + + if (unmap) + kunmap_atomic(src); + + if (!tbnet_xmit_csum_and_map(net, skb, frames, frame_index + 1)) + goto err_drop; + + for (i = 0; i < frame_index + 1; i++) + tb_ring_tx(net->tx_ring.ring, &frames[i]->frame); + + if (net->svc->prtcstns & TBNET_MATCH_FRAGS_ID) + atomic_inc(&net->frame_id); + + net->stats.tx_packets++; + net->stats.tx_bytes += skb->len; + + dev_consume_skb_any(skb); + + return NETDEV_TX_OK; + +err_drop: + /* We can re-use the buffers */ + net->tx_ring.cons -= frame_index; + + dev_kfree_skb_any(skb); + net->stats.tx_errors++; + + return NETDEV_TX_OK; +} + +static void tbnet_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct tbnet *net = netdev_priv(dev); + + stats->tx_packets = net->stats.tx_packets; + stats->rx_packets = net->stats.rx_packets; + stats->tx_bytes = net->stats.tx_bytes; + stats->rx_bytes = net->stats.rx_bytes; + stats->rx_errors = net->stats.rx_errors + net->stats.rx_length_errors + + net->stats.rx_over_errors + net->stats.rx_crc_errors + + net->stats.rx_missed_errors; + stats->tx_errors = net->stats.tx_errors; + stats->rx_length_errors = net->stats.rx_length_errors; + stats->rx_over_errors = net->stats.rx_over_errors; + stats->rx_crc_errors = net->stats.rx_crc_errors; + stats->rx_missed_errors = net->stats.rx_missed_errors; +} + +static const struct net_device_ops tbnet_netdev_ops = { + .ndo_open = tbnet_open, + .ndo_stop = tbnet_stop, + .ndo_start_xmit = tbnet_start_xmit, + .ndo_get_stats64 = tbnet_get_stats64, +}; + +static void tbnet_generate_mac(struct net_device *dev) +{ + const struct tbnet *net = netdev_priv(dev); + const struct tb_xdomain *xd = net->xd; + u8 phy_port; + u32 hash; + + phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route)); + + /* Unicast and locally administered MAC */ + dev->dev_addr[0] = phy_port << 4 | 0x02; + hash = jhash2((u32 *)xd->local_uuid, 4, 0); + memcpy(dev->dev_addr + 1, &hash, sizeof(hash)); + hash = jhash2((u32 *)xd->local_uuid, 4, hash); + dev->dev_addr[5] = hash & 0xff; +} + +static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id) +{ + struct tb_xdomain *xd = tb_service_parent(svc); + struct net_device *dev; + struct tbnet *net; + int ret; + + dev = alloc_etherdev(sizeof(*net)); + if (!dev) + return -ENOMEM; + + SET_NETDEV_DEV(dev, &svc->dev); + + net = netdev_priv(dev); + INIT_DELAYED_WORK(&net->login_work, tbnet_login_work); + INIT_WORK(&net->connected_work, tbnet_connected_work); + mutex_init(&net->connection_lock); + atomic_set(&net->command_id, 0); + atomic_set(&net->frame_id, 0); + net->svc = svc; + net->dev = dev; + net->xd = xd; + + tbnet_generate_mac(dev); + + strcpy(dev->name, "thunderbolt%d"); + dev->netdev_ops = &tbnet_netdev_ops; + + /* ThunderboltIP takes advantage of TSO packets but instead of + * segmenting them we just split the packet into Thunderbolt + * frames (maximum payload size of each frame is 4084 bytes) and + * calculate checksum over the whole packet here. + * + * The receiving side does the opposite if the host OS supports + * LRO, otherwise it needs to split the large packet into MTU + * sized smaller packets. + * + * In order to receive large packets from the networking stack, + * we need to announce support for most of the offloading + * features here. + */ + dev->hw_features = NETIF_F_SG | NETIF_F_ALL_TSO | NETIF_F_GRO | + NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + dev->features = dev->hw_features | NETIF_F_HIGHDMA; + dev->hard_header_len += sizeof(struct thunderbolt_ip_frame_header); + + netif_napi_add(dev, &net->napi, tbnet_poll, NAPI_POLL_WEIGHT); + + /* MTU range: 68 - 65522 */ + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = TBNET_MAX_MTU - ETH_HLEN; + + net->handler.uuid = &tbnet_svc_uuid; + net->handler.callback = tbnet_handle_packet, + net->handler.data = net; + tb_register_protocol_handler(&net->handler); + + tb_service_set_drvdata(svc, net); + + ret = register_netdev(dev); + if (ret) { + tb_unregister_protocol_handler(&net->handler); + free_netdev(dev); + return ret; + } + + return 0; +} + +static void tbnet_remove(struct tb_service *svc) +{ + struct tbnet *net = tb_service_get_drvdata(svc); + + unregister_netdev(net->dev); + tb_unregister_protocol_handler(&net->handler); + free_netdev(net->dev); +} + +static void tbnet_shutdown(struct tb_service *svc) +{ + tbnet_tear_down(tb_service_get_drvdata(svc), true); +} + +static int __maybe_unused tbnet_suspend(struct device *dev) +{ + struct tb_service *svc = tb_to_service(dev); + struct tbnet *net = tb_service_get_drvdata(svc); + + stop_login(net); + if (netif_running(net->dev)) { + netif_device_detach(net->dev); + tb_ring_stop(net->rx_ring.ring); + tb_ring_stop(net->tx_ring.ring); + tbnet_free_buffers(&net->rx_ring); + tbnet_free_buffers(&net->tx_ring); + } + + return 0; +} + +static int __maybe_unused tbnet_resume(struct device *dev) +{ + struct tb_service *svc = tb_to_service(dev); + struct tbnet *net = tb_service_get_drvdata(svc); + + netif_carrier_off(net->dev); + if (netif_running(net->dev)) { + netif_device_attach(net->dev); + start_login(net); + } + + return 0; +} + +static const struct dev_pm_ops tbnet_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(tbnet_suspend, tbnet_resume) +}; + +static const struct tb_service_id tbnet_ids[] = { + { TB_SERVICE("network", 1) }, + { }, +}; +MODULE_DEVICE_TABLE(tbsvc, tbnet_ids); + +static struct tb_service_driver tbnet_driver = { + .driver = { + .owner = THIS_MODULE, + .name = "thunderbolt-net", + .pm = &tbnet_pm_ops, + }, + .probe = tbnet_probe, + .remove = tbnet_remove, + .shutdown = tbnet_shutdown, + .id_table = tbnet_ids, +}; + +static int __init tbnet_init(void) +{ + int ret; + + tbnet_dir = tb_property_create_dir(&tbnet_dir_uuid); + if (!tbnet_dir) + return -ENOMEM; + + tb_property_add_immediate(tbnet_dir, "prtcid", 1); + tb_property_add_immediate(tbnet_dir, "prtcvers", 1); + tb_property_add_immediate(tbnet_dir, "prtcrevs", 1); + tb_property_add_immediate(tbnet_dir, "prtcstns", + TBNET_MATCH_FRAGS_ID); + + ret = tb_register_property_dir("network", tbnet_dir); + if (ret) { + tb_property_free_dir(tbnet_dir); + return ret; + } + + return tb_register_service_driver(&tbnet_driver); +} +module_init(tbnet_init); + +static void __exit tbnet_exit(void) +{ + tb_unregister_service_driver(&tbnet_driver); + tb_unregister_property_dir("network", tbnet_dir); + tb_property_free_dir(tbnet_dir); +} +module_exit(tbnet_exit); + +MODULE_AUTHOR("Amir Levy "); +MODULE_AUTHOR("Michael Jamet "); +MODULE_AUTHOR("Mika Westerberg "); +MODULE_DESCRIPTION("Thunderbolt network driver"); +MODULE_LICENSE("GPL v2"); From c024297e7be34b6f0e578afc76c5b9f7708a5832 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2017 13:38:46 +0300 Subject: [PATCH 1161/1322] MAINTAINERS: Add entry for Thunderbolt network driver I will be maintaining the Thunderbolt network driver along with Michael and Yehezkel. Signed-off-by: Michael Jamet Signed-off-by: Yehezkel Bernat Signed-off-by: Mika Westerberg Signed-off-by: David S. Miller --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c1c90d962012..5231392cf4bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13280,6 +13280,14 @@ S: Maintained F: drivers/thunderbolt/ F: include/linux/thunderbolt.h +THUNDERBOLT NETWORK DRIVER +M: Michael Jamet +M: Mika Westerberg +M: Yehezkel Bernat +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/thunderbolt.c + THUNDERX GPIO DRIVER M: David Daney S: Maintained From d312fefea8387503375f728855c9a62de20c9665 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 2 Oct 2017 19:31:24 +0100 Subject: [PATCH 1162/1322] ahci: don't ignore result code of ahci_reset_controller() ahci_pci_reset_controller() calls ahci_reset_controller(), which may fail, but ignores the result code and always returns success. This may result in failures like below ahci 0000:02:00.0: version 3.0 ahci 0000:02:00.0: enabling device (0000 -> 0003) ahci 0000:02:00.0: SSS flag set, parallel bus scan disabled ahci 0000:02:00.0: controller reset failed (0xffffffff) ahci 0000:02:00.0: failed to stop engine (-5) ... repeated many times ... ahci 0000:02:00.0: failed to stop engine (-5) Unable to handle kernel paging request at virtual address ffff0000093f9018 ... PC is at ahci_stop_engine+0x5c/0xd8 [libahci] LR is at ahci_deinit_port.constprop.12+0x1c/0xc0 [libahci] ... [] ahci_stop_engine+0x5c/0xd8 [libahci] [] ahci_deinit_port.constprop.12+0x1c/0xc0 [libahci] [] ahci_init_controller+0x80/0x168 [libahci] [] ahci_pci_init_controller+0x60/0x68 [ahci] [] ahci_init_one+0x75c/0xd88 [ahci] [] local_pci_probe+0x3c/0xb8 [] pci_device_probe+0x138/0x170 [] driver_probe_device+0x2dc/0x458 [] __driver_attach+0x114/0x118 [] bus_for_each_dev+0x60/0xa0 [] driver_attach+0x20/0x28 [] bus_add_driver+0x1f0/0x2a8 [] driver_register+0x60/0xf8 [] __pci_register_driver+0x3c/0x48 [] ahci_pci_driver_init+0x1c/0x1000 [ahci] [] do_one_initcall+0x38/0x120 where an obvious hardware level failure results in an unnecessary 15 second delay and a subsequent crash. So record the result code of ahci_reset_controller() and relay it, rather than ignoring it. Signed-off-by: Ard Biesheuvel Signed-off-by: Tejun Heo --- drivers/ata/ahci.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index cb9b0e9090e3..9f78bb03bb76 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -621,8 +621,11 @@ static void ahci_pci_save_initial_config(struct pci_dev *pdev, static int ahci_pci_reset_controller(struct ata_host *host) { struct pci_dev *pdev = to_pci_dev(host->dev); + int rc; - ahci_reset_controller(host); + rc = ahci_reset_controller(host); + if (rc) + return rc; if (pdev->vendor == PCI_VENDOR_ID_INTEL) { struct ahci_host_priv *hpriv = host->private_data; From e50d5751c807853cd0fce0b5c46479cc6274014f Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 24 Jul 2017 18:17:42 -0700 Subject: [PATCH 1163/1322] i40e: limit lan queue count in large CPU count machine When a machine has more CPUs than queue pairs, e.g. 512 cores, the counting gets a little funky and turns off Flow Director with the message: not enough queues for Flow Director. Flow Director feature is disabled This patch limits the number of lan queues initially allocated to be sure we have some left for FD and other features. Signed-off-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 47f71d7c3ae0..387f0863f794 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11093,6 +11093,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit) static void i40e_determine_queue_usage(struct i40e_pf *pf) { int queues_left; + int q_max; pf->num_lan_qps = 0; @@ -11139,10 +11140,12 @@ static void i40e_determine_queue_usage(struct i40e_pf *pf) I40E_FLAG_DCB_ENABLED); dev_info(&pf->pdev->dev, "not enough queues for DCB. DCB is disabled.\n"); } - pf->num_lan_qps = max_t(int, pf->rss_size_max, - num_online_cpus()); - pf->num_lan_qps = min_t(int, pf->num_lan_qps, - pf->hw.func_caps.num_tx_qp); + + /* limit lan qps to the smaller of qps, cpus or msix */ + q_max = max_t(int, pf->rss_size_max, num_online_cpus()); + q_max = min_t(int, q_max, pf->hw.func_caps.num_tx_qp); + q_max = min_t(int, q_max, pf->hw.func_caps.num_msix_vectors); + pf->num_lan_qps = q_max; queues_left -= pf->num_lan_qps; } From 5872866e166c38ad1c1028fb9cf7dd756c0ef43e Mon Sep 17 00:00:00 2001 From: Lihong Yang Date: Thu, 27 Jul 2017 03:17:09 -0700 Subject: [PATCH 1164/1322] i40e: remove logically dead code This patch removes the !vf condition check that cannot be true in i40e_ndo_set_vf_trust function Detected by CoverityScan, CID 1397531 Logically dead code Signed-off-by: Lihong Yang Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index a75396c157d9..e6b95e1e1a33 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3354,8 +3354,6 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting) vf = &pf->vf[vf_id]; - if (!vf) - return -EINVAL; if (setting == vf->trusted) goto out; From 54902349ee95045b67e2f0c39b75f5418540064b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 6 Aug 2017 23:37:01 +0200 Subject: [PATCH 1165/1322] i40e: Fix a potential NULL pointer dereference If 'kzalloc()' fails, a NULL pointer will be dereferenced. Return an error code (-ENOMEM) instead. Signed-off-by: Christophe JAILLET Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index e6b95e1e1a33..9e3667fc7f6a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -423,6 +423,9 @@ static int i40e_config_iwarp_qvlist(struct i40e_vf *vf, (sizeof(struct virtchnl_iwarp_qv_info) * (qvlist_info->num_vectors - 1)); vf->qvlist_info = kzalloc(size, GFP_KERNEL); + if (!vf->qvlist_info) + return -ENOMEM; + vf->qvlist_info->num_vectors = qvlist_info->num_vectors; msix_vf = pf->hw.func_caps.num_msix_vectors_vf; From d60bcc798000e015940fb47eb23b79dd2fda5c9e Mon Sep 17 00:00:00 2001 From: Filip Sadowski Date: Tue, 22 Aug 2017 06:57:43 -0400 Subject: [PATCH 1166/1322] i40e: Fix reporting of supported link modes This patch fixes incorrect reporting of supported link modes on some NICs. Signed-off-by: Filip Sadowski Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 20 +++++++++++++++++-- drivers/net/ethernet/intel/i40e/i40e_common.c | 11 +++++++++- .../ethernet/intel/i40evf/i40e_adminq_cmd.h | 20 +++++++++++++++++-- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index e2a9ec80a623..5d0291c1337e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1734,6 +1734,8 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_10GBASE_CR1_CU = 0xB, I40E_PHY_TYPE_10GBASE_AOC = 0xC, I40E_PHY_TYPE_40GBASE_AOC = 0xD, + I40E_PHY_TYPE_UNRECOGNIZED = 0xE, + I40E_PHY_TYPE_UNSUPPORTED = 0xF, I40E_PHY_TYPE_100BASE_TX = 0x11, I40E_PHY_TYPE_1000BASE_T = 0x12, I40E_PHY_TYPE_10GBASE_T = 0x13, @@ -1752,6 +1754,8 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_25GBASE_CR = 0x20, I40E_PHY_TYPE_25GBASE_SR = 0x21, I40E_PHY_TYPE_25GBASE_LR = 0x22, + I40E_PHY_TYPE_EMPTY = 0xFE, + I40E_PHY_TYPE_DEFAULT = 0xFF, I40E_PHY_TYPE_MAX }; @@ -1942,19 +1946,31 @@ struct i40e_aqc_get_link_status { #define I40E_AQ_25G_SERDES_UCODE_ERR 0X04 #define I40E_AQ_25G_NIMB_UCODE_ERR 0X05 u8 loopback; /* use defines from i40e_aqc_set_lb_mode */ +/* Since firmware API 1.7 loopback field keeps power class info as well */ +#define I40E_AQ_LOOPBACK_MASK 0x07 +#define I40E_AQ_PWR_CLASS_SHIFT_LB 6 +#define I40E_AQ_PWR_CLASS_MASK_LB (0x03 << I40E_AQ_PWR_CLASS_SHIFT_LB) __le16 max_frame_size; u8 config; #define I40E_AQ_CONFIG_FEC_KR_ENA 0x01 #define I40E_AQ_CONFIG_FEC_RS_ENA 0x02 #define I40E_AQ_CONFIG_CRC_ENA 0x04 #define I40E_AQ_CONFIG_PACING_MASK 0x78 - u8 power_desc; + union { + struct { + u8 power_desc; #define I40E_AQ_LINK_POWER_CLASS_1 0x00 #define I40E_AQ_LINK_POWER_CLASS_2 0x01 #define I40E_AQ_LINK_POWER_CLASS_3 0x02 #define I40E_AQ_LINK_POWER_CLASS_4 0x03 #define I40E_AQ_PWR_CLASS_MASK 0x03 - u8 reserved[4]; + u8 reserved[4]; + }; + struct { + u8 link_type[4]; + u8 link_type_ext; + }; + }; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status); diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 7346d8850c8e..64c15f4c9d2b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1821,7 +1821,7 @@ i40e_status i40e_aq_get_link_info(struct i40e_hw *hw, hw_link_info->fec_info = resp->config & (I40E_AQ_CONFIG_FEC_KR_ENA | I40E_AQ_CONFIG_FEC_RS_ENA); hw_link_info->ext_info = resp->ext_info; - hw_link_info->loopback = resp->loopback; + hw_link_info->loopback = resp->loopback & I40E_AQ_LOOPBACK_MASK; hw_link_info->max_frame_size = le16_to_cpu(resp->max_frame_size); hw_link_info->pacing = resp->config & I40E_AQ_CONFIG_PACING_MASK; @@ -1852,6 +1852,15 @@ i40e_status i40e_aq_get_link_info(struct i40e_hw *hw, hw->aq.fw_min_ver < 40)) && hw_link_info->phy_type == 0xE) hw_link_info->phy_type = I40E_PHY_TYPE_10GBASE_SFPP_CU; + if (hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR && + hw->aq.api_min_ver >= 7) { + __le32 tmp; + + memcpy(&tmp, resp->link_type, sizeof(tmp)); + hw->phy.phy_types = le32_to_cpu(tmp); + hw->phy.phy_types |= ((u64)resp->link_type_ext << 32); + } + /* save link status information */ if (link) *link = *hw_link_info; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index f9f48d1900b0..709d114fc305 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -1730,6 +1730,8 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_10GBASE_CR1_CU = 0xB, I40E_PHY_TYPE_10GBASE_AOC = 0xC, I40E_PHY_TYPE_40GBASE_AOC = 0xD, + I40E_PHY_TYPE_UNRECOGNIZED = 0xE, + I40E_PHY_TYPE_UNSUPPORTED = 0xF, I40E_PHY_TYPE_100BASE_TX = 0x11, I40E_PHY_TYPE_1000BASE_T = 0x12, I40E_PHY_TYPE_10GBASE_T = 0x13, @@ -1748,6 +1750,8 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_25GBASE_CR = 0x20, I40E_PHY_TYPE_25GBASE_SR = 0x21, I40E_PHY_TYPE_25GBASE_LR = 0x22, + I40E_PHY_TYPE_EMPTY = 0xFE, + I40E_PHY_TYPE_DEFAULT = 0xFF, I40E_PHY_TYPE_MAX }; @@ -1938,19 +1942,31 @@ struct i40e_aqc_get_link_status { #define I40E_AQ_25G_SERDES_UCODE_ERR 0X04 #define I40E_AQ_25G_NIMB_UCODE_ERR 0X05 u8 loopback; /* use defines from i40e_aqc_set_lb_mode */ +/* Since firmware API 1.7 loopback field keeps power class info as well */ +#define I40E_AQ_LOOPBACK_MASK 0x07 +#define I40E_AQ_PWR_CLASS_SHIFT_LB 6 +#define I40E_AQ_PWR_CLASS_MASK_LB (0x03 << I40E_AQ_PWR_CLASS_SHIFT_LB) __le16 max_frame_size; u8 config; #define I40E_AQ_CONFIG_FEC_KR_ENA 0x01 #define I40E_AQ_CONFIG_FEC_RS_ENA 0x02 #define I40E_AQ_CONFIG_CRC_ENA 0x04 #define I40E_AQ_CONFIG_PACING_MASK 0x78 - u8 power_desc; + union { + struct { + u8 power_desc; #define I40E_AQ_LINK_POWER_CLASS_1 0x00 #define I40E_AQ_LINK_POWER_CLASS_2 0x01 #define I40E_AQ_LINK_POWER_CLASS_3 0x02 #define I40E_AQ_LINK_POWER_CLASS_4 0x03 #define I40E_AQ_PWR_CLASS_MASK 0x03 - u8 reserved[4]; + u8 reserved[4]; + }; + struct { + u8 link_type[4]; + u8 link_type_ext; + }; + }; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_get_link_status); From 9c0e5caf6398d6b892dc5046c421890e32ab5fa3 Mon Sep 17 00:00:00 2001 From: Filip Sadowski Date: Tue, 22 Aug 2017 06:57:44 -0400 Subject: [PATCH 1167/1322] i40e: Add support for 'ethtool -m' This patch adds support for 'ethtool -m' command which displays information about (Q)SFP+ module plugged into NIC's cage. Signed-off-by: Filip Sadowski Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 18 ++ drivers/net/ethernet/intel/i40e/i40e_common.c | 69 ++++++++ .../net/ethernet/intel/i40e/i40e_ethtool.c | 154 ++++++++++++++++++ .../net/ethernet/intel/i40e/i40e_prototype.h | 9 + drivers/net/ethernet/intel/i40e/i40e_type.h | 13 ++ .../ethernet/intel/i40evf/i40e_adminq_cmd.h | 18 ++ .../net/ethernet/intel/i40evf/i40e_common.c | 69 ++++++++ .../ethernet/intel/i40evf/i40e_prototype.h | 9 + drivers/net/ethernet/intel/i40evf/i40e_type.h | 12 ++ 9 files changed, 371 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 5d0291c1337e..ed7bbe14bc6e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -244,6 +244,8 @@ enum i40e_admin_queue_opc { i40e_aqc_opc_set_phy_debug = 0x0622, i40e_aqc_opc_upload_ext_phy_fm = 0x0625, i40e_aqc_opc_run_phy_activity = 0x0626, + i40e_aqc_opc_set_phy_register = 0x0628, + i40e_aqc_opc_get_phy_register = 0x0629, /* NVM commands */ i40e_aqc_opc_nvm_read = 0x0701, @@ -2053,6 +2055,22 @@ struct i40e_aqc_run_phy_activity { I40E_CHECK_CMD_LENGTH(i40e_aqc_run_phy_activity); +/* Set PHY Register command (0x0628) */ +/* Get PHY Register command (0x0629) */ +struct i40e_aqc_phy_register_access { + u8 phy_interface; +#define I40E_AQ_PHY_REG_ACCESS_INTERNAL 0 +#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL 1 +#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE 2 + u8 dev_address; + u8 reserved1[2]; + __le32 reg_address; + __le32 reg_value; + u8 reserved2[4]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_phy_register_access); + /* NVM Read command (indirect 0x0701) * NVM Erase commands (direct 0x0702) * NVM Update commands (indirect 0x0703) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 64c15f4c9d2b..fada03799850 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -5062,6 +5062,75 @@ do_retry: wr32(hw, reg_addr, reg_val); } +/** + * i40e_aq_set_phy_register + * @hw: pointer to the hw struct + * @phy_select: select which phy should be accessed + * @dev_addr: PHY device address + * @reg_addr: PHY register address + * @reg_val: new register value + * @cmd_details: pointer to command details structure or NULL + * + * Write the external PHY register. + **/ +i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_phy_register_access *cmd = + (struct i40e_aqc_phy_register_access *)&desc.params.raw; + i40e_status status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_phy_register); + + cmd->phy_interface = phy_select; + cmd->dev_address = dev_addr; + cmd->reg_address = cpu_to_le32(reg_addr); + cmd->reg_value = cpu_to_le32(reg_val); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40e_aq_get_phy_register + * @hw: pointer to the hw struct + * @phy_select: select which phy should be accessed + * @dev_addr: PHY device address + * @reg_addr: PHY register address + * @reg_val: read register value + * @cmd_details: pointer to command details structure or NULL + * + * Read the external PHY register. + **/ +i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 *reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_phy_register_access *cmd = + (struct i40e_aqc_phy_register_access *)&desc.params.raw; + i40e_status status; + + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_phy_register); + + cmd->phy_interface = phy_select; + cmd->dev_address = dev_addr; + cmd->reg_address = cpu_to_le32(reg_addr); + + status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); + if (!status) + *reg_val = le32_to_cpu(cmd->reg_value); + + return status; +} + /** * i40e_aq_write_ppp - Write pipeline personalization profile (ppp) * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 05e89864f781..1136d02e2e95 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -4196,6 +4196,158 @@ flags_complete: return 0; } +/** + * i40e_get_module_info - get (Q)SFP+ module type info + * @netdev: network interface device structure + * @modinfo: module EEPROM size and layout information structure + **/ +static int i40e_get_module_info(struct net_device *netdev, + struct ethtool_modinfo *modinfo) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_vsi *vsi = np->vsi; + struct i40e_pf *pf = vsi->back; + struct i40e_hw *hw = &pf->hw; + u32 sff8472_comp = 0; + u32 sff8472_swap = 0; + u32 sff8636_rev = 0; + i40e_status status; + u32 type = 0; + + /* Check if firmware supports reading module EEPROM. */ + if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE)) { + netdev_err(vsi->netdev, "Module EEPROM memory read not supported. Please update the NVM image.\n"); + return -EINVAL; + } + + status = i40e_update_link_info(hw); + if (status) + return -EIO; + + if (hw->phy.link_info.phy_type == I40E_PHY_TYPE_EMPTY) { + netdev_err(vsi->netdev, "Cannot read module EEPROM memory. No module connected.\n"); + return -EINVAL; + } + + type = hw->phy.link_info.module_type[0]; + + switch (type) { + case I40E_MODULE_TYPE_SFP: + status = i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, + I40E_I2C_EEPROM_DEV_ADDR, + I40E_MODULE_SFF_8472_COMP, + &sff8472_comp, NULL); + if (status) + return -EIO; + + status = i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, + I40E_I2C_EEPROM_DEV_ADDR, + I40E_MODULE_SFF_8472_SWAP, + &sff8472_swap, NULL); + if (status) + return -EIO; + + /* Check if the module requires address swap to access + * the other EEPROM memory page. + */ + if (sff8472_swap & I40E_MODULE_SFF_ADDR_MODE) { + netdev_warn(vsi->netdev, "Module address swap to access page 0xA2 is not supported.\n"); + modinfo->type = ETH_MODULE_SFF_8079; + modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; + } else if (sff8472_comp == 0x00) { + /* Module is not SFF-8472 compliant */ + modinfo->type = ETH_MODULE_SFF_8079; + modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + } + break; + case I40E_MODULE_TYPE_QSFP_PLUS: + /* Read from memory page 0. */ + status = i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, + 0, + I40E_MODULE_REVISION_ADDR, + &sff8636_rev, NULL); + if (status) + return -EIO; + /* Determine revision compliance byte */ + if (sff8636_rev > 0x02) { + /* Module is SFF-8636 compliant */ + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN; + } + break; + case I40E_MODULE_TYPE_QSFP28: + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = I40E_MODULE_QSFP_MAX_LEN; + break; + default: + netdev_err(vsi->netdev, "Module type unrecognized\n"); + return -EINVAL; + } + return 0; +} + +/** + * i40e_get_module_eeprom - fills buffer with (Q)SFP+ module memory contents + * @netdev: network interface device structure + * @ee: EEPROM dump request structure + * @data: buffer to be filled with EEPROM contents + **/ +static int i40e_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *ee, + u8 *data) +{ + struct i40e_netdev_priv *np = netdev_priv(netdev); + struct i40e_vsi *vsi = np->vsi; + struct i40e_pf *pf = vsi->back; + struct i40e_hw *hw = &pf->hw; + bool is_sfp = false; + i40e_status status; + u32 value = 0; + int i; + + if (!ee || !ee->len || !data) + return -EINVAL; + + if (hw->phy.link_info.module_type[0] == I40E_MODULE_TYPE_SFP) + is_sfp = true; + + for (i = 0; i < ee->len; i++) { + u32 offset = i + ee->offset; + u32 addr = is_sfp ? I40E_I2C_EEPROM_DEV_ADDR : 0; + + /* Check if we need to access the other memory page */ + if (is_sfp) { + if (offset >= ETH_MODULE_SFF_8079_LEN) { + offset -= ETH_MODULE_SFF_8079_LEN; + addr = I40E_I2C_EEPROM_DEV_ADDR2; + } + } else { + while (offset >= ETH_MODULE_SFF_8436_LEN) { + /* Compute memory page number and offset. */ + offset -= ETH_MODULE_SFF_8436_LEN / 2; + addr++; + } + } + + status = i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, + addr, offset, &value, NULL); + if (status) + return -EIO; + data[i] = value; + } + return 0; +} + static const struct ethtool_ops i40e_ethtool_ops = { .get_drvinfo = i40e_get_drvinfo, .get_regs_len = i40e_get_regs_len, @@ -4228,6 +4380,8 @@ static const struct ethtool_ops i40e_ethtool_ops = { .set_rxfh = i40e_set_rxfh, .get_channels = i40e_get_channels, .set_channels = i40e_set_channels, + .get_module_info = i40e_get_module_info, + .get_module_eeprom = i40e_get_module_eeprom, .get_ts_info = i40e_get_ts_info, .get_priv_flags = i40e_get_priv_flags, .set_priv_flags = i40e_set_priv_flags, diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index a39b13197891..01502561035c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -360,6 +360,15 @@ i40e_status i40e_aq_rx_ctl_write_register(struct i40e_hw *hw, u32 reg_addr, u32 reg_val, struct i40e_asq_cmd_details *cmd_details); void i40e_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val); +i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 reg_val, + struct i40e_asq_cmd_details *cmd_details); +i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 *reg_val, + struct i40e_asq_cmd_details *cmd_details); + i40e_status i40e_read_phy_register_clause22(struct i40e_hw *hw, u16 reg, u8 phy_addr, u16 *value); i40e_status i40e_write_phy_register_clause22(struct i40e_hw *hw, diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index fd4bbdd88b57..8b0b9f826b7f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -428,6 +428,18 @@ struct i40e_nvm_access { u8 data[1]; }; +/* (Q)SFP module access definitions */ +#define I40E_I2C_EEPROM_DEV_ADDR 0xA0 +#define I40E_I2C_EEPROM_DEV_ADDR2 0xA2 +#define I40E_MODULE_TYPE_ADDR 0x00 +#define I40E_MODULE_REVISION_ADDR 0x01 +#define I40E_MODULE_SFF_8472_COMP 0x5E +#define I40E_MODULE_SFF_8472_SWAP 0x5C +#define I40E_MODULE_SFF_ADDR_MODE 0x04 +#define I40E_MODULE_TYPE_QSFP_PLUS 0x0D +#define I40E_MODULE_TYPE_QSFP28 0x11 +#define I40E_MODULE_QSFP_MAX_LEN 640 + /* PCI bus types */ enum i40e_bus_type { i40e_bus_type_unknown = 0, @@ -598,6 +610,7 @@ struct i40e_hw { struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */ #define I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE BIT_ULL(0) +#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2) u64 flags; /* debug mask */ diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index 709d114fc305..eee7ece42b39 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -244,6 +244,8 @@ enum i40e_admin_queue_opc { i40e_aqc_opc_set_phy_debug = 0x0622, i40e_aqc_opc_upload_ext_phy_fm = 0x0625, i40e_aqc_opc_run_phy_activity = 0x0626, + i40e_aqc_opc_set_phy_register = 0x0628, + i40e_aqc_opc_get_phy_register = 0x0629, /* NVM commands */ i40e_aqc_opc_nvm_read = 0x0701, @@ -2046,6 +2048,22 @@ struct i40e_aqc_run_phy_activity { I40E_CHECK_CMD_LENGTH(i40e_aqc_run_phy_activity); +/* Set PHY Register command (0x0628) */ +/* Get PHY Register command (0x0629) */ +struct i40e_aqc_phy_register_access { + u8 phy_interface; +#define I40E_AQ_PHY_REG_ACCESS_INTERNAL 0 +#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL 1 +#define I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE 2 + u8 dev_address; + u8 reserved1[2]; + __le32 reg_address; + __le32 reg_value; + u8 reserved2[4]; +}; + +I40E_CHECK_CMD_LENGTH(i40e_aqc_phy_register_access); + /* NVM Read command (indirect 0x0701) * NVM Erase commands (direct 0x0702) * NVM Update commands (indirect 0x0703) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c index 8d3a2bfe186a..7d70bf69b249 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_common.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c @@ -1041,6 +1041,75 @@ do_retry: wr32(hw, reg_addr, reg_val); } +/** + * i40evf_aq_set_phy_register + * @hw: pointer to the hw struct + * @phy_select: select which phy should be accessed + * @dev_addr: PHY device address + * @reg_addr: PHY register address + * @reg_val: new register value + * @cmd_details: pointer to command details structure or NULL + * + * Reset the external PHY. + **/ +i40e_status i40evf_aq_set_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_phy_register_access *cmd = + (struct i40e_aqc_phy_register_access *)&desc.params.raw; + i40e_status status; + + i40evf_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_set_phy_register); + + cmd->phy_interface = phy_select; + cmd->dev_address = dev_addr; + cmd->reg_address = cpu_to_le32(reg_addr); + cmd->reg_value = cpu_to_le32(reg_val); + + status = i40evf_asq_send_command(hw, &desc, NULL, 0, cmd_details); + + return status; +} + +/** + * i40evf_aq_get_phy_register + * @hw: pointer to the hw struct + * @phy_select: select which phy should be accessed + * @dev_addr: PHY device address + * @reg_addr: PHY register address + * @reg_val: read register value + * @cmd_details: pointer to command details structure or NULL + * + * Reset the external PHY. + **/ +i40e_status i40evf_aq_get_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 *reg_val, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_desc desc; + struct i40e_aqc_phy_register_access *cmd = + (struct i40e_aqc_phy_register_access *)&desc.params.raw; + i40e_status status; + + i40evf_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_phy_register); + + cmd->phy_interface = phy_select; + cmd->dev_address = dev_addr; + cmd->reg_address = cpu_to_le32(reg_addr); + + status = i40evf_asq_send_command(hw, &desc, NULL, 0, cmd_details); + if (!status) + *reg_val = le32_to_cpu(cmd->reg_value); + + return status; +} + /** * i40e_aq_send_msg_to_pf * @hw: pointer to the hardware structure diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h index c9836bba487d..b624b5994075 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h @@ -111,6 +111,15 @@ i40e_status i40evf_aq_rx_ctl_write_register(struct i40e_hw *hw, u32 reg_addr, u32 reg_val, struct i40e_asq_cmd_details *cmd_details); void i40evf_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val); +i40e_status i40e_aq_set_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 reg_val, + struct i40e_asq_cmd_details *cmd_details); +i40e_status i40e_aq_get_phy_register(struct i40e_hw *hw, + u8 phy_select, u8 dev_addr, + u32 reg_addr, u32 *reg_val, + struct i40e_asq_cmd_details *cmd_details); + i40e_status i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg, u8 phy_addr, u16 *value); i40e_status i40e_write_phy_register(struct i40e_hw *hw, u8 page, diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index 2ea919d9cdcf..b53584e3d580 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -401,6 +401,18 @@ struct i40e_nvm_access { u8 data[1]; }; +/* (Q)SFP module access definitions */ +#define I40E_I2C_EEPROM_DEV_ADDR 0xA0 +#define I40E_I2C_EEPROM_DEV_ADDR2 0xA2 +#define I40E_MODULE_TYPE_ADDR 0x00 +#define I40E_MODULE_REVISION_ADDR 0x01 +#define I40E_MODULE_SFF_8472_COMP 0x5E +#define I40E_MODULE_SFF_8472_SWAP 0x5C +#define I40E_MODULE_SFF_ADDR_MODE 0x04 +#define I40E_MODULE_TYPE_QSFP_PLUS 0x0D +#define I40E_MODULE_TYPE_QSFP28 0x11 +#define I40E_MODULE_QSFP_MAX_LEN 640 + /* PCI bus types */ enum i40e_bus_type { i40e_bus_type_unknown = 0, From 00f6c2f5e20bbdb638e58c50c6e6b1d8b796d6f6 Mon Sep 17 00:00:00 2001 From: Mariusz Stachura Date: Tue, 22 Aug 2017 06:57:45 -0400 Subject: [PATCH 1168/1322] i40e: use admin queue for setting LEDs behavior Instead of accessing register directly, use newly added AQC in order to blink LEDs. Introduce and utilize a new flag to prevent excessive API version checking. Signed-off-by: Mariusz Stachura Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_adminq.c | 6 + drivers/net/ethernet/intel/i40e/i40e_common.c | 113 ++++++++++++++---- drivers/net/ethernet/intel/i40evf/i40e_type.h | 2 + 3 files changed, 98 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index ba04988e0598..08f63226105a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -607,6 +607,12 @@ i40e_status i40e_init_adminq(struct i40e_hw *hw) &oem_lo); hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo; + if (hw->mac.type == I40E_MAC_XL710 && + hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR && + hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) { + hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE; + } + if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) { ret_code = I40E_ERR_FIRMWARE_API_VERSION; goto init_adminq_free_arq; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index fada03799850..a4838779de5d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -4836,6 +4836,74 @@ phy_blinking_end: return status; } +/** + * i40e_led_get_reg - read LED register + * @hw: pointer to the HW structure + * @led_addr: LED register address + * @reg_val: read register value + **/ +static enum i40e_status_code i40e_led_get_reg(struct i40e_hw *hw, u16 led_addr, + u32 *reg_val) +{ + enum i40e_status_code status; + u8 phy_addr = 0; + u8 port_num; + u32 i; + + *reg_val = 0; + if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) { + status = + i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL, + I40E_PHY_COM_REG_PAGE, + I40E_PHY_LED_PROV_REG_1, + reg_val, NULL); + } else { + i = rd32(hw, I40E_PFGEN_PORTNUM); + port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); + phy_addr = i40e_get_phy_address(hw, port_num); + status = i40e_read_phy_register_clause45(hw, + I40E_PHY_COM_REG_PAGE, + led_addr, phy_addr, + (u16 *)reg_val); + } + return status; +} + +/** + * i40e_led_set_reg - write LED register + * @hw: pointer to the HW structure + * @led_addr: LED register address + * @reg_val: register value to write + **/ +static enum i40e_status_code i40e_led_set_reg(struct i40e_hw *hw, u16 led_addr, + u32 reg_val) +{ + enum i40e_status_code status; + u8 phy_addr = 0; + u8 port_num; + u32 i; + + if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) { + status = + i40e_aq_set_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL, + I40E_PHY_COM_REG_PAGE, + I40E_PHY_LED_PROV_REG_1, + reg_val, NULL); + } else { + i = rd32(hw, I40E_PFGEN_PORTNUM); + port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); + phy_addr = i40e_get_phy_address(hw, port_num); + status = i40e_write_phy_register_clause45(hw, + I40E_PHY_COM_REG_PAGE, + led_addr, phy_addr, + (u16)reg_val); + } + + return status; +} + /** * i40e_led_get_phy - return current on/off mode * @hw: pointer to the hw struct @@ -4853,7 +4921,19 @@ i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr, u16 temp_addr; u8 port_num; u32 i; + u32 reg_val_aq; + if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) { + status = + i40e_aq_get_phy_register(hw, + I40E_AQ_PHY_REG_ACCESS_EXTERNAL, + I40E_PHY_COM_REG_PAGE, + I40E_PHY_LED_PROV_REG_1, + ®_val_aq, NULL); + if (status == I40E_SUCCESS) + *val = (u16)reg_val_aq; + return status; + } temp_addr = I40E_PHY_LED_PROV_REG_1; i = rd32(hw, I40E_PFGEN_PORTNUM); port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); @@ -4888,51 +4968,38 @@ i40e_status i40e_led_set_phy(struct i40e_hw *hw, bool on, u16 led_addr, u32 mode) { i40e_status status = 0; - u16 led_ctl = 0; - u16 led_reg = 0; - u8 phy_addr = 0; - u8 port_num; - u32 i; + u32 led_ctl = 0; + u32 led_reg = 0; - i = rd32(hw, I40E_PFGEN_PORTNUM); - port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK); - phy_addr = i40e_get_phy_address(hw, port_num); - status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, &led_reg); + status = i40e_led_get_reg(hw, led_addr, &led_reg); if (status) return status; led_ctl = led_reg; if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) { led_reg = 0; - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, - led_reg); + status = i40e_led_set_reg(hw, led_addr, led_reg); if (status) return status; } - status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, &led_reg); + status = i40e_led_get_reg(hw, led_addr, &led_reg); if (status) goto restore_config; if (on) led_reg = I40E_PHY_LED_MANUAL_ON; else led_reg = 0; - status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_reg); + + status = i40e_led_set_reg(hw, led_addr, led_reg); if (status) goto restore_config; if (mode & I40E_PHY_LED_MODE_ORIG) { led_ctl = (mode & I40E_PHY_LED_MODE_MASK); - status = i40e_write_phy_register_clause45(hw, - I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_ctl); + status = i40e_led_set_reg(hw, led_addr, led_ctl); } return status; + restore_config: - status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE, - led_addr, phy_addr, led_ctl); + status = i40e_led_set_reg(hw, led_addr, led_ctl); return status; } diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index b53584e3d580..48eacf5e73e4 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -568,6 +568,8 @@ struct i40e_hw { /* LLDP/DCBX Status */ u16 dcbx_status; +#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2) + /* DCBX info */ struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */ struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */ From ba4e003d29c1d32776f156695fb00adf7df86ee2 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 22 Aug 2017 06:57:46 -0400 Subject: [PATCH 1169/1322] i40e: don't hold spinlock while resetting VF When we refactored handling of the PVID in commit 9af52f60b2d9 ("i40e: use (add|rm)_vlan_all_mac helper functions when changing PVID") we introduced a scheduling while atomic regression. This occurred because we now held the spinlock across a call to i40e_reset_vf(), which results in a usleep_range() call that triggers a scheduling while atomic bug. This was rare as it only occurred if the user configured a VLAN on a VF and also attempted to reconfigure the VF from the host system with a port VLAN. We do need to hold the lock while calling i40e_is_vsi_in_vlan(), but we should not be holding it while we reset the VF. We'll fix this by introducing a separate helper function i40e_vsi_has_vlans which checks whether we have a PVID and whether the VSI has configured VLANs. This helper function will manage its own need for the mac_filter_hash_lock. Then, we can move the acquiring of the spinlock until after we reset the VF, which ensures that we do not sleep while holding the lock. Using a separate function like this makes the code more clear and is easier to read than attempting to release and re-acquire the spinlock when we reset the VF. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 9e3667fc7f6a..53ead127b293 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2925,6 +2925,34 @@ error_param: return ret; } +/** + * i40e_vsi_has_vlans - True if VSI has configured VLANs + * @vsi: pointer to the vsi + * + * Check if a VSI has configured any VLANs. False if we have a port VLAN or if + * we have no configured VLANs. Do not call while holding the + * mac_filter_hash_lock. + */ +static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi) +{ + bool have_vlans; + + /* If we have a port VLAN, then the VSI cannot have any VLANs + * configured, as all MAC/VLAN filters will be assigned to the PVID. + */ + if (vsi->info.pvid) + return false; + + /* Since we don't have a PVID, we know that if the device is in VLAN + * mode it must be because of a VLAN filter configured on this VSI. + */ + spin_lock_bh(&vsi->mac_filter_hash_lock); + have_vlans = i40e_is_vsi_in_vlan(vsi); + spin_unlock_bh(&vsi->mac_filter_hash_lock); + + return have_vlans; +} + /** * i40e_ndo_set_vf_port_vlan * @netdev: network interface device structure @@ -2977,10 +3005,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, /* duplicate request, so just return success */ goto error_pvid; - /* Locked once because multiple functions below iterate list */ - spin_lock_bh(&vsi->mac_filter_hash_lock); - - if (le16_to_cpu(vsi->info.pvid) == 0 && i40e_is_vsi_in_vlan(vsi)) { + if (i40e_vsi_has_vlans(vsi)) { dev_err(&pf->pdev->dev, "VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n", vf_id); @@ -2993,6 +3018,9 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, vsi = pf->vsi[vf->lan_vsi_idx]; } + /* Locked once because multiple functions below iterate list */ + spin_lock_bh(&vsi->mac_filter_hash_lock); + /* Check for condition where there was already a port VLAN ID * filter set and now it is being deleted by setting it to zero. * Additionally check for the condition where there was a port From eeeddbb80640ef63466a54bc118f66c81487bc42 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 22 Aug 2017 06:57:47 -0400 Subject: [PATCH 1170/1322] i40e: drop i40e_pf *pf from i40e_vc_disable_vf() It's never used, and the vf structure could get back to the PF if necessary. Lets just drop the extra unneeded parameter. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 53ead127b293..70a79864177a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -154,12 +154,11 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf) /** * i40e_vc_disable_vf - * @pf: pointer to the PF info * @vf: pointer to the VF info * * Disable the VF through a SW reset **/ -static inline void i40e_vc_disable_vf(struct i40e_pf *pf, struct i40e_vf *vf) +static inline void i40e_vc_disable_vf(struct i40e_vf *vf) { i40e_vc_notify_vf_reset(vf); i40e_reset_vf(vf, false); @@ -2918,7 +2917,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) } /* Force the VF driver stop so it has to reload with new MAC address */ - i40e_vc_disable_vf(pf, vf); + i40e_vc_disable_vf(vf); dev_info(&pf->pdev->dev, "Reload the VF driver to make this change effective.\n"); error_param: @@ -3013,7 +3012,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, * the right thing by reconfiguring his network correctly * and then reloading the VF driver. */ - i40e_vc_disable_vf(pf, vf); + i40e_vc_disable_vf(vf); /* During reset the VF got a new VSI, so refresh the pointer. */ vsi = pf->vsi[vf->lan_vsi_idx]; } From f18d20218a14d11d8fd6ed32e66ad199c8c93280 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 22 Aug 2017 06:57:48 -0400 Subject: [PATCH 1171/1322] i40e: make use of i40e_vc_disable_vf Replace i40e_vc_notify_vf_reset and i40e_reset_vf with a call to i40e_vc_disable_vf which does this exact thing. This matches similar code patterns throughout the driver. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 70a79864177a..94ee243f110e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3388,8 +3388,7 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting) goto out; vf->trusted = setting; - i40e_vc_notify_vf_reset(vf); - i40e_reset_vf(vf, false); + i40e_vc_disable_vf(vf); dev_info(&pf->pdev->dev, "VF %u is now %strusted\n", vf_id, setting ? "" : "un"); out: From d43d60e5eb9504aa6f8f390aa0313cc8e3816b82 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 22 Aug 2017 06:57:49 -0400 Subject: [PATCH 1172/1322] i40e: ensure reset occurs when disabling VF It is possible although rare that we may not reset when i40e_vc_disable_vf() is called. This can lead to some weird circumstances with some values not being properly set. Modify i40e_reset_vf() to return a code indicating whether it reset or not. Now, i40e_vc_disable_vf() can wait until a reset actually occurs. If it fails to free up within a reasonable time frame we'll display a warning message. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 42 +++++++++++++++---- .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 4 +- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 94ee243f110e..7742cf3d38d9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -156,12 +156,28 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf) * i40e_vc_disable_vf * @vf: pointer to the VF info * - * Disable the VF through a SW reset + * Disable the VF through a SW reset. **/ static inline void i40e_vc_disable_vf(struct i40e_vf *vf) { + int i; + i40e_vc_notify_vf_reset(vf); - i40e_reset_vf(vf, false); + + /* We want to ensure that an actual reset occurs initiated after this + * function was called. However, we do not want to wait forever, so + * we'll give a reasonable time and print a message if we failed to + * ensure a reset. + */ + for (i = 0; i < 20; i++) { + if (i40e_reset_vf(vf, false)) + return; + usleep_range(10000, 20000); + } + + dev_warn(&vf->pf->pdev->dev, + "Failed to initiate reset for VF %d after 200 milliseconds\n", + vf->vf_id); } /** @@ -1051,9 +1067,9 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf) * @vf: pointer to the VF structure * @flr: VFLR was issued or not * - * reset the VF + * Returns true if the VF is reset, false otherwise. **/ -void i40e_reset_vf(struct i40e_vf *vf, bool flr) +bool i40e_reset_vf(struct i40e_vf *vf, bool flr) { struct i40e_pf *pf = vf->pf; struct i40e_hw *hw = &pf->hw; @@ -1061,9 +1077,11 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr) u32 reg; int i; - /* If VFs have been disabled, there is no need to reset */ + /* If the VFs have been disabled, this means something else is + * resetting the VF, so we shouldn't continue. + */ if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) - return; + return false; i40e_trigger_vf_reset(vf, flr); @@ -1100,6 +1118,8 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr) i40e_flush(hw); clear_bit(__I40E_VF_DISABLE, pf->state); + + return true; } /** @@ -1111,8 +1131,10 @@ void i40e_reset_vf(struct i40e_vf *vf, bool flr) * VF, then do all the waiting in one chunk, and finally finish restoring each * VF after the wait. This is useful during PF routines which need to reset * all VFs, as otherwise it must perform these resets in a serialized fashion. + * + * Returns true if any VFs were reset, and false otherwise. **/ -void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) +bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) { struct i40e_hw *hw = &pf->hw; struct i40e_vf *vf; @@ -1121,11 +1143,11 @@ void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* If we don't have any VFs, then there is nothing to reset */ if (!pf->num_alloc_vfs) - return; + return false; /* If VFs have been disabled, there is no need to reset */ if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) - return; + return false; /* Begin reset on all VFs at once */ for (v = 0; v < pf->num_alloc_vfs; v++) @@ -1200,6 +1222,8 @@ void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) i40e_flush(hw); clear_bit(__I40E_VF_DISABLE, pf->state); + + return true; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 5111d05d5f2f..5ea42ad094bc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -122,8 +122,8 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs); int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen); int i40e_vc_process_vflr_event(struct i40e_pf *pf); -void i40e_reset_vf(struct i40e_vf *vf, bool flr); -void i40e_reset_all_vfs(struct i40e_pf *pf, bool flr); +bool i40e_reset_vf(struct i40e_vf *vf, bool flr); +bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr); void i40e_vc_notify_vf_reset(struct i40e_vf *vf); /* VF configuration related iplink handlers */ From 5b36e8d04b4439c9ceb814bfdfe1284737f9c632 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Tue, 22 Aug 2017 06:57:50 -0400 Subject: [PATCH 1173/1322] i40evf: Enable VF to request an alternate queue allocation Currently the VF gets a default number of allocated queues from HW on init and it could choose to enable or disable those allocated queues. This makes it such that the VF can request more or less underlying allocated queues from the PF. First the VF negotiates the number of queues it wants that can be supported by the PF and if successful asks for a reset. During reset the PF will reallocate the HW queues for the VF and will then remap the new queues. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40evf/i40evf.h | 4 + .../ethernet/intel/i40evf/i40evf_ethtool.c | 38 +++++++- .../net/ethernet/intel/i40evf/i40evf_main.c | 94 ++++++++++++++++++- .../ethernet/intel/i40evf/i40evf_virtchnl.c | 44 ++++++++- 4 files changed, 173 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h index 82f69031e5cd..5982362c5643 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf.h +++ b/drivers/net/ethernet/intel/i40evf/i40evf.h @@ -102,6 +102,7 @@ struct i40e_vsi { #define I40E_TX_CTXTDESC(R, i) \ (&(((struct i40e_tx_context_desc *)((R)->desc))[i])) #define MAX_QUEUES 16 +#define I40EVF_MAX_REQ_QUEUES 4 #define I40EVF_HKEY_ARRAY_SIZE ((I40E_VFQF_HKEY_MAX_INDEX + 1) * 4) #define I40EVF_HLUT_ARRAY_SIZE ((I40E_VFQF_HLUT_MAX_INDEX + 1) * 4) @@ -200,6 +201,7 @@ struct i40evf_adapter { struct list_head vlan_filter_list; char misc_vector_name[IFNAMSIZ + 9]; int num_active_queues; + int num_req_queues; /* TX */ struct i40e_ring *tx_rings; @@ -235,6 +237,7 @@ struct i40evf_adapter { #define I40EVF_FLAG_PROMISC_ON BIT(18) #define I40EVF_FLAG_ALLMULTI_ON BIT(19) #define I40EVF_FLAG_LEGACY_RX BIT(20) +#define I40EVF_FLAG_REINIT_ITR_NEEDED BIT(21) /* duplicates for common code */ #define I40E_FLAG_DCB_ENABLED 0 #define I40E_FLAG_RX_CSUM_ENABLED I40EVF_FLAG_RX_CSUM_ENABLED @@ -349,6 +352,7 @@ void i40evf_deconfigure_queues(struct i40evf_adapter *adapter); void i40evf_enable_queues(struct i40evf_adapter *adapter); void i40evf_disable_queues(struct i40evf_adapter *adapter); void i40evf_map_queues(struct i40evf_adapter *adapter); +int i40evf_request_queues(struct i40evf_adapter *adapter, int num); void i40evf_add_ether_addrs(struct i40evf_adapter *adapter); void i40evf_del_ether_addrs(struct i40evf_adapter *adapter); void i40evf_add_vlans(struct i40evf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c index 65874d6b3ab9..da006fa3fec1 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c @@ -669,7 +669,7 @@ static void i40evf_get_channels(struct net_device *netdev, struct i40evf_adapter *adapter = netdev_priv(netdev); /* Report maximum channels */ - ch->max_combined = adapter->num_active_queues; + ch->max_combined = I40EVF_MAX_REQ_QUEUES; ch->max_other = NONQ_VECS; ch->other_count = NONQ_VECS; @@ -677,6 +677,41 @@ static void i40evf_get_channels(struct net_device *netdev, ch->combined_count = adapter->num_active_queues; } +/** + * i40evf_set_channels: set the new channel count + * @netdev: network interface device structure + * @ch: channel information structure + * + * Negotiate a new number of channels with the PF then do a reset. During + * reset we'll realloc queues and fix the RSS table. Returns 0 on success, + * negative on failure. + **/ +static int i40evf_set_channels(struct net_device *netdev, + struct ethtool_channels *ch) +{ + struct i40evf_adapter *adapter = netdev_priv(netdev); + int num_req = ch->combined_count; + + if (num_req != adapter->num_active_queues && + !(adapter->vf_res->vf_cap_flags & + VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)) { + dev_info(&adapter->pdev->dev, "PF is not capable of queue negotiation.\n"); + return -EINVAL; + } + + /* All of these should have already been checked by ethtool before this + * even gets to us, but just to be sure. + */ + if (num_req <= 0 || num_req > I40EVF_MAX_REQ_QUEUES) + return -EINVAL; + + if (ch->rx_count || ch->tx_count || ch->other_count != NONQ_VECS) + return -EINVAL; + + adapter->num_req_queues = num_req; + return i40evf_request_queues(adapter, num_req); +} + /** * i40evf_get_rxfh_key_size - get the RSS hash key size * @netdev: network interface device structure @@ -785,6 +820,7 @@ static const struct ethtool_ops i40evf_ethtool_ops = { .get_rxfh = i40evf_get_rxfh, .set_rxfh = i40evf_set_rxfh, .get_channels = i40evf_get_channels, + .set_channels = i40evf_set_channels, .get_rxfh_key_size = i40evf_get_rxfh_key_size, .get_link_ksettings = i40evf_get_link_ksettings, }; diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 69ef6c1d5364..8c513ce84345 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -1189,9 +1189,18 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter) { int i, num_active_queues; - num_active_queues = min_t(int, - adapter->vsi_res->num_queue_pairs, - (int)(num_online_cpus())); + /* If we're in reset reallocating queues we don't actually know yet for + * certain the PF gave us the number of queues we asked for but we'll + * assume it did. Once basic reset is finished we'll confirm once we + * start negotiating config with PF. + */ + if (adapter->num_req_queues) + num_active_queues = adapter->num_req_queues; + else + num_active_queues = min_t(int, + adapter->vsi_res->num_queue_pairs, + (int)(num_online_cpus())); + adapter->tx_rings = kcalloc(num_active_queues, sizeof(struct i40e_ring), GFP_KERNEL); @@ -1539,6 +1548,48 @@ static void i40evf_free_rss(struct i40evf_adapter *adapter) adapter->rss_lut = NULL; } +/** + * i40evf_reinit_interrupt_scheme - Reallocate queues and vectors + * @adapter: board private structure + * + * Returns 0 on success, negative on failure + **/ +static int i40evf_reinit_interrupt_scheme(struct i40evf_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + int err; + + if (netif_running(netdev)) + i40evf_free_traffic_irqs(adapter); + i40evf_free_misc_irq(adapter); + i40evf_reset_interrupt_capability(adapter); + i40evf_free_q_vectors(adapter); + i40evf_free_queues(adapter); + + err = i40evf_init_interrupt_scheme(adapter); + if (err) + goto err; + + netif_tx_stop_all_queues(netdev); + + err = i40evf_request_misc_irq(adapter); + if (err) + goto err; + + set_bit(__I40E_VSI_DOWN, adapter->vsi.state); + + err = i40evf_map_rings_to_vectors(adapter); + if (err) + goto err; + + if (RSS_AQ(adapter)) + adapter->aq_required |= I40EVF_FLAG_AQ_CONFIGURE_RSS; + else + err = i40evf_init_rss(adapter); +err: + return err; +} + /** * i40evf_watchdog_timer - Periodic call-back timer * @data: pointer to adapter disguised as unsigned long @@ -1885,8 +1936,15 @@ continue_reset: if (err) dev_info(&adapter->pdev->dev, "Failed to init adminq: %d\n", err); + adapter->aq_required = 0; - adapter->aq_required = I40EVF_FLAG_AQ_GET_CONFIG; + if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) { + err = i40evf_reinit_interrupt_scheme(adapter); + if (err) + goto reset_err; + } + + adapter->aq_required |= I40EVF_FLAG_AQ_GET_CONFIG; adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS; /* re-add all MAC filters */ @@ -1916,6 +1974,15 @@ continue_reset: if (err) goto reset_err; + if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) { + err = i40evf_request_traffic_irqs(adapter, + netdev->name); + if (err) + goto reset_err; + + adapter->flags &= ~I40EVF_FLAG_REINIT_ITR_NEEDED; + } + i40evf_configure(adapter); i40evf_up_complete(adapter); @@ -2431,9 +2498,9 @@ static int i40evf_check_reset_complete(struct i40e_hw *hw) int i40evf_process_config(struct i40evf_adapter *adapter) { struct virtchnl_vf_resource *vfres = adapter->vf_res; + int i, num_req_queues = adapter->num_req_queues; struct net_device *netdev = adapter->netdev; struct i40e_vsi *vsi = &adapter->vsi; - int i; netdev_features_t hw_enc_features; netdev_features_t hw_features; @@ -2447,6 +2514,23 @@ int i40evf_process_config(struct i40evf_adapter *adapter) return -ENODEV; } + if (num_req_queues && + num_req_queues != adapter->vsi_res->num_queue_pairs) { + /* Problem. The PF gave us fewer queues than what we had + * negotiated in our request. Need a reset to see if we can't + * get back to a working state. + */ + dev_err(&adapter->pdev->dev, + "Requested %d queues, but PF only gave us %d.\n", + num_req_queues, + adapter->vsi_res->num_queue_pairs); + adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED; + adapter->num_req_queues = adapter->vsi_res->num_queue_pairs; + i40evf_schedule_reset(adapter); + return -ENODEV; + } + adapter->num_req_queues = 0; + hw_enc_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c index 2bb0fe00361f..2bb81c39d85f 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c @@ -160,7 +160,8 @@ int i40evf_send_vf_config_msg(struct i40evf_adapter *adapter) VIRTCHNL_VF_OFFLOAD_WB_ON_ITR | VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 | VIRTCHNL_VF_OFFLOAD_ENCAP | - VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM; + VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM | + VIRTCHNL_VF_OFFLOAD_REQ_QUEUES; adapter->current_op = VIRTCHNL_OP_GET_VF_RESOURCES; adapter->aq_required &= ~I40EVF_FLAG_AQ_GET_CONFIG; @@ -384,6 +385,32 @@ void i40evf_map_queues(struct i40evf_adapter *adapter) kfree(vimi); } +/** + * i40evf_request_queues + * @adapter: adapter structure + * @num: number of requested queues + * + * We get a default number of queues from the PF. This enables us to request a + * different number. Returns 0 on success, negative on failure + **/ +int i40evf_request_queues(struct i40evf_adapter *adapter, int num) +{ + struct virtchnl_vf_res_request vfres; + + if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) { + /* bail because we already have a command pending */ + dev_err(&adapter->pdev->dev, "Cannot request queues, command %d pending\n", + adapter->current_op); + return -EBUSY; + } + + vfres.num_queue_pairs = num; + + adapter->current_op = VIRTCHNL_OP_REQUEST_QUEUES; + return i40evf_send_pf_msg(adapter, VIRTCHNL_OP_REQUEST_QUEUES, + (u8 *)&vfres, sizeof(vfres)); +} + /** * i40evf_add_ether_addrs * @adapter: adapter structure @@ -1068,6 +1095,21 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter, "Invalid message %d from PF\n", v_opcode); } break; + case VIRTCHNL_OP_REQUEST_QUEUES: { + struct virtchnl_vf_res_request *vfres = + (struct virtchnl_vf_res_request *)msg; + if (vfres->num_queue_pairs == adapter->num_req_queues) { + adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED; + i40evf_schedule_reset(adapter); + } else { + dev_info(&adapter->pdev->dev, + "Requested %d queues, PF can support %d\n", + adapter->num_req_queues, + vfres->num_queue_pairs); + adapter->num_req_queues = 0; + } + } + break; default: if (adapter->current_op && (v_opcode != adapter->current_op)) dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n", From 1b7b7596aeebc21913bad49eb6a2c364c4b2988a Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Tue, 22 Aug 2017 06:57:51 -0400 Subject: [PATCH 1174/1322] i40e: make i40evf_map_rings_to_vectors void This function cannot fail, so why is it returning a value? And why are we checking it? Why shouldn't we just make it void? Why is this commit message made up of only questions? Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 8c513ce84345..f2f1e754c2ce 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -430,12 +430,11 @@ i40evf_map_vector_to_txq(struct i40evf_adapter *adapter, int v_idx, int t_idx) * group the rings as "efficiently" as possible. You would add new * mapping configurations in here. **/ -static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) +static void i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) { int rings_remaining = adapter->num_active_queues; int ridx = 0, vidx = 0; int q_vectors; - int err = 0; q_vectors = adapter->num_msix_vectors - NONQ_VECS; @@ -451,8 +450,6 @@ static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) } adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS; - - return err; } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1578,9 +1575,7 @@ static int i40evf_reinit_interrupt_scheme(struct i40evf_adapter *adapter) set_bit(__I40E_VSI_DOWN, adapter->vsi.state); - err = i40evf_map_rings_to_vectors(adapter); - if (err) - goto err; + i40evf_map_rings_to_vectors(adapter); if (RSS_AQ(adapter)) adapter->aq_required |= I40EVF_FLAG_AQ_CONFIGURE_RSS; From 41d0a4d0c8b144e44d92ea95e975d2434748d806 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Tue, 22 Aug 2017 06:57:52 -0400 Subject: [PATCH 1175/1322] i40e: fix handling of vf_states variable Currently we inappropriately clear the vf_states variable with a null assignment. This is problematic because we should be using atomic bitops on this variable and we don't actually want to clear all the flags. We should just clear the ones we know we want to clear. Additionally remove the I40E_VF_STATE_FCOEENA bit because it is no longer being used. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 5 ++++- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 7742cf3d38d9..989a65d60ac9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -879,7 +879,8 @@ static void i40e_free_vf_res(struct i40e_vf *vf) } /* reset some of the state variables keeping track of the resources */ vf->num_queue_pairs = 0; - vf->vf_states = 0; + clear_bit(I40E_VF_STATE_MC_PROMISC, &vf->vf_states); + clear_bit(I40E_VF_STATE_UC_PROMISC, &vf->vf_states); } /** @@ -1586,6 +1587,8 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_IWARP)) { vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_IWARP; set_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states); + } else { + clear_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states); } if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 5ea42ad094bc..5efc4f92bb37 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -56,7 +56,6 @@ enum i40e_vf_states { I40E_VF_STATE_INIT = 0, I40E_VF_STATE_ACTIVE, I40E_VF_STATE_IWARPENA, - I40E_VF_STATE_FCOEENA, I40E_VF_STATE_DISABLED, I40E_VF_STATE_MC_PROMISC, I40E_VF_STATE_UC_PROMISC, From c53d11f669c0e7d0daf46a717b6712ad0b09de99 Mon Sep 17 00:00:00 2001 From: Alan Brady Date: Tue, 22 Aug 2017 06:57:53 -0400 Subject: [PATCH 1176/1322] i40e: fix client notify of VF reset Currently there is a bug in which the PF driver fails to inform clients of a VF reset which then causes clients to leak resources. The bug exists because we were incorrectly checking the I40E_VF_STATE_PRE_ENABLE bit. When a VF is first init we go through a reset to initialize variables and allocate resources but we don't want to inform clients of this first reset since the client isn't fully enabled yet so we set a state bit signifying we're in a "pre-enabled" client state. During the first reset we should be clearing the bit, allowing all following resets to notify the client of the reset when the bit is not set. This patch fixes the issue by negating the 'test_and_clear_bit' check to accurately reflect the behavior we want. Signed-off-by: Alan Brady Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 989a65d60ac9..04568137e029 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1050,8 +1050,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf) set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states); clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states); /* Do not notify the client during VF init */ - if (test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE, - &vf->vf_states)) + if (!test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE, + &vf->vf_states)) i40e_notify_client_of_vf_reset(pf, abs_vf_id); vf->num_vlan = 0; } From ab243ec9401d164531cc9bc07fb32231d72d1280 Mon Sep 17 00:00:00 2001 From: Scott Peterson Date: Tue, 22 Aug 2017 06:57:54 -0400 Subject: [PATCH 1177/1322] i40e: Stop dropping 802.1ad tags - eth proto 0x88a8 Enable i40e to pass traffic with VLAN tags using the 802.1ad ethernet protocol ID (0x88a8). This requires NIC firmware providing version 1.7 of the API. With older NIC firmware 802.1ad tagged packets will continue to be dropped. No VLAN offloads nor RSS are supported for 802.1ad VLANs. Signed-off-by: Scott Peterson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_adminq.c | 6 ++++++ .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 17 ++++++++++++++++- drivers/net/ethernet/intel/i40e/i40e_common.c | 6 +++++- drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++ drivers/net/ethernet/intel/i40e/i40e_type.h | 6 ++++++ .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h | 17 ++++++++++++++++- drivers/net/ethernet/intel/i40evf/i40e_type.h | 6 ++++++ 7 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c index 08f63226105a..9dcb2a961197 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c @@ -613,6 +613,12 @@ i40e_status i40e_init_adminq(struct i40e_hw *hw) hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE; } + /* The ability to RX (not drop) 802.1ad frames was added in API 1.7 */ + if (hw->aq.api_maj_ver > 1 || + (hw->aq.api_maj_ver == 1 && + hw->aq.api_min_ver >= 7)) + hw->flags |= I40E_HW_FLAG_802_1AD_CAPABLE; + if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) { ret_code = I40E_ERR_FIRMWARE_API_VERSION; goto init_adminq_free_arq; diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index ed7bbe14bc6e..4c85ea9cd89a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -775,7 +775,22 @@ struct i40e_aqc_set_switch_config { #define I40E_AQ_SET_SWITCH_CFG_PROMISC 0x0001 #define I40E_AQ_SET_SWITCH_CFG_L2_FILTER 0x0002 __le16 valid_flags; - u8 reserved[12]; + /* The ethertype in switch_tag is dropped on ingress and used + * internally by the switch. Set this to zero for the default + * of 0x88a8 (802.1ad). Should be zero for firmware API + * versions lower than 1.7. + */ + __le16 switch_tag; + /* The ethertypes in first_tag and second_tag are used to + * match the outer and inner VLAN tags (respectively) when HW + * double VLAN tagging is enabled via the set port parameters + * AQ command. Otherwise these are both ignored. Set them to + * zero for their defaults of 0x8100 (802.1Q). Should be zero + * for firmware API versions lower than 1.7. + */ + __le16 first_tag; + __le16 second_tag; + u8 reserved[6]; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config); diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index a4838779de5d..60542beda7ad 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -2402,7 +2402,11 @@ enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw, i40e_aqc_opc_set_switch_config); scfg->flags = cpu_to_le16(flags); scfg->valid_flags = cpu_to_le16(valid_flags); - + if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) { + scfg->switch_tag = cpu_to_le16(hw->switch_tag); + scfg->first_tag = cpu_to_le16(hw->first_tag); + scfg->second_tag = cpu_to_le16(hw->second_tag); + } status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); return status; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 387f0863f794..3f9e89b054ec 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -11361,6 +11361,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw->bus.bus_id = pdev->bus->number; pf->instance = pfs_found; + /* Select something other than the 802.1ad ethertype for the + * switch to use internally and drop on ingress. + */ + hw->switch_tag = 0xffff; + hw->first_tag = ETH_P_8021AD; + hw->second_tag = ETH_P_8021Q; + INIT_LIST_HEAD(&pf->l3_flex_pit_list); INIT_LIST_HEAD(&pf->l4_flex_pit_list); diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 8b0b9f826b7f..4b32b1d38a66 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -610,9 +610,15 @@ struct i40e_hw { struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */ #define I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE BIT_ULL(0) +#define I40E_HW_FLAG_802_1AD_CAPABLE BIT_ULL(1) #define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2) u64 flags; + /* Used in set switch config AQ command */ + u16 switch_tag; + u16 first_tag; + u16 second_tag; + /* debug mask */ u32 debug_mask; char err_str[16]; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index eee7ece42b39..ed5602f4bbcd 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -771,7 +771,22 @@ struct i40e_aqc_set_switch_config { #define I40E_AQ_SET_SWITCH_CFG_PROMISC 0x0001 #define I40E_AQ_SET_SWITCH_CFG_L2_FILTER 0x0002 __le16 valid_flags; - u8 reserved[12]; + /* The ethertype in switch_tag is dropped on ingress and used + * internally by the switch. Set this to zero for the default + * of 0x88a8 (802.1ad). Should be zero for firmware API + * versions lower than 1.7. + */ + __le16 switch_tag; + /* The ethertypes in first_tag and second_tag are used to + * match the outer and inner VLAN tags (respectively) when HW + * double VLAN tagging is enabled via the set port parameters + * AQ command. Otherwise these are both ignored. Set them to + * zero for their defaults of 0x8100 (802.1Q). Should be zero + * for firmware API versions lower than 1.7. + */ + __le16 first_tag; + __le16 second_tag; + u8 reserved[6]; }; I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config); diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index 48eacf5e73e4..9364b67fff9c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -568,6 +568,7 @@ struct i40e_hw { /* LLDP/DCBX Status */ u16 dcbx_status; +#define I40E_HW_FLAG_802_1AD_CAPABLE BIT_ULL(1) #define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2) /* DCBX info */ @@ -575,6 +576,11 @@ struct i40e_hw { struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */ struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */ + /* Used in set switch config AQ command */ + u16 switch_tag; + u16 first_tag; + u16 second_tag; + /* debug mask */ u32 debug_mask; char err_str[16]; From eefca20eb20c66b06cf5ed09b49b1a7caaa27b7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Oct 2017 12:20:51 -0700 Subject: [PATCH 1178/1322] socket, bpf: fix possible use after free Starting from linux-4.4, 3WHS no longer takes the listener lock. Since this time, we might hit a use-after-free in sk_filter_charge(), if the filter we got in the memcpy() of the listener content just happened to be replaced by a thread changing listener BPF filter. To fix this, we need to make sure the filter refcount is not already zero before incrementing it again. Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 12 ++++++++---- net/core/sock.c | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..74b8c91fb5f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) diff --git a/net/core/sock.c b/net/core/sock.c index 7d55c05f449d..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1684,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new From 2b0b8499ae75df91455bbeb7491d45affc384fb0 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: [PATCH 1179/1322] ftrace: Fix kmemleak in unregister_ftrace_graph The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Cc: stable@vger.kernel.org Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } From f39b536ce9248e9799ff900358d6f073ab2e6c55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Sep 2017 20:19:02 -0700 Subject: [PATCH 1180/1322] rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}() The read of ->dynticks_nmi_nesting in rcu_irq_enter() and rcu_irq_exit() is currently protected with READ_ONCE(). However, this protection is unnecessary because (1) ->dynticks_nmi_nesting is updated only by the current CPU, (2) Although NMI handlers can update this field, they reset it back to its old value before return, and (3) Interrupts are disabled, so nothing else can modify it. The value of ->dynticks_nmi_nesting is thus effectively constant, and so no protection is required. This commit therefore removes the READ_ONCE() protection from these two accesses. Link: http://lkml.kernel.org/r/20170926031902.GA2074@linux.vnet.ibm.com Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63bee8e1b193..c03152f7e458 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -890,7 +890,7 @@ void rcu_irq_exit(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -1027,7 +1027,7 @@ void rcu_irq_enter(void) rdtp = this_cpu_ptr(&rcu_dynticks); /* Page faults can happen in NMI handlers, so check... */ - if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + if (rdtp->dynticks_nmi_nesting) return; oldval = rdtp->dynticks_nesting; From 0b40f457488d966878eec413a91f27d9b21e6ce5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:14 -0700 Subject: [PATCH 1181/1322] fm10k: prepare_for_reset() when we lose PCIe Link If we lose PCIe link, such as when an unannounced PFLR event occurs, or when a device is surprise removed, we currently detach the device and close the netdev. This unfortunately leaves a lot of things still active, such as the msix_mbx_pf IRQ, and Tx/Rx resources. This can cause problems because the register reads will return potentially invalid values which may result in unknown driver behavior. Begin the process of resetting using fm10k_prepare_for_reset(), much in the same way as the suspend and resume cycle does. This will attempt to shutdown as much as possible, in order to prevent possible issues. A naive implementation for this has issues, because there are now multiple flows calling the reset logic and setting a reset bit. This would cause problems, because the "re-attach" routine might call fm10k_handle_reset() prior to the reset actually finishing. Instead, we'll add state bits to indicate which flow actually initiated the reset. For the general reset flow, we'll assume that if someone else is resetting that we do not need to handle it at all, so it does not need its own state bit. For the suspend case, we will simply issue a warning indicating that we are attempting to recover from this case when resuming. For the detached subtask, we'll simply refuse to re-attach until we've actually initiated a reset as part of that flow. Finally, we'll stop attempting to manage the mailbox subtask when we're detached, since there's nothing we can do if we don't have a PCIe address. Overall this produces a much cleaner shutdown and recovery cycle for a PCIe surprise remove event. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k.h | 2 + drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 105 ++++++++++++++----- 2 files changed, 80 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 689c413b7782..ba70c58ca920 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -270,6 +270,8 @@ enum fm10k_flags_t { enum fm10k_state_t { __FM10K_RESETTING, + __FM10K_RESET_DETACHED, + __FM10K_RESET_SUSPENDED, __FM10K_DOWN, __FM10K_SERVICE_SCHED, __FM10K_SERVICE_REQUEST, diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 9575f7c1862d..4e5e3e64beda 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -153,7 +153,15 @@ static void fm10k_service_timer(unsigned long data) fm10k_service_event_schedule(interface); } -static void fm10k_prepare_for_reset(struct fm10k_intfc *interface) +/** + * fm10k_prepare_for_reset - Prepare the driver and device for a pending reset + * @interface: fm10k private data structure + * + * This function prepares for a device reset by shutting as much down as we + * can. It does nothing and returns false if __FM10K_RESETTING was already set + * prior to calling this function. It returns true if it actually did work. + */ +static bool fm10k_prepare_for_reset(struct fm10k_intfc *interface) { struct net_device *netdev = interface->netdev; @@ -162,8 +170,9 @@ static void fm10k_prepare_for_reset(struct fm10k_intfc *interface) /* put off any impending NetWatchDogTimeout */ netif_trans_update(netdev); - while (test_and_set_bit(__FM10K_RESETTING, interface->state)) - usleep_range(1000, 2000); + /* Nothing to do if a reset is already in progress */ + if (test_and_set_bit(__FM10K_RESETTING, interface->state)) + return false; rtnl_lock(); @@ -181,6 +190,8 @@ static void fm10k_prepare_for_reset(struct fm10k_intfc *interface) interface->last_reset = jiffies + (10 * HZ); rtnl_unlock(); + + return true; } static int fm10k_handle_reset(struct fm10k_intfc *interface) @@ -189,6 +200,8 @@ static int fm10k_handle_reset(struct fm10k_intfc *interface) struct fm10k_hw *hw = &interface->hw; int err; + WARN_ON(!test_bit(__FM10K_RESETTING, interface->state)); + rtnl_lock(); pci_set_master(interface->pdev); @@ -267,35 +280,70 @@ static void fm10k_detach_subtask(struct fm10k_intfc *interface) struct net_device *netdev = interface->netdev; u32 __iomem *hw_addr; u32 value; + int err; - /* do nothing if device is still present or hw_addr is set */ + /* do nothing if netdev is still present or hw_addr is set */ if (netif_device_present(netdev) || interface->hw.hw_addr) return; + /* We've lost the PCIe register space, and can no longer access the + * device. Shut everything except the detach subtask down and prepare + * to reset the device in case we recover. If we actually prepare for + * reset, indicate that we're detached. + */ + if (fm10k_prepare_for_reset(interface)) + set_bit(__FM10K_RESET_DETACHED, interface->state); + /* check the real address space to see if we've recovered */ hw_addr = READ_ONCE(interface->uc_addr); value = readl(hw_addr); if (~value) { + /* Make sure the reset was initiated because we detached, + * otherwise we might race with a different reset flow. + */ + if (!test_and_clear_bit(__FM10K_RESET_DETACHED, + interface->state)) + return; + + /* Restore the hardware address */ interface->hw.hw_addr = interface->uc_addr; + + /* PCIe link has been restored, and the device is active + * again. Restore everything and reset the device. + */ + err = fm10k_handle_reset(interface); + if (err) { + netdev_err(netdev, "Unable to reset device: %d\n", err); + interface->hw.hw_addr = NULL; + return; + } + + /* Re-attach the netdev */ netif_device_attach(netdev); - set_bit(FM10K_FLAG_RESET_REQUESTED, interface->flags); netdev_warn(netdev, "PCIe link restored, device now attached\n"); return; } - - rtnl_lock(); - - if (netif_running(netdev)) - dev_close(netdev); - - rtnl_unlock(); } -static void fm10k_reinit(struct fm10k_intfc *interface) +static void fm10k_reset_subtask(struct fm10k_intfc *interface) { int err; - fm10k_prepare_for_reset(interface); + if (!test_and_clear_bit(FM10K_FLAG_RESET_REQUESTED, + interface->flags)) + return; + + /* If another thread has already prepared to reset the device, we + * should not attempt to handle a reset here, since we'd race with + * that thread. This may happen if we suspend the device or if the + * PCIe link is lost. In this case, we'll just ignore the RESET + * request, as it will (eventually) be taken care of when the thread + * which actually started the reset is finished. + */ + if (!fm10k_prepare_for_reset(interface)) + return; + + netdev_err(interface->netdev, "Reset interface\n"); err = fm10k_handle_reset(interface); if (err) @@ -303,17 +351,6 @@ static void fm10k_reinit(struct fm10k_intfc *interface) "fm10k_handle_reset failed: %d\n", err); } -static void fm10k_reset_subtask(struct fm10k_intfc *interface) -{ - if (!test_and_clear_bit(FM10K_FLAG_RESET_REQUESTED, - interface->flags)) - return; - - netdev_err(interface->netdev, "Reset interface\n"); - - fm10k_reinit(interface); -} - /** * fm10k_configure_swpri_map - Configure Receive SWPRI to PC mapping * @interface: board private structure @@ -381,6 +418,10 @@ static void fm10k_watchdog_update_host_state(struct fm10k_intfc *interface) **/ static void fm10k_mbx_subtask(struct fm10k_intfc *interface) { + /* If we're resetting, bail out */ + if (test_bit(__FM10K_RESETTING, interface->state)) + return; + /* process upstream mailbox and update device state */ fm10k_watchdog_update_host_state(interface); @@ -630,9 +671,11 @@ static void fm10k_service_task(struct work_struct *work) interface = container_of(work, struct fm10k_intfc, service_task); + /* Check whether we're detached first */ + fm10k_detach_subtask(interface); + /* tasks run even when interface is down */ fm10k_mbx_subtask(interface); - fm10k_detach_subtask(interface); fm10k_reset_subtask(interface); /* tasks only run when interface is up */ @@ -2177,7 +2220,8 @@ static void fm10k_prepare_suspend(struct fm10k_intfc *interface) */ fm10k_stop_service_event(interface); - fm10k_prepare_for_reset(interface); + if (fm10k_prepare_for_reset(interface)) + set_bit(__FM10K_RESET_SUSPENDED, interface->state); } static int fm10k_handle_resume(struct fm10k_intfc *interface) @@ -2185,6 +2229,13 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) struct fm10k_hw *hw = &interface->hw; int err; + /* Even if we didn't properly prepare for reset in + * fm10k_prepare_suspend, we'll attempt to resume anyways. + */ + if (!test_and_clear_bit(__FM10K_RESET_SUSPENDED, interface->state)) + dev_warn(&interface->pdev->dev, + "Device was shut down as part of suspend... Attempting to recover\n"); + /* reset statistics starting values */ hw->mac.ops.rebind_hw_stats(hw, &interface->stats); From b4fcd43661df0d84cc4e030ab7a26533114889b9 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:15 -0700 Subject: [PATCH 1182/1322] fm10k: use spinlock to implement mailbox lock Lets not re-invent the locking wheel. Remove our bitlock and use a proper spinlock instead. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k.h | 15 +++++---------- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 3 +++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index ba70c58ca920..74542e9d63c7 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -276,7 +276,6 @@ enum fm10k_state_t { __FM10K_SERVICE_SCHED, __FM10K_SERVICE_REQUEST, __FM10K_SERVICE_DISABLE, - __FM10K_MBX_LOCK, __FM10K_LINK_DOWN, __FM10K_UPDATING_STATS, /* This value must be last and determines the BITMAP size */ @@ -346,6 +345,8 @@ struct fm10k_intfc { struct fm10k_hw_stats stats; struct fm10k_hw hw; + /* Mailbox lock */ + spinlock_t mbx_lock; u32 __iomem *uc_addr; u32 __iomem *sw_addr; u16 msg_enable; @@ -386,23 +387,17 @@ struct fm10k_intfc { static inline void fm10k_mbx_lock(struct fm10k_intfc *interface) { - /* busy loop if we cannot obtain the lock as some calls - * such as ndo_set_rx_mode may be made in atomic context - */ - while (test_and_set_bit(__FM10K_MBX_LOCK, interface->state)) - udelay(20); + spin_lock(&interface->mbx_lock); } static inline void fm10k_mbx_unlock(struct fm10k_intfc *interface) { - /* flush memory to make sure state is correct */ - smp_mb__before_atomic(); - clear_bit(__FM10K_MBX_LOCK, interface->state); + spin_unlock(&interface->mbx_lock); } static inline int fm10k_mbx_trylock(struct fm10k_intfc *interface) { - return !test_and_set_bit(__FM10K_MBX_LOCK, interface->state); + return spin_trylock(&interface->mbx_lock); } /* fm10k_test_staterr - test bits in Rx descriptor status and error fields */ diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 4e5e3e64beda..240772ad5d69 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1921,6 +1921,9 @@ static int fm10k_sw_init(struct fm10k_intfc *interface, netdev_rss_key_fill(rss_key, sizeof(rss_key)); memcpy(interface->rssrk, rss_key, sizeof(rss_key)); + /* Initialize the mailbox lock */ + spin_lock_init(&interface->mbx_lock); + /* Start off interface as being down */ set_bit(__FM10K_DOWN, interface->state); set_bit(__FM10K_UPDATING_STATS, interface->state); From 8249c47c6ba48cd3eba7c3ca7f8e733ee815c39b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:16 -0700 Subject: [PATCH 1183/1322] fm10k: use generic PM hooks instead of legacy PCIe power hooks Replace the PCI specific legacy power management hooks with the new generic power management hooks which work properly for both suspend and hibernate. The new generic system is better and properly handles the lower level PCIe power management rather than forcing the driver to handle it. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 67 +++++++------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 240772ad5d69..aef39909e4a2 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -2264,36 +2264,19 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) #ifdef CONFIG_PM /** - * fm10k_resume - Restore device to pre-sleep state - * @pdev: PCI device information struct + * fm10k_resume - Generic PM resume hook + * @dev: generic device structure * - * fm10k_resume is called after the system has powered back up from a sleep - * state and is ready to resume operation. This function is meant to restore - * the device back to its pre-sleep state. + * Generic PM hook used when waking the device from a low power state after + * suspend or hibernation. This function does not need to handle lower PCIe + * device state as the stack takes care of that for us. **/ -static int fm10k_resume(struct pci_dev *pdev) +static int fm10k_resume(struct device *dev) { - struct fm10k_intfc *interface = pci_get_drvdata(pdev); + struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev)); struct net_device *netdev = interface->netdev; struct fm10k_hw *hw = &interface->hw; - u32 err; - - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - - /* pci_restore_state clears dev->state_saved so call - * pci_save_state to restore it. - */ - pci_save_state(pdev); - - err = pci_enable_device_mem(pdev); - if (err) { - dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n"); - return err; - } - pci_set_master(pdev); - - pci_wake_from_d3(pdev, false); + int err; /* refresh hw_addr in case it was dropped */ hw->hw_addr = interface->uc_addr; @@ -2308,36 +2291,27 @@ static int fm10k_resume(struct pci_dev *pdev) } /** - * fm10k_suspend - Prepare the device for a system sleep state - * @pdev: PCI device information struct + * fm10k_suspend - Generic PM suspend hook + * @dev: generic device structure * - * fm10k_suspend is meant to shutdown the device prior to the system entering - * a sleep state. The fm10k hardware does not support wake on lan so the - * driver simply needs to shut down the device so it is in a low power state. + * Generic PM hook used when setting the device into a low power state for + * system suspend or hibernation. This function does not need to handle lower + * PCIe device state as the stack takes care of that for us. **/ -static int fm10k_suspend(struct pci_dev *pdev, - pm_message_t __always_unused state) +static int fm10k_suspend(struct device *dev) { - struct fm10k_intfc *interface = pci_get_drvdata(pdev); + struct fm10k_intfc *interface = pci_get_drvdata(to_pci_dev(dev)); struct net_device *netdev = interface->netdev; - int err = 0; netif_device_detach(netdev); fm10k_prepare_suspend(interface); - err = pci_save_state(pdev); - if (err) - return err; - - pci_disable_device(pdev); - pci_wake_from_d3(pdev, false); - pci_set_power_state(pdev, PCI_D3hot); - return 0; } #endif /* CONFIG_PM */ + /** * fm10k_io_error_detected - called when PCI error is detected * @pdev: Pointer to PCI device @@ -2447,15 +2421,18 @@ static const struct pci_error_handlers fm10k_err_handler = { .reset_done = fm10k_io_reset_done, }; +static SIMPLE_DEV_PM_OPS(fm10k_pm_ops, fm10k_suspend, fm10k_resume); + static struct pci_driver fm10k_driver = { .name = fm10k_driver_name, .id_table = fm10k_pci_tbl, .probe = fm10k_probe, .remove = fm10k_remove, #ifdef CONFIG_PM - .suspend = fm10k_suspend, - .resume = fm10k_resume, -#endif + .driver = { + .pm = &fm10k_pm_ops, + }, +#endif /* CONFIG_PM */ .sriov_configure = fm10k_iov_configure, .err_handler = &fm10k_err_handler }; From fc9173682dcf73cfe3324267424ef17e854bb444 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:17 -0700 Subject: [PATCH 1184/1322] fm10k: introduce a message queue for MAC/VLAN messages Under some circumstances, when dealing with a large number of MAC address or VLAN updates at once, the fm10k driver, particularly the VFs can overload the mailbox with too many messages at once. This results in a mailbox timeout, which causes the driver to initiate a reset. During the reset, we re-send all the same messages that originally caused the timeout. This results in a cycle of resets each triggering a future reset. To fix or avoid this, we introduce a workqueue item which monitors a queue of MAC and VLAN requests. These requests are queued to the end of the list, and we process as a FIFO periodically. Initially we only handle requests for the netdev, but we do handle unicast MAC addresses, multicast MAC addresses, and update VLAN requests. A future patch will add support to use this queue for handling MAC update requests from the VF<->PF mailbox. The MAC/VLAN work item will keep checking to make sure that each request does not overflow the mailbox and cause a timeout. If it might, then the work item will reschedule itself a short time later. This avoids any reset cycle, since we never send the message if the mailbox is not ready. As an alternative, we tried increasing the mailbox message FIFO, but this just delays the problem and results in needless memory waste on the system. Our new message queue is dynamically allocated so only uses as much memory as it needs. Additionally, it need not be contiguous like the Tx and Rx FIFOs. Note that this patch chose to only create a queue for MAC and VLAN messages, since these are the only messages sent in a large enough volume to cause the reset loop. Other messages are very unlikely to overflow the mailbox Tx FIFO so easily. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k.h | 39 ++++ .../net/ethernet/intel/fm10k/fm10k_netdev.c | 199 +++++++++++++---- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 201 ++++++++++++++++++ 3 files changed, 397 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 74542e9d63c7..40856bc0f3b9 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -248,6 +248,29 @@ struct fm10k_udp_port { __be16 port; }; +enum fm10k_macvlan_request_type { + FM10K_UC_MAC_REQUEST, + FM10K_MC_MAC_REQUEST, + FM10K_VLAN_REQUEST +}; + +struct fm10k_macvlan_request { + enum fm10k_macvlan_request_type type; + struct list_head list; + union { + struct fm10k_mac_request { + u8 addr[ETH_ALEN]; + u16 glort; + u16 vid; + } mac; + struct fm10k_vlan_request { + u32 vid; + u8 vsi; + } vlan; + }; + bool set; +}; + /* one work queue for entire driver */ extern struct workqueue_struct *fm10k_workqueue; @@ -276,6 +299,9 @@ enum fm10k_state_t { __FM10K_SERVICE_SCHED, __FM10K_SERVICE_REQUEST, __FM10K_SERVICE_DISABLE, + __FM10K_MACVLAN_SCHED, + __FM10K_MACVLAN_REQUEST, + __FM10K_MACVLAN_DISABLE, __FM10K_LINK_DOWN, __FM10K_UPDATING_STATS, /* This value must be last and determines the BITMAP size */ @@ -368,6 +394,12 @@ struct fm10k_intfc { struct list_head vxlan_port; struct list_head geneve_port; + /* MAC/VLAN update queue */ + struct list_head macvlan_requests; + struct delayed_work macvlan_task; + /* MAC/VLAN update queue lock */ + spinlock_t macvlan_lock; + #ifdef CONFIG_DEBUG_FS struct dentry *dbg_intfc; #endif /* CONFIG_DEBUG_FS */ @@ -487,6 +519,7 @@ void fm10k_up(struct fm10k_intfc *interface); void fm10k_down(struct fm10k_intfc *interface); void fm10k_update_stats(struct fm10k_intfc *interface); void fm10k_service_event_schedule(struct fm10k_intfc *interface); +void fm10k_macvlan_schedule(struct fm10k_intfc *interface); void fm10k_update_rx_drop_en(struct fm10k_intfc *interface); #ifdef CONFIG_NET_POLL_CONTROLLER void fm10k_netpoll(struct net_device *netdev); @@ -507,6 +540,12 @@ void fm10k_reset_rx_state(struct fm10k_intfc *); int fm10k_setup_tc(struct net_device *dev, u8 tc); int fm10k_open(struct net_device *netdev); int fm10k_close(struct net_device *netdev); +int fm10k_queue_vlan_request(struct fm10k_intfc *interface, u32 vid, + u8 vsi, bool set); +int fm10k_queue_mac_request(struct fm10k_intfc *interface, u16 glort, + const unsigned char *addr, u16 vid, bool set); +void fm10k_clear_macvlan_queue(struct fm10k_intfc *interface, + u16 glort, bool vlans); /* Ethtool */ void fm10k_set_ethtool_ops(struct net_device *dev); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index 77d495fedced..81e4425f0529 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -758,11 +758,132 @@ static bool fm10k_host_mbx_ready(struct fm10k_intfc *interface) return (hw->mac.type == fm10k_mac_vf || interface->host_ready); } +/** + * fm10k_queue_vlan_request - Queue a VLAN update request + * @interface: the fm10k interface structure + * @vid: the VLAN vid + * @vsi: VSI index number + * @set: whether to set or clear + * + * This function queues up a VLAN update. For VFs, this must be sent to the + * managing PF over the mailbox. For PFs, we'll use the same handling so that + * it's similar to the VF. This avoids storming the PF<->VF mailbox with too + * many VLAN updates during reset. + */ +int fm10k_queue_vlan_request(struct fm10k_intfc *interface, + u32 vid, u8 vsi, bool set) +{ + struct fm10k_macvlan_request *request; + unsigned long flags; + + /* This must be atomic since we may be called while the netdev + * addr_list_lock is held + */ + request = kzalloc(sizeof(*request), GFP_ATOMIC); + if (!request) + return -ENOMEM; + + request->type = FM10K_VLAN_REQUEST; + request->vlan.vid = vid; + request->vlan.vsi = vsi; + request->set = set; + + spin_lock_irqsave(&interface->macvlan_lock, flags); + list_add_tail(&request->list, &interface->macvlan_requests); + spin_unlock_irqrestore(&interface->macvlan_lock, flags); + + fm10k_macvlan_schedule(interface); + + return 0; +} + +/** + * fm10k_queue_mac_request - Queue a MAC update request + * @interface: the fm10k interface structure + * @glort: the target glort for this update + * @addr: the address to update + * @vid: the vid to update + * @sync: whether to add or remove + * + * This function queues up a MAC request for sending to the switch manager. + * A separate thread monitors the queue and sends updates to the switch + * manager. Return 0 on success, and negative error code on failure. + **/ +int fm10k_queue_mac_request(struct fm10k_intfc *interface, u16 glort, + const unsigned char *addr, u16 vid, bool set) +{ + struct fm10k_macvlan_request *request; + unsigned long flags; + + /* This must be atomic since we may be called while the netdev + * addr_list_lock is held + */ + request = kzalloc(sizeof(*request), GFP_ATOMIC); + if (!request) + return -ENOMEM; + + if (is_multicast_ether_addr(addr)) + request->type = FM10K_MC_MAC_REQUEST; + else + request->type = FM10K_UC_MAC_REQUEST; + + ether_addr_copy(request->mac.addr, addr); + request->mac.glort = glort; + request->mac.vid = vid; + request->set = set; + + spin_lock_irqsave(&interface->macvlan_lock, flags); + list_add_tail(&request->list, &interface->macvlan_requests); + spin_unlock_irqrestore(&interface->macvlan_lock, flags); + + fm10k_macvlan_schedule(interface); + + return 0; +} + +/** + * fm10k_clear_macvlan_queue - Cancel pending updates for a given glort + * @interface: the fm10k interface structure + * @glort: the target glort to clear + * @vlans: true to clear VLAN messages, false to ignore them + * + * Cancel any outstanding MAC/VLAN requests for a given glort. This is + * expected to be called when a logical port goes down. + **/ +void fm10k_clear_macvlan_queue(struct fm10k_intfc *interface, + u16 glort, bool vlans) + +{ + struct fm10k_macvlan_request *r, *tmp; + unsigned long flags; + + spin_lock_irqsave(&interface->macvlan_lock, flags); + + /* Free any outstanding MAC/VLAN requests for this interface */ + list_for_each_entry_safe(r, tmp, &interface->macvlan_requests, list) { + switch (r->type) { + case FM10K_MC_MAC_REQUEST: + case FM10K_UC_MAC_REQUEST: + /* Don't free requests for other interfaces */ + if (r->mac.glort != glort) + break; + /* fall through */ + case FM10K_VLAN_REQUEST: + if (vlans) { + list_del(&r->list); + kfree(r); + } + break; + } + } + + spin_unlock_irqrestore(&interface->macvlan_lock, flags); +} + static int fm10k_uc_vlan_unsync(struct net_device *netdev, const unsigned char *uc_addr) { struct fm10k_intfc *interface = netdev_priv(netdev); - struct fm10k_hw *hw = &interface->hw; u16 glort = interface->glort; u16 vid = interface->vid; bool set = !!(vid / VLAN_N_VID); @@ -771,10 +892,7 @@ static int fm10k_uc_vlan_unsync(struct net_device *netdev, /* drop any leading bits on the VLAN ID */ vid &= VLAN_N_VID - 1; - if (fm10k_host_mbx_ready(interface)) - err = hw->mac.ops.update_uc_addr(hw, glort, uc_addr, - vid, set, 0); - + err = fm10k_queue_mac_request(interface, glort, uc_addr, vid, set); if (err) return err; @@ -786,7 +904,6 @@ static int fm10k_mc_vlan_unsync(struct net_device *netdev, const unsigned char *mc_addr) { struct fm10k_intfc *interface = netdev_priv(netdev); - struct fm10k_hw *hw = &interface->hw; u16 glort = interface->glort; u16 vid = interface->vid; bool set = !!(vid / VLAN_N_VID); @@ -795,9 +912,7 @@ static int fm10k_mc_vlan_unsync(struct net_device *netdev, /* drop any leading bits on the VLAN ID */ vid &= VLAN_N_VID - 1; - if (fm10k_host_mbx_ready(interface)) - err = hw->mac.ops.update_mc_addr(hw, glort, mc_addr, vid, set); - + err = fm10k_queue_mac_request(interface, glort, mc_addr, vid, set); if (err) return err; @@ -855,18 +970,14 @@ static int fm10k_update_vid(struct net_device *netdev, u16 vid, bool set) /* only need to update the VLAN if not in promiscuous mode */ if (!(netdev->flags & IFF_PROMISC)) { - err = hw->mac.ops.update_vlan(hw, vid, 0, set); + err = fm10k_queue_vlan_request(interface, vid, 0, set); if (err) goto err_out; } - /* update our base MAC address if host's mailbox is ready */ - if (fm10k_host_mbx_ready(interface)) - err = hw->mac.ops.update_uc_addr(hw, interface->glort, - hw->mac.addr, vid, set, 0); - else - err = -EHOSTDOWN; - + /* Update our base MAC address */ + err = fm10k_queue_mac_request(interface, interface->glort, + hw->mac.addr, vid, set); if (err) goto err_out; @@ -910,7 +1021,6 @@ static u16 fm10k_find_next_vlan(struct fm10k_intfc *interface, u16 vid) static void fm10k_clear_unused_vlans(struct fm10k_intfc *interface) { - struct fm10k_hw *hw = &interface->hw; u32 vid, prev_vid; /* loop through and find any gaps in the table */ @@ -922,7 +1032,7 @@ static void fm10k_clear_unused_vlans(struct fm10k_intfc *interface) /* send request to clear multiple bits at a time */ prev_vid += (vid - prev_vid - 1) << FM10K_VLAN_LENGTH_SHIFT; - hw->mac.ops.update_vlan(hw, prev_vid, 0, false); + fm10k_queue_vlan_request(interface, prev_vid, 0, false); } } @@ -937,15 +1047,11 @@ static int __fm10k_uc_sync(struct net_device *dev, if (!is_valid_ether_addr(addr)) return -EADDRNOTAVAIL; - /* update table with current entries if host's mailbox is ready */ - if (!fm10k_host_mbx_ready(interface)) - return -EHOSTDOWN; - for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1; vid < VLAN_N_VID; vid = fm10k_find_next_vlan(interface, vid)) { - err = hw->mac.ops.update_uc_addr(hw, glort, addr, - vid, sync, 0); + err = fm10k_queue_mac_request(interface, glort, + addr, vid, sync); if (err) return err; } @@ -1002,15 +1108,18 @@ static int __fm10k_mc_sync(struct net_device *dev, struct fm10k_intfc *interface = netdev_priv(dev); struct fm10k_hw *hw = &interface->hw; u16 vid, glort = interface->glort; + s32 err; - /* update table with current entries if host's mailbox is ready */ - if (!fm10k_host_mbx_ready(interface)) - return 0; + if (!is_multicast_ether_addr(addr)) + return -EADDRNOTAVAIL; for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1; vid < VLAN_N_VID; vid = fm10k_find_next_vlan(interface, vid)) { - hw->mac.ops.update_mc_addr(hw, glort, addr, vid, sync); + err = fm10k_queue_mac_request(interface, glort, + addr, vid, sync); + if (err) + return err; } return 0; @@ -1050,7 +1159,8 @@ static void fm10k_set_rx_mode(struct net_device *dev) if (interface->xcast_mode != xcast_mode) { /* update VLAN table */ if (xcast_mode == FM10K_XCAST_MODE_PROMISC) - hw->mac.ops.update_vlan(hw, FM10K_VLAN_ALL, 0, true); + fm10k_queue_vlan_request(interface, FM10K_VLAN_ALL, + 0, true); if (interface->xcast_mode == FM10K_XCAST_MODE_PROMISC) fm10k_clear_unused_vlans(interface); @@ -1098,22 +1208,20 @@ void fm10k_restore_rx_state(struct fm10k_intfc *interface) interface->glort_count, true); /* update VLAN table */ - hw->mac.ops.update_vlan(hw, FM10K_VLAN_ALL, 0, - xcast_mode == FM10K_XCAST_MODE_PROMISC); + fm10k_queue_vlan_request(interface, FM10K_VLAN_ALL, 0, + xcast_mode == FM10K_XCAST_MODE_PROMISC); /* Add filter for VLAN 0 */ - hw->mac.ops.update_vlan(hw, 0, 0, true); + fm10k_queue_vlan_request(interface, 0, 0, true); /* update table with current entries */ for (vid = hw->mac.default_vid ? fm10k_find_next_vlan(interface, 0) : 1; vid < VLAN_N_VID; vid = fm10k_find_next_vlan(interface, vid)) { - hw->mac.ops.update_vlan(hw, vid, 0, true); + fm10k_queue_vlan_request(interface, vid, 0, true); - /* Update unicast entries if host's mailbox is ready */ - if (fm10k_host_mbx_ready(interface)) - hw->mac.ops.update_uc_addr(hw, glort, hw->mac.addr, - vid, true, 0); + fm10k_queue_mac_request(interface, glort, + hw->mac.addr, vid, true); } /* update xcast mode before synchronizing addresses if host's mailbox @@ -1140,6 +1248,13 @@ void fm10k_reset_rx_state(struct fm10k_intfc *interface) struct net_device *netdev = interface->netdev; struct fm10k_hw *hw = &interface->hw; + /* Wait for MAC/VLAN work to finish */ + while (test_bit(__FM10K_MACVLAN_SCHED, interface->state)) + usleep_range(1000, 2000); + + /* Cancel pending MAC/VLAN requests */ + fm10k_clear_macvlan_queue(interface, interface->glort, true); + fm10k_mbx_lock(interface); /* clear the logical port state on lower device if host's mailbox is @@ -1374,8 +1489,8 @@ static void *fm10k_dfwd_add_station(struct net_device *dev, if (fm10k_host_mbx_ready(interface)) { hw->mac.ops.update_xcast_mode(hw, glort, FM10K_XCAST_MODE_MULTI); - hw->mac.ops.update_uc_addr(hw, glort, sdev->dev_addr, - 0, true, 0); + fm10k_queue_mac_request(interface, glort, sdev->dev_addr, + 0, true); } fm10k_mbx_unlock(interface); @@ -1414,8 +1529,8 @@ static void fm10k_dfwd_del_station(struct net_device *dev, void *priv) if (fm10k_host_mbx_ready(interface)) { hw->mac.ops.update_xcast_mode(hw, glort, FM10K_XCAST_MODE_NONE); - hw->mac.ops.update_uc_addr(hw, glort, sdev->dev_addr, - 0, false, 0); + fm10k_queue_mac_request(interface, glort, sdev->dev_addr, + 0, false); } fm10k_mbx_unlock(interface); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index aef39909e4a2..58538ce997e1 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -91,6 +91,76 @@ static int fm10k_hw_ready(struct fm10k_intfc *interface) return FM10K_REMOVED(hw->hw_addr) ? -ENODEV : 0; } +/** + * fm10k_macvlan_schedule - Schedule MAC/VLAN queue task + * @interface: fm10k private interface structure + * + * Schedule the MAC/VLAN queue monitor task. If the MAC/VLAN task cannot be + * started immediately, request that it be restarted when possible. + */ +void fm10k_macvlan_schedule(struct fm10k_intfc *interface) +{ + /* Avoid processing the MAC/VLAN queue when the service task is + * disabled, or when we're resetting the device. + */ + if (!test_bit(__FM10K_MACVLAN_DISABLE, interface->state) && + !test_and_set_bit(__FM10K_MACVLAN_SCHED, interface->state)) { + clear_bit(__FM10K_MACVLAN_REQUEST, interface->state); + /* We delay the actual start of execution in order to allow + * multiple MAC/VLAN updates to accumulate before handling + * them, and to allow some time to let the mailbox drain + * between runs. + */ + queue_delayed_work(fm10k_workqueue, + &interface->macvlan_task, 10); + } else { + set_bit(__FM10K_MACVLAN_REQUEST, interface->state); + } +} + +/** + * fm10k_stop_macvlan_task - Stop the MAC/VLAN queue monitor + * @interface: fm10k private interface structure + * + * Wait until the MAC/VLAN queue task has stopped, and cancel any future + * requests. + */ +static void fm10k_stop_macvlan_task(struct fm10k_intfc *interface) +{ + /* Disable the MAC/VLAN work item */ + set_bit(__FM10K_MACVLAN_DISABLE, interface->state); + + /* Make sure we waited until any current invocations have stopped */ + cancel_delayed_work_sync(&interface->macvlan_task); + + /* We set the __FM10K_MACVLAN_SCHED bit when we schedule the task. + * However, it may not be unset of the MAC/VLAN task never actually + * got a chance to run. Since we've canceled the task here, and it + * cannot be rescheuled right now, we need to ensure the scheduled bit + * gets unset. + */ + clear_bit(__FM10K_MACVLAN_SCHED, interface->state); +} + +/** + * fm10k_resume_macvlan_task - Restart the MAC/VLAN queue monitor + * @interface: fm10k private interface structure + * + * Clear the __FM10K_MACVLAN_DISABLE bit and, if a request occurred, schedule + * the MAC/VLAN work monitor. + */ +static void fm10k_resume_macvlan_task(struct fm10k_intfc *interface) +{ + /* Re-enable the MAC/VLAN work item */ + clear_bit(__FM10K_MACVLAN_DISABLE, interface->state); + + /* We might have received a MAC/VLAN request while disabled. If so, + * kick off the queue now. + */ + if (test_bit(__FM10K_MACVLAN_REQUEST, interface->state)) + fm10k_macvlan_schedule(interface); +} + void fm10k_service_event_schedule(struct fm10k_intfc *interface) { if (!test_bit(__FM10K_SERVICE_DISABLE, interface->state) && @@ -174,6 +244,12 @@ static bool fm10k_prepare_for_reset(struct fm10k_intfc *interface) if (test_and_set_bit(__FM10K_RESETTING, interface->state)) return false; + /* As the MAC/VLAN task will be accessing registers it must not be + * running while we reset. Although the task will not be scheduled + * once we start resetting it may already be running + */ + fm10k_stop_macvlan_task(interface); + rtnl_lock(); fm10k_iov_suspend(interface->pdev); @@ -258,6 +334,8 @@ static int fm10k_handle_reset(struct fm10k_intfc *interface) rtnl_unlock(); + fm10k_resume_macvlan_task(interface); + clear_bit(__FM10K_RESETTING, interface->state); return err; @@ -686,6 +764,112 @@ static void fm10k_service_task(struct work_struct *work) fm10k_service_event_complete(interface); } +/** + * fm10k_macvlan_task - send queued MAC/VLAN requests to switch manager + * @work: pointer to work_struct containing our data + * + * This work item handles sending MAC/VLAN updates to the switch manager. When + * the interface is up, it will attempt to queue mailbox messages to the + * switch manager requesting updates for MAC/VLAN pairs. If the Tx fifo of the + * mailbox is full, it will reschedule itself to try again in a short while. + * This ensures that the driver does not overload the switch mailbox with too + * many simultaneous requests, causing an unnecessary reset. + **/ +static void fm10k_macvlan_task(struct work_struct *work) +{ + struct fm10k_macvlan_request *item; + struct fm10k_intfc *interface; + struct delayed_work *dwork; + struct list_head *requests; + struct fm10k_hw *hw; + unsigned long flags; + + dwork = to_delayed_work(work); + interface = container_of(dwork, struct fm10k_intfc, macvlan_task); + hw = &interface->hw; + requests = &interface->macvlan_requests; + + do { + /* Pop the first item off the list */ + spin_lock_irqsave(&interface->macvlan_lock, flags); + item = list_first_entry_or_null(requests, + struct fm10k_macvlan_request, + list); + if (item) + list_del_init(&item->list); + + spin_unlock_irqrestore(&interface->macvlan_lock, flags); + + /* We have no more items to process */ + if (!item) + goto done; + + fm10k_mbx_lock(interface); + + /* Check that we have plenty of space to send the message. We + * want to ensure that the mailbox stays low enough to avoid a + * change in the host state, otherwise we may see spurious + * link up / link down notifications. + */ + if (!hw->mbx.ops.tx_ready(&hw->mbx, FM10K_VFMBX_MSG_MTU + 5)) { + hw->mbx.ops.process(hw, &hw->mbx); + set_bit(__FM10K_MACVLAN_REQUEST, interface->state); + fm10k_mbx_unlock(interface); + + /* Put the request back on the list */ + spin_lock_irqsave(&interface->macvlan_lock, flags); + list_add(&item->list, requests); + spin_unlock_irqrestore(&interface->macvlan_lock, flags); + break; + } + + switch (item->type) { + case FM10K_MC_MAC_REQUEST: + hw->mac.ops.update_mc_addr(hw, + item->mac.glort, + item->mac.addr, + item->mac.vid, + item->set); + break; + case FM10K_UC_MAC_REQUEST: + hw->mac.ops.update_uc_addr(hw, + item->mac.glort, + item->mac.addr, + item->mac.vid, + item->set, + 0); + break; + case FM10K_VLAN_REQUEST: + hw->mac.ops.update_vlan(hw, + item->vlan.vid, + item->vlan.vsi, + item->set); + break; + default: + break; + } + + fm10k_mbx_unlock(interface); + + /* Free the item now that we've sent the update */ + kfree(item); + } while (true); + +done: + WARN_ON(!test_bit(__FM10K_MACVLAN_SCHED, interface->state)); + + /* flush memory to make sure state is correct */ + smp_mb__before_atomic(); + clear_bit(__FM10K_MACVLAN_SCHED, interface->state); + + /* If a MAC/VLAN request was scheduled since we started, we should + * re-schedule. However, there is no reason to re-schedule if there is + * no work to do. + */ + if (test_bit(__FM10K_MACVLAN_REQUEST, interface->state)) + fm10k_macvlan_schedule(interface); +} + /** * fm10k_configure_tx_ring - Configure Tx ring after Reset * @interface: board private structure @@ -1918,11 +2102,15 @@ static int fm10k_sw_init(struct fm10k_intfc *interface, INIT_LIST_HEAD(&interface->vxlan_port); INIT_LIST_HEAD(&interface->geneve_port); + /* Initialize the MAC/VLAN queue */ + INIT_LIST_HEAD(&interface->macvlan_requests); + netdev_rss_key_fill(rss_key, sizeof(rss_key)); memcpy(interface->rssrk, rss_key, sizeof(rss_key)); /* Initialize the mailbox lock */ spin_lock_init(&interface->mbx_lock); + spin_lock_init(&interface->macvlan_lock); /* Start off interface as being down */ set_bit(__FM10K_DOWN, interface->state); @@ -2131,6 +2319,9 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent) (unsigned long)interface); INIT_WORK(&interface->service_task, fm10k_service_task); + /* Setup the MAC/VLAN queue */ + INIT_DELAYED_WORK(&interface->macvlan_task, fm10k_macvlan_task); + /* kick off service timer now, even when interface is down */ mod_timer(&interface->service_timer, (HZ * 2) + jiffies); @@ -2184,6 +2375,10 @@ static void fm10k_remove(struct pci_dev *pdev) del_timer_sync(&interface->service_timer); fm10k_stop_service_event(interface); + fm10k_stop_macvlan_task(interface); + + /* Remove all pending MAC/VLAN requests */ + fm10k_clear_macvlan_queue(interface, interface->glort, true); /* free netdev, this may bounce the interrupts due to setup_tc */ if (netdev->reg_state == NETREG_REGISTERED) @@ -2220,6 +2415,9 @@ static void fm10k_prepare_suspend(struct fm10k_intfc *interface) * a surprise remove if the PCIe device is disabled while we're * stopped. We stop the watchdog task until after we resume software * activity. + * + * Note that the MAC/VLAN task will be stopped as part of preparing + * for reset so we don't need to handle it here. */ fm10k_stop_service_event(interface); @@ -2259,6 +2457,9 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) /* restart the service task */ fm10k_start_service_event(interface); + /* Restart the MAC/VLAN request queue in-case of outstanding events */ + fm10k_macvlan_schedule(interface); + return err; } From 1f5c27e52857c9ba8f1ee4ed5093bee1a341f330 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:18 -0700 Subject: [PATCH 1185/1322] fm10k: use the MAC/VLAN queue for VF<->PF MAC/VLAN requests Now that we have a working MAC/VLAN queue for handling MAC/VLAN messages from the netdev, replace the default handler for the VF<->PF messages. This new handler is very similar to the default code, but uses the MAC/VLAN queue instead of sending the message directly. Unfortunately we can't easily re-use the default code, so we'll just replace the entire function. This ensures that a VF requesting a large number of VLANs or MAC addresses does not start a reset cycle, as explained in the commit which introduced the message queue. Signed-off-by: Jacob Keller Reviewed-by: Ngai-mint Kwan Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 132 ++++++++++++++++++- drivers/net/ethernet/intel/fm10k/fm10k_pf.c | 2 +- drivers/net/ethernet/intel/fm10k/fm10k_pf.h | 3 +- 3 files changed, 133 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index 03897720bf0b..4a17cc903eed 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -35,10 +35,133 @@ static s32 fm10k_iov_msg_error(struct fm10k_hw *hw, u32 **results, return fm10k_tlv_msg_error(hw, results, mbx); } +/** + * fm10k_iov_msg_queue_mac_vlan - Message handler for MAC/VLAN request from VF + * @hw: Pointer to hardware structure + * @results: Pointer array to message, results[0] is pointer to message + * @mbx: Pointer to mailbox information structure + * + * This function is a custom handler for MAC/VLAN requests from the VF. The + * assumption is that it is acceptable to directly hand off the message from + * the VF to the PF's switch manager. However, we use a MAC/VLAN message + * queue to avoid overloading the mailbox when a large number of requests + * come in. + **/ +static s32 fm10k_iov_msg_queue_mac_vlan(struct fm10k_hw *hw, u32 **results, + struct fm10k_mbx_info *mbx) +{ + struct fm10k_vf_info *vf_info = (struct fm10k_vf_info *)mbx; + struct fm10k_intfc *interface = hw->back; + u8 mac[ETH_ALEN]; + u32 *result; + int err = 0; + bool set; + u16 vlan; + u32 vid; + + /* we shouldn't be updating rules on a disabled interface */ + if (!FM10K_VF_FLAG_ENABLED(vf_info)) + err = FM10K_ERR_PARAM; + + if (!err && !!results[FM10K_MAC_VLAN_MSG_VLAN]) { + result = results[FM10K_MAC_VLAN_MSG_VLAN]; + + /* record VLAN id requested */ + err = fm10k_tlv_attr_get_u32(result, &vid); + if (err) + return err; + + set = !(vid & FM10K_VLAN_CLEAR); + vid &= ~FM10K_VLAN_CLEAR; + + /* if the length field has been set, this is a multi-bit + * update request. For multi-bit requests, simply disallow + * them when the pf_vid has been set. In this case, the PF + * should have already cleared the VLAN_TABLE, and if we + * allowed them, it could allow a rogue VF to receive traffic + * on a VLAN it was not assigned. In the single-bit case, we + * need to modify requests for VLAN 0 to use the default PF or + * SW vid when assigned. + */ + + if (vid >> 16) { + /* prevent multi-bit requests when PF has + * administratively set the VLAN for this VF + */ + if (vf_info->pf_vid) + return FM10K_ERR_PARAM; + } else { + err = fm10k_iov_select_vid(vf_info, (u16)vid); + if (err < 0) + return err; + + vid = err; + } + + /* update VSI info for VF in regards to VLAN table */ + err = hw->mac.ops.update_vlan(hw, vid, vf_info->vsi, set); + } + + if (!err && !!results[FM10K_MAC_VLAN_MSG_MAC]) { + result = results[FM10K_MAC_VLAN_MSG_MAC]; + + /* record unicast MAC address requested */ + err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan); + if (err) + return err; + + /* block attempts to set MAC for a locked device */ + if (is_valid_ether_addr(vf_info->mac) && + !ether_addr_equal(mac, vf_info->mac)) + return FM10K_ERR_PARAM; + + set = !(vlan & FM10K_VLAN_CLEAR); + vlan &= ~FM10K_VLAN_CLEAR; + + err = fm10k_iov_select_vid(vf_info, vlan); + if (err < 0) + return err; + + vlan = (u16)err; + + /* Add this request to the MAC/VLAN queue */ + err = fm10k_queue_mac_request(interface, vf_info->glort, + mac, vlan, set); + } + + if (!err && !!results[FM10K_MAC_VLAN_MSG_MULTICAST]) { + result = results[FM10K_MAC_VLAN_MSG_MULTICAST]; + + /* record multicast MAC address requested */ + err = fm10k_tlv_attr_get_mac_vlan(result, mac, &vlan); + if (err) + return err; + + /* verify that the VF is allowed to request multicast */ + if (!(vf_info->vf_flags & FM10K_VF_FLAG_MULTI_ENABLED)) + return FM10K_ERR_PARAM; + + set = !(vlan & FM10K_VLAN_CLEAR); + vlan &= ~FM10K_VLAN_CLEAR; + + err = fm10k_iov_select_vid(vf_info, vlan); + if (err < 0) + return err; + + vlan = (u16)err; + + /* Add this request to the MAC/VLAN queue */ + err = fm10k_queue_mac_request(interface, vf_info->glort, + mac, vlan, set); + } + + return err; +} + static const struct fm10k_msg_data iov_mbx_data[] = { FM10K_TLV_MSG_TEST_HANDLER(fm10k_tlv_msg_test), FM10K_VF_MSG_MSIX_HANDLER(fm10k_iov_msg_msix_pf), - FM10K_VF_MSG_MAC_VLAN_HANDLER(fm10k_iov_msg_mac_vlan_pf), + FM10K_VF_MSG_MAC_VLAN_HANDLER(fm10k_iov_msg_queue_mac_vlan), FM10K_VF_MSG_LPORT_STATE_HANDLER(fm10k_iov_msg_lport_state_pf), FM10K_TLV_MSG_ERROR_HANDLER(fm10k_iov_msg_error), }; @@ -126,8 +249,10 @@ process_mbx: hw->mbx.ops.process(hw, &hw->mbx); /* verify port mapping is valid, if not reset port */ - if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort)) + if (vf_info->vf_flags && !fm10k_glort_valid_pf(hw, glort)) { hw->iov.ops.reset_lport(hw, vf_info); + fm10k_clear_macvlan_queue(interface, glort, false); + } /* reset VFs that have mailbox timed out */ if (!mbx->timeout) { @@ -190,6 +315,7 @@ void fm10k_iov_suspend(struct pci_dev *pdev) hw->iov.ops.reset_resources(hw, vf_info); hw->iov.ops.reset_lport(hw, vf_info); + fm10k_clear_macvlan_queue(interface, vf_info->glort, false); } } @@ -414,6 +540,8 @@ static inline void fm10k_reset_vf_info(struct fm10k_intfc *interface, /* disable LPORT for this VF which clears switch rules */ hw->iov.ops.reset_lport(hw, vf_info); + fm10k_clear_macvlan_queue(interface, vf_info->glort, false); + /* assign new MAC+VLAN for this VF */ hw->iov.ops.assign_default_mac_vlan(hw, vf_info); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c index 9e4fb3a44376..425d814aed4d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c @@ -1186,7 +1186,7 @@ s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *hw, u32 **results, * Will report an error if the VLAN ID is out of range. For VID = 0, it will * return either the pf_vid or sw_vid depending on which one is set. */ -static s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid) +s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid) { if (!vid) return vf_info->pf_vid ? vf_info->pf_vid : vf_info->sw_vid; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h index 3336d3c10760..e04d41f1a532 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h @@ -1,5 +1,5 @@ /* Intel(R) Ethernet Switch Host Interface Driver - * Copyright(c) 2013 - 2016 Intel Corporation. + * Copyright(c) 2013 - 2017 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -114,6 +114,7 @@ extern const struct fm10k_tlv_attr fm10k_err_msg_attr[]; #define FM10K_PF_MSG_ERR_HANDLER(msg, func) \ FM10K_MSG_HANDLER(FM10K_PF_MSG_ID_##msg, fm10k_err_msg_attr, func) +s32 fm10k_iov_select_vid(struct fm10k_vf_info *vf_info, u16 vid); s32 fm10k_iov_msg_msix_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); s32 fm10k_iov_msg_mac_vlan_pf(struct fm10k_hw *, u32 **, struct fm10k_mbx_info *); From ef57ab791c81b3a83c75a312b15b42f7440bb425 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 10 Jul 2017 13:23:19 -0700 Subject: [PATCH 1186/1322] fm10k: bump version number Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 189d52a8a605..5d56ed5ad7a6 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -28,7 +28,7 @@ #include "fm10k.h" -#define DRV_VERSION "0.21.7-k" +#define DRV_VERSION "0.22.1-k" #define DRV_SUMMARY "Intel(R) Ethernet Switch Host Interface Driver" const char fm10k_driver_version[] = DRV_VERSION; char fm10k_driver_name[] = "fm10k"; From c0ad8ef3df091ef179d78dccb810024612dcfa44 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 11 Aug 2017 09:17:15 -0700 Subject: [PATCH 1187/1322] fm10k: Fix misuse of net_ratelimit() Correct the backward logic using !net_ratelimit() Miscellanea: o Add a blank line before the error return label Signed-off-by: Joe Perches Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 5d56ed5ad7a6..dbd69310f263 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -806,9 +806,10 @@ static int fm10k_tso(struct fm10k_ring *tx_ring, tx_desc->mss = cpu_to_le16(skb_shinfo(skb)->gso_size); return 1; + err_vxlan: tx_ring->netdev->features &= ~NETIF_F_GSO_UDP_TUNNEL; - if (!net_ratelimit()) + if (net_ratelimit()) netdev_err(tx_ring->netdev, "TSO requested for unsupported tunnel, disabling offload\n"); return -1; From 87be98927eb0bfa5484dfbe5ba2f6b7f91dd9187 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 11 Aug 2017 11:14:37 -0700 Subject: [PATCH 1188/1322] fm10k: prefer %s and __func__ for diagnostic prints Don't hard code the function names in the diagnostic output when these reset related routines fail. Instead, use %s and __func__ so that future refactors don't need to change the print outs. Additionally, while we are here, add missing function header comments for the new reset_prepare and reset_done function handlers. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 58538ce997e1..1e9ae3197b17 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -2588,11 +2588,18 @@ static void fm10k_io_resume(struct pci_dev *pdev) if (err) dev_warn(&pdev->dev, - "fm10k_io_resume failed: %d\n", err); + "%s failed: %d\n", __func__, err); else netif_device_attach(netdev); } +/** + * fm10k_io_reset_prepare - called when PCI function is about to be reset + * @pdev: Pointer to PCI device + * + * This callback is called when the PCI function is about to be reset, + * allowing the device driver to prepare for it. + */ static void fm10k_io_reset_prepare(struct pci_dev *pdev) { /* warn incase we have any active VF devices */ @@ -2602,6 +2609,13 @@ static void fm10k_io_reset_prepare(struct pci_dev *pdev) fm10k_prepare_suspend(pci_get_drvdata(pdev)); } +/** + * fm10k_io_reset_done - called when PCI function has finished resetting + * @pdev: Pointer to PCI device + * + * This callback is called just after the PCI function is reset, such as via + * /sys/class/net//device/reset or similar. + */ static void fm10k_io_reset_done(struct pci_dev *pdev) { struct fm10k_intfc *interface = pci_get_drvdata(pdev); @@ -2609,7 +2623,7 @@ static void fm10k_io_reset_done(struct pci_dev *pdev) if (err) { dev_warn(&pdev->dev, - "fm10k_io_reset_notify failed: %d\n", err); + "%s failed: %d\n", __func__, err); netif_device_detach(interface->netdev); } } From 3e256ac5b1ec307e5dd5a4c99fbdbc651446c738 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 11 Aug 2017 11:14:58 -0700 Subject: [PATCH 1189/1322] fm10k: fix mis-ordered parameters in declaration for .ndo_set_vf_bw We've had support for setting both a minimum and maximum bandwidth via .ndo_set_vf_bw since commit 883a9ccbae56 ("fm10k: Add support for SR-IOV to driver", 2014-09-20). Likely because we do not support minimum rates, the declaration mis-ordered the "unused" parameter, which causes warnings when analyzed with cppcheck. Fix this warning by properly declaring the min_rate and max_rate variables in the declaration and definition (rather than using "unused"). Also rename "rate" to max_rate so as to clarify that we only support setting the maximum rate. Signed-off-by: Jacob Keller Tested-by: Krishneil Singh Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/fm10k/fm10k.h | 4 ++-- drivers/net/ethernet/intel/fm10k/fm10k_iov.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index 40856bc0f3b9..46973fb234c5 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -562,8 +562,8 @@ s32 fm10k_iov_update_pvid(struct fm10k_intfc *interface, u16 glort, u16 pvid); int fm10k_ndo_set_vf_mac(struct net_device *netdev, int vf_idx, u8 *mac); int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, u8 qos, __be16 vlan_proto); -int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, int rate, - int unused); +int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, + int __always_unused min_rate, int max_rate); int fm10k_ndo_get_vf_config(struct net_device *netdev, int vf_idx, struct ifla_vf_info *ivi); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c index 4a17cc903eed..ea3ab24265ee 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c @@ -613,7 +613,7 @@ int fm10k_ndo_set_vf_vlan(struct net_device *netdev, int vf_idx, u16 vid, } int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, - int __always_unused unused, int rate) + int __always_unused min_rate, int max_rate) { struct fm10k_intfc *interface = netdev_priv(netdev); struct fm10k_iov_data *iov_data = interface->iov_data; @@ -624,14 +624,15 @@ int fm10k_ndo_set_vf_bw(struct net_device *netdev, int vf_idx, return -EINVAL; /* rate limit cannot be less than 10Mbs or greater than link speed */ - if (rate && ((rate < FM10K_VF_TC_MIN) || rate > FM10K_VF_TC_MAX)) + if (max_rate && + (max_rate < FM10K_VF_TC_MIN || max_rate > FM10K_VF_TC_MAX)) return -EINVAL; /* store values */ - iov_data->vf_info[vf_idx].rate = rate; + iov_data->vf_info[vf_idx].rate = max_rate; /* update hardware configuration */ - hw->iov.ops.configure_tc(hw, vf_idx, rate); + hw->iov.ops.configure_tc(hw, vf_idx, max_rate); return 0; } From a047fbae23e1d94da28f81fb0f86fab4e473a094 Mon Sep 17 00:00:00 2001 From: Arjun Vynipadath Date: Tue, 3 Oct 2017 11:43:05 +0530 Subject: [PATCH 1190/1322] cxgb4: Update comment for min_mtu We have lost a comment for minimum mtu value set for netdevice with 'commit d894be57ca92 ("ethernet: use net core MTU range checking in more drivers"). Updating it accordingly. Signed-off-by: Arjun Vynipadath Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 13b636b0af5f..fe4cbe22d5d7 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -5024,7 +5024,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->priv_flags |= IFF_UNICAST_FLT; /* MTU range: 81 - 9600 */ - netdev->min_mtu = 81; + netdev->min_mtu = 81; /* accommodate SACK */ netdev->max_mtu = MAX_MTU; netdev->netdev_ops = &cxgb4_netdev_ops; From abf4bb6b63d0a54266f8e7eff3720c1974063971 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:06 +0200 Subject: [PATCH 1191/1322] skbuff: Add the offload_mr_fwd_mark field Similarly to the offload_fwd_mark field, the offload_mr_fwd_mark field is used to allow partial offloading of MFC multicast routes. Switchdev drivers can offload MFC multicast routes to the hardware by registering to the FIB notification chain. When one of the route output interfaces is not offload-able, i.e. has different parent ID, the route cannot be fully offloaded by the hardware. Examples to non-offload-able devices are a management NIC, dummy device, pimreg device, etc. Similar problem exists in the bridge module, as one bridge can hold interfaces with different parent IDs. At the bridge, the problem is solved by the offload_fwd_mark skb field. Currently, when a route cannot go through full offload, the only solution for a switchdev driver is not to offload it at all and let the packet go through slow path. Using the offload_mr_fwd_mark field, a driver can indicate that a packet was already forwarded by hardware to all the devices with the same parent ID as the input device. Further patches in this patch-set are going to enhance ipmr to skip multicast forwarding to devices with the same parent ID if a packets is marked with that field. The reason why the already existing "offload_fwd_mark" bit cannot be used is that a switchdev driver would want to make the distinction between a packet that has already gone through L2 forwarding but did not go through multicast forwarding, and a packet that has already gone through both L2 and multicast forwarding. For example: when a packet is ingressing from a switchport enslaved to a bridge, which is configured with multicast forwarding, the following scenarios are possible: - The packet can be trapped to the CPU due to exception while multicast forwarding (for example, MTU error). In that case, it had already gone through L2 forwarding in the hardware, thus A switchdev driver would want to set the skb->offload_fwd_mark and not the skb->offload_mr_fwd_mark. - The packet can also be trapped due to a pimreg/dummy device used as one of the output interfaces. In that case, it can go through both L2 and (partial) multicast forwarding inside the hardware, thus a switchdev driver would want to set both the skb->offload_fwd_mark and skb->offload_mr_fwd_mark. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 19e64bfb1a66..ada821466e88 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -772,6 +772,7 @@ struct sk_buff { __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; + __u8 offload_mr_fwd_mark:1; #endif #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:07 +0200 Subject: [PATCH 1192/1322] ipv4: ipmr: Add the parent ID field to VIF struct In order to allow the ipmr module to do partial multicast forwarding according to the device parent ID, add the device parent ID field to the VIF struct. This way, the forwarding path can use the parent ID field without invoking switchdev calls, which requires the RTNL lock. When a new VIF is added, set the device parent ID field in it by invoking the switchdev_port_attr_get call. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 1 + net/ipv4/ipmr.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/mroute.h b/include/linux/mroute.h index b072a84fbe1c..8242d05df35e 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -57,6 +57,7 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) struct vif_device { struct net_device *dev; /* Device we are using */ + struct netdev_phys_item_id dev_parent_id; /* Device parent ID */ unsigned long bytes_in,bytes_out; unsigned long pkt_in,pkt_out; /* Statistics */ unsigned long rate_limit; /* Traffic shaping (NI) */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a844738b38bd..1b161ada7ae6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include #include #include +#include struct ipmr_rule { struct fib_rule common; @@ -868,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -942,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; From a5bc9294d70fe85729bb343eef281ccbe78ff119 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:08 +0200 Subject: [PATCH 1193/1322] ipv4: ipmr: Don't forward packets already forwarded by hardware Change the ipmr module to not forward packets if: - The packet is marked with the offload_mr_fwd_mark, and - Both input interface and output interface share the same parent ID. This way, a packet can go through partial multicast forwarding in the hardware, where it will be forwarded only to the devices that share the same parent ID (AKA, reside inside the same hardware). The kernel will forward the packet to all other interfaces. To do this, add the ipmr_offload_forward helper, which per skb, ingress VIF and egress VIF, returns whether the forwarding was offloaded to hardware. The ipmr_queue_xmit frees the skb and does not forward it if the result is a true value. All the forwarding path code compiles out when the CONFIG_NET_SWITCHDEV is not set. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1b161ada7ae6..b3ee01b0551b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1859,10 +1859,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1883,6 +1906,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -2060,8 +2086,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -2072,9 +2098,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } From 267872435515185e2e600a721fdddeea90f96ffa Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:09 +0200 Subject: [PATCH 1194/1322] mlxsw: acl: Introduce ACL trap and forward action Use trap/discard flex action to implement trap and forward. The action will later be used for multicast routing, as the multicast routing mechanism is done using ACL flexible actions in Spectrum hardware. Using that action, it will be possible to implement a trap-and-forward route. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/core_acl_flex_actions.c | 17 +++++++++++++++++ .../mellanox/mlxsw/core_acl_flex_actions.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c index bc55d0e76705..6a979a09ab72 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c @@ -676,6 +676,7 @@ enum mlxsw_afa_trapdisc_trap_action { MLXSW_ITEM32(afa, trapdisc, trap_action, 0x00, 24, 4); enum mlxsw_afa_trapdisc_forward_action { + MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD = 1, MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD = 3, }; @@ -729,6 +730,22 @@ int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id) } EXPORT_SYMBOL(mlxsw_afa_block_append_trap); +int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block, + u16 trap_id) +{ + char *act = mlxsw_afa_block_append_action(block, + MLXSW_AFA_TRAPDISC_CODE, + MLXSW_AFA_TRAPDISC_SIZE); + + if (!act) + return -ENOBUFS; + mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP, + MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD, + trap_id); + return 0; +} +EXPORT_SYMBOL(mlxsw_afa_block_append_trap_and_forward); + /* Forwarding Action * ----------------- * Forwarding Action can be used to implement Policy Based Switching (PBS) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h index 06b0be432b8f..a8d3314c3a24 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h @@ -61,6 +61,8 @@ int mlxsw_afa_block_continue(struct mlxsw_afa_block *block); int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id); int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block); int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id); +int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block, + u16 trap_id); int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block, u8 local_port, bool in_port); int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block, From a0040c8c935548e1efb1a28f07f15d7ec7918055 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:10 +0200 Subject: [PATCH 1195/1322] mlxsw: spectrum: Add trap for multicast trap-and-forward routes When a multicast route is configured with trap-and-forward action, the packets should be marked with skb->offload_mr_fwd_mark, in order to prevent the packets from being forwarded again by the kernel ipmr module. Due to this, it is not possible to use the already existing multicast trap (MLXSW_TRAP_ID_ACL1) as the packet should be marked differently. Add the MLXSW_TRAP_ID_ACL2 which is for trap-and-forward multicast routes, and set the offload_mr_fwd_mark skb field in its handler. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 13 +++++++++++++ drivers/net/ethernet/mellanox/mlxsw/trap.h | 2 ++ 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index e9b94430afed..3adf237c951a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3312,6 +3312,14 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port, return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); } +static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb, + u8 local_port, void *priv) +{ + skb->offload_mr_fwd_mark = 1; + skb->offload_fwd_mark = 1; + return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); +} + static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, void *priv) { @@ -3355,6 +3363,10 @@ out: MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action, \ _is_ctrl, SP_##_trap_group, DISCARD) +#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ + MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action, \ + _is_ctrl, SP_##_trap_group, DISCARD) + #define MLXSW_SP_EVENTL(_func, _trap_id) \ MLXSW_EVENTL(_func, _trap_id, SP_EVENT) @@ -3425,6 +3437,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false), MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false), MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), + MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), }; static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index a98103539f6b..ec6cef8267ae 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -93,6 +93,8 @@ enum { MLXSW_TRAP_ID_ACL0 = 0x1C0, /* Multicast trap used for routes with trap action */ MLXSW_TRAP_ID_ACL1 = 0x1C1, + /* Multicast trap used for routes with trap-and-forward action */ + MLXSW_TRAP_ID_ACL2 = 0x1C2, MLXSW_TRAP_ID_MAX = 0x1FF }; From 607feadef89ac806df5a0be983afef77247e1541 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:11 +0200 Subject: [PATCH 1196/1322] mlxsw: spectrum: mr_tcam: Add trap-and-forward multicast route In addition to the current multicast route actions, which include trap route action and a forward route action, add the trap-and-forward multicast route action, and implement it in the multicast routing hardware logic. To implement that, add a trap-and-forward ACL action as the last action in the route flexible action set. The used trap is the ACL2 trap, which marks the packets with offload_mr_forward_mark, to prevent the packet from being forwarded again by the kernel. Note: At that stage the offloading logic does not support trap-and-forward multicast routes. This patch adds the support only in the hardware logic. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h | 1 + drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h index c851b237d253..5d26a122af49 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h @@ -42,6 +42,7 @@ enum mlxsw_sp_mr_route_action { MLXSW_SP_MR_ROUTE_ACTION_FORWARD, MLXSW_SP_MR_ROUTE_ACTION_TRAP, + MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD, }; enum mlxsw_sp_mr_route_prio { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c index cda9e9ad10e3..3ffb28dd4057 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c @@ -253,6 +253,7 @@ mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp, if (err) goto err; break; + case MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD: case MLXSW_SP_MR_ROUTE_ACTION_FORWARD: /* If we are about to append a multicast router action, commit * the erif_list. @@ -266,6 +267,13 @@ mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp, erif_list->kvdl_index); if (err) goto err; + + if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD) { + err = mlxsw_afa_block_append_trap_and_forward(afa_block, + MLXSW_TRAP_ID_ACL2); + if (err) + goto err; + } break; default: err = -EINVAL; From f60c254998de80feaec8e4122960ab64e8045214 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 3 Oct 2017 09:58:12 +0200 Subject: [PATCH 1197/1322] mlxsw: spectrum: mr: Support trap-and-forward routes Add the support of trap-and-forward route action in the multicast routing offloading logic. A route will be set to trap-and-forward action if one (or more) of its output interfaces is not offload-able, i.e. does not have a valid Spectrum RIF. This way, a route with mixed output VIFs list, which contains both offload-able and un-offload-able devices can go through partial offloading in hardware, and the rest will be done in the kernel ipmr module. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_mr.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index 4aaf6ca1be7c..1f84bb8e9135 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -114,9 +114,9 @@ static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif) return mlxsw_sp_mr_vif_regular(vif) && vif->dev && vif->rif; } -static bool mlxsw_sp_mr_vif_rif_invalid(const struct mlxsw_sp_mr_vif *vif) +static bool mlxsw_sp_mr_vif_exists(const struct mlxsw_sp_mr_vif *vif) { - return mlxsw_sp_mr_vif_regular(vif) && vif->dev && !vif->rif; + return vif->dev; } static bool @@ -182,14 +182,13 @@ mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route) if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route)) return MLXSW_SP_MR_ROUTE_ACTION_TRAP; - /* If either one of the eVIFs is not regular (VIF of type pimreg or - * tunnel) or one of the VIFs has no matching RIF, trap the packet. + /* If one of the eVIFs has no RIF, trap-and-forward the route as there + * is some more routing to do in software too. */ - list_for_each_entry(rve, &mr_route->evif_list, route_node) { - if (!mlxsw_sp_mr_vif_regular(rve->mr_vif) || - mlxsw_sp_mr_vif_rif_invalid(rve->mr_vif)) - return MLXSW_SP_MR_ROUTE_ACTION_TRAP; - } + list_for_each_entry(rve, &mr_route->evif_list, route_node) + if (mlxsw_sp_mr_vif_exists(rve->mr_vif) && !rve->mr_vif->rif) + return MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD; + return MLXSW_SP_MR_ROUTE_ACTION_FORWARD; } From ce024f42c2e28b6bce4ecc1e891b42f57f753892 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 3 Oct 2017 13:20:48 +0300 Subject: [PATCH 1198/1322] net: rtnetlink: fix info leak in RTM_GETSTATS call When RTM_GETSTATS was added the fields of its header struct were not all initialized when returning the result thus leaking 4 bytes of information to user-space per rtnl_fill_statsinfo call, so initialize them now. Thanks to Alexander Potapenko for the detailed report and bisection. Reported-by: Alexander Potapenko Fixes: 10c9ead9f3c6 ("rtnetlink: add new RTM_GETSTATS message to dump link stats") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..d4bcdcc68e92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3854,6 +3854,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; From 161ae6b04d5da0de518ea600d8fffe61208e243c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 11:39:18 +0100 Subject: [PATCH 1199/1322] net: dsa: lan9303: make functions lan9303_mdio_phy_{read|write} static The functions lan9303_mdio_phy_write and lan9303_mdio_phy_read are local to the source and do not need to be in global scope, so make them static. Cleans up sparse warnings: symbol 'lan9303_mdio_phy_write' was not declared. Should it be static? symbol 'lan9303_mdio_phy_read' was not declared. Should it be static? Signed-off-by: Colin Ian King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/lan9303_mdio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/lan9303_mdio.c b/drivers/net/dsa/lan9303_mdio.c index fc16668a487f..0bc56b9900f9 100644 --- a/drivers/net/dsa/lan9303_mdio.c +++ b/drivers/net/dsa/lan9303_mdio.c @@ -67,14 +67,15 @@ static int lan9303_mdio_read(void *ctx, uint32_t reg, uint32_t *val) return 0; } -int lan9303_mdio_phy_write(struct lan9303 *chip, int phy, int reg, u16 val) +static int lan9303_mdio_phy_write(struct lan9303 *chip, int phy, int reg, + u16 val) { struct lan9303_mdio *sw_dev = dev_get_drvdata(chip->dev); return mdiobus_write_nested(sw_dev->device->bus, phy, reg, val); } -int lan9303_mdio_phy_read(struct lan9303 *chip, int phy, int reg) +static int lan9303_mdio_phy_read(struct lan9303 *chip, int phy, int reg) { struct lan9303_mdio *sw_dev = dev_get_drvdata(chip->dev); From 360cc342c9037ed487adf07212000f69c0ffa14d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 11:46:33 +0100 Subject: [PATCH 1200/1322] net: dsa: mt7530: make functions mt7530_phy_write static The function mt7530_phy_write is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warnings: symbol 'mt7530_phy_write' was not declared. Should it be static? Signed-off-by: Colin Ian King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index faa3b88d2206..034241696ce2 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -564,7 +564,8 @@ static int mt7530_phy_read(struct dsa_switch *ds, int port, int regnum) return mdiobus_read_nested(priv->bus, port, regnum); } -int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) +static int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum, + u16 val) { struct mt7530_priv *priv = ds->priv; From b508e0b6e47c85a095ef056f3de6ba9d396c490c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 3 Oct 2017 13:53:03 +0300 Subject: [PATCH 1201/1322] mlxsw: spectrum: Fix check for IS_ERR() instead of NULL mlxsw_afa_block_create() doesn't return error pointers, it returns NULL on error. Fixes: 0e14c7777acb ("mlxsw: spectrum: Add the multicast routing hardware logic") Signed-off-by: Dan Carpenter Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c index 3ffb28dd4057..3a61896ae4d8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c @@ -239,8 +239,8 @@ mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp, int err; afa_block = mlxsw_afa_block_create(mlxsw_sp->afa); - if (IS_ERR(afa_block)) - return afa_block; + if (!afa_block) + return ERR_PTR(-ENOMEM); err = mlxsw_afa_block_append_counter(afa_block, counter_index); if (err) From b5c7d4e54c9ab830e5c03f92377fe15cbae64d0d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 3 Oct 2017 13:53:41 +0300 Subject: [PATCH 1202/1322] mlxsw: spectrum: Add missing error code on allocation failure We accidentally return success if the kmalloc_array() call fails. Fixes: 0e14c7777acb ("mlxsw: spectrum: Add the multicast routing hardware logic") Signed-off-by: Dan Carpenter Acked-by: Yotam Gigi Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c index 3a61896ae4d8..39c21c70ac32 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c @@ -771,8 +771,10 @@ mlxsw_sp_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp, parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1, sizeof(*parman_prios), GFP_KERNEL); - if (!parman_prios) + if (!parman_prios) { + err = -ENOMEM; goto err_parman_prios_alloc; + } mr_tcam_region->parman_prios = parman_prios; for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++) From 63ba395cd7a52431cbb61658dad3beb5b24e9300 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Wed, 27 Sep 2017 23:31:03 +0200 Subject: [PATCH 1203/1322] rndis_host: support Novatel Verizon USB730L MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat the ef/04/01 interface class/subclass/protocol combination used by the Novatel Verizon USB730L (1410:9030) as a possible RNDIS interface. T: Bus=01 Lev=02 Prnt=02 Port=01 Cnt=02 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 3 P: Vendor=1410 ProdID=9030 Rev=03.10 S: Manufacturer=Novatel Wireless S: Product=MiFi USB730L S: SerialNumber=0123456789ABCDEF C: #Ifs= 3 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host I: If#= 2 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid Once the network interface is brought up, the user just needs to run a DHCP client to get IP address and routing setup. As a side note, other Novatel Verizon USB730L models with the same vid:pid end up exposing a standard ECM interface which doesn't require any other kernel update to make it work. Signed-off-by: Aleksander Morgado Reviewed-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 11 ++++++++++- drivers/net/usb/rndis_host.c | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 677a85360db1..29c7e2ec0dcb 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -54,11 +54,19 @@ static int is_wireless_rndis(struct usb_interface_descriptor *desc) desc->bInterfaceProtocol == 3); } +static int is_novatel_rndis(struct usb_interface_descriptor *desc) +{ + return (desc->bInterfaceClass == USB_CLASS_MISC && + desc->bInterfaceSubClass == 4 && + desc->bInterfaceProtocol == 1); +} + #else #define is_rndis(desc) 0 #define is_activesync(desc) 0 #define is_wireless_rndis(desc) 0 +#define is_novatel_rndis(desc) 0 #endif @@ -150,7 +158,8 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf) */ rndis = (is_rndis(&intf->cur_altsetting->desc) || is_activesync(&intf->cur_altsetting->desc) || - is_wireless_rndis(&intf->cur_altsetting->desc)); + is_wireless_rndis(&intf->cur_altsetting->desc) || + is_novatel_rndis(&intf->cur_altsetting->desc)); memset(info, 0, sizeof(*info)); info->control = intf; diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index a151f267aebb..b807c91abe1d 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -632,6 +632,10 @@ static const struct usb_device_id products [] = { /* RNDIS for tethering */ USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3), .driver_info = (unsigned long) &rndis_info, +}, { + /* Novatel Verizon USB730L */ + USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1), + .driver_info = (unsigned long) &rndis_info, }, { }, // END }; From 4d2c0cda07448ea6980f00102dc3964eb25e241c Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 27 Sep 2017 18:03:49 -0700 Subject: [PATCH 1204/1322] bonding: speed/duplex update at NETDEV_UP event Some NIC drivers don't have correct speed/duplex settings at the time they send NETDEV_UP notification and that messes up the bonding state. Especially 802.3ad mode which is very sensitive to these settings. In the current implementation we invoke bond_update_speed_duplex() when we receive NETDEV_UP, however, ignore the return value. If the values we get are invalid (UNKNOWN), then slave gets removed from the aggregator with speed and duplex set to UNKNOWN while link is still marked as UP. This patch fixes this scenario. Also 802.3ad mode is sensitive to these conditions while other modes are not, so making sure that it doesn't change the behavior for other modes. Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d2e94b8559f0..b19dc033fb36 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3073,7 +3073,16 @@ static int bond_slave_netdev_event(unsigned long event, break; case NETDEV_UP: case NETDEV_CHANGE: - bond_update_speed_duplex(slave); + /* For 802.3ad mode only: + * Getting invalid Speed/Duplex values here will put slave + * in weird state. So mark it as link-down for the time + * being and let link-monitoring (miimon) set it right when + * correct speeds/duplex are available. + */ + if (bond_update_speed_duplex(slave) && + BOND_MODE(bond) == BOND_MODE_8023AD) + slave->link = BOND_LINK_DOWN; + if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); /* Fallthrough */ From 05946876f0c16f6fe1db692d575aba42b25f0811 Mon Sep 17 00:00:00 2001 From: David Wu Date: Sat, 30 Sep 2017 17:47:23 +0800 Subject: [PATCH 1205/1322] net: stmmac: dwmac-rk: Add RK3128 GMAC support Add constants and callback functions for the dwmac on rk3128 soc. As can be seen, the base structure is the same, only registers and the bits in them moved slightly. Signed-off-by: David Wu Signed-off-by: David S. Miller --- .../bindings/net/rockchip-dwmac.txt | 1 + .../net/ethernet/stmicro/stmmac/dwmac-rk.c | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt index 6af8eed1adeb..9c16ee2965a2 100644 --- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt +++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt @@ -4,6 +4,7 @@ The device node has following properties. Required properties: - compatible: should be "rockchip,-gamc" + "rockchip,rk3128-gmac": found on RK312x SoCs "rockchip,rk3228-gmac": found on RK322x SoCs "rockchip,rk3288-gmac": found on RK3288 SoCs "rockchip,rk3328-gmac": found on RK3328 SoCs diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 99823f54696a..13133b30b575 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -83,6 +83,117 @@ struct rk_priv_data { (((tx) ? soc##_GMAC_TXCLK_DLY_ENABLE : soc##_GMAC_TXCLK_DLY_DISABLE) | \ ((rx) ? soc##_GMAC_RXCLK_DLY_ENABLE : soc##_GMAC_RXCLK_DLY_DISABLE)) +#define RK3128_GRF_MAC_CON0 0x0168 +#define RK3128_GRF_MAC_CON1 0x016c + +/* RK3128_GRF_MAC_CON0 */ +#define RK3128_GMAC_TXCLK_DLY_ENABLE GRF_BIT(14) +#define RK3128_GMAC_TXCLK_DLY_DISABLE GRF_CLR_BIT(14) +#define RK3128_GMAC_RXCLK_DLY_ENABLE GRF_BIT(15) +#define RK3128_GMAC_RXCLK_DLY_DISABLE GRF_CLR_BIT(15) +#define RK3128_GMAC_CLK_RX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 7) +#define RK3128_GMAC_CLK_TX_DL_CFG(val) HIWORD_UPDATE(val, 0x7F, 0) + +/* RK3128_GRF_MAC_CON1 */ +#define RK3128_GMAC_PHY_INTF_SEL_RGMII \ + (GRF_BIT(6) | GRF_CLR_BIT(7) | GRF_CLR_BIT(8)) +#define RK3128_GMAC_PHY_INTF_SEL_RMII \ + (GRF_CLR_BIT(6) | GRF_CLR_BIT(7) | GRF_BIT(8)) +#define RK3128_GMAC_FLOW_CTRL GRF_BIT(9) +#define RK3128_GMAC_FLOW_CTRL_CLR GRF_CLR_BIT(9) +#define RK3128_GMAC_SPEED_10M GRF_CLR_BIT(10) +#define RK3128_GMAC_SPEED_100M GRF_BIT(10) +#define RK3128_GMAC_RMII_CLK_25M GRF_BIT(11) +#define RK3128_GMAC_RMII_CLK_2_5M GRF_CLR_BIT(11) +#define RK3128_GMAC_CLK_125M (GRF_CLR_BIT(12) | GRF_CLR_BIT(13)) +#define RK3128_GMAC_CLK_25M (GRF_BIT(12) | GRF_BIT(13)) +#define RK3128_GMAC_CLK_2_5M (GRF_CLR_BIT(12) | GRF_BIT(13)) +#define RK3128_GMAC_RMII_MODE GRF_BIT(14) +#define RK3128_GMAC_RMII_MODE_CLR GRF_CLR_BIT(14) + +static void rk3128_set_to_rgmii(struct rk_priv_data *bsp_priv, + int tx_delay, int rx_delay) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_PHY_INTF_SEL_RGMII | + RK3128_GMAC_RMII_MODE_CLR); + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON0, + DELAY_ENABLE(RK3128, tx_delay, rx_delay) | + RK3128_GMAC_CLK_RX_DL_CFG(rx_delay) | + RK3128_GMAC_CLK_TX_DL_CFG(tx_delay)); +} + +static void rk3128_set_to_rmii(struct rk_priv_data *bsp_priv) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_PHY_INTF_SEL_RMII | RK3128_GMAC_RMII_MODE); +} + +static void rk3128_set_rgmii_speed(struct rk_priv_data *bsp_priv, int speed) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + if (speed == 10) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_2_5M); + else if (speed == 100) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_25M); + else if (speed == 1000) + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_CLK_125M); + else + dev_err(dev, "unknown speed value for RGMII! speed=%d", speed); +} + +static void rk3128_set_rmii_speed(struct rk_priv_data *bsp_priv, int speed) +{ + struct device *dev = &bsp_priv->pdev->dev; + + if (IS_ERR(bsp_priv->grf)) { + dev_err(dev, "Missing rockchip,grf property\n"); + return; + } + + if (speed == 10) { + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_RMII_CLK_2_5M | + RK3128_GMAC_SPEED_10M); + } else if (speed == 100) { + regmap_write(bsp_priv->grf, RK3128_GRF_MAC_CON1, + RK3128_GMAC_RMII_CLK_25M | + RK3128_GMAC_SPEED_100M); + } else { + dev_err(dev, "unknown speed value for RMII! speed=%d", speed); + } +} + +static const struct rk_gmac_ops rk3128_ops = { + .set_to_rgmii = rk3128_set_to_rgmii, + .set_to_rmii = rk3128_set_to_rmii, + .set_rgmii_speed = rk3128_set_rgmii_speed, + .set_rmii_speed = rk3128_set_rmii_speed, +}; + #define RK3228_GRF_MAC_CON0 0x0900 #define RK3228_GRF_MAC_CON1 0x0904 @@ -1313,6 +1424,7 @@ static int rk_gmac_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(rk_gmac_pm_ops, rk_gmac_suspend, rk_gmac_resume); static const struct of_device_id rk_gmac_dwmac_match[] = { + { .compatible = "rockchip,rk3128-gmac", .data = &rk3128_ops }, { .compatible = "rockchip,rk3228-gmac", .data = &rk3228_ops }, { .compatible = "rockchip,rk3288-gmac", .data = &rk3288_ops }, { .compatible = "rockchip,rk3328-gmac", .data = &rk3328_ops }, From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 2 Oct 2017 23:50:05 +0200 Subject: [PATCH 1206/1322] net: core: decouple ifalias get/set from rtnl lock Device alias can be set by either rtnetlink (rtnl is held) or sysfs. rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose. Add an extra mutex for it and use rcu to protect concurrent accesses. This allows the sysfs path to not take rtnl and would later allow to not hold it when dumping ifalias. Based on suggestion from Eric Dumazet. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++++- net/core/dev.c | 52 +++++++++++++++++++++++++++++---------- net/core/net-sysfs.c | 17 ++++++------- net/core/rtnetlink.c | 13 ++++++++-- 4 files changed, 65 insertions(+), 25 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e1d6ef130611..d04424cfffba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -826,6 +826,11 @@ struct xfrmdev_ops { }; #endif +struct dev_ifalias { + struct rcu_head rcuhead; + char ifalias[]; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1632,7 +1637,7 @@ enum netdev_priv_flags { struct net_device { char name[IFNAMSIZ]; struct hlist_node name_hlist; - char *ifalias; + struct dev_ifalias __rcu *ifalias; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one @@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); +int dev_get_alias(const struct net_device *, char *, size_t); int dev_change_net_namespace(struct net_device *, struct net *, const char *); int __dev_set_mtu(struct net_device *, int); int dev_set_mtu(struct net_device *, int); diff --git a/net/core/dev.c b/net/core/dev.c index e350c768d4b5..1770097cfd86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id); DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +static DEFINE_MUTEX(ifalias_mutex); + /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); @@ -1265,29 +1267,53 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; - memcpy(dev->ifalias, alias, len); - dev->ifalias[len] = 0; + mutex_lock(&ifalias_mutex); + rcu_swap_protected(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @alias: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 927a6dcbad96..51d5836d8fb9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); ret = dev_set_alias(netdev, buf, count); - rtnl_unlock(); return ret < 0 ? ret : len; } @@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); + char tmp[IFALIASZ]; ssize_t ret = 0; - if (!rtnl_trylock()) - return restart_syscall(); - if (netdev->ifalias) - ret = sprintf(buf, "%s\n", netdev->ifalias); - rtnl_unlock(); + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sprintf(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); - kfree(dev->ifalias); + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); netdev_freemem(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e6955da0d58d..3961f87cdc76 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) return nla_put_u32(skb, IFLA_LINK, ifindex); } +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev) { @@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || - (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_changes)) || nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Oct 2017 15:37:20 -0700 Subject: [PATCH 1207/1322] bpf: fix bpf_tail_call() x64 JIT - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 4 ++-- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c9573660d51..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ab5c402f98..f90860d1f897 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,7 +312,7 @@ union bpf_attr { * jump into another BPF program * @ctx: context pointer passed to next program * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run + * @index: 32-bit index inside array that selects specific program to run * Return: 0 on success or negative error * * int bpf_clone_redirect(skb, ifindex, flags) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; From 1ae2eaaa229bc350b6f38fbf4ab9c873532aecfb Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:08 -0300 Subject: [PATCH 1208/1322] sctp: silence warns on sctp_stream_init allocations As SCTP supports up to 65535 streams, that can lead to very large allocations in sctp_stream_init(). As Xin Long noticed, systems with small amounts of memory are more prone to not have enough memory and dump warnings on dmesg initiated by user actions. Thus, silence them. Also, if the reallocation of stream->out is not necessary, skip it and keep the memory we already have. Reported-by: Xin Long Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..1afa95558083 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -40,9 +40,14 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, { int i; + gfp |= __GFP_NOWARN; + /* Initial stream->out size may be very big, so free it and alloc - * a new one with new outcnt to save memory. + * a new one with new outcnt to save memory if needed. */ + if (outcnt == stream->outcnt) + goto in; + kfree(stream->out); stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); @@ -53,6 +58,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; +in: if (!incnt) return 0; From e090abd0d81c7bdbf0c0ba2224624be2b15ded0d Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:09 -0300 Subject: [PATCH 1209/1322] sctp: factor out stream->out allocation There is 1 place allocating it and 2 other reallocating. Move such procedures to a common function. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 52 +++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 1afa95558083..6d0e997d301f 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -35,6 +35,30 @@ #include #include +static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, + gfp_t gfp) +{ + struct sctp_stream_out *out; + + out = kmalloc_array(outcnt, sizeof(*out), gfp); + if (!out) + return -ENOMEM; + + if (stream->out) { + memcpy(out, stream->out, min(outcnt, stream->outcnt) * + sizeof(*out)); + kfree(stream->out); + } + + if (outcnt > stream->outcnt) + memset(out + stream->outcnt, 0, + (outcnt - stream->outcnt) * sizeof(*out)); + + stream->out = out; + + return 0; +} + int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { @@ -48,11 +72,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, if (outcnt == stream->outcnt) goto in; - kfree(stream->out); - - stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); - if (!stream->out) - return -ENOMEM; + i = sctp_stream_alloc_out(stream, outcnt, gfp); + if (i) + return i; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) @@ -276,15 +298,9 @@ int sctp_send_add_streams(struct sctp_association *asoc, } if (out) { - struct sctp_stream_out *streamout; - - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_KERNEL); - if (!streamout) + retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); + if (retval) goto out; - - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; } chunk = sctp_make_strreset_addstrm(asoc, out, in); @@ -682,10 +698,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_out *streamout; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; + int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -714,14 +730,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( if (!out || outcnt > SCTP_MAX_STREAM) goto out; - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_ATOMIC); - if (!streamout) + ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); + if (ret) goto out; - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; - chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; From 1fdb8d8fefe2e7320ea15a65051758a4c4332f05 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:10 -0300 Subject: [PATCH 1210/1322] sctp: factor out stream->in allocation There is 1 place allocating it and another reallocating. Move such procedures to a common function. v2: updated changelog Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/stream.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 6d0e997d301f..952437d656cc 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -59,6 +59,31 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, return 0; } +static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, + gfp_t gfp) +{ + struct sctp_stream_in *in; + + in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + + if (!in) + return -ENOMEM; + + if (stream->in) { + memcpy(in, stream->in, min(incnt, stream->incnt) * + sizeof(*in)); + kfree(stream->in); + } + + if (incnt > stream->incnt) + memset(in + stream->incnt, 0, + (incnt - stream->incnt) * sizeof(*in)); + + stream->in = in; + + return 0; +} + int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { @@ -84,8 +109,8 @@ in: if (!incnt) return 0; - stream->in = kcalloc(incnt, sizeof(*stream->in), gfp); - if (!stream->in) { + i = sctp_stream_alloc_in(stream, incnt, gfp); + if (i) { kfree(stream->out); stream->out = NULL; return -ENOMEM; @@ -623,7 +648,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_in *streamin; __u32 request_seq, incnt; __u16 in, i; @@ -670,13 +694,9 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( if (!in || incnt > SCTP_MAX_STREAM) goto out; - streamin = krealloc(stream->in, incnt * sizeof(*streamin), - GFP_ATOMIC); - if (!streamin) + if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; - memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); - stream->in = streamin; stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; From f952be79cebd49d04154781d99408867a069d375 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:11 -0300 Subject: [PATCH 1211/1322] sctp: introduce struct sctp_stream_out_ext With the stream schedulers, sctp_stream_out will become too big to be allocated by kmalloc and as we need to allocate with BH disabled, we cannot use __vmalloc in sctp_stream_init(). This patch moves out the stats from sctp_stream_out to sctp_stream_out_ext, which will be allocated only when the application tries to sendmsg something on it. Just the introduction of sctp_stream_out_ext would already fix the issue described above by splitting the allocation in two. Moving the stats to it also reduces the pressure on the allocator as we will ask for less memory atomically when creating the socket and we will use GFP_KERNEL later. Then, for stream schedulers, we will just use sctp_stream_out_ext. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 10 ++++++++-- net/sctp/chunk.c | 6 +++--- net/sctp/outqueue.c | 4 ++-- net/sctp/socket.c | 27 +++++++++++++++++++++------ net/sctp/stream.c | 16 ++++++++++++++++ 5 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 0477945de1a3..9b2b30b3ba4d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,6 +84,7 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; +struct sctp_stream_out; #include @@ -380,6 +381,7 @@ struct sctp_sender_hb_info { int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp); +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); @@ -1315,11 +1317,15 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_out_ext { + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; +}; + struct sctp_stream_out { __u16 ssn; __u8 state; - __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; - __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct sctp_stream_out_ext *ext; }; struct sctp_stream_in { diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 3afac275ee82..7b261afc47b9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && @@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 2966ff400755..746b07b7937d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -366,7 +366,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -404,7 +404,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_stream_out *streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; - streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } msg_len -= SCTP_DATA_SNDSIZE(chk) + diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d4730ada7f32..d207734326b0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1927,6 +1927,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + /* Allocate sctp_stream_out_ext if not already done */ + if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); + if (err) + goto out_free; + } + if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); @@ -6645,7 +6652,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_stream_out *streamout; + struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; @@ -6668,21 +6675,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamout = &asoc->stream.out[params.sprstat_sid]; + streamoute = asoc->stream.out[params.sprstat_sid].ext; + if (!streamoute) { + /* Not allocated yet, means all stats are 0 */ + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + retval = 0; + goto out; + } + if (policy == SCTP_PR_SCTP_NONE) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += - streamout->abandoned_unsent[policy]; + streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += - streamout->abandoned_sent[policy]; + streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = - streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = - streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 952437d656cc..055ca25bbc91 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -121,8 +121,24 @@ in: return 0; } +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_stream_out_ext *soute; + + soute = kzalloc(sizeof(*soute), GFP_KERNEL); + if (!soute) + return -ENOMEM; + stream->out[sid].ext = soute; + + return 0; +} + void sctp_stream_free(struct sctp_stream *stream) { + int i; + + for (i = 0; i < stream->outcnt; i++) + kfree(stream->out[i].ext); kfree(stream->out); kfree(stream->in); } From 2fc019f790312e703efa1a44204c586112a430dc Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:12 -0300 Subject: [PATCH 1212/1322] sctp: introduce sctp_chunk_stream_no Add a helper to fetch the stream number from a given chunk. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9b2b30b3ba4d..c48f7999fe9b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -642,6 +642,11 @@ void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); +static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch) +{ + return ntohs(ch->subh.data_hdr->stream); +} + enum { SCTP_ADDR_NEW, /* new address added to assoc/ep */ SCTP_ADDR_SRC, /* address can be used as source */ From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:13 -0300 Subject: [PATCH 1213/1322] sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_sched.h | 72 +++++++++ include/net/sctp/structs.h | 15 +- include/uapi/linux/sctp.h | 6 + net/sctp/Makefile | 2 +- net/sctp/outqueue.c | 59 +++---- net/sctp/sm_sideeffect.c | 3 + net/sctp/stream.c | 88 ++++++++++- net/sctp/stream_sched.c | 270 ++++++++++++++++++++++++++++++++ 8 files changed, 477 insertions(+), 38 deletions(-) create mode 100644 include/net/sctp/stream_sched.h create mode 100644 net/sctp/stream_sched.c diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h new file mode 100644 index 000000000000..c676550a4c7d --- /dev/null +++ b/include/net/sctp/stream_sched.h @@ -0,0 +1,72 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * These are definitions used by the stream schedulers, defined in RFC + * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11) + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresses: + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#ifndef __sctp_stream_sched_h__ +#define __sctp_stream_sched_h__ + +struct sctp_sched_ops { + /* Property handling for a given stream */ + int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value, + gfp_t gfp); + int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value); + + /* Init the specific scheduler */ + int (*init)(struct sctp_stream *stream); + /* Init a stream */ + int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); + /* Frees the entire thing */ + void (*free)(struct sctp_stream *stream); + + /* Enqueue a chunk */ + void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); + /* Dequeue a chunk */ + struct sctp_chunk *(*dequeue)(struct sctp_outq *q); + /* Called only if the chunk fit the packet */ + void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk); + /* Sched all chunks already enqueued */ + void (*sched_all)(struct sctp_stream *steam); + /* Unched all chunks already enqueued */ + void (*unsched_all)(struct sctp_stream *steam); +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched); +int sctp_sched_get_sched(struct sctp_association *asoc); +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp); +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value); +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); + +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); + +#endif /* __sctp_stream_sched_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c48f7999fe9b..3c22a30fd71b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,7 +84,6 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; -struct sctp_stream_out; #include @@ -531,8 +530,12 @@ struct sctp_chunk { /* How many times this chunk have been sent, for prsctp RTX policy */ int sent_count; - /* This is our link to the per-transport transmitted list. */ - struct list_head transmitted_list; + union { + /* This is our link to the per-transport transmitted list. */ + struct list_head transmitted_list; + /* List in specific stream outq */ + struct list_head stream_list; + }; /* This field is used by chunks that hold fragmented data. * For the first fragment this is the list that holds the rest of @@ -1019,6 +1022,9 @@ struct sctp_outq { /* Data pending that has never been transmitted. */ struct list_head out_chunk_list; + /* Stream scheduler being used */ + struct sctp_sched_ops *sched; + unsigned int out_qlen; /* Total length of queued data chunks. */ /* Error of send failed, may used in SCTP_SEND_FAILED event. */ @@ -1325,6 +1331,7 @@ struct sctp_inithdr_host { struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct list_head outq; /* chunks enqueued by this stream */ }; struct sctp_stream_out { @@ -1342,6 +1349,8 @@ struct sctp_stream { struct sctp_stream_in *in; __u16 outcnt; __u16 incnt; + /* Current stream being sent, if any */ + struct sctp_stream_out *out_curr; }; #define SCTP_STREAM_CLOSED 0x00 diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6217ff8500a1..4487e7625ddb 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1088,4 +1088,10 @@ struct sctp_add_streams { uint16_t sas_outstrms; }; +/* SCTP Stream schedulers */ +enum sctp_sched_type { + SCTP_SS_FCFS, + SCTP_SS_MAX = SCTP_SS_FCFS +}; + #endif /* _UAPI_SCTP_H */ diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 70f1b570bab9..0f6e6d1d69fd 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o + offload.o stream_sched.o sctp_probe-y := probe.o diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 746b07b7937d..4db012aa25f7 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -50,6 +50,7 @@ #include #include +#include /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, - struct sctp_chunk *ch) + struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { - struct sctp_chunk *ch = NULL; - - if (!list_empty(&q->out_chunk_list)) { - struct list_head *entry = q->out_chunk_list.next; - - ch = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); - q->out_qlen -= ch->skb->len; - } - return ch; + return q->sched->dequeue(q); } + /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add_tail(&ch->stream_list, &oute->outq); } /* @@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); + sctp_sched_set_sched(asoc, SCTP_SS_FCFS); } /* Free the outqueue structure and any related pending chunks. @@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q) /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); @@ -391,13 +400,14 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; + q->sched->unsched_all(&asoc->stream); + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; - list_del_init(&chk->list); - q->out_qlen -= chk->skb->len; + sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { @@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, break; } + q->sched->sched_all(&asoc->stream); + return msg_len; } @@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); - /* RFC 2960 6.5 Every DATA chunk MUST carry a valid - * stream identifier. - */ - if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) { - - /* Mark as failed send. */ - sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); - if (asoc->peer.prsctp_capable && - SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) - asoc->sent_cnt_removable--; - sctp_chunk_free(chunk); - continue; - } - /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) else asoc->stats.oodchunks++; + /* Only now it's safe to consider this + * chunk as sent, sched-wise. + */ + sctp_sched_dequeue_done(q, chunk); + break; default: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e6a2974e020e..402bfbb888cd 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -50,6 +50,7 @@ #include #include #include +#include static int sctp_cmd_interpreter(enum sctp_event event_type, union sctp_subtype subtype, @@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); + + asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 055ca25bbc91..5ea33a2c453b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -32,8 +32,61 @@ * Xin Long */ +#include #include #include +#include + +/* Migrates chunks from stream queues to new stream queues if needed, + * but not across associations. Also, removes those chunks to streams + * higher than the new max. + */ +static void sctp_stream_outq_migrate(struct sctp_stream *stream, + struct sctp_stream *new, __u16 outcnt) +{ + struct sctp_association *asoc; + struct sctp_chunk *ch, *temp; + struct sctp_outq *outq; + int i; + + asoc = container_of(stream, struct sctp_association, stream); + outq = &asoc->outqueue; + + list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { + __u16 sid = sctp_chunk_stream_no(ch); + + if (sid < outcnt) + continue; + + sctp_sched_dequeue_common(outq, ch); + /* No need to call dequeue_done here because + * the chunks are not scheduled by now. + */ + + /* Mark as failed send. */ + sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; + + sctp_chunk_free(ch); + } + + if (new) { + /* Here we actually move the old ext stuff into the new + * buffer, because we want to keep it. Then + * sctp_stream_update will swap ->out pointers. + */ + for (i = 0; i < outcnt; i++) { + kfree(new->out[i].ext); + new->out[i].ext = stream->out[i].ext; + stream->out[i].ext = NULL; + } + } + + for (i = outcnt; i < stream->outcnt; i++) + kfree(stream->out[i].ext); +} static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) @@ -87,7 +140,8 @@ static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - int i; + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i, ret = 0; gfp |= __GFP_NOWARN; @@ -97,6 +151,11 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, if (outcnt == stream->outcnt) goto in; + /* Filter out chunks queued on streams that won't exist anymore */ + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, NULL, outcnt); + sched->sched_all(stream); + i = sctp_stream_alloc_out(stream, outcnt, gfp); if (i) return i; @@ -105,20 +164,27 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; + sched->init(stream); + in: if (!incnt) - return 0; + goto out; i = sctp_stream_alloc_in(stream, incnt, gfp); if (i) { - kfree(stream->out); - stream->out = NULL; - return -ENOMEM; + ret = -ENOMEM; + goto free; } stream->incnt = incnt; + goto out; - return 0; +free: + sched->free(stream); + kfree(stream->out); + stream->out = NULL; +out: + return ret; } int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) @@ -130,13 +196,15 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) return -ENOMEM; stream->out[sid].ext = soute; - return 0; + return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } void sctp_stream_free(struct sctp_stream *stream) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; + sched->free(stream); for (i = 0; i < stream->outcnt; i++) kfree(stream->out[i].ext); kfree(stream->out); @@ -156,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; @@ -163,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) stream->outcnt = new->outcnt; stream->incnt = new->incnt; + sched->sched_all(stream); + new->out = NULL; new->in = NULL; } diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c new file mode 100644 index 000000000000..40a9a9de2b98 --- /dev/null +++ b/net/sctp/stream_sched.c @@ -0,0 +1,270 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* First Come First Serve (a.k.a. FIFO) + * RFC DRAFT ndata Section 3.1 + */ +static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, + __u16 value, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = 0; + return 0; +} + +static int sctp_sched_fcfs_init(struct sctp_stream *stream) +{ + return 0; +} + +static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + return 0; +} + +static void sctp_sched_fcfs_free(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ +} + +static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_chunk *ch = NULL; + struct list_head *entry; + + if (list_empty(&q->out_chunk_list)) + goto out; + + if (stream->out_curr) { + ch = list_entry(stream->out_curr->ext->outq.next, + struct sctp_chunk, stream_list); + } else { + entry = q->out_chunk_list.next; + ch = list_entry(entry, struct sctp_chunk, list); + } + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *chunk) +{ +} + +static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) +{ +} + +static struct sctp_sched_ops sctp_sched_fcfs = { + .set = sctp_sched_fcfs_set, + .get = sctp_sched_fcfs_get, + .init = sctp_sched_fcfs_init, + .init_sid = sctp_sched_fcfs_init_sid, + .free = sctp_sched_fcfs_free, + .enqueue = sctp_sched_fcfs_enqueue, + .dequeue = sctp_sched_fcfs_dequeue, + .dequeue_done = sctp_sched_fcfs_dequeue_done, + .sched_all = sctp_sched_fcfs_sched_all, + .unsched_all = sctp_sched_fcfs_unsched_all, +}; + +/* API to other parts of the stack */ + +struct sctp_sched_ops *sctp_sched_ops[] = { + &sctp_sched_fcfs, +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched) +{ + struct sctp_sched_ops *n = sctp_sched_ops[sched]; + struct sctp_sched_ops *old = asoc->outqueue.sched; + struct sctp_datamsg *msg = NULL; + struct sctp_chunk *ch; + int i, ret = 0; + + if (old == n) + return ret; + + if (sched > SCTP_SS_MAX) + return -EINVAL; + + if (old) { + old->free(&asoc->stream); + + /* Give the next scheduler a clean slate. */ + for (i = 0; i < asoc->stream.outcnt; i++) { + void *p = asoc->stream.out[i].ext; + + if (!p) + continue; + + p += offsetofend(struct sctp_stream_out_ext, outq); + memset(p, 0, sizeof(struct sctp_stream_out_ext) - + offsetofend(struct sctp_stream_out_ext, outq)); + } + } + + asoc->outqueue.sched = n; + n->init(&asoc->stream); + for (i = 0; i < asoc->stream.outcnt; i++) { + if (!asoc->stream.out[i].ext) + continue; + + ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + if (ret) + goto err; + } + + /* We have to requeue all chunks already queued. */ + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + if (ch->msg == msg) + continue; + msg = ch->msg; + n->enqueue(&asoc->outqueue, msg); + } + + return ret; + +err: + n->free(&asoc->stream); + asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ + + return ret; +} + +int sctp_sched_get_sched(struct sctp_association *asoc) +{ + int i; + + for (i = 0; i <= SCTP_SS_MAX; i++) + if (asoc->outqueue.sched == sctp_sched_ops[i]) + return i; + + return 0; +} + +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) { + int ret; + + ret = sctp_stream_init_ext(&asoc->stream, sid); + if (ret) + return ret; + } + + return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); +} + +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) + return 0; + + return asoc->outqueue.sched->get(&asoc->stream, sid, value); +} + +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) +{ + if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + struct sctp_stream_out *sout; + __u16 sid; + + /* datamsg is not finish, so save it as current one, + * in case application switch scheduler or a higher + * priority stream comes in. + */ + sid = sctp_chunk_stream_no(ch); + sout = &q->asoc->stream.out[sid]; + q->asoc->stream.out_curr = sout; + return; + } + + q->asoc->stream.out_curr = NULL; + q->sched->dequeue_done(q, ch); +} + +/* Auxiliary functions for the schedulers */ +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) +{ + list_del_init(&ch->list); + list_del_init(&ch->stream_list); + q->out_qlen -= ch->skb->len; +} + +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) +{ + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + INIT_LIST_HEAD(&stream->out[sid].ext->outq); + return sched->init_sid(stream, sid, gfp); +} + +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + + return asoc->outqueue.sched; +} From 13aa8770fe42d246c6f3a8eb814b85bccb428011 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:14 -0300 Subject: [PATCH 1214/1322] sctp: add sockopt to get/set stream scheduler As defined per RFC Draft ndata Section 4.3.2, named as SCTP_STREAM_SCHEDULER. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 4487e7625ddb..0050f10087d2 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -122,6 +122,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RESET_ASSOC 120 #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 +#define SCTP_STREAM_SCHEDULER 123 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d207734326b0..ae35dbf2810f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -79,6 +79,7 @@ #include #include #include +#include /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); @@ -3914,6 +3915,36 @@ out: return retval; } +static int sctp_setsockopt_scheduler(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_SS_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_sched(asoc, params.assoc_value); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4095,6 +4126,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_setsockopt_scheduler(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6793,6 +6827,43 @@ out: return retval; } +static int sctp_getsockopt_scheduler(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = sctp_sched_get_sched(asoc); + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6975,6 +7046,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_getsockopt_scheduler(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; From 0ccdf3c7fdeda511b10def19505178a9d2d3fccd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:15 -0300 Subject: [PATCH 1215/1322] sctp: add sockopt to get/set stream scheduler parameters As defined per RFC Draft ndata Section 4.3.3, named as SCTP_STREAM_SCHEDULER_VALUE. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 7 ++++ net/sctp/socket.c | 77 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 0050f10087d2..00ac417d2c4f 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -123,6 +123,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_ADD_STREAMS 121 #define SCTP_SOCKOPT_PEELOFF_FLAGS 122 #define SCTP_STREAM_SCHEDULER 123 +#define SCTP_STREAM_SCHEDULER_VALUE 124 /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 @@ -815,6 +816,12 @@ struct sctp_assoc_value { uint32_t assoc_value; }; +struct sctp_stream_value { + sctp_assoc_t assoc_id; + uint16_t stream_id; + uint16_t stream_value; +}; + /* * 7.2.2 Peer Address Information * diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ae35dbf2810f..88c28421ec15 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3945,6 +3945,34 @@ out: return retval; } +static int sctp_setsockopt_scheduler_value(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_stream_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_value(asoc, params.stream_id, + params.stream_value, GFP_KERNEL); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4129,6 +4157,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_STREAM_SCHEDULER: retval = sctp_setsockopt_scheduler(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6864,6 +6895,48 @@ out: return retval; } +static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + retval = sctp_sched_get_value(asoc, params.stream_id, + ¶ms.stream_value); + if (retval) + goto out; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7050,6 +7123,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_scheduler(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_getsockopt_scheduler_value(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:16 -0300 Subject: [PATCH 1216/1322] sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 24 +++ include/uapi/linux/sctp.h | 3 +- net/sctp/Makefile | 2 +- net/sctp/stream_sched.c | 3 + net/sctp/stream_sched_prio.c | 347 +++++++++++++++++++++++++++++++++++ 5 files changed, 377 insertions(+), 2 deletions(-) create mode 100644 net/sctp/stream_sched_prio.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 3c22a30fd71b..40eb8d66a37c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1328,10 +1328,27 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_priorities { + /* List of priorities scheduled */ + struct list_head prio_sched; + /* List of streams scheduled */ + struct list_head active; + /* The next stream stream in line */ + struct sctp_stream_out_ext *next; + __u16 prio; +}; + struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; struct list_head outq; /* chunks enqueued by this stream */ + union { + struct { + /* Scheduled streams list */ + struct list_head prio_list; + struct sctp_stream_priorities *prio_head; + }; + }; }; struct sctp_stream_out { @@ -1351,6 +1368,13 @@ struct sctp_stream { __u16 incnt; /* Current stream being sent, if any */ struct sctp_stream_out *out_curr; + union { + /* Fields used by priority scheduler */ + struct { + /* List of priorities scheduled */ + struct list_head prio_list; + }; + }; }; #define SCTP_STREAM_CLOSED 0x00 diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 00ac417d2c4f..850fa8b29d7e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1099,7 +1099,8 @@ struct sctp_add_streams { /* SCTP Stream schedulers */ enum sctp_sched_type { SCTP_SS_FCFS, - SCTP_SS_MAX = SCTP_SS_FCFS + SCTP_SS_PRIO, + SCTP_SS_MAX = SCTP_SS_PRIO }; #endif /* _UAPI_SCTP_H */ diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0f6e6d1d69fd..647c9cfd4e95 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o stream_sched.o + offload.o stream_sched.o stream_sched_prio.o sctp_probe-y := probe.o diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 40a9a9de2b98..115ddb765169 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -121,8 +121,11 @@ static struct sctp_sched_ops sctp_sched_fcfs = { /* API to other parts of the stack */ +extern struct sctp_sched_ops sctp_sched_prio; + struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, + &sctp_sched_prio, }; int sctp_sched_set_sched(struct sctp_association *asoc, diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c new file mode 100644 index 000000000000..384dbf3c8760 --- /dev/null +++ b/net/sctp/stream_sched_prio.c @@ -0,0 +1,347 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* Priority handling + * RFC DRAFT ndata section 3.4 + */ + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); + +static struct sctp_stream_priorities *sctp_sched_prio_new_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + + p = kmalloc(sizeof(*p), gfp); + if (!p) + return NULL; + + INIT_LIST_HEAD(&p->prio_sched); + INIT_LIST_HEAD(&p->active); + p->next = NULL; + p->prio = prio; + + return p; +} + +static struct sctp_stream_priorities *sctp_sched_prio_get_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + int i; + + /* Look into scheduled priorities first, as they are sorted and + * we can find it fast IF it's scheduled. + */ + list_for_each_entry(p, &stream->prio_list, prio_sched) { + if (p->prio == prio) + return p; + if (p->prio > prio) + break; + } + + /* No luck. So we search on all streams now. */ + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + + p = stream->out[i].ext->prio_head; + if (!p) + /* Means all other streams won't be initialized + * as well. + */ + break; + if (p->prio == prio) + return p; + } + + /* If not even there, allocate a new one. */ + return sctp_sched_prio_new_head(stream, prio, gfp); +} + +static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) +{ + struct list_head *pos; + + pos = p->next->prio_list.next; + if (pos == &p->active) + pos = pos->next; + p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); +} + +static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) +{ + bool scheduled = false; + + if (!list_empty(&soute->prio_list)) { + struct sctp_stream_priorities *prio_head = soute->prio_head; + + /* Scheduled */ + scheduled = true; + + if (prio_head->next == soute) + /* Try to move to the next stream */ + sctp_sched_prio_next_stream(prio_head); + + list_del_init(&soute->prio_list); + + /* Also unsched the priority if this was the last stream */ + if (list_empty(&prio_head->active)) { + list_del_init(&prio_head->prio_sched); + /* If there is no stream left, clear next */ + prio_head->next = NULL; + } + } + + return scheduled; +} + +static void sctp_sched_prio_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + struct sctp_stream_priorities *prio, *prio_head; + + prio_head = soute->prio_head; + + /* Nothing to do if already scheduled */ + if (!list_empty(&soute->prio_list)) + return; + + /* Schedule the stream. If there is a next, we schedule the new + * one before it, so it's the last in round robin order. + * If there isn't, we also have to schedule the priority. + */ + if (prio_head->next) { + list_add(&soute->prio_list, prio_head->next->prio_list.prev); + return; + } + + list_add(&soute->prio_list, &prio_head->active); + prio_head->next = soute; + + list_for_each_entry(prio, &stream->prio_list, prio_sched) { + if (prio->prio > prio_head->prio) { + list_add(&prio_head->prio_sched, prio->prio_sched.prev); + return; + } + } + + list_add_tail(&prio_head->prio_sched, &stream->prio_list); +} + +static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out_ext *soute = sout->ext; + struct sctp_stream_priorities *prio_head, *old; + bool reschedule = false; + int i; + + prio_head = sctp_sched_prio_get_head(stream, prio, gfp); + if (!prio_head) + return -ENOMEM; + + reschedule = sctp_sched_prio_unsched(soute); + old = soute->prio_head; + soute->prio_head = prio_head; + if (reschedule) + sctp_sched_prio_sched(stream, soute); + + if (!old) + /* Happens when we set the priority for the first time */ + return 0; + + for (i = 0; i < stream->outcnt; i++) { + soute = stream->out[i].ext; + if (soute && soute->prio_head == old) + /* It's still in use, nothing else to do here. */ + return 0; + } + + /* No hits, we are good to free it. */ + kfree(old); + + return 0; +} + +static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = stream->out[sid].ext->prio_head->prio; + return 0; +} + +static int sctp_sched_prio_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->prio_list); + + return 0; +} + +static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + return sctp_sched_prio_set(stream, sid, 0, gfp); +} + +static void sctp_sched_prio_free(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *prio, *n; + LIST_HEAD(list); + int i; + + /* As we don't keep a list of priorities, to avoid multiple + * frees we have to do it in 3 steps: + * 1. unsched everyone, so the lists are free to use in 2. + * 2. build the list of the priorities + * 3. free the list + */ + sctp_sched_prio_unsched_all(stream); + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + prio = stream->out[i].ext->prio_head; + if (prio && list_empty(&prio->prio_sched)) + list_add(&prio->prio_sched, &list); + } + list_for_each_entry_safe(prio, n, &list, prio_sched) { + list_del_init(&prio->prio_sched); + kfree(prio); + } +} + +static void sctp_sched_prio_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_prio_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next. It's easy, it's either the current + * one or the first chunk on the next active stream. + */ + if (stream->out_curr) { + soute = stream->out_curr->ext; + } else { + prio = list_entry(stream->prio_list.next, + struct sctp_stream_priorities, prio_sched); + soute = prio->next; + } + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream on + * this priority. + */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + prio = soute->prio_head; + + sctp_sched_prio_next_stream(prio); + + if (list_empty(&soute->outq)) + sctp_sched_prio_unsched(soute); +} + +static void sctp_sched_prio_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out *sout; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + sout = &stream->out[sid]; + if (sout->ext) + sctp_sched_prio_sched(stream, sout->ext); + } +} + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *p, *tmp; + struct sctp_stream_out_ext *soute, *souttmp; + + list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) + list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) + sctp_sched_prio_unsched(soute); +} + +struct sctp_sched_ops sctp_sched_prio = { + .set = sctp_sched_prio_set, + .get = sctp_sched_prio_get, + .init = sctp_sched_prio_init, + .init_sid = sctp_sched_prio_init_sid, + .free = sctp_sched_prio_free, + .enqueue = sctp_sched_prio_enqueue, + .dequeue = sctp_sched_prio_dequeue, + .dequeue_done = sctp_sched_prio_dequeue_done, + .sched_all = sctp_sched_prio_sched_all, + .unsched_all = sctp_sched_prio_unsched_all, +}; From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:17 -0300 Subject: [PATCH 1217/1322] sctp: introduce round robin stream scheduler This patch introduces RFC Draft ndata section 3.2 Priority Based Scheduler (SCTP_SS_RR). Works by maintaining a list of enqueued streams and tracking the last one used to send data. When the datamsg is done, it switches to the next stream. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 11 ++ include/uapi/linux/sctp.h | 3 +- net/sctp/Makefile | 3 +- net/sctp/stream_sched.c | 2 + net/sctp/stream_sched_rr.c | 201 +++++++++++++++++++++++++++++++++++++ 5 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 net/sctp/stream_sched_rr.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 40eb8d66a37c..16f949eef52f 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext { struct list_head prio_list; struct sctp_stream_priorities *prio_head; }; + /* Fields used by RR scheduler */ + struct { + struct list_head rr_list; + }; }; }; @@ -1374,6 +1378,13 @@ struct sctp_stream { /* List of priorities scheduled */ struct list_head prio_list; }; + /* Fields used by RR scheduler */ + struct { + /* List of streams scheduled */ + struct list_head rr_list; + /* The next stream stream in line */ + struct sctp_stream_out_ext *rr_next; + }; }; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 850fa8b29d7e..6cd7d416ca40 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -1100,7 +1100,8 @@ struct sctp_add_streams { enum sctp_sched_type { SCTP_SS_FCFS, SCTP_SS_PRIO, - SCTP_SS_MAX = SCTP_SS_PRIO + SCTP_SS_RR, + SCTP_SS_MAX = SCTP_SS_RR }; #endif /* _UAPI_SCTP_H */ diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 647c9cfd4e95..bf90c5397719 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o stream_sched.o stream_sched_prio.o + offload.o stream_sched.o stream_sched_prio.o \ + stream_sched_rr.o sctp_probe-y := probe.o diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 115ddb765169..03513a9fa110 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -122,10 +122,12 @@ static struct sctp_sched_ops sctp_sched_fcfs = { /* API to other parts of the stack */ extern struct sctp_sched_ops sctp_sched_prio; +extern struct sctp_sched_ops sctp_sched_rr; struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, &sctp_sched_prio, + &sctp_sched_rr, }; int sctp_sched_set_sched(struct sctp_association *asoc, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c new file mode 100644 index 000000000000..7612a438c5b9 --- /dev/null +++ b/net/sctp/stream_sched_rr.c @@ -0,0 +1,201 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#include +#include +#include +#include + +/* Priority handling + * RFC DRAFT ndata section 3.2 + */ +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream); + +static void sctp_sched_rr_next_stream(struct sctp_stream *stream) +{ + struct list_head *pos; + + pos = stream->rr_next->rr_list.next; + if (pos == &stream->rr_list) + pos = pos->next; + stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list); +} + +static void sctp_sched_rr_unsched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (stream->rr_next == soute) + /* Try to move to the next stream */ + sctp_sched_rr_next_stream(stream); + + list_del_init(&soute->rr_list); + + /* If we have no other stream queued, clear next */ + if (list_empty(&stream->rr_list)) + stream->rr_next = NULL; +} + +static void sctp_sched_rr_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (!list_empty(&soute->rr_list)) + /* Already scheduled. */ + return; + + /* Schedule the stream */ + list_add_tail(&soute->rr_list, &stream->rr_list); + + if (!stream->rr_next) + stream->rr_next = soute; +} + +static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + return 0; +} + +static int sctp_sched_rr_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->rr_list); + stream->rr_next = NULL; + + return 0; +} + +static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + + return 0; +} + +static void sctp_sched_rr_free(struct sctp_stream *stream) +{ + sctp_sched_rr_unsched_all(stream); +} + +static void sctp_sched_rr_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_rr_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next */ + if (stream->out_curr) + soute = stream->out_curr->ext; + else + soute = stream->rr_next; + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + + sctp_sched_rr_next_stream(&q->asoc->stream); + + if (list_empty(&soute->outq)) + sctp_sched_rr_unsched(&q->asoc->stream, soute); +} + +static void sctp_sched_rr_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + soute = stream->out[sid].ext; + if (soute) + sctp_sched_rr_sched(stream, soute); + } +} + +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_out_ext *soute, *tmp; + + list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list) + sctp_sched_rr_unsched(stream, soute); +} + +struct sctp_sched_ops sctp_sched_rr = { + .set = sctp_sched_rr_set, + .get = sctp_sched_rr_get, + .init = sctp_sched_rr_init, + .init_sid = sctp_sched_rr_init_sid, + .free = sctp_sched_rr_free, + .enqueue = sctp_sched_rr_enqueue, + .dequeue = sctp_sched_rr_dequeue, + .dequeue_done = sctp_sched_rr_dequeue_done, + .sched_all = sctp_sched_rr_sched_all, + .unsched_all = sctp_sched_rr_unsched_all, +}; From 8ee912dab95f1483156b6e994004bfcc3158d798 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:14:15 -0700 Subject: [PATCH 1218/1322] alpha: fix build failures The build of alpha allmodconfig is giving error: arch/alpha/include/asm/mmu_context.h: In function 'ev5_switch_mm': arch/alpha/include/asm/mmu_context.h:160:2: error: implicit declaration of function 'task_thread_info'; did you mean 'init_thread_info'? [-Werror=implicit-function-declaration] The file 'mmu_context.h' needed an extra header file. Link: http://lkml.kernel.org/r/1505668810-7497-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/mmu_context.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h index 384bd47b5187..45c020a0fe76 100644 --- a/arch/alpha/include/asm/mmu_context.h +++ b/arch/alpha/include/asm/mmu_context.h @@ -8,6 +8,7 @@ */ #include +#include #include #include From 630cc2b30a42c70628368a412beb4a5e5dd71abe Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:14:18 -0700 Subject: [PATCH 1219/1322] kernel/params.c: align add_sysfs_param documentation with code This parameter is named kp, so the documentation should use that. Fixes: 9b473de87209 ("param: Fix duplicate module prefixes") Link: http://lkml.kernel.org/r/20170919142656.64aea59e@endymion Signed-off-by: Jean Delvare Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..1cd8f1a895a8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and From e00e5a26e3d37b47ce8ca59611db1e6135790ef8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 16:14:21 -0700 Subject: [PATCH 1220/1322] scripts/spelling.txt: add more spelling mistakes to spelling.txt Here are some of the more spelling mistakes and typos that I've found while fixing up spelling mistakes in kernel error message text over the past eight weeks. [akpm@linux-foundation.org: s/|/||/, per Joe] Link: http://lkml.kernel.org/r/20170919090818.5989-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: Masahiro Yamada Cc: Stephen Boyd Cc: Joe Perches Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 400ef35169c5..aa0cc49ad1ad 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -53,6 +53,7 @@ acumulator||accumulator adapater||adapter addional||additional additionaly||additionally +additonal||additional addres||address adddress||address addreses||addresses @@ -67,6 +68,8 @@ adviced||advised afecting||affecting againt||against agaist||against +aggreataon||aggregation +aggreation||aggregation albumns||albums alegorical||allegorical algined||aligned @@ -80,6 +83,8 @@ aligment||alignment alignement||alignment allign||align alligned||aligned +alllocate||allocate +alloated||allocated allocatote||allocate allocatrd||allocated allocte||allocate @@ -171,6 +176,7 @@ availale||available availavility||availability availble||available availiable||available +availible||available avalable||available avaliable||available aysnc||async @@ -203,6 +209,7 @@ broadcat||broadcast cacluated||calculated caculation||calculation calender||calendar +calescing||coalescing calle||called callibration||calibration calucate||calculate @@ -210,6 +217,7 @@ calulate||calculate cancelation||cancellation cancle||cancel capabilites||capabilities +capabilty||capability capabitilies||capabilities capatibilities||capabilities capapbilities||capabilities @@ -302,6 +310,7 @@ containts||contains contaisn||contains contant||contact contence||contents +continious||continuous continous||continuous continously||continuously continueing||continuing @@ -393,6 +402,7 @@ differrence||difference diffrent||different diffrentiate||differentiate difinition||definition +dimesions||dimensions diplay||display direectly||directly disassocation||disassociation @@ -449,6 +459,7 @@ equiped||equipped equivelant||equivalent equivilant||equivalent eror||error +errorr||error estbalishment||establishment etsablishment||establishment etsbalishment||establishment @@ -481,6 +492,7 @@ failied||failed faillure||failure failue||failure failuer||failure +failng||failing faireness||fairness falied||failed faliure||failure @@ -493,6 +505,7 @@ fetaure||feature fetaures||features fileystem||filesystem fimware||firmware +firware||firmware finanize||finalize findn||find finilizes||finalizes @@ -502,6 +515,7 @@ folloing||following followign||following followings||following follwing||following +fonud||found forseeable||foreseeable forse||force fortan||fortran @@ -532,6 +546,7 @@ grabing||grabbing grahical||graphical grahpical||graphical grapic||graphic +grranted||granted guage||gauge guarenteed||guaranteed guarentee||guarantee @@ -543,6 +558,7 @@ happend||happened harware||hardware heirarchically||hierarchically helpfull||helpful +hybernate||hibernate hierachy||hierarchy hierarchie||hierarchy howver||however @@ -565,16 +581,19 @@ implemenation||implementation implementaiton||implementation implementated||implemented implemention||implementation +implementd||implemented implemetation||implementation implemntation||implementation implentation||implementation implmentation||implementation implmenting||implementing +incative||inactive incomming||incoming incompatabilities||incompatibilities incompatable||incompatible inconsistant||inconsistent increas||increase +incremeted||incremented incrment||increment indendation||indentation indended||intended @@ -619,6 +638,7 @@ interger||integer intermittant||intermittent internel||internal interoprability||interoperability +interuupt||interrupt interrface||interface interrrupt||interrupt interrup||interrupt @@ -638,8 +658,10 @@ intrrupt||interrupt intterrupt||interrupt intuative||intuitive invaid||invalid +invald||invalid invalde||invalid invalide||invalid +invalidiate||invalidate invalud||invalid invididual||individual invokation||invocation @@ -713,6 +735,7 @@ misformed||malformed mispelled||misspelled mispelt||misspelt mising||missing +mismactch||mismatch missmanaged||mismanaged missmatch||mismatch miximum||maximum @@ -731,6 +754,7 @@ multidimensionnal||multidimensional multple||multiple mumber||number muticast||multicast +mutilcast||multicast mutiple||multiple mutli||multi nams||names @@ -834,6 +858,7 @@ posible||possible positon||position possibilites||possibilities powerfull||powerful +preample||preamble preapre||prepare preceeded||preceded preceeding||preceding @@ -1059,6 +1084,7 @@ sturcture||structure subdirectoires||subdirectories suble||subtle substract||subtract +submition||submission succesfully||successfully succesful||successful successed||succeeded @@ -1078,6 +1104,7 @@ suppoted||supported suppported||supported suppport||support supress||suppress +surpressed||suppressed surpresses||suppresses susbsystem||subsystem suspeneded||suspended @@ -1091,6 +1118,7 @@ swithced||switched swithcing||switching swithed||switched swithing||switching +swtich||switch symetric||symmetric synax||syntax synchonized||synchronized @@ -1111,7 +1139,9 @@ therfore||therefore thier||their threds||threads threshhold||threshold +thresold||threshold throught||through +troughput||throughput thses||these tiggered||triggered tipically||typically @@ -1120,6 +1150,7 @@ tmis||this torerable||tolerable tramsmitted||transmitted tramsmit||transmit +tranasction||transaction tranfer||transfer transciever||transceiver transferd||transferred @@ -1133,6 +1164,7 @@ trasmission||transmission treshold||threshold trigerring||triggering trun||turn +tunning||tuning ture||true tyep||type udpate||update @@ -1199,6 +1231,7 @@ visiters||visitors vitual||virtual wakeus||wakeups wating||waiting +wiat||wait wether||whether whataver||whatever whcih||which From fa87b91c94a8fd2c8502b6761be2d08a8e9bcf55 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 3 Oct 2017 16:14:24 -0700 Subject: [PATCH 1221/1322] include/linux/mm.h: fix typo in VM_MPX definition There's a typo in recent change of VM_MPX definition. We want it to be VM_HIGH_ARCH_4, not VM_HIGH_ARCH_BIT_4. This bug does cause visible regressions. In arch_vma_name the vmflags are tested against VM_MPX. With the incorrect value of VM_MPX, a number of vmas (such as the stack) test positive and end up being marked as "[mpx]" in /proc/N/maps instead of their correct names. This confuses tools like rr which expect to be able to find familiar vmas. Fixes: df3735c5b40f ("x86,mpx: make mpx depend on x86-64 to free up VMA flag") Link: http://lkml.kernel.org/r/20170918140253.36856-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Rik van Riel Cc: Dave Hansen Cc: Kyle Huey Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f8c10d336e42..065d99deb847 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -240,7 +240,7 @@ extern unsigned int kobjsize(const void *objp); #if defined(CONFIG_X86_INTEL_MPX) /* MPX specific bounds table or bounds directory */ -# define VM_MPX VM_HIGH_ARCH_BIT_4 +# define VM_MPX VM_HIGH_ARCH_4 #else # define VM_MPX VM_NONE #endif From 4b22927f0cbd58303aac689e378d20bf56267a39 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 3 Oct 2017 16:14:27 -0700 Subject: [PATCH 1222/1322] ksm: fix unlocked iteration over vmas in cmp_and_merge_page() In this place mm is unlocked, so vmas or list may change. Down read mmap_sem to protect them from modifications. Link: http://lkml.kernel.org/r/150512788393.10691.8868381099691121308.stgit@localhost.localdomain Fixes: e86c59b1b12d ("mm/ksm: improve deduplication of zero pages with colouring") Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Reviewed-by: Andrea Arcangeli Cc: Minchan Kim Cc: zhong jiang Cc: Ingo Molnar Cc: Claudio Imbrenda Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/ksm.c b/mm/ksm.c index 15dd7415f7b3..6cb60f46cce5 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1990,6 +1990,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) { + struct mm_struct *mm = rmap_item->mm; struct rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct stable_node *stable_node; @@ -2062,9 +2063,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (ksm_use_zero_pages && (checksum == zero_checksum)) { struct vm_area_struct *vma; - vma = find_mergeable_vma(rmap_item->mm, rmap_item->address); + down_read(&mm->mmap_sem); + vma = find_mergeable_vma(mm, rmap_item->address); err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); + up_read(&mm->mmap_sem); /* * In case of failure, the page was not really empty, so we * need to continue. Otherwise we're done. From 19bfbe22f59a207417b2679e7e83c180419c9ec5 Mon Sep 17 00:00:00 2001 From: Alexandru Moise <00moses.alexander00@gmail.com> Date: Tue, 3 Oct 2017 16:14:31 -0700 Subject: [PATCH 1223/1322] mm, hugetlb, soft_offline: save compound page order before page migration This fixes a bug in madvise() where if you'd try to soft offline a hugepage via madvise(), while walking the address range you'd end up, using the wrong page offset due to attempting to get the compound order of a former but presently not compound page, due to dissolving the huge page (since commit c3114a84f7f9: "mm: hugetlb: soft-offline: dissolve source hugepage after successful migration"). As a result I ended up with all my free pages except one being offlined. Link: http://lkml.kernel.org/r/20170912204306.GA12053@gmail.com Fixes: c3114a84f7f9 ("mm: hugetlb: soft-offline: dissolve source hugepage after successful migration") Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Anshuman Khandual Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Hillf Danton Cc: Shaohua Li Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Rientjes Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 21261ff0466f..25bade36e9ca 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior, { struct page *page; struct zone *zone; + unsigned int order; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(page))) { + + for (; start < end; start += PAGE_SIZE << order) { int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; + /* + * When soft offlining hugepages, after migrating the page + * we dissolve it, therefore in the second loop "page" will + * no longer be a compound page, and order will be 0. + */ + order = compound_order(compound_head(page)); + if (PageHWPoison(page)) { put_page(page); continue; From b78412b8300a8453b78d2c1b0b925b66493bb011 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:34 -0700 Subject: [PATCH 1224/1322] sh: sh7722: remove nonexistent GPIO_PTQ7 to fix pinctrl registration Patch series "sh: sh7722/sh7757i/sh7264/sh7269: Fix pinctrl registration", v2. Magnus Damm reported that on sh7722/Migo-R, pinctrl registration fails with: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. Apparently GPIO_PTQ7 was defined in the enum, but never used. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Hence such entries are treated as pin zero, which was registered before, and pinctrl registration fails. I can't see how this ever worked, as at the time of commit f5e25ae52fef ("sh-pfc: Add sh7722 pinmux support"), pinmux_gpios[] in drivers/pinctrl/sh-pfc/pfc-sh7722.c already had the hole, and drivers/pinctrl/core.c already had the check. Some scripting revealed a few more broken drivers: - sh7757 has four holes, due to nonexistent GPIO_PT[JLNQ]7_RESV. - sh7264 and sh7269 define GPIO_PH[0-7], but don't use it with PINMUX_GPIO(). Patch 1 fixes the issue on sh7722, and was tested. Patches 3-4 should fix the issue on the other 3 SoCs, but was untested due to lack of hardware. This patch (of 4): On sh7722/Migo-R, pinctrl registration fails with: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. As GPIO_PTQ7 is defined in the enum, but never used, pinmux_pins[] contains a (zero-filled) hole. Hence this entry is treated as pin zero, which was registered before, and pinctrl registration fails. According to the datasheet, port PTQ7 does not exist. Hence remove GPIO_PTQ7 from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-2-git-send-email-geert+renesas@glider.be Fixes: 8d7b5b0af7e070b9 ("sh: Add sh7722 pinmux code") Signed-off-by: Geert Uytterhoeven Reported-by: Magnus Damm Reviewed-by: Laurent Pinchart Tested-by: Jacopo Mondi Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh4/cpu/sh7722.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h index 3bb74e534d0f..78961ab78a5a 100644 --- a/arch/sh/include/cpu-sh4/cpu/sh7722.h +++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h @@ -67,7 +67,7 @@ enum { GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0, /* PTQ */ - GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4, + GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4, GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0, /* PTR */ From d8ce38f69843a56da044e56b6c16aecfbc3c6e39 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:37 -0700 Subject: [PATCH 1225/1322] sh: sh7757: remove nonexistent GPIO_PT[JLNQ]7_RESV to fix pinctrl registration Commit 3810e96056ff ("sh: modify pinmux for SH7757 2nd cut") renamed GPIO_PT[JLNQ]7 to GPIO_PT[JLNQ]7_RESV, and removed the existing users from the pinmux_pins[] array. However, pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. Hence entries were not really removed, but replaced by (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PT[JLNQ]7_RESV from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-3-git-send-email-geert+renesas@glider.be Fixes: 3810e96056ffddf6 ("sh: modify pinmux for SH7757 2nd cut") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Jacopo Mondi Cc: Magnus Damm Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh4/cpu/sh7757.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sh/include/cpu-sh4/cpu/sh7757.h b/arch/sh/include/cpu-sh4/cpu/sh7757.h index 5340f3bc1863..b40fb541e72a 100644 --- a/arch/sh/include/cpu-sh4/cpu/sh7757.h +++ b/arch/sh/include/cpu-sh4/cpu/sh7757.h @@ -40,7 +40,7 @@ enum { /* PTJ */ GPIO_PTJ0, GPIO_PTJ1, GPIO_PTJ2, GPIO_PTJ3, - GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6, GPIO_PTJ7_RESV, + GPIO_PTJ4, GPIO_PTJ5, GPIO_PTJ6, /* PTK */ GPIO_PTK0, GPIO_PTK1, GPIO_PTK2, GPIO_PTK3, @@ -48,7 +48,7 @@ enum { /* PTL */ GPIO_PTL0, GPIO_PTL1, GPIO_PTL2, GPIO_PTL3, - GPIO_PTL4, GPIO_PTL5, GPIO_PTL6, GPIO_PTL7_RESV, + GPIO_PTL4, GPIO_PTL5, GPIO_PTL6, /* PTM */ GPIO_PTM0, GPIO_PTM1, GPIO_PTM2, GPIO_PTM3, @@ -56,7 +56,7 @@ enum { /* PTN */ GPIO_PTN0, GPIO_PTN1, GPIO_PTN2, GPIO_PTN3, - GPIO_PTN4, GPIO_PTN5, GPIO_PTN6, GPIO_PTN7_RESV, + GPIO_PTN4, GPIO_PTN5, GPIO_PTN6, /* PTO */ GPIO_PTO0, GPIO_PTO1, GPIO_PTO2, GPIO_PTO3, @@ -68,7 +68,7 @@ enum { /* PTQ */ GPIO_PTQ0, GPIO_PTQ1, GPIO_PTQ2, GPIO_PTQ3, - GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6, GPIO_PTQ7_RESV, + GPIO_PTQ4, GPIO_PTQ5, GPIO_PTQ6, /* PTR */ GPIO_PTR0, GPIO_PTR1, GPIO_PTR2, GPIO_PTR3, From eae3df7e82318d798f45dedf111e241805ec7a4a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:41 -0700 Subject: [PATCH 1226/1322] sh: sh7264: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PH[0-7] from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-4-git-send-email-geert+renesas@glider.be Fixes: 41797f75486d8ca3 ("sh: Add pinmux for sh7264") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Jacopo Mondi Cc: Magnus Damm Cc: Rich Felker Cc: Yoshihiro Shimoda Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh2a/cpu/sh7264.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7264.h b/arch/sh/include/cpu-sh2a/cpu/sh7264.h index 4d1ef6d74bd6..2ae0e938b657 100644 --- a/arch/sh/include/cpu-sh2a/cpu/sh7264.h +++ b/arch/sh/include/cpu-sh2a/cpu/sh7264.h @@ -43,9 +43,7 @@ enum { GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4, GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0, - /* Port H */ - GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4, - GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0, + /* Port H - Port H does not have a Data Register */ /* Port I - not on device */ From d9d73e81fe82fdf4ee65a48c26531edc04108349 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Oct 2017 16:14:44 -0700 Subject: [PATCH 1227/1322] sh: sh7269: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration Pinmux_pins[] is initialized through PINMUX_GPIO(), using designated array initializers, where the GPIO_* enums serve as indices. If enum values are defined, but never used, pinmux_pins[] contains (zero-filled) holes. Such entries are treated as pin zero, which was registered before, thus leading to pinctrl registration failures, as seen on sh7722: sh-pfc pfc-sh7722: pin 0 already registered sh-pfc pfc-sh7722: error during pin registration sh-pfc pfc-sh7722: could not register: -22 sh-pfc: probe of pfc-sh7722 failed with error -22 Remove GPIO_PH[0-7] from the enum to fix this. Link: http://lkml.kernel.org/r/1505205657-18012-5-git-send-email-geert+renesas@glider.be Fixes: ef0fa5331a73e479 ("sh: Add pinmux for sh7269") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Cc: Yoshinori Sato Cc: Rich Felker Cc: Magnus Damm Cc: Yoshihiro Shimoda Cc: Jacopo Mondi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/cpu-sh2a/cpu/sh7269.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7269.h b/arch/sh/include/cpu-sh2a/cpu/sh7269.h index 2a0ca8780f0d..13c495a9fc00 100644 --- a/arch/sh/include/cpu-sh2a/cpu/sh7269.h +++ b/arch/sh/include/cpu-sh2a/cpu/sh7269.h @@ -45,9 +45,7 @@ enum { GPIO_PG7, GPIO_PG6, GPIO_PG5, GPIO_PG4, GPIO_PG3, GPIO_PG2, GPIO_PG1, GPIO_PG0, - /* Port H */ - GPIO_PH7, GPIO_PH6, GPIO_PH5, GPIO_PH4, - GPIO_PH3, GPIO_PH2, GPIO_PH1, GPIO_PH0, + /* Port H - Port H does not have a Data Register */ /* Port I - not on device */ From d5567c9df1ef001b2a7e6684b3b3498371ee4cae Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Tue, 3 Oct 2017 16:14:47 -0700 Subject: [PATCH 1228/1322] z3fold: fix potential race in z3fold_reclaim_page It is possible that on a (partially) unsuccessful page reclaim, kref_put() called in z3fold_reclaim_page() does not yield page release, but the page is released shortly afterwards by another thread. Then z3fold_reclaim_page() would try to list_add() that (released) page again which is obviously a bug. To avoid that, spin_lock() has to be taken earlier, before the kref_put() call mentioned earlier. Link: http://lkml.kernel.org/r/20170913162937.bfff21c7d12b12a5f47639fd@gmail.com Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 486550df32be..b04fa3ba1bf2 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -875,16 +875,18 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) goto next; } next: + spin_lock(&pool->lock); if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { + spin_unlock(&pool->lock); free_z3fold_page(page); return 0; } } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { atomic64_dec(&pool->pages_nr); + spin_unlock(&pool->lock); return 0; } - spin_lock(&pool->lock); /* * Add to the beginning of LRU. From 4d4bbd8526a8fbeb2c090ea360211fceff952383 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:14:50 -0700 Subject: [PATCH 1229/1322] mm, oom_reaper: skip mm structs with mmu notifiers Andrea has noticed that the oom_reaper doesn't invalidate the range via mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can corrupt the memory of the kvm guest for example. tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not sufficient as per Andrea: "mmu_notifier_invalidate_range cannot be used in replacement of mmu_notifier_invalidate_range_start/end. For KVM mmu_notifier_invalidate_range is a noop and rightfully so. A MMU notifier implementation has to implement either ->invalidate_range method or the invalidate_range_start/end methods, not both. And if you implement invalidate_range_start/end like KVM is forced to do, calling mmu_notifier_invalidate_range in common code is a noop for KVM. For those MMU notifiers that can get away only implementing ->invalidate_range, the ->invalidate_range is implicitly called by mmu_notifier_invalidate_range_end(). And only those secondary MMUs that share the same pagetable with the primary MMU (like AMD iommuv2) can get away only implementing ->invalidate_range" As the callback is allowed to sleep and the implementation is out of hand of the MM it is safer to simply bail out if there is an mmu notifier registered. In order to not fail too early make the mm_has_notifiers check under the oom_lock and have a little nap before failing to give the current oom victim some more time to exit. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org Fixes: aac453635549 ("mm, oom: introduce oom reaper") Signed-off-by: Michal Hocko Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 5 +++++ mm/oom_kill.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 7b2e31b1745a..6866e8126982 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -400,6 +400,11 @@ extern void mmu_notifier_synchronize(void); #else /* CONFIG_MMU_NOTIFIER */ +static inline int mm_has_notifiers(struct mm_struct *mm) +{ + return 0; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 99736e026712..dee0f75c3013 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include "internal.h" @@ -494,6 +495,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } + /* + * If the mm has notifiers then we would need to invalidate them around + * unmap_page_range and that is risky because notifiers can sleep and + * what they do is basically undeterministic. So let's have a short + * sleep to give the oom victim some more time. + * TODO: we really want to get rid of this ugly hack and make sure that + * notifiers cannot block for unbounded amount of time and add + * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + */ + if (mm_has_notifiers(mm)) { + up_read(&mm->mmap_sem); + schedule_timeout_idle(HZ); + goto unlock_oom; + } + /* * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * work on the mm anymore. The check for MMF_OOM_SKIP must run From 72f0184c8a00c70179cfed6266e2e06b4d400065 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:14:53 -0700 Subject: [PATCH 1230/1322] mm, memcg: remove hotplug locking from try_charge The following lockdep splat has been noticed during LTP testing ====================================================== WARNING: possible circular locking dependency detected 4.13.0-rc3-next-20170807 #12 Not tainted ------------------------------------------------------ a.out/4771 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){++++++}, at: [] drain_all_stock.part.35+0x18/0x140 but task is already holding lock: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x175/0x530 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++++}: lock_acquire+0xc9/0x230 __might_fault+0x70/0xa0 _copy_to_user+0x23/0x70 filldir+0xa7/0x110 xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs] xfs_readdir+0x1fa/0x2c0 [xfs] xfs_file_readdir+0x30/0x40 [xfs] iterate_dir+0x17a/0x1a0 SyS_getdents+0xb0/0x160 entry_SYSCALL_64_fastpath+0x1f/0xbe -> #2 (&type->i_mutex_dir_key#3){++++++}: lock_acquire+0xc9/0x230 down_read+0x51/0xb0 lookup_slow+0xde/0x210 walk_component+0x160/0x250 link_path_walk+0x1a6/0x610 path_openat+0xe4/0xd50 do_filp_open+0x91/0x100 file_open_name+0xf5/0x130 filp_open+0x33/0x50 kernel_read_file_from_path+0x39/0x80 _request_firmware+0x39f/0x880 request_firmware_direct+0x37/0x50 request_microcode_fw+0x64/0xe0 reload_store+0xf7/0x180 dev_attr_store+0x18/0x30 sysfs_kf_write+0x44/0x60 kernfs_fop_write+0x113/0x1a0 __vfs_write+0x37/0x170 vfs_write+0xc7/0x1c0 SyS_write+0x58/0xc0 do_syscall_64+0x6c/0x1f0 return_from_SYSCALL_64+0x0/0x7a -> #1 (microcode_mutex){+.+.+.}: lock_acquire+0xc9/0x230 __mutex_lock+0x88/0x960 mutex_lock_nested+0x1b/0x20 microcode_init+0xbb/0x208 do_one_initcall+0x51/0x1a9 kernel_init_freeable+0x208/0x2a7 kernel_init+0xe/0x104 ret_from_fork+0x2a/0x40 -> #0 (cpu_hotplug_lock.rw_sem){++++++}: __lock_acquire+0x153c/0x1550 lock_acquire+0xc9/0x230 cpus_read_lock+0x4b/0x90 drain_all_stock.part.35+0x18/0x140 try_charge+0x3ab/0x6e0 mem_cgroup_try_charge+0x7f/0x2c0 shmem_getpage_gfp+0x25f/0x1050 shmem_fault+0x96/0x200 __do_fault+0x1e/0xa0 __handle_mm_fault+0x9c3/0xe00 handle_mm_fault+0x16e/0x380 __do_page_fault+0x24a/0x530 do_page_fault+0x30/0x80 page_fault+0x28/0x30 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&type->i_mutex_dir_key#3); lock(&mm->mmap_sem); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 2 locks held by a.out/4771: #0: (&mm->mmap_sem){++++++}, at: [] __do_page_fault+0x175/0x530 #1: (percpu_charge_mutex){+.+...}, at: [] try_charge+0x397/0x6e0 The problem is very similar to the one fixed by commit a459eeb7b852 ("mm, page_alloc: do not depend on cpu hotplug locks inside the allocator"). We are taking hotplug locks while we can be sitting on top of basically arbitrary locks. This just calls for problems. We can get rid of {get,put}_online_cpus, fortunately. We do not have to be worried about races with memory hotplug because drain_local_stock, which is called from both the WQ draining and the memory hotplug contexts, is always operating on the local cpu stock with IRQs disabled. The only thing to be careful about is that the target memcg doesn't vanish while we are still in drain_all_stock so take a reference on it. Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Artem Savkov Tested-by: Artem Savkov Cc: Johannes Weiner Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15af3da5af02..696c6529e900 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1777,6 +1777,10 @@ static void drain_local_stock(struct work_struct *dummy) struct memcg_stock_pcp *stock; unsigned long flags; + /* + * The only protection from memory hotplug vs. drain_stock races is + * that we always operate on local CPU stock here with IRQ disabled + */ local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); @@ -1821,27 +1825,33 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) /* If someone's already draining, avoid adding running more workers. */ if (!mutex_trylock(&percpu_charge_mutex)) return; - /* Notify other cpus that system-wide "drain" is running */ - get_online_cpus(); + /* + * Notify other cpus that system-wide "drain" is running + * We do not care about races with the cpu hotplug because cpu down + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; memcg = stock->cached; - if (!memcg || !stock->nr_pages) + if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) { + css_put(&memcg->css); continue; + } if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } + css_put(&memcg->css); } put_cpu(); - put_online_cpus(); mutex_unlock(&percpu_charge_mutex); } From 3f2eb0287ebd62ec8d6d544f830285302279e6bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Tue, 3 Oct 2017 16:14:57 -0700 Subject: [PATCH 1231/1322] mm/memcg: avoid page count check for zone device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for 4.14, zone device page always have an elevated refcount of one and thus page count sanity check in uncharge_page() is inappropriate for them. [mhocko@suse.com: nano-optimize VM_BUG_ON in uncharge_page] Link: http://lkml.kernel.org/r/20170914190011.5217-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Michal Hocko Reported-by: Evgeny Baskakov Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 696c6529e900..d5f3a62887cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5658,7 +5658,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) static void uncharge_page(struct page *page, struct uncharge_gather *ug) { VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); + VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && + !PageHWPoison(page) , page); if (!page->mem_cgroup) return; From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 3 Oct 2017 16:15:00 -0700 Subject: [PATCH 1232/1322] android: binder: drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Also restore mmput_async, which was initially introduced in commit ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom reaper context"), and was removed in commit 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently"). Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Signed-off-by: Sherry Yang Signed-off-by: Greg Kroah-Hartman Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Acked-by: Michal Hocko Cc: Martijn Coenen Cc: Todd Kjos Cc: Riley Andrews Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Hillf Danton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Hoeun Ryu Cc: Christopher Lameter Cc: Vegard Nossum Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/android/binder_alloc.c | 18 ++++++++++++------ include/linux/sched/mm.h | 6 ++++++ kernel/fork.c | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 8fe165844e47..064f5e31ec55 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + list_lru_isolate(lru, item); + spin_unlock(lock); + + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE); @@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3a19c253bdb1..ae53e413fb13 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * From 6818600ff094ca255a7fe31838ad50c29656c3c5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 3 Oct 2017 16:15:03 -0700 Subject: [PATCH 1233/1322] mm,compaction: serialize waitqueue_active() checks (for real) Andrea brought to my attention that the L->{L,S} guarantees are completely bogus for this case. I was looking at the diagram, from the offending commit, when that _is_ the race, we had the load reordered already. What we need is at least S->L semantics, thus simply use wq_has_sleeper() to serialize the call for good. Link: http://lkml.kernel.org/r/20170914175313.GB811@linux-80c1.suse Fixes: 46acef048a6 (mm,compaction: serialize waitqueue_active() checks) Signed-off-by: Davidlohr Bueso Reported-by: Andrea Parri Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index fb548e4c7bd4..03d31a875341 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1999,17 +1999,14 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - /* - * Pairs with implicit barrier in wait_event_freezable() - * such that wakeups are not missed in the lockless - * waitqueue_active() call. - */ - smp_acquire__after_ctrl_dep(); - if (pgdat->kcompactd_classzone_idx > classzone_idx) pgdat->kcompactd_classzone_idx = classzone_idx; - if (!waitqueue_active(&pgdat->kcompactd_wait)) + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed. + */ + if (!wq_has_sleeper(&pgdat->kcompactd_wait)) return; if (!kcompactd_node_suitable(pgdat)) From 3552935742e0d5f0dafd823736f45bdaa7ba672c Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Tue, 3 Oct 2017 16:15:06 -0700 Subject: [PATCH 1234/1322] z3fold: fix stale list handling Fix the situation when clear_bit() is called for page->private before the page pointer is actually assigned. While at it, remove work_busy() check because it is costly and does not give 100% guarantee anyway. Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index b04fa3ba1bf2..b2ba2ba585f3 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -250,6 +250,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); @@ -303,7 +304,6 @@ static void free_pages_work(struct work_struct *w) list_del(&zhdr->buddy); if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) continue; - clear_bit(NEEDS_COMPACTING, &page->private); spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); free_z3fold_page(page); @@ -624,10 +624,8 @@ lookup: * stale pages list. cancel_work_sync() can sleep so we must make * sure it won't be called in case we're in atomic context. */ - if (zhdr && (can_sleep || !work_pending(&zhdr->work) || - !unlikely(work_busy(&zhdr->work)))) { + if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { list_del(&zhdr->buddy); - clear_bit(NEEDS_COMPACTING, &page->private); spin_unlock(&pool->stale_lock); if (can_sleep) cancel_work_sync(&zhdr->work); From 57148a64e823bb1f49112fa52a92a7f372cda892 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Oct 2017 16:15:10 -0700 Subject: [PATCH 1235/1322] mm: meminit: mark init_reserved_page as __meminit The function is called from __meminit context and calls other __meminit functions but isn't it self mark as such today: WARNING: vmlinux.o(.text.unlikely+0x4516): Section mismatch in reference from the function init_reserved_page() to the function .meminit.text:early_pfn_to_nid() The function init_reserved_page() references the function __meminit early_pfn_to_nid(). This is often because init_reserved_page lacks a __meminit annotation or the annotation of early_pfn_to_nid is wrong. On most compilers, we don't notice this because the function gets inlined all the time. Adding __meminit here fixes the harmless warning for the old versions and is generally the correct annotation. Link: http://lkml.kernel.org/r/20170915193149.901180-1-arnd@arndb.de Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd") Signed-off-by: Arnd Bergmann Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c841af88836a..38d165a87860 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn) { pg_data_t *pgdat; int nid, zid; From 31d1e130f4a0f8f629a460167569577cac9b17c1 Mon Sep 17 00:00:00 2001 From: Ioan Nicu Date: Tue, 3 Oct 2017 16:15:13 -0700 Subject: [PATCH 1236/1322] rapidio: remove global irq spinlocks from the subsystem Locking of config and doorbell operations should be done only if the underlying hardware requires it. This patch removes the global spinlocks from the rapidio subsystem and moves them to the mport drivers (fsl_rio and tsi721), only to the necessary places. For example, local config space read and write operations (lcread/lcwrite) are atomic in all existing drivers, so there should be no need for locking, while the cread/cwrite operations which generate maintenance transactions need to be synchronized with a lock. Later, each driver could chose to use a per-port lock instead of a global one, or even more granular locking. Link: http://lkml.kernel.org/r/20170824113023.GD50104@nokia.com Signed-off-by: Ioan Nicu Signed-off-by: Frank Kunz Acked-by: Alexandre Bounine Cc: Matt Porter Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/fsl_rio.c | 17 ++++++++++++-- arch/powerpc/sysdev/fsl_rmu.c | 8 +++++++ drivers/rapidio/devices/tsi721.c | 7 ++++++ drivers/rapidio/rio-access.c | 40 ++++---------------------------- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 9234be1e66f5..5011ffea4e4b 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -71,6 +71,8 @@ #define RIWAR_WRTYP_ALLOC 0x00006000 #define RIWAR_SIZE_MASK 0x0000003F +static DEFINE_SPINLOCK(fsl_rio_config_lock); + #define __fsl_read_rio_config(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2)\n" \ @@ -184,6 +186,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, u8 hopcount, u32 offset, int len, u32 *val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; u32 rval, err = 0; @@ -197,6 +200,8 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -213,6 +218,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, __fsl_read_rio_config(rval, data, err, "lwz"); break; default: + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); return -EINVAL; } @@ -221,6 +227,7 @@ fsl_rio_config_read(struct rio_mport *mport, int index, u16 destid, err, destid, hopcount, offset); } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); *val = rval; return err; @@ -244,7 +251,10 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, u8 hopcount, u32 offset, int len, u32 val) { struct rio_priv *priv = mport->priv; + unsigned long flags; u8 *data; + int ret = 0; + pr_debug ("fsl_rio_config_write:" " index %d destid %d hopcount %d offset %8.8x len %d val %8.8x\n", @@ -255,6 +265,8 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, if (offset > (0x1000000 - len) || !IS_ALIGNED(offset, len)) return -EINVAL; + spin_lock_irqsave(&fsl_rio_config_lock, flags); + out_be32(&priv->maint_atmu_regs->rowtar, (destid << 22) | (hopcount << 12) | (offset >> 12)); out_be32(&priv->maint_atmu_regs->rowtear, (destid >> 10)); @@ -271,10 +283,11 @@ fsl_rio_config_write(struct rio_mport *mport, int index, u16 destid, out_be32((u32 *) data, val); break; default: - return -EINVAL; + ret = -EINVAL; } + spin_unlock_irqrestore(&fsl_rio_config_lock, flags); - return 0; + return ret; } static void fsl_rio_inbound_mem_init(struct rio_priv *priv) diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c index ab7a74c75be8..88b35a3dcdc5 100644 --- a/arch/powerpc/sysdev/fsl_rmu.c +++ b/arch/powerpc/sysdev/fsl_rmu.c @@ -104,6 +104,8 @@ #define DOORBELL_MESSAGE_SIZE 0x08 +static DEFINE_SPINLOCK(fsl_rio_doorbell_lock); + struct rio_msg_regs { u32 omr; u32 osr; @@ -626,9 +628,13 @@ err_out: int fsl_rio_doorbell_send(struct rio_mport *mport, int index, u16 destid, u16 data) { + unsigned long flags; + pr_debug("fsl_doorbell_send: index %d destid %4.4x data %4.4x\n", index, destid, data); + spin_lock_irqsave(&fsl_rio_doorbell_lock, flags); + /* In the serial version silicons, such as MPC8548, MPC8641, * below operations is must be. */ @@ -638,6 +644,8 @@ int fsl_rio_doorbell_send(struct rio_mport *mport, out_be32(&dbell->dbell_regs->oddatr, (index << 20) | data); out_be32(&dbell->dbell_regs->odmr, 0x00000001); + spin_unlock_irqrestore(&fsl_rio_doorbell_lock, flags); + return 0; } diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 315a4be8dc1e..9a68914100ad 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -51,6 +51,8 @@ module_param(mbox_sel, byte, S_IRUGO); MODULE_PARM_DESC(mbox_sel, "RIO Messaging MBOX Selection Mask (default: 0x0f = all)"); +static DEFINE_SPINLOCK(tsi721_maint_lock); + static void tsi721_omsg_handler(struct tsi721_device *priv, int ch); static void tsi721_imsg_handler(struct tsi721_device *priv, int ch); @@ -124,12 +126,15 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, void __iomem *regs = priv->regs + TSI721_DMAC_BASE(priv->mdma.ch_id); struct tsi721_dma_desc *bd_ptr; u32 rd_count, swr_ptr, ch_stat; + unsigned long flags; int i, err = 0; u32 op = do_wr ? MAINT_WR : MAINT_RD; if (offset > (RIO_MAINT_SPACE_SZ - len) || (len != sizeof(u32))) return -EINVAL; + spin_lock_irqsave(&tsi721_maint_lock, flags); + bd_ptr = priv->mdma.bd_base; rd_count = ioread32(regs + TSI721_DMAC_DRDCNT); @@ -197,7 +202,9 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, */ swr_ptr = ioread32(regs + TSI721_DMAC_DSWP); iowrite32(swr_ptr, regs + TSI721_DMAC_DSRP); + err_out: + spin_unlock_irqrestore(&tsi721_maint_lock, flags); return err; } diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c index a3824baca2e5..3ee9af83b638 100644 --- a/drivers/rapidio/rio-access.c +++ b/drivers/rapidio/rio-access.c @@ -13,17 +13,9 @@ #include #include -/* - * These interrupt-safe spinlocks protect all accesses to RIO - * configuration space and doorbell access. - */ -static DEFINE_SPINLOCK(rio_config_lock); -static DEFINE_SPINLOCK(rio_doorbell_lock); - /* * Wrappers for all RIO configuration access functions. They just check - * alignment, do locking and call the low-level functions pointed to - * by rio_mport->ops. + * alignment and call the low-level functions pointed to by rio_mport->ops. */ #define RIO_8_BAD 0 @@ -44,13 +36,10 @@ int __rio_local_read_config_##size \ (struct rio_mport *mport, u32 offset, type *value) \ { \ int res; \ - unsigned long flags; \ u32 data = 0; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ res = mport->ops->lcread(mport, mport->id, offset, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ return res; \ } @@ -67,13 +56,8 @@ int __rio_local_read_config_##size \ int __rio_local_write_config_##size \ (struct rio_mport *mport, u32 offset, type value) \ { \ - int res; \ - unsigned long flags; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ - res = mport->ops->lcwrite(mport, mport->id, offset, len, value);\ - spin_unlock_irqrestore(&rio_config_lock, flags); \ - return res; \ + return mport->ops->lcwrite(mport, mport->id, offset, len, value);\ } RIO_LOP_READ(8, u8, 1) @@ -104,13 +88,10 @@ int rio_mport_read_config_##size \ (struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type *value) \ { \ int res; \ - unsigned long flags; \ u32 data = 0; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ res = mport->ops->cread(mport, mport->id, destid, hopcount, offset, len, &data); \ *value = (type)data; \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ return res; \ } @@ -127,13 +108,9 @@ int rio_mport_read_config_##size \ int rio_mport_write_config_##size \ (struct rio_mport *mport, u16 destid, u8 hopcount, u32 offset, type value) \ { \ - int res; \ - unsigned long flags; \ if (RIO_##size##_BAD) return RIO_BAD_SIZE; \ - spin_lock_irqsave(&rio_config_lock, flags); \ - res = mport->ops->cwrite(mport, mport->id, destid, hopcount, offset, len, value); \ - spin_unlock_irqrestore(&rio_config_lock, flags); \ - return res; \ + return mport->ops->cwrite(mport, mport->id, destid, hopcount, \ + offset, len, value); \ } RIO_OP_READ(8, u8, 1) @@ -162,14 +139,7 @@ EXPORT_SYMBOL_GPL(rio_mport_write_config_32); */ int rio_mport_send_doorbell(struct rio_mport *mport, u16 destid, u16 data) { - int res; - unsigned long flags; - - spin_lock_irqsave(&rio_doorbell_lock, flags); - res = mport->ops->dsend(mport, mport->id, destid, data); - spin_unlock_irqrestore(&rio_doorbell_lock, flags); - - return res; + return mport->ops->dsend(mport, mport->id, destid, data); } EXPORT_SYMBOL_GPL(rio_mport_send_doorbell); From a872eb2131e91ce7c89a8888974a5e22a272b12f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 3 Oct 2017 16:15:16 -0700 Subject: [PATCH 1237/1322] mm: fix RODATA_TEST failure "rodata_test: test data was not read only" On powerpc, RODATA_TEST fails with message the following messages: Freeing unused kernel memory: 528K rodata_test: test data was not read only This is because GCC allocates it to .data section: c0695034 g O .data 00000004 rodata_test_data Since commit 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt"), rodata_test_data is used only inside rodata_test.c By declaring it static, it gets properly allocated into .rodata section instead of .data: c04df710 l O .rodata 00000004 rodata_test_data Fixes: 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt") Link: http://lkml.kernel.org/r/20170921093729.1080368AC1@po15668-vm-win7.idsi0.si.c-s.fr Signed-off-by: Christophe Leroy Cc: Kees Cook Cc: Jinbum Park Cc: Segher Boessenkool Cc: David Laight Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rodata_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6bb4deb12e78..d908c8769b48 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -14,7 +14,7 @@ #include #include -const int rodata_test_data = 0xC3; +static const int rodata_test_data = 0xC3; void rodata_test(void) { From ae94264ed4b0cf7cd887947650db4c69acb62072 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 3 Oct 2017 16:15:19 -0700 Subject: [PATCH 1238/1322] zram: fix null dereference of handle In testing I found handle passed to zs_map_object in __zram_bvec_read is NULL so eh kernel goes oops in pin_object(). The reason is there is no routine to check the slot's freeing after getting the slot's lock. This patch fixes it. [minchan@kernel.org: v2] Link: http://lkml.kernel.org/r/1505887347-10881-1-git-send-email-minchan@kernel.org Link: http://lkml.kernel.org/r/1505788488-26723-1-git-send-email-minchan@kernel.org Fixes: 1f7319c74275 ("zram: partial IO refactoring") Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 36 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 2981c27d3aae..f149d3e61234 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -766,27 +766,6 @@ static void zram_slot_unlock(struct zram *zram, u32 index) bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); } -static bool zram_same_page_read(struct zram *zram, u32 index, - struct page *page, - unsigned int offset, unsigned int len) -{ - zram_slot_lock(zram, index); - if (unlikely(!zram_get_handle(zram, index) || - zram_test_flag(zram, index, ZRAM_SAME))) { - void *mem; - - zram_slot_unlock(zram, index); - mem = kmap_atomic(page); - zram_fill_page(mem + offset, len, - zram_get_element(zram, index)); - kunmap_atomic(mem); - return true; - } - zram_slot_unlock(zram, index); - - return false; -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -884,11 +863,20 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zram_slot_unlock(zram, index); } - if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE)) - return 0; - zram_slot_lock(zram, index); handle = zram_get_handle(zram, index); + if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { + unsigned long value; + void *mem; + + value = handle ? zram_get_element(zram, index) : 0; + mem = kmap_atomic(page); + zram_fill_page(mem, PAGE_SIZE, value); + kunmap_atomic(mem); + zram_slot_unlock(zram, index); + return 0; + } + size = zram_get_obj_size(zram, index); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); From 5bdfca6435b8294490ffb5b7c8b7d8eac3814b06 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:15:23 -0700 Subject: [PATCH 1239/1322] m32r: define CPU_BIG_ENDIAN The build of m32r allmodconfig is giving lots of build warnings about: include/linux/byteorder/big_endian.h:7:2: warning: #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN [-Wcpp] #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN Define CPU_BIG_ENDIAN like the way CPU_LITTLE_ENDIAN is defined. Link: http://lkml.kernel.org/r/1505678083-10320-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 87cde1e4b38c..0777f3a8a1f3 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -194,6 +194,10 @@ config TIMER_DIVIDE int "Timer divider (integer)" default "128" +config CPU_BIG_ENDIAN + bool "Generate big endian code" + default n + config CPU_LITTLE_ENDIAN bool "Generate little endian code" default n From f4e222c56c83b2aed7cc2b329fca7435508eefa1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 3 Oct 2017 16:15:25 -0700 Subject: [PATCH 1240/1322] mm: have filemap_check_and_advance_wb_err clear AS_EIO/AS_ENOSPC Eryu noticed that he could sometimes get a leftover error reported when it shouldn't be on fsync with ext2 and non-journalled ext4. The problem is that writeback_single_inode still uses filemap_fdatawait. That picks up a previously set AS_EIO flag, which would ordinarily have been cleared before. Since we're mostly using this function as a replacement for filemap_check_errors, have filemap_check_and_advance_wb_err clear AS_EIO and AS_ENOSPC when reporting an error. That should allow the new function to better emulate the behavior of the old with respect to these flags. Link: http://lkml.kernel.org/r/20170922133331.28812-1-jlayton@kernel.org Signed-off-by: Jeff Layton Reported-by: Eryu Guan Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index db250d0e0565..594d73fef8b4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -620,6 +620,14 @@ int file_check_and_advance_wb_err(struct file *file) trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } + + /* + * We're mostly using this function as a drop in replacement for + * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect + * that the legacy code would have had on these flags. + */ + clear_bit(AS_EIO, &mapping->flags); + clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); From 24c92eb7dce0a299b8e1a8c5fa585844a53bf7f0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Oct 2017 16:15:29 -0700 Subject: [PATCH 1241/1322] mm: avoid marking swap cached page as lazyfree MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear SwapBacked). There is no lock to prevent the page is added to swap cache between these two steps by page reclaim. Page reclaim could add the page to swap cache and unmap the page. After page reclaim, the page is added back to lru. At that time, we probably start draining per-cpu pagevec and mark the page lazyfree. So the page could be in a state with SwapBacked cleared and PG_swapcache set. Next time there is a refault in the virtual address, do_swap_page can find the page from swap cache but the page has PageSwapCache false because SwapBacked isn't set, so do_swap_page will bail out and do nothing. The task will keep running into fault handler. Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Link: http://lkml.kernel.org/r/6537ef3814398c0073630b03f176263bc81f0902.1506446061.git.shli@fb.com Signed-off-by: Shaohua Li Reported-by: Artem Savkov Tested-by: Artem Savkov Reviewed-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Hillf Danton Cc: Hugh Dickins Cc: Mel Gorman Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 9295ae960d66..a77d68f2c1b6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -575,7 +575,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageUnevictable(page)) { + !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); del_page_from_lru_list(page, lruvec, @@ -665,7 +665,7 @@ void deactivate_file_page(struct page *page) void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageUnevictable(page)) { + !PageSwapCache(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); From 9625456cc76391b7f3f2809579126542a8ed4d39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 3 Oct 2017 16:15:32 -0700 Subject: [PATCH 1242/1322] mm: fix data corruption caused by lazyfree page MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear SwapBacked). There is no lock to prevent the page is added to swap cache between these two steps by page reclaim. If page reclaim finds such page, it will simply add the page to swap cache without pageout the page to swap because the page is marked as clean. Next time, page fault will read data from the swap slot which doesn't have the original data, so we have a data corruption. To fix issue, we mark the page dirty and pageout the page. However, we shouldn't dirty all pages which is clean and in swap cache. swapin page is swap cache and clean too. So we only dirty page which is added into swap cache in page reclaim, which shouldn't be swapin page. As Minchan suggested, simply dirty the page in add_to_swap can do the job. Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages") Link: http://lkml.kernel.org/r/08c84256b007bf3f63c91d94383bd9eb6fee2daa.1506446061.git.shli@fb.com Signed-off-by: Shaohua Li Reported-by: Artem Savkov Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Rik van Riel Cc: Mel Gorman Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index 71ce2d1ccbf7..ed91091d1e68 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -242,6 +242,17 @@ int add_to_swap(struct page *page) * clear SWAP_HAS_CACHE flag. */ goto fail; + /* + * Normally the page will be dirtied in unmap because its pte should be + * dirty. A special case is MADV_FREE page. The page'e pte could have + * dirty bit cleared but the page's SwapBacked bit is still set because + * clearing the dirty bit and SwapBacked bit has no lock protected. For + * such page, unmap will not set dirty bit for it, so page reclaim will + * not write the page out. This can cause data corruption when the page + * is swap in later. Always setting the dirty bit for the page solves + * the problem. + */ + set_page_dirty(page); return 1; From 7d790d2da386a52cfebcf0c898ba927bece9d4ab Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Tue, 3 Oct 2017 16:15:35 -0700 Subject: [PATCH 1243/1322] mm/device-public-memory: fix edge case in _vm_normal_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With device public pages at the end of my memory space, I'm getting output from _vm_normal_page(): BUG: Bad page map in process migrate_pages pte:c0800001ffff0d06 pmd:f95d3000 addr:00007fff89330000 vm_flags:00100073 anon_vma:c0000000fa899320 mapping: (null) index:7fff8933 file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 0 PID: 13963 Comm: migrate_pages Tainted: P B OE 4.14.0-rc1-wip #155 Call Trace: dump_stack+0xb0/0xf4 (unreliable) print_bad_pte+0x28c/0x340 _vm_normal_page+0xc0/0x140 zap_pte_range+0x664/0xc10 unmap_page_range+0x318/0x670 unmap_vmas+0x74/0xe0 exit_mmap+0xe8/0x1f0 mmput+0xac/0x1f0 do_exit+0x348/0xcd0 do_group_exit+0x5c/0xf0 SyS_exit_group+0x1c/0x20 system_call+0x58/0x6c The pfn causing this is the very last one. Correct the bounds check accordingly. Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU") Link: http://lkml.kernel.org/r/1506092178-20351-1-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Jérôme Glisse Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index ec4e15494901..a728bed16c20 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, * vm_normal_page() so that we do not have to special case all * call site of vm_normal_page(). */ - if (likely(pfn < highest_memmap_pfn)) { + if (likely(pfn <= highest_memmap_pfn)) { struct page *page = pfn_to_page(pfn); if (is_device_public_page(page)) { From 384632e67e0829deb8015ee6ad916b180049d252 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 3 Oct 2017 16:15:38 -0700 Subject: [PATCH 1244/1322] userfaultfd: non-cooperative: fix fork use after free When reading the event from the uffd, we put it on a temporary fork_event list to detect if we can still access it after releasing and retaking the event_wqh.lock. If fork aborts and removes the event from the fork_event all is fine as long as we're still in the userfault read context and fork_event head is still alive. We've to put the event allocated in the fork kernel stack, back from fork_event list-head to the event_wqh head, before returning from userfaultfd_ctx_read, because the fork_event head lifetime is limited to the userfaultfd_ctx_read stack lifetime. Forgetting to move the event back to its event_wqh place then results in __remove_wait_queue(&ctx->event_wqh, &ewq->wq); in userfaultfd_event_wait_completion to remove it from a head that has been already freed from the reader stack. This could only happen if resolve_userfault_fork failed (for example if there are no file descriptors available to allocate the fork uffd). If it succeeded it was put back correctly. Furthermore, after find_userfault_evt receives a fork event, the forked userfault context in fork_nctx and uwq->msg.arg.reserved.reserved1 can be released by the fork thread as soon as the event_wqh.lock is released. Taking a reference on the fork_nctx before dropping the lock prevents an use after free in resolve_userfault_fork(). If the fork side aborted and it already released everything, we still try to succeed resolve_userfault_fork(), if possible. Fixes: 893e26e61d04eac9 ("userfaultfd: non-cooperative: Add fork() event") Link: http://lkml.kernel.org/r/20170920180413.26713-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Mark Rutland Tested-by: Mark Rutland Cc: Pavel Emelyanov Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 66 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ef4b48d1ea42..1c713fd5b3e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -588,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ __remove_wait_queue(&ctx->event_wqh, &ewq->wq); if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; @@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, (unsigned long) uwq->msg.arg.reserved.reserved1; list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); - if (!ret) { - spin_lock(&ctx->event_wqh.lock); - if (!list_empty(&fork_event)) { - uwq = list_first_entry(&fork_event, - typeof(*uwq), - wq.entry); - list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->event_wqh, &uwq->wq); + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); + + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) userfaultfd_event_complete(ctx, uwq); - } - spin_unlock(&ctx->event_wqh.lock); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); } + spin_unlock(&ctx->event_wqh.lock); } return ret; From c2315c187fa0d3ab363fdebe22718170b40473e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:42 -0700 Subject: [PATCH 1245/1322] exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array Patch series "exec: binfmt_misc: fix use-after-free, kill iname[BINPRM_BUF_SIZE]". It looks like this code was always wrong, then commit 948b701a607f ("binfmt_misc: add persistent opened binary handler for containers") added more problems. This patch (of 6): load_script() can simply use i_name instead, it points into bprm->buf[] and nobody can change this memory until we call prepare_binprm(). The only complication is that we need to also change the signature of bprm_change_interp() but this change looks good too. While at it, do whitespace/style cleanups. NOTE: the real motivation for this change is that people want to increase BINPRM_BUF_SIZE, we need to change load_misc_binary() too but this looks more complicated because afaics it is very buggy. Link: http://lkml.kernel.org/r/20170918163446.GA26793@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Travis Gummels Cc: Ben Woodard Cc: Jim Foraker Cc: Cc: Al Viro Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 17 +++++++++-------- fs/exec.c | 2 +- include/linux/binfmts.h | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index afdf4e3cafc2..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm) const char *i_arg, *i_name; char *cp; struct file *file; - char interp[BINPRM_BUF_SIZE]; int retval; if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) @@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm) break; } for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); - if (*cp == '\0') + if (*cp == '\0') return -ENOEXEC; /* No interpreter name found */ i_name = cp; i_arg = NULL; @@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm) *cp++ = '\0'; if (*cp) i_arg = cp; - strcpy (interp, i_name); /* * OK, we've parsed out the interpreter name and * (optional) argument. @@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm) if (retval) return retval; retval = copy_strings_kernel(1, &bprm->interp, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; if (i_arg) { retval = copy_strings_kernel(1, &i_arg, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; } retval = copy_strings_kernel(1, &i_name, bprm); - if (retval) return retval; + if (retval) + return retval; bprm->argc++; - retval = bprm_change_interp(interp, bprm); + retval = bprm_change_interp(i_name, bprm); if (retval < 0) return retval; /* * OK, now restart the process with the interpreter's dentry. */ - file = open_exec(interp); + file = open_exec(i_name); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/exec.c b/fs/exec.c index ac34d9724684..5470d3c1892a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -int bprm_change_interp(char *interp, struct linux_binprm *bprm) +int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fb44d6180ca0..18d05b5491f3 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -131,7 +131,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location); -extern int bprm_change_interp(char *interp, struct linux_binprm *bprm); +extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm); extern int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm); extern int prepare_bprm_creds(struct linux_binprm *bprm); From baba1b29731c79d605100087b8f02f9e1cf5a344 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:45 -0700 Subject: [PATCH 1246/1322] exec: binfmt_misc: don't nullify Node->dentry in kill_node() kill_node() nullifies/checks Node->dentry to avoid double free. This complicates the next changes and this is very confusing: - we do not need to check dentry != NULL under entries_lock, kill_node() is always called under inode_lock(d_inode(root)) and we rely on this inode_lock() anyway, without this lock the MISC_FMT_OPEN_FILE cleanup could race with itself. - if kill_inode() was already called and ->dentry == NULL we should not even try to close e->interp_file. We can change bm_entry_write() to simply check !list_empty(list) before kill_node. Again, we rely on inode_lock(), in particular it saves us from the race with bm_status_write(), another caller of kill_node(). Link: http://lkml.kernel.org/r/20170922143641.GA17210@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ce7181ea60fa..6451e7520e05 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -603,11 +603,7 @@ static void kill_node(Node *e) struct dentry *dentry; write_lock(&entries_lock); - dentry = e->dentry; - if (dentry) { - list_del_init(&e->list); - e->dentry = NULL; - } + list_del_init(&e->list); write_unlock(&entries_lock); if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { @@ -615,12 +611,11 @@ static void kill_node(Node *e) e->interp_file = NULL; } - if (dentry) { - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); - } + dentry = e->dentry; + drop_nlink(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); } /* / */ @@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); - kill_node(e); + if (!list_empty(&e->list)) + kill_node(e); inode_unlock(d_inode(root)); break; @@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, inode_lock(d_inode(root)); while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); + kill_node(list_first_entry(&entries, Node, list)); inode_unlock(d_inode(root)); break; From 83f918274e4b841d6fb817861ea0c896fba0c179 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:48 -0700 Subject: [PATCH 1247/1322] exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode() To ensure that load_misc_binary() can't use the partially destroyed Node, see also the next patch. The current logic looks wrong in any case, once we close interp_file it doesn't make any sense to delay kfree(inode->i_private), this Node is no longer valid. Even if the MISC_FMT_OPEN_FILE/interp_file checks were not racy (they are), load_misc_binary() should not try to reopen ->interpreter if MISC_FMT_OPEN_FILE is set but ->interp_file is NULL. And I can't understand why do we use filp_close(), not fput(). Link: http://lkml.kernel.org/r/20170922143644.GA17216@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 6451e7520e05..203598ccb40a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { + Node *e = inode->i_private; + + if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + filp_close(e->interp_file, NULL); + clear_inode(inode); - kfree(inode->i_private); + kfree(e); } static void kill_node(Node *e) @@ -606,11 +611,6 @@ static void kill_node(Node *e) list_del_init(&e->list); write_unlock(&entries_lock); - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { - filp_close(e->interp_file, NULL); - e->interp_file = NULL; - } - dentry = e->dentry; drop_nlink(d_inode(dentry)); d_drop(dentry); From eb23aa0317eb1f08e8d9d36b8753d42f03b32764 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:51 -0700 Subject: [PATCH 1248/1322] exec: binfmt_misc: remove the confusing e->interp_file != NULL checks If MISC_FMT_OPEN_FILE flag is set e->interp_file must be valid or we have a bug which should not be silently ignored. Link: http://lkml.kernel.org/r/20170922143647.GA17222@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 203598ccb40a..babfe7bd56f0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto error; - if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) { + if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = filp_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); @@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); clear_inode(inode); From 43a4f2619038002f48c78698c42c05692d4b4eb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:55 -0700 Subject: [PATCH 1249/1322] exec: binfmt_misc: fix race between load_misc_binary() and kill_node() load_misc_binary() makes a local copy of fmt->interpreter under entries_lock to avoid the race with kill_node() but this is not enough; the whole Node can be freed after we drop entries_lock, not only the ->interpreter string. Add dget/dput(fmt->dentry) to ensure bm_evict_inode() can't destroy/free this Node. Link: http://lkml.kernel.org/r/20170922143650.GA17227@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Travis Gummels Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index babfe7bd56f0..f5f8c2541790 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -138,20 +138,23 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (!enabled) - goto ret; + return retval; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) + if (fmt) { + dget(fmt->dentry); strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + } read_unlock(&entries_lock); if (!fmt) - goto ret; + return retval; /* Need to be able to load the file after exec */ + retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) - return -ENOENT; + goto ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); @@ -238,6 +241,7 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; ret: + dput(fmt->dentry); return retval; error: if (fd_binary > 0) From 50097f74934e3ec8fb1e6f3087568b958972817d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:58 -0700 Subject: [PATCH 1250/1322] exec: binfmt_misc: kill the onstack iname[BINPRM_BUF_SIZE] array After the previous change "fmt" can't go away, we can kill iname/iname_addr and use fmt->interpreter. Link: http://lkml.kernel.org/r/20170922143653.GA17232@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f5f8c2541790..2a46762def31 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -54,7 +54,7 @@ typedef struct { int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ - char *interpreter; /* filename of interpreter */ + const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; @@ -131,8 +131,6 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - char iname[BINPRM_BUF_SIZE]; - const char *iname_addr = iname; int retval; int fd_binary = -1; @@ -143,10 +141,8 @@ static int load_misc_binary(struct linux_binprm *bprm) /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) { + if (fmt) dget(fmt->dentry); - strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); - } read_unlock(&entries_lock); if (!fmt) return retval; @@ -198,13 +194,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; /* Update interp in case binfmt_script needs it. */ - retval = bprm_change_interp(iname, bprm); + retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto error; @@ -213,7 +209,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { - interp_file = open_exec(iname); + interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) From 8cb5d7482810b7eb1bb05bf4f71bc93ce35e5896 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 16:16:01 -0700 Subject: [PATCH 1251/1322] lib/lz4: make arrays static const, reduces object code size Don't populate the read-only arrays dec32table and dec64table on the stack, instead make them both static const. Makes the object code smaller by over 10K bytes: Before: text data bss dec hex filename 31500 0 0 31500 7b0c lib/lz4/lz4_decompress.o After: text data bss dec hex filename 20237 176 0 20413 4fbd lib/lz4/lz4_decompress.o (gcc version 7.2.0 x86_64) Link: http://lkml.kernel.org/r/20170921221939.20820-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Christophe JAILLET Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Arnd Bergmann Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lz4/lz4_decompress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index bd3574312b82..141734d255e4 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -85,8 +85,8 @@ static FORCE_INLINE int LZ4_decompress_generic( const BYTE * const lowLimit = lowPrefix - dictSize; const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; - const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; - const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; + static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; + static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; const int safeDecode = (endOnInput == endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); From 7240767450d6d8380fb153e2998a1bb4ede7b029 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Oct 2017 16:16:04 -0700 Subject: [PATCH 1252/1322] include/linux/bitfield.h: remove 32bit from FIELD_GET comment block I do not see anything that restricts this macro to 32 bit width. Link: http://lkml.kernel.org/r/1505921975-23379-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Jakub Kicinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 8b9d6fff002d..f2deb71958b2 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -92,7 +92,7 @@ /** * FIELD_GET() - extract a bitfield element * @_mask: shifted mask defining the field's length and position - * @_reg: 32bit value of entire bitfield + * @_reg: value of entire bitfield * * FIELD_GET() extracts the field specified by @_mask from the * bitfield passed in as @_reg by masking and shifting it down. From 3181c38e4df257852a0c0a53552fd5c869402886 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 3 Oct 2017 16:16:07 -0700 Subject: [PATCH 1253/1322] kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv() do_proc_douintvec_conv() has two UINT_MAX checks, we can remove one. This has no functional changes other than fixing a compiler warning: kernel/sysctl.c:2190]: (warning) Identical condition '*lvalp>UINT_MAX', second condition is always false Fixes: 4f2fec00afa60 ("sysctl: simplify unsigned int support") Link: http://lkml.kernel.org/r/20170919072918.12066-1-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reported-by: David Binderman Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 423554ad3610..4da9e622471f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2186,8 +2186,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > UINT_MAX) - return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; From f80c7dab95a1f0f968acbafe4426ee9525b6f6ab Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 3 Oct 2017 16:16:10 -0700 Subject: [PATCH 1254/1322] mm: memcontrol: use vmalloc fallback for large kmem memcg arrays For quick per-memcg indexing, slab caches and list_lru structures maintain linear arrays of descriptors. As the number of concurrent memory cgroups in the system goes up, this requires large contiguous allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for every existing slab cache and list_lru, which can easily fail on loaded systems. E.g.: mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null) CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty #481 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: ? __alloc_pages_direct_compact+0x4c/0x110 __alloc_pages_nodemask+0xf50/0x1430 alloc_pages_current+0x60/0xc0 kmalloc_order_trace+0x29/0x1b0 __kmalloc+0x1f4/0x320 memcg_update_all_list_lrus+0xca/0x2e0 mem_cgroup_css_alloc+0x612/0x670 cgroup_apply_control_enable+0x19e/0x360 cgroup_mkdir+0x322/0x490 kernfs_iop_mkdir+0x55/0x80 vfs_mkdir+0xd0/0x120 SyS_mkdirat+0x6c/0xe0 SyS_mkdir+0x14/0x20 entry_SYSCALL_64_fastpath+0x18/0xad Mem-Info: active_anon:2965 inactive_anon:19 isolated_anon:0 active_file:100270 inactive_file:98846 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:7328 slab_unreclaimable:16402 mapped:771 shmem:52 pagetables:278 bounce:0 free:13718 free_pcp:0 free_cma:0 This output is from an artificial reproducer, but we have repeatedly observed order-7 failures in production in the Facebook fleet. These systems become useless as they cannot run more jobs, even though there is plenty of memory to allocate 128 individual pages. Use kvmalloc and kvzalloc to fall back to vmalloc space if these arrays prove too large for allocating them physically contiguous. Link: http://lkml.kernel.org/r/20170918184919.20644-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Josef Bacik Acked-by: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 ++++++------ mm/slab_common.c | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7a40fa2be858..f141f0c80ff3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -325,12 +325,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) { int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); if (!nlru->memcg_lrus) return -ENOMEM; if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kfree(nlru->memcg_lrus); + kvfree(nlru->memcg_lrus); return -ENOMEM; } @@ -340,7 +340,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kfree(nlru->memcg_lrus); + kvfree(nlru->memcg_lrus); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,12 +351,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); old = nlru->memcg_lrus; - new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; if (__memcg_init_list_lru_node(new, old_size, new_size)) { - kfree(new); + kvfree(new); return -ENOMEM; } @@ -373,7 +373,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, nlru->memcg_lrus = new; spin_unlock_irq(&nlru->lock); - kfree(old); + kvfree(old); return 0; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 904a83be82de..80164599ca5d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -165,9 +165,9 @@ static int init_memcg_params(struct kmem_cache *s, if (!memcg_nr_cache_ids) return 0; - arr = kzalloc(sizeof(struct memcg_cache_array) + - memcg_nr_cache_ids * sizeof(void *), - GFP_KERNEL); + arr = kvzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); if (!arr) return -ENOMEM; @@ -178,15 +178,23 @@ static int init_memcg_params(struct kmem_cache *s, static void destroy_memcg_params(struct kmem_cache *s) { if (is_root_cache(s)) - kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); + kvfree(rcu_access_pointer(s->memcg_params.memcg_caches)); +} + +static void free_memcg_params(struct rcu_head *rcu) +{ + struct memcg_cache_array *old; + + old = container_of(rcu, struct memcg_cache_array, rcu); + kvfree(old); } static int update_memcg_params(struct kmem_cache *s, int new_array_size) { struct memcg_cache_array *old, *new; - new = kzalloc(sizeof(struct memcg_cache_array) + - new_array_size * sizeof(void *), GFP_KERNEL); + new = kvzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -198,7 +206,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size) rcu_assign_pointer(s->memcg_params.memcg_caches, new); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, free_memcg_params); return 0; } From a70e43a59de9316e6fbad3b65557d0a24c099aca Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 3 Oct 2017 16:16:13 -0700 Subject: [PATCH 1255/1322] lib/idr.c: fix comment for idr_replace() idr_replace() returns the old value on success, not 0. Link: http://lkml.kernel.org/r/20170918162642.37511-1-ebiggers3@gmail.com Signed-off-by: Eric Biggers Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index f9adf4805fd7..edd9b2be1651 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -146,8 +146,8 @@ EXPORT_SYMBOL(idr_get_next_ext); * idr_alloc() and idr_remove() (as long as the ID being removed is not * the one being replaced!). * - * Returns: 0 on success. %-ENOENT indicates that @id was not found. - * %-EINVAL indicates that @id or @ptr were not valid. + * Returns: the old value on success. %-ENOENT indicates that @id was not + * found. %-EINVAL indicates that @id or @ptr were not valid. */ void *idr_replace(struct idr *idr, void *ptr, int id) { From f64ac5e6e30668216cf489d73ba8a96e372d78c6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:16 -0700 Subject: [PATCH 1256/1322] mm, memory_hotplug: add scheduling point to __add_pages Patch series "mm, memory_hotplug: fix few soft lockups in memory hotadd". Johannes has noticed few soft lockups when adding a large nvdimm device. All of them were caused by a long loop without any explicit cond_resched which is a problem for !PREEMPT kernels. The fix is quite straightforward. Just make sure that cond_resched gets called from time to time. This patch (of 3): __add_pages gets a pfn range to add and there is no upper bound for a single call. This is usually a memory block aligned size for the regular memory hotplug - smaller sizes are usual for memory balloning drivers, or the whole NUMA node for physical memory online. There is no explicit scheduling point in that code path though. This can lead to long latencies while __add_pages is executed and we have even seen a soft lockup report during nvdimm initialization with !PREEMPT kernel NMI watchdog: BUG: soft lockup - CPU#11 stuck for 23s! [kworker/u641:3:832] [...] Workqueue: events_unbound async_run_entry_fn task: ffff881809270f40 ti: ffff881809274000 task.ti: ffff881809274000 RIP: _raw_spin_unlock_irqrestore+0x11/0x20 RSP: 0018:ffff881809277b10 EFLAGS: 00000286 [...] Call Trace: sparse_add_one_section+0x13d/0x18e __add_pages+0x10a/0x1d0 arch_add_memory+0x4a/0xc0 devm_memremap_pages+0x29d/0x430 pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 DWARF2 unwinder stuck at ret_from_fork+0x3f/0x70 Fix this by adding cond_resched once per each memory section in the given pfn range. Each section is constant amount of work which itself is not too expensive but many of them will just add up. Link: http://lkml.kernel.org/r/20170918121410.24466-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e882cb6da994..23d5bd968950 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -328,6 +328,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, if (err && (err != -EEXIST)) break; err = 0; + cond_resched(); } vmemmap_populate_print_last(); out: From 9b6e63cbf85b89b2dbffa4955dbf2df8250e5375 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:19 -0700 Subject: [PATCH 1257/1322] mm, page_alloc: add scheduling point to memmap_init_zone memmap_init_zone gets a pfn range to initialize and it can be really large resulting in a soft lockup on non-preemptible kernels NMI watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [kworker/u642:5:1720] [...] task: ffff88ecd7e902c0 ti: ffff88eca4e50000 task.ti: ffff88eca4e50000 RIP: move_pfn_range_to_zone+0x185/0x1d0 [...] Call Trace: devm_memremap_pages+0x2c7/0x430 pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 Fix this by adding a scheduling point once per page block. Link: http://lkml.kernel.org/r/20170918121410.24466-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38d165a87860..77e4d3c5c57b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5367,6 +5367,7 @@ not_early: __init_single_page(page, pfn, zone, nid); set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); } else { __init_single_pfn(pfn, zone, nid); } From 1fdcce6e16c54facc4f0688630d3b9ecfcaa411f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 3 Oct 2017 16:16:23 -0700 Subject: [PATCH 1258/1322] memremap: add scheduling point to devm_memremap_pages devm_memremap_pages is initializing struct pages in for_each_device_pfn and that can take quite some time. We have even seen a soft lockup triggering on a non preemptive kernel NMI watchdog: BUG: soft lockup - CPU#61 stuck for 22s! [kworker/u641:11:1808] [...] RIP: 0010:[] [] devm_memremap_pages+0x327/0x430 [...] Call Trace: pmem_attach_disk+0x2fd/0x3f0 [nd_pmem] nvdimm_bus_probe+0x64/0x110 [libnvdimm] driver_probe_device+0x1f7/0x420 bus_for_each_drv+0x52/0x80 __device_attach+0xb0/0x130 bus_probe_device+0x87/0xa0 device_add+0x3fc/0x5f0 nd_async_device_register+0xe/0x40 [libnvdimm] async_run_entry_fn+0x43/0x150 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xc7/0xe0 ret_from_fork+0x3f/0x70 fix this by adding cond_resched every 1024 pages. Link: http://lkml.kernel.org/r/20170918121410.24466-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid, is_ram; + int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(ref); + if (!(++i % 1024)) + cond_resched(); } devres_add(dev, page_map); return __va(res->start); From c9653850c90d6050197bc76ba672e00a7771aea5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 3 Oct 2017 16:16:26 -0700 Subject: [PATCH 1259/1322] kernel/kcmp.c: drop branch leftover typo The else branch been left over and escaped the source code refresh. Not a problem but better clean it up. Fixes: 0791e3644e5e ("kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files") Link: http://lkml.kernel.org/r/20170917165838.GA1887@uranus.lan Reported-by: Eugene Syromiatnikov Signed-off-by: Cyrill Gorcunov Acked-by: Andrei Vagin Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, if (filp_epoll) { filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); - } else + } if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); From 1dd2bfc86818ddbc95f98e312e7704350223fd7d Mon Sep 17 00:00:00 2001 From: YASUAKI ISHIMATSU Date: Tue, 3 Oct 2017 16:16:29 -0700 Subject: [PATCH 1260/1322] mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro to inline function pfn_to_section_nr() and section_nr_to_pfn() are defined as macro. pfn_to_section_nr() has no issue even if it is defined as macro. But section_nr_to_pfn() has overflow issue if sec is defined as int. section_nr_to_pfn() just shifts sec by PFN_SECTION_SHIFT. If sec is defined as unsigned long, section_nr_to_pfn() returns pfn as 64 bit value. But if sec is defined as int, section_nr_to_pfn() returns pfn as 32 bit value. __remove_section() calculates start_pfn using section_nr_to_pfn() and scn_nr defined as int. So if hot-removed memory address is over 16TB, overflow issue occurs and section_nr_to_pfn() does not calculate correct pfn. To make callers use proper arg, the patch changes the macros to inline functions. Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory") Link: http://lkml.kernel.org/r/e643a387-e573-6bbf-d418-c60c8ee3d15e@gmail.com Signed-off-by: Yasuaki Ishimatsu Acked-by: Michal Hocko Cc: Xishi Qiu Cc: Reza Arbab Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 ++++++++-- mm/memory_hotplug.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 356a814e7c8e..c8f89417740b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1094,8 +1094,14 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif -#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) -#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) +static inline unsigned long pfn_to_section_nr(unsigned long pfn) +{ + return pfn >> PFN_SECTION_SHIFT; +} +static inline unsigned long section_nr_to_pfn(unsigned long sec) +{ + return sec << PFN_SECTION_SHIFT; +} #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 23d5bd968950..efd1ad37bb57 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -551,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, return ret; scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn(scn_nr); + start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); sparse_remove_one_section(zone, ms, map_offset); From d09b0137d204bebeaafed672bc5a244e9ac92edb Mon Sep 17 00:00:00 2001 From: YASUAKI ISHIMATSU Date: Tue, 3 Oct 2017 16:16:32 -0700 Subject: [PATCH 1261/1322] mm/memory_hotplug: define find_{smallest|biggest}_section_pfn as unsigned long find_{smallest|biggest}_section_pfn()s find the smallest/biggest section and return the pfn of the section. But the functions are defined as int. So the functions always return 0x00000000 - 0xffffffff. It means if memory address is over 16TB, the functions does not work correctly. To handle 64 bit value, the patch defines find_{smallest|biggest}_section_pfn() as unsigned long. Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory") Link: http://lkml.kernel.org/r/d9d5593a-d0a4-c4be-ab08-493df59a85c6@gmail.com Signed-off-by: Yasuaki Ishimatsu Acked-by: Michal Hocko Cc: Xishi Qiu Cc: Reza Arbab Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index efd1ad37bb57..d4b5f29906b9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(__add_pages); #ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ -static int find_smallest_section_pfn(int nid, struct zone *zone, +static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { @@ -363,7 +363,7 @@ static int find_smallest_section_pfn(int nid, struct zone *zone, } /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ -static int find_biggest_section_pfn(int nid, struct zone *zone, +static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { From 90ceb2a3ad868f800eb1c9f4ede650daddd94b77 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:35 -0700 Subject: [PATCH 1262/1322] kernel/params.c: fix the maximum length in param_get_string The length parameter of strlcpy() is supposed to reflect the size of the target buffer, not of the source string. Harmless in this case as the buffer is PAGE_SIZE long and the source string is always much shorter than this, but conceptually wrong, so let's fix it. Link: http://lkml.kernel.org/r/20170928162515.24846b4f@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/params.c b/kernel/params.c index 1cd8f1a895a8..8283ba045f4f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); + return strlcpy(buffer, kps->string, PAGE_SIZE); } EXPORT_SYMBOL(param_get_string); From 96802e6b1dbf29d3012b39503c5dd6d9d8e82955 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:38 -0700 Subject: [PATCH 1263/1322] kernel/params.c: fix an overflow in param_attr_show Function param_attr_show could overflow the buffer it is operating on. The buffer size is PAGE_SIZE, and the string returned by attribute->param->ops->get is generated by scnprintf(buffer, PAGE_SIZE, ...) so it could be PAGE_SIZE - 1 long, with the terminating '\0' at the very end of the buffer. Calling strcat(..., "\n") on this isn't safe, as the '\0' will be replaced by '\n' (OK) and then another '\0' will be added past the end of the buffer (not OK.) Simply add the trailing '\n' when writing the attribute contents to the buffer originally. This is safe, and also faster. Credits to Teradata for discovering this issue. Link: http://lkml.kernel.org/r/20170928162602.60c379c7@endymion Signed-off-by: Jean Delvare Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 8283ba045f4f..0cca488263dd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return scnprintf(buffer, PAGE_SIZE, format, \ + return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); + return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + /* Replace \n with comma */ if (i) - buffer[off++] = ','; + buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, PAGE_SIZE); + return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } return count; } From e0596c80f442d1e1221c17dbb71b2aed43909221 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 3 Oct 2017 16:16:41 -0700 Subject: [PATCH 1264/1322] kernel/params.c: improve STANDARD_PARAM_DEF readability Align the parameters passed to STANDARD_PARAM_DEF for clarity. Link: http://lkml.kernel.org/r/20170928162728.756143cc@endymion Signed-off-by: Jean Delvare Suggested-by: Ingo Molnar Acked-by: Ingo Molnar Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 0cca488263dd..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -236,14 +236,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { From 656d61ce9666209c4c4a13c71902d3ee70d1ff6f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Oct 2017 16:16:45 -0700 Subject: [PATCH 1265/1322] lib/ratelimit.c: use deferred printk() version printk_ratelimit() invokes ___ratelimit() which may invoke a normal printk() (pr_warn() in this particular case) to warn about suppressed output. Given that printk_ratelimit() may be called from anywhere, that pr_warn() is dangerous - it may end up deadlocking the system. Fix ___ratelimit() by using deferred printk(). Sasha reported the following lockdep error: : Unregister pv shared memory for cpu 8 : select_fallback_rq: 3 callbacks suppressed : process 8583 (trinity-c78) no longer affine to cpu8 : : ====================================================== : WARNING: possible circular locking dependency detected : 4.14.0-rc2-next-20170927+ #252 Not tainted : ------------------------------------------------------ : migration/8/62 is trying to acquire lock: : (&port_lock_key){-.-.}, at: serial8250_console_write() : : but task is already holding lock: : (&rq->lock){-.-.}, at: sched_cpu_dying() : : which lock already depends on the new lock. : : : the existing dependency chain (in reverse order) is: : : -> #3 (&rq->lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock() : task_fork_fair() : sched_fork() : copy_process.part.31() : _do_fork() : kernel_thread() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #2 (&p->pi_lock){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : try_to_wake_up() : default_wake_function() : woken_wake_function() : __wake_up_common() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #1 (&tty->write_wait){-.-.}: : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : __wake_up_common_lock() : __wake_up() : tty_wakeup() : tty_port_default_wakeup() : tty_port_tty_wakeup() : uart_write_wakeup() : serial8250_tx_chars() : serial8250_handle_irq.part.25() : serial8250_default_handle_irq() : serial8250_interrupt() : __handle_irq_event_percpu() : handle_irq_event_percpu() : handle_irq_event() : handle_level_irq() : handle_irq() : do_IRQ() : ret_from_intr() : native_safe_halt() : default_idle() : arch_cpu_idle() : default_idle_call() : do_idle() : cpu_startup_entry() : rest_init() : start_kernel() : x86_64_start_reservations() : x86_64_start_kernel() : verify_cpu() : : -> #0 (&port_lock_key){-.-.}: : check_prev_add() : __lock_acquire() : lock_acquire() : _raw_spin_lock_irqsave() : serial8250_console_write() : univ8250_console_write() : console_unlock() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : cpuhp_invoke_callback() : take_cpu_down() : multi_cpu_stop() : cpu_stopper_thread() : smpboot_thread_fn() : kthread() : ret_from_fork() : : other info that might help us debug this: : : Chain exists of: : &port_lock_key --> &p->pi_lock --> &rq->lock : : Possible unsafe locking scenario: : : CPU0 CPU1 : ---- ---- : lock(&rq->lock); : lock(&p->pi_lock); : lock(&rq->lock); : lock(&port_lock_key); : : *** DEADLOCK *** : : 4 locks held by migration/8/62: : #0: (&p->pi_lock){-.-.}, at: sched_cpu_dying() : #1: (&rq->lock){-.-.}, at: sched_cpu_dying() : #2: (printk_ratelimit_state.lock){....}, at: ___ratelimit() : #3: (console_lock){+.+.}, at: vprintk_emit() : : stack backtrace: : CPU: 8 PID: 62 Comm: migration/8 Not tainted 4.14.0-rc2-next-20170927+ #252 : Call Trace: : dump_stack() : print_circular_bug() : check_prev_add() : ? add_lock_to_list.isra.26() : ? check_usage() : ? kvm_clock_read() : ? kvm_sched_clock_read() : ? sched_clock() : ? check_preemption_disabled() : __lock_acquire() : ? __lock_acquire() : ? add_lock_to_list.isra.26() : ? debug_check_no_locks_freed() : ? memcpy() : lock_acquire() : ? serial8250_console_write() : _raw_spin_lock_irqsave() : ? serial8250_console_write() : serial8250_console_write() : ? serial8250_start_tx() : ? lock_acquire() : ? memcpy() : univ8250_console_write() : console_unlock() : ? __down_trylock_console_sem() : vprintk_emit() : vprintk_default() : vprintk_func() : printk() : ? show_regs_print_info() : ? lock_acquire() : ___ratelimit() : __printk_ratelimit() : select_fallback_rq() : sched_cpu_dying() : ? sched_cpu_starting() : ? rcutree_dying_cpu() : ? sched_cpu_starting() : cpuhp_invoke_callback() : ? cpu_disable_common() : take_cpu_down() : ? trace_hardirqs_off_caller() : ? cpuhp_invoke_callback() : multi_cpu_stop() : ? __this_cpu_preempt_check() : ? cpu_stop_queue_work() : cpu_stopper_thread() : ? cpu_stop_create() : smpboot_thread_fn() : ? sort_range() : ? schedule() : ? __kthread_parkme() : kthread() : ? sort_range() : ? kthread_create_on_node() : ret_from_fork() : process 9121 (trinity-c78) no longer affine to cpu8 : smpboot: CPU 8 is now offline Link: http://lkml.kernel.org/r/20170928120405.18273-1-sergey.senozhatsky@gmail.com Fixes: 6b1d174b0c27b ("ratelimit: extend to print suppressed messages on release") Signed-off-by: Sergey Senozhatsky Reported-by: Sasha Levin Reviewed-by: Petr Mladek Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ratelimit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 08f8043cac61..d01f47135239 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -48,7 +48,9 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (time_is_before_jiffies(rs->begin + rs->interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { - pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + printk_deferred(KERN_WARNING + "%s: %d callbacks suppressed\n", + func, rs->missed); rs->missed = 0; } } From d22e3d69ee1a3f83ff7cc943af63de48b6156dcf Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 3 Oct 2017 16:16:49 -0700 Subject: [PATCH 1266/1322] m32r: fix build failure The allmodconfig build of m32r is failing with the error: lib/mpi/mpih-div.o: In function 'mpihelp_divrem': mpih-div.c:(.text+0x40): undefined reference to 'abort' mpih-div.c:(.text+0x40): relocation truncated to fit: R_M32R_26_PCREL_RELA against undefined symbol 'abort' The function 'abort' was never defined for the m32r architecture. Create 'abort' as is done in other arch like 'arm' and 'unicore32'. Link: http://lkml.kernel.org/r/1506727220-6108-1-git-send-email-sudip.mukherjee@codethink.co.uk Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/kernel/traps.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index 647dd94a0c39..72b96f282689 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -114,6 +114,15 @@ static void set_eit_vector_entries(void) _flush_cache_copyback_all(); } +void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} +EXPORT_SYMBOL(abort); + void __init trap_init(void) { set_eit_vector_entries(); From a08ffbef4ab799351d0610bbdbeaa1ee746b9065 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 3 Oct 2017 16:16:51 -0700 Subject: [PATCH 1267/1322] checkpatch: fix ignoring cover-letter logic Currently running checkpatch on a directory with a cover-letter.patch file reports the following error: ----------------------------------------- patches/smp-v2/v2-0000-cover-letter.patch ----------------------------------------- ERROR: Does not appear to be a unified-diff format patch The logic to suppress the unified-diff check for cover letters is there but is checking $file instead of $filename. Fix the variable to use the correct one. Link: http://lkml.kernel.org/r/20170909090406.31523-1-shorne@gmail.com Signed-off-by: Stafford Horne Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dd2c262aebbf..8b80bac055e4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6390,7 +6390,7 @@ sub process { exit(0); } - if (!$is_patch && $file !~ /cover-letter\.patch$/) { + if (!$is_patch && $filename !~ /cover-letter\.patch$/) { ERROR("NOT_UNIFIED_DIFF", "Does not appear to be a unified-diff format patch\n"); } From 32e57c29e3c038ac802b7cc214a8795a4234055f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 3 Oct 2017 16:16:54 -0700 Subject: [PATCH 1268/1322] include/linux/fs.h: fix comment about struct address_space Before commit 9c5d760b8d22 ("mm: split gfp_mask and mapping flags into separate fields") the private_* fields of struct adrress_space were grouped together and using "ditto" in comments describing the last fields was correct. With introduction of gpf_mask between private_lock and private_list "ditto" references the wrong description. Fix it by using the elaborate description. Link: http://lkml.kernel.org/r/1507009987-8746-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 339e73742e73..13dab191a23e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -403,7 +403,7 @@ struct address_space { unsigned long flags; /* error bits */ spinlock_t private_lock; /* for use by the address_space */ gfp_t gfp_mask; /* implicit gfp mask for allocations */ - struct list_head private_list; /* ditto */ + struct list_head private_list; /* for use by the address_space */ void *private_data; /* ditto */ errseq_t wb_err; } __attribute__((aligned(sizeof(long)))) __randomize_layout; From e4c77f8b9b213c6315faba109c03b0db873db200 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Sep 2017 17:47:50 +0200 Subject: [PATCH 1269/1322] ARM: defconfig: FRAMEBUFFER_CONSOLE can no longer be =m It is no longer possible to load this at runtime, so let's change the few remaining users to have it built-in all the time. arch/arm/configs/zeus_defconfig:115:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE arch/arm/configs/viper_defconfig:116:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE arch/arm/configs/pxa_defconfig:474:warning: symbol value 'm' invalid for FRAMEBUFFER_CONSOLE Reported-by: kernelci.org bot Fixes: 6104c37094e7 ("fbcon: Make fbcon a built-time depency for fbdev") Signed-off-by: Arnd Bergmann Signed-off-by: Olof Johansson --- arch/arm/configs/pxa_defconfig | 2 +- arch/arm/configs/viper_defconfig | 2 +- arch/arm/configs/zeus_defconfig | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index 64e3a2a8cede..d5e1370ec303 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -471,7 +471,7 @@ CONFIG_LCD_PLATFORM=m CONFIG_LCD_TOSA=m CONFIG_BACKLIGHT_PWM=m CONFIG_BACKLIGHT_TOSA=m -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y CONFIG_LOGO=y CONFIG_SOUND=m diff --git a/arch/arm/configs/viper_defconfig b/arch/arm/configs/viper_defconfig index 44d4fa57ba0a..070e5074f1ee 100644 --- a/arch/arm/configs/viper_defconfig +++ b/arch/arm/configs/viper_defconfig @@ -113,7 +113,7 @@ CONFIG_FB_PXA_PARAMETERS=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_PWM=m # CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m diff --git a/arch/arm/configs/zeus_defconfig b/arch/arm/configs/zeus_defconfig index 8d4c0c926c34..09e7050d5653 100644 --- a/arch/arm/configs/zeus_defconfig +++ b/arch/arm/configs/zeus_defconfig @@ -112,7 +112,7 @@ CONFIG_FB_PXA=m CONFIG_FB_PXA_PARAMETERS=y CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y CONFIG_SOUND=m CONFIG_SND=m From 0694b2ee87ee1a6d83acf1a66b92c8e64ceb38f2 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 17 Sep 2017 16:26:18 +0200 Subject: [PATCH 1270/1322] ARM: defconfig: update Gemini defconfig This updates the Gemini defconfig with drivers merged for v4.13 or v4.14: - ATA driver is merged - DMA driver is merged - RTC driver gets selected from default Kconfig Signed-off-by: Linus Walleij Signed-off-by: Olof Johansson --- arch/arm/configs/gemini_defconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/configs/gemini_defconfig b/arch/arm/configs/gemini_defconfig index d2d75fa664a6..2a63fa10c813 100644 --- a/arch/arm/configs/gemini_defconfig +++ b/arch/arm/configs/gemini_defconfig @@ -32,6 +32,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOWLEVEL is not set CONFIG_ATA=y +CONFIG_PATA_FTIDE010=y CONFIG_INPUT_EVDEV=y CONFIG_KEYBOARD_GPIO=y # CONFIG_INPUT_MOUSE is not set @@ -55,8 +56,8 @@ CONFIG_LEDS_GPIO=y CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GEMINI=y CONFIG_DMADEVICES=y +CONFIG_AMBA_PL08X=y # CONFIG_DNOTIFY is not set CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y From acd669a8f67ed47f5edd385741486cc7a259a446 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Tue, 3 Oct 2017 11:10:53 +0530 Subject: [PATCH 1271/1322] cxgb4: add new T6 pci device id's Add 0x6085 T6 device id. Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 37d90d63e4a3..633e9751a25e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -202,6 +202,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x6082), /* Custom T6225-CR SFP28 */ CH_PCI_ID_TABLE_FENTRY(0x6083), /* Custom T62100-CR QSFP28 */ CH_PCI_ID_TABLE_FENTRY(0x6084), /* Custom T64100-CR QSFP28 */ + CH_PCI_ID_TABLE_FENTRY(0x6085), /* Custom T6240-SO */ CH_PCI_DEVICE_ID_TABLE_DEFINE_END; #endif /* __T4_PCI_ID_TBL_H__ */ From 57e7ba04d422c3d41c8426380303ec9b7533ded9 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 19 Sep 2017 09:39:08 -0700 Subject: [PATCH 1272/1322] lsm: fix smack_inode_removexattr and xattr_getsecurity memleak security_inode_getsecurity() provides the text string value of a security attribute. It does not provide a "secctx". The code in xattr_getsecurity() that calls security_inode_getsecurity() and then calls security_release_secctx() happened to work because SElinux and Smack treat the attribute and the secctx the same way. It fails for cap_inode_getsecurity(), because that module has no secctx that ever needs releasing. It turns out that Smack is the one that's doing things wrong by not allocating memory when instructed to do so by the "alloc" parameter. The fix is simple enough. Change the security_release_secctx() to kfree() because it isn't a secctx being returned by security_inode_getsecurity(). Change Smack to allocate the string when told to do so. Note: this also fixes memory leaks for LSMs which implement inode_getsecurity but not release_secctx, such as capabilities. Signed-off-by: Casey Schaufler Reported-by: Konstantin Khlebnikov Cc: stable@vger.kernel.org Signed-off-by: James Morris --- fs/xattr.c | 2 +- security/smack/smack_lsm.c | 59 +++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 4424f7fecf14..61cd28ba25f3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -250,7 +250,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 319add31b4a4..286171a16ed2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1473,7 +1473,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) * @inode: the object * @name: attribute name * @buffer: where to put the result - * @alloc: unused + * @alloc: duplicate memory * * Returns the size of the attribute or an error code */ @@ -1486,43 +1486,38 @@ static int smack_inode_getsecurity(struct inode *inode, struct super_block *sbp; struct inode *ip = (struct inode *)inode; struct smack_known *isp; - int ilen; - int rc = 0; - if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) isp = smk_of_inode(inode); - ilen = strlen(isp->smk_known); - *buffer = isp->smk_known; - return ilen; + else { + /* + * The rest of the Smack xattrs are only on sockets. + */ + sbp = ip->i_sb; + if (sbp->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; + + sock = SOCKET_I(ip); + if (sock == NULL || sock->sk == NULL) + return -EOPNOTSUPP; + + ssp = sock->sk->sk_security; + + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + isp = ssp->smk_in; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) + isp = ssp->smk_out; + else + return -EOPNOTSUPP; } - /* - * The rest of the Smack xattrs are only on sockets. - */ - sbp = ip->i_sb; - if (sbp->s_magic != SOCKFS_MAGIC) - return -EOPNOTSUPP; - - sock = SOCKET_I(ip); - if (sock == NULL || sock->sk == NULL) - return -EOPNOTSUPP; - - ssp = sock->sk->sk_security; - - if (strcmp(name, XATTR_SMACK_IPIN) == 0) - isp = ssp->smk_in; - else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) - isp = ssp->smk_out; - else - return -EOPNOTSUPP; - - ilen = strlen(isp->smk_known); - if (rc == 0) { - *buffer = isp->smk_known; - rc = ilen; + if (alloc) { + *buffer = kstrdup(isp->smk_known, GFP_KERNEL); + if (*buffer == NULL) + return -ENOMEM; } - return rc; + return strlen(isp->smk_known); } From 4edd8121e555acbee63578abeaf73026d055bbb4 Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Wed, 4 Oct 2017 11:42:00 +0200 Subject: [PATCH 1273/1322] ARM: dts: stm32: Fix STMPE1600 binding on stm32429i-eval board To declare gpio interrupt line for STMPE1600, 2 possibilities are offered: -use gpio binding (and then the gpiolib interface inside driver) -use interrupt binding as each gpio-controller are also interrupt controller on stm32f429. In STMPE 1600 node both (gpio and interrupt) bindings are defined. This patch fixes this issue and use only interrupt binding. Fixes: c04b2e72af8d ("ARM: dts: stm32: Enable STMPE1600 gpio expander of STM32F429-EVAL board") Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32429i-eval.dts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/stm32429i-eval.dts b/arch/arm/boot/dts/stm32429i-eval.dts index 97b1c2321ba9..5bdb90b2ae72 100644 --- a/arch/arm/boot/dts/stm32429i-eval.dts +++ b/arch/arm/boot/dts/stm32429i-eval.dts @@ -202,10 +202,8 @@ stmpe1600: stmpe1600@42 { compatible = "st,stmpe1600"; reg = <0x42>; - irq-gpio = <&gpioi 8 0>; - irq-trigger = <3>; interrupts = <8 3>; - interrupt-parent = <&exti>; + interrupt-parent = <&gpioi>; interrupt-controller; wakeup-source; From 2aaae13a9db7897a007c5d7bb46cacfb37dffacd Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Wed, 4 Oct 2017 15:34:48 +0200 Subject: [PATCH 1274/1322] ARM: dts: stm32: use right pinctrl compatible for stm32f469 Currently, same stm32f429-pinctrl driver is used for stm32f429 and stm32f469. As pin map is different between those 2 MCUs, a stm32f469-pinctrl driver has been recently added. This patch -allows to use stm32f469-pinctrl driver for stm32f469 boards -reworks stm32 devicetree files to fit with stm32f429 / stm32f469 In the same time it fixes an issue when only MACH_STM32F469 flag is selected in menuconfig. Fixes: d28bcd53fa90 ("ARM: stm32: Introduce MACH_STM32F469 flag") Reported-by: Nicolas Pitre Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32429i-eval.dts | 1 + arch/arm/boot/dts/stm32f4-pinctrl.dtsi | 343 +++++++++++++++++++++++ arch/arm/boot/dts/stm32f429-disco.dts | 1 + arch/arm/boot/dts/stm32f429-pinctrl.dtsi | 95 +++++++ arch/arm/boot/dts/stm32f429.dtsi | 297 -------------------- arch/arm/boot/dts/stm32f469-disco.dts | 1 + arch/arm/boot/dts/stm32f469-pinctrl.dtsi | 96 +++++++ 7 files changed, 537 insertions(+), 297 deletions(-) create mode 100644 arch/arm/boot/dts/stm32f4-pinctrl.dtsi create mode 100644 arch/arm/boot/dts/stm32f429-pinctrl.dtsi create mode 100644 arch/arm/boot/dts/stm32f469-pinctrl.dtsi diff --git a/arch/arm/boot/dts/stm32429i-eval.dts b/arch/arm/boot/dts/stm32429i-eval.dts index 5bdb90b2ae72..293ecb957227 100644 --- a/arch/arm/boot/dts/stm32429i-eval.dts +++ b/arch/arm/boot/dts/stm32429i-eval.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f429-pinctrl.dtsi" #include #include diff --git a/arch/arm/boot/dts/stm32f4-pinctrl.dtsi b/arch/arm/boot/dts/stm32f4-pinctrl.dtsi new file mode 100644 index 000000000000..7f3560c0211d --- /dev/null +++ b/arch/arm/boot/dts/stm32f4-pinctrl.dtsi @@ -0,0 +1,343 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/ { + soc { + pinctrl: pin-controller { + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x40020000 0x3000>; + interrupt-parent = <&exti>; + st,syscfg = <&syscfg 0x8>; + pins-are-numbered; + + gpioa: gpio@40020000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x0 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>; + st,bank-name = "GPIOA"; + }; + + gpiob: gpio@40020400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>; + st,bank-name = "GPIOB"; + }; + + gpioc: gpio@40020800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>; + st,bank-name = "GPIOC"; + }; + + gpiod: gpio@40020c00 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0xc00 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOD)>; + st,bank-name = "GPIOD"; + }; + + gpioe: gpio@40021000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1000 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOE)>; + st,bank-name = "GPIOE"; + }; + + gpiof: gpio@40021400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOF)>; + st,bank-name = "GPIOF"; + }; + + gpiog: gpio@40021800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOG)>; + st,bank-name = "GPIOG"; + }; + + gpioh: gpio@40021c00 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x1c00 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOH)>; + st,bank-name = "GPIOH"; + }; + + gpioi: gpio@40022000 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2000 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOI)>; + st,bank-name = "GPIOI"; + }; + + gpioj: gpio@40022400 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2400 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOJ)>; + st,bank-name = "GPIOJ"; + }; + + gpiok: gpio@40022800 { + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + reg = <0x2800 0x400>; + clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOK)>; + st,bank-name = "GPIOK"; + }; + + usart1_pins_a: usart1@0 { + pins1 { + pinmux = ; + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; + bias-disable; + }; + }; + + usart3_pins_a: usart3@0 { + pins1 { + pinmux = ; + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; + bias-disable; + }; + }; + + usbotg_fs_pins_a: usbotg_fs@0 { + pins { + pinmux = , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + usbotg_fs_pins_b: usbotg_fs@1 { + pins { + pinmux = , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + usbotg_hs_pins_a: usbotg_hs@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <2>; + }; + }; + + ethernet_mii: mii@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + ; + slew-rate = <2>; + }; + }; + + adc3_in8_pin: adc@200 { + pins { + pinmux = ; + }; + }; + + pwm1_pins: pwm@1 { + pins { + pinmux = , + , + ; + }; + }; + + pwm3_pins: pwm@3 { + pins { + pinmux = , + ; + }; + }; + + i2c1_pins: i2c1@0 { + pins { + pinmux = , + ; + bias-disable; + drive-open-drain; + slew-rate = <3>; + }; + }; + + ltdc_pins: ltdc@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + slew-rate = <2>; + }; + }; + + dcmi_pins: dcmi@0 { + pins { + pinmux = , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + bias-disable; + drive-push-pull; + slew-rate = <3>; + }; + }; + }; + }; +}; diff --git a/arch/arm/boot/dts/stm32f429-disco.dts b/arch/arm/boot/dts/stm32f429-disco.dts index c66d617e4245..5ceb2cf3777f 100644 --- a/arch/arm/boot/dts/stm32f429-disco.dts +++ b/arch/arm/boot/dts/stm32f429-disco.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f429-pinctrl.dtsi" #include / { diff --git a/arch/arm/boot/dts/stm32f429-pinctrl.dtsi b/arch/arm/boot/dts/stm32f429-pinctrl.dtsi new file mode 100644 index 000000000000..3e7a17d9112e --- /dev/null +++ b/arch/arm/boot/dts/stm32f429-pinctrl.dtsi @@ -0,0 +1,95 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "stm32f4-pinctrl.dtsi" + +/ { + soc { + pinctrl: pin-controller { + compatible = "st,stm32f429-pinctrl"; + + gpioa: gpio@40020000 { + gpio-ranges = <&pinctrl 0 0 16>; + }; + + gpiob: gpio@40020400 { + gpio-ranges = <&pinctrl 0 16 16>; + }; + + gpioc: gpio@40020800 { + gpio-ranges = <&pinctrl 0 32 16>; + }; + + gpiod: gpio@40020c00 { + gpio-ranges = <&pinctrl 0 48 16>; + }; + + gpioe: gpio@40021000 { + gpio-ranges = <&pinctrl 0 64 16>; + }; + + gpiof: gpio@40021400 { + gpio-ranges = <&pinctrl 0 80 16>; + }; + + gpiog: gpio@40021800 { + gpio-ranges = <&pinctrl 0 96 16>; + }; + + gpioh: gpio@40021c00 { + gpio-ranges = <&pinctrl 0 112 16>; + }; + + gpioi: gpio@40022000 { + gpio-ranges = <&pinctrl 0 128 16>; + }; + + gpioj: gpio@40022400 { + gpio-ranges = <&pinctrl 0 144 16>; + }; + + gpiok: gpio@40022800 { + gpio-ranges = <&pinctrl 0 160 8>; + }; + }; + }; +}; diff --git a/arch/arm/boot/dts/stm32f429.dtsi b/arch/arm/boot/dts/stm32f429.dtsi index dd7e99b1f43b..5b36eb114ddc 100644 --- a/arch/arm/boot/dts/stm32f429.dtsi +++ b/arch/arm/boot/dts/stm32f429.dtsi @@ -47,7 +47,6 @@ #include "skeleton.dtsi" #include "armv7-m.dtsi" -#include #include #include @@ -591,302 +590,6 @@ status = "disabled"; }; - pinctrl: pin-controller { - #address-cells = <1>; - #size-cells = <1>; - compatible = "st,stm32f429-pinctrl"; - ranges = <0 0x40020000 0x3000>; - interrupt-parent = <&exti>; - st,syscfg = <&syscfg 0x8>; - pins-are-numbered; - - gpioa: gpio@40020000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x0 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>; - st,bank-name = "GPIOA"; - }; - - gpiob: gpio@40020400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>; - st,bank-name = "GPIOB"; - }; - - gpioc: gpio@40020800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>; - st,bank-name = "GPIOC"; - }; - - gpiod: gpio@40020c00 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0xc00 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOD)>; - st,bank-name = "GPIOD"; - }; - - gpioe: gpio@40021000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1000 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOE)>; - st,bank-name = "GPIOE"; - }; - - gpiof: gpio@40021400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOF)>; - st,bank-name = "GPIOF"; - }; - - gpiog: gpio@40021800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOG)>; - st,bank-name = "GPIOG"; - }; - - gpioh: gpio@40021c00 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x1c00 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOH)>; - st,bank-name = "GPIOH"; - }; - - gpioi: gpio@40022000 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2000 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOI)>; - st,bank-name = "GPIOI"; - }; - - gpioj: gpio@40022400 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2400 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOJ)>; - st,bank-name = "GPIOJ"; - }; - - gpiok: gpio@40022800 { - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; - reg = <0x2800 0x400>; - clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOK)>; - st,bank-name = "GPIOK"; - }; - - usart1_pins_a: usart1@0 { - pins1 { - pinmux = ; - bias-disable; - drive-push-pull; - slew-rate = <0>; - }; - pins2 { - pinmux = ; - bias-disable; - }; - }; - - usart3_pins_a: usart3@0 { - pins1 { - pinmux = ; - bias-disable; - drive-push-pull; - slew-rate = <0>; - }; - pins2 { - pinmux = ; - bias-disable; - }; - }; - - usbotg_fs_pins_a: usbotg_fs@0 { - pins { - pinmux = , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - usbotg_fs_pins_b: usbotg_fs@1 { - pins { - pinmux = , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - usbotg_hs_pins_a: usbotg_hs@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <2>; - }; - }; - - ethernet_mii: mii@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - ; - slew-rate = <2>; - }; - }; - - adc3_in8_pin: adc@200 { - pins { - pinmux = ; - }; - }; - - pwm1_pins: pwm@1 { - pins { - pinmux = , - , - ; - }; - }; - - pwm3_pins: pwm@3 { - pins { - pinmux = , - ; - }; - }; - - i2c1_pins: i2c1@0 { - pins { - pinmux = , - ; - bias-disable; - drive-open-drain; - slew-rate = <3>; - }; - }; - - ltdc_pins: ltdc@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - , - ; - slew-rate = <2>; - }; - }; - - dcmi_pins: dcmi@0 { - pins { - pinmux = , - , - , - , - , - , - , - , - , - , - , - , - , - , - ; - bias-disable; - drive-push-pull; - slew-rate = <3>; - }; - }; - }; - crc: crc@40023000 { compatible = "st,stm32f4-crc"; reg = <0x40023000 0x400>; diff --git a/arch/arm/boot/dts/stm32f469-disco.dts b/arch/arm/boot/dts/stm32f469-disco.dts index 6ae1f037f3f0..c18acbe4cf4e 100644 --- a/arch/arm/boot/dts/stm32f469-disco.dts +++ b/arch/arm/boot/dts/stm32f469-disco.dts @@ -47,6 +47,7 @@ /dts-v1/; #include "stm32f429.dtsi" +#include "stm32f469-pinctrl.dtsi" / { model = "STMicroelectronics STM32F469i-DISCO board"; diff --git a/arch/arm/boot/dts/stm32f469-pinctrl.dtsi b/arch/arm/boot/dts/stm32f469-pinctrl.dtsi new file mode 100644 index 000000000000..fff542662eea --- /dev/null +++ b/arch/arm/boot/dts/stm32f469-pinctrl.dtsi @@ -0,0 +1,96 @@ +/* + * Copyright 2017 - Alexandre Torgue + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "stm32f4-pinctrl.dtsi" + +/ { + soc { + pinctrl: pin-controller { + compatible = "st,stm32f469-pinctrl"; + + gpioa: gpio@40020000 { + gpio-ranges = <&pinctrl 0 0 16>; + }; + + gpiob: gpio@40020400 { + gpio-ranges = <&pinctrl 0 16 16>; + }; + + gpioc: gpio@40020800 { + gpio-ranges = <&pinctrl 0 32 16>; + }; + + gpiod: gpio@40020c00 { + gpio-ranges = <&pinctrl 0 48 16>; + }; + + gpioe: gpio@40021000 { + gpio-ranges = <&pinctrl 0 64 16>; + }; + + gpiof: gpio@40021400 { + gpio-ranges = <&pinctrl 0 80 16>; + }; + + gpiog: gpio@40021800 { + gpio-ranges = <&pinctrl 0 96 16>; + }; + + gpioh: gpio@40021c00 { + gpio-ranges = <&pinctrl 0 112 16>; + }; + + gpioi: gpio@40022000 { + gpio-ranges = <&pinctrl 0 128 16>; + }; + + gpioj: gpio@40022400 { + gpio-ranges = <&pinctrl 0 144 6>, + <&pinctrl 12 156 4>; + }; + + gpiok: gpio@40022800 { + gpio-ranges = <&pinctrl 3 163 5>; + }; + }; + }; +}; From 783874b050768d361239e444ba0fa396bb6d463f Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 13 Sep 2017 15:45:56 +0200 Subject: [PATCH 1275/1322] dm crypt: reject sector_size feature if device length is not aligned to it If a crypt mapping uses optional sector_size feature, additional restrictions to mapped device segment size must be applied in constructor, otherwise the device activation will fail later. Fixes: 8f0009a225 ("dm crypt: optionally support larger encryption sector size") Cc: stable@vger.kernel.org # 4.12+ Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 75341fdca4b6..96ab46512e1f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2585,6 +2585,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar ti->error = "Invalid feature value for sector_size"; return -EINVAL; } + if (ti->len & ((cc->sector_size >> SECTOR_SHIFT) - 1)) { + ti->error = "Device size is not multiple of sector_size feature"; + return -EINVAL; + } cc->sector_shift = __ffs(cc->sector_size) - SECTOR_SHIFT; } else if (!strcasecmp(opt_string, "iv_large_sectors")) set_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); From 4d86d38186271438ef002c5ae6e04836f01bf8bf Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 4 Oct 2017 09:54:27 +0200 Subject: [PATCH 1276/1322] ravb: RX checksum offload Add support for RX checksum offload. This is enabled by default and may be disabled and re-enabled using ethtool: # ethtool -K eth0 rx off # ethtool -K eth0 rx on The RAVB provides a simple checksumming scheme which appears to be completely compatible with CHECKSUM_COMPLETE: sum of all packet data after the L2 header is appended to packet data; this may be trivially read by the driver and used to update the skb accordingly. In terms of performance throughput is close to gigabit line-rate both with and without RX checksum offload enabled. Perf output, however, appears to indicate that significantly less time is spent in do_csum(). This is as expected. Test results with RX checksum offload enabled: # /usr/bin/perf_3.16 record -o /run/perf.data -a netperf -t TCP_MAERTS -H 10.4.3.162 MIGRATED TCP MAERTS TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.4.3.162 () port 0 AF_INET : demo enable_enobufs failed: getprotobyname Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 937.54 Summary of output of perf report: 18.28% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 10.34% ksoftirqd/0 [kernel.kallsyms] [k] __pi_memcpy 9.83% ksoftirqd/0 [kernel.kallsyms] [k] ravb_poll 7.89% ksoftirqd/0 [kernel.kallsyms] [k] skb_put 4.01% ksoftirqd/0 [kernel.kallsyms] [k] dev_gro_receive 3.37% netperf [kernel.kallsyms] [k] __arch_copy_to_user 3.17% swapper [kernel.kallsyms] [k] arch_cpu_idle 2.55% swapper [kernel.kallsyms] [k] tick_nohz_idle_enter 2.04% ksoftirqd/0 [kernel.kallsyms] [k] __pi___inval_dcache_area 2.03% swapper [kernel.kallsyms] [k] _raw_spin_unlock_irq 1.96% ksoftirqd/0 [kernel.kallsyms] [k] __netdev_alloc_skb 1.59% ksoftirqd/0 [kernel.kallsyms] [k] __slab_alloc.isra.83 Test results without RX checksum offload enabled: # /usr/bin/perf_3.16 record -o /run/perf.data -a netperf -t TCP_MAERTS -H 10.4.3.162 MIGRATED TCP MAERTS TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.4.3.162 () port 0 AF_INET : demo enable_enobufs failed: getprotobyname Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.00 940.20 Summary of output of perf report: 17.10% ksoftirqd/0 [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 10.99% ksoftirqd/0 [kernel.kallsyms] [k] __pi_memcpy 8.87% ksoftirqd/0 [kernel.kallsyms] [k] ravb_poll 8.16% ksoftirqd/0 [kernel.kallsyms] [k] skb_put 7.42% ksoftirqd/0 [kernel.kallsyms] [k] do_csum 3.91% ksoftirqd/0 [kernel.kallsyms] [k] dev_gro_receive 2.31% swapper [kernel.kallsyms] [k] arch_cpu_idle 2.16% ksoftirqd/0 [kernel.kallsyms] [k] __pi___inval_dcache_area 2.14% ksoftirqd/0 [kernel.kallsyms] [k] __netdev_alloc_skb 1.93% netperf [kernel.kallsyms] [k] __arch_copy_to_user 1.79% swapper [kernel.kallsyms] [k] tick_nohz_idle_enter 1.63% ksoftirqd/0 [kernel.kallsyms] [k] __slab_alloc.isra.83 Above results collected on an R-Car Gen 3 Salvator-X/r8a7796 ES1.0. Also tested on a R-Car Gen 3 Salvator-X/r8a7795 ES1.0. By inspection this also appears to be compatible with the ravb found on R-Car Gen 2 SoCs, however, this patch is currently untested on such hardware. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 55 +++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index fdf30bfa403b..a8822a756e08 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -403,8 +403,9 @@ static void ravb_emac_init(struct net_device *ndev) /* Receive frame limit set register */ ravb_write(ndev, ndev->mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN, RFLR); - /* PAUSE prohibition */ + /* EMAC Mode: PAUSE prohibition; Duplex; RX Checksum; TX; RX */ ravb_write(ndev, ECMR_ZPF | (priv->duplex ? ECMR_DM : 0) | + (ndev->features & NETIF_F_RXCSUM ? ECMR_RCSC : 0) | ECMR_TE | ECMR_RE, ECMR); ravb_set_rate(ndev); @@ -520,6 +521,19 @@ static void ravb_get_tx_tstamp(struct net_device *ndev) } } +static void ravb_rx_csum(struct sk_buff *skb) +{ + u8 *hw_csum; + + /* The hardware checksum is 2 bytes appended to packet data */ + if (unlikely(skb->len < 2)) + return; + hw_csum = skb_tail_pointer(skb) - 2; + skb->csum = csum_unfold((__force __sum16)get_unaligned_le16(hw_csum)); + skb->ip_summed = CHECKSUM_COMPLETE; + skb_trim(skb, skb->len - 2); +} + /* Packet receive function for Ethernet AVB */ static bool ravb_rx(struct net_device *ndev, int *quota, int q) { @@ -587,8 +601,11 @@ static bool ravb_rx(struct net_device *ndev, int *quota, int q) ts.tv_nsec = le32_to_cpu(desc->ts_n); shhwtstamps->hwtstamp = timespec64_to_ktime(ts); } + skb_put(skb, pkt_len); skb->protocol = eth_type_trans(skb, ndev); + if (ndev->features & NETIF_F_RXCSUM) + ravb_rx_csum(skb); napi_gro_receive(&priv->napi[q], skb); stats->rx_packets++; stats->rx_bytes += pkt_len; @@ -1842,6 +1859,38 @@ static int ravb_do_ioctl(struct net_device *ndev, struct ifreq *req, int cmd) return phy_mii_ioctl(phydev, req, cmd); } +static void ravb_set_rx_csum(struct net_device *ndev, bool enable) +{ + struct ravb_private *priv = netdev_priv(ndev); + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + /* Disable TX and RX */ + ravb_rcv_snd_disable(ndev); + + /* Modify RX Checksum setting */ + ravb_modify(ndev, ECMR, ECMR_RCSC, enable ? ECMR_RCSC : 0); + + /* Enable TX and RX */ + ravb_rcv_snd_enable(ndev); + + spin_unlock_irqrestore(&priv->lock, flags); +} + +static int ravb_set_features(struct net_device *ndev, + netdev_features_t features) +{ + netdev_features_t changed = ndev->features ^ features; + + if (changed & NETIF_F_RXCSUM) + ravb_set_rx_csum(ndev, features & NETIF_F_RXCSUM); + + ndev->features = features; + + return 0; +} + static const struct net_device_ops ravb_netdev_ops = { .ndo_open = ravb_open, .ndo_stop = ravb_close, @@ -1853,6 +1902,7 @@ static const struct net_device_ops ravb_netdev_ops = { .ndo_do_ioctl = ravb_do_ioctl, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, + .ndo_set_features = ravb_set_features, }; /* MDIO bus init function */ @@ -2004,6 +2054,9 @@ static int ravb_probe(struct platform_device *pdev) if (!ndev) return -ENOMEM; + ndev->features = NETIF_F_RXCSUM; + ndev->hw_features = NETIF_F_RXCSUM; + pm_runtime_enable(&pdev->dev); pm_runtime_get_sync(&pdev->dev); From 20e883204f0268b423799781e5efed786f8a11ba Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 13:56:50 +0200 Subject: [PATCH 1277/1322] net: core: fix kerneldoc comment net/core/dev.c:1306: warning: No description found for parameter 'name' net/core/dev.c:1306: warning: Excess function parameter 'alias' description in 'dev_get_alias' Fixes: 6c5570016b97 ("net: core: decouple ifalias get/set from rtnl lock") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 1770097cfd86..454f05441546 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1295,7 +1295,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) /** * dev_get_alias - get ifalias of a device * @dev: device - * @alias: buffer to store name of ifalias + * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go From ebf6b13142f947be576b40edce214788dfe1d3e3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Oct 2017 14:20:37 +0100 Subject: [PATCH 1278/1322] cxgb4vf: make a couple of functions static The functions t4vf_link_down_rc_str and t4vf_handle_get_port_info are local to the source and do not need to be in global scope, so make them static. Cleans up sparse warnings: symbol 't4vf_link_down_rc_str' was not declared. Should it be static? symbol 't4vf_handle_get_port_info' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c index a8d94963b4d0..67aec59a14e6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c @@ -1812,7 +1812,7 @@ int t4vf_eth_eq_free(struct adapter *adapter, unsigned int eqid) * * Returns a string representation of the Link Down Reason Code. */ -const char *t4vf_link_down_rc_str(unsigned char link_down_rc) +static const char *t4vf_link_down_rc_str(unsigned char link_down_rc) { static const char * const reason[] = { "Link Down", @@ -1838,8 +1838,8 @@ const char *t4vf_link_down_rc_str(unsigned char link_down_rc) * * Processes a GET_PORT_INFO FW reply message. */ -void t4vf_handle_get_port_info(struct port_info *pi, - const struct fw_port_cmd *cmd) +static void t4vf_handle_get_port_info(struct port_info *pi, + const struct fw_port_cmd *cmd) { int action = FW_PORT_CMD_ACTION_G(be32_to_cpu(cmd->action_to_len16)); struct adapter *adapter = pi->adapter; From e774d96b7d2c3489bfb5bbdc2b65ed41cd68d3d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:55:29 +0200 Subject: [PATCH 1279/1322] rtnetlink: remove slave_validate callback no users in the tree. Signed-off-by: Florian Westphal Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 3 --- net/core/rtnetlink.c | 6 ------ 2 files changed, 9 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 21837ca68ecc..6520993ff449 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -93,9 +93,6 @@ struct rtnl_link_ops { int slave_maxtype; const struct nla_policy *slave_policy; - int (*slave_validate)(struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack); int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3961f87cdc76..b63c5759641f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2631,12 +2631,6 @@ replay: return err; slave_data = slave_attr; } - if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data, - extack); - if (err < 0) - return err; - } } if (dev) { From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:58:49 +0200 Subject: [PATCH 1280/1322] rtnetlink: remove __rtnl_af_unregister switch the only caller to rtnl_af_unregister. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 -- net/core/rtnetlink.c | 14 +------------- net/ipv6/addrconf.c | 4 ++-- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6520993ff449..e3ca8e2e3103 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -151,8 +151,6 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void __rtnl_af_unregister(struct rtnl_af_ops *ops); - void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b63c5759641f..3fb1ca33cba4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -475,18 +475,6 @@ void rtnl_af_register(struct rtnl_af_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_af_register); -/** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); - /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister @@ -494,7 +482,7 @@ EXPORT_SYMBOL_GPL(__rtnl_af_unregister); void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del(&ops->list); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f553f72d0bee..837418ff2d4b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6618,9 +6618,9 @@ void addrconf_cleanup(void) unregister_pernet_subsys(&addrconf_ops); ipv6_addr_label_cleanup(); - rtnl_lock(); + rtnl_af_unregister(&inet6_ops); - __rtnl_af_unregister(&inet6_ops); + rtnl_lock(); /* clean dev list */ for_each_netdev(&init_net, dev) { From e9b871ee098dfb91780e13bfc6e91e82c7b4ad73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 16:22:59 +0200 Subject: [PATCH 1281/1322] selftests: rtnetlink: try concurrent change of ifalias to make sure this is serialized correctly. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 62c87da92770..e8c86c416ed0 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -278,6 +278,12 @@ kci_test_ifalias() ip link show "$devdummy" | grep -q "alias $namewant" check_fail $? + for i in $(seq 1 100); do + uuidgen > "$syspathname" & + done + + wait + # re-add the alias -- kernel should free mem when dummy dev is removed ip link set dev "$devdummy" alias "$namewant" check_err $? From c818fa9e288be5be7e360c33cf4f5e30f9fa206e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 10:48:35 -0700 Subject: [PATCH 1282/1322] net: cache skb_shinfo() in skb_try_coalesce() Compiler does not really know that skb_shinfo(to|from) are constants in skb_try_coalesce(), lets cache their values to shrink code. We might even take care of skb_zcopy() calls later. $ size net/core/skbuff.o.before net/core/skbuff.o text data bss dec hex filename 40727 1298 0 42025 a429 net/core/skbuff.o.before 40631 1298 0 41929 a3c9 net/core/skbuff.o Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d98c2e3ce2bf..822a90e56aea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4767,6 +4767,7 @@ EXPORT_SYMBOL(kfree_skb_partial); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { + struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; @@ -4781,7 +4782,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; @@ -4790,8 +4793,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) @@ -4802,12 +4805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); @@ -4815,19 +4818,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; + from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); to->truesize += delta; to->len += len; From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: [PATCH 1283/1322] bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 46 ++-- include/linux/bpf.h | 32 +++ include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 42 +++- kernel/bpf/cgroup.c | 465 ++++++++++++++++++++++++++----------- kernel/bpf/core.c | 31 +++ kernel/bpf/syscall.c | 37 ++- kernel/cgroup/cgroup.c | 28 ++- 8 files changed, 515 insertions(+), 168 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d41d40ac3efd..102e56fbb6de 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -14,27 +14,42 @@ struct bpf_sock_ops_kern; extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +struct bpf_prog_list { + struct list_head node; + struct bpf_prog *prog; +}; + +struct bpf_prog_array; + struct cgroup_bpf { - /* - * Store two sets of bpf_prog pointers, one for programs that are - * pinned directly to this cgroup, and one for those that are effective - * when this cgroup is accessed. + /* array of effective progs in this cgroup */ + struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; + + /* attached progs to this cgroup and attach flags + * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will + * have either zero or one element + * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ - struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; - struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE]; - bool disallow_override[MAX_BPF_ATTACH_TYPE]; + struct list_head progs[MAX_BPF_ATTACH_TYPE]; + u32 flags[MAX_BPF_ATTACH_TYPE]; + + /* temp storage for effective prog array used by prog_attach/detach */ + struct bpf_prog_array __rcu *inactive; }; void cgroup_bpf_put(struct cgroup *cgrp); -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); +int cgroup_bpf_inherit(struct cgroup *cgrp); -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool overridable); +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); -/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable); +/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} -static inline void cgroup_bpf_inherit(struct cgroup *cgrp, - struct cgroup *parent) {} +static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 252f4bc9eb25..a6964b75f070 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +/* an array of programs to be executed under rcu_lock. + * + * Typical usage: + * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); + * + * the structure returned by bpf_prog_array_alloc() should be populated + * with program pointers and the last pointer must be NULL. + * The user has to keep refcnt on the program and make sure the program + * is removed from the array before bpf_prog_put(). + * The 'struct bpf_prog_array *' should only be replaced with xchg() + * since other cpus are walking the array of pointers in parallel. + */ +struct bpf_prog_array { + struct rcu_head rcu; + struct bpf_prog *progs[0]; +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); + +#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog **_prog; \ + u32 _ret = 1; \ + rcu_read_lock(); \ + _prog = rcu_dereference(array)->progs; \ + for (; *_prog; _prog++) \ + _ret &= func(*_prog, ctx); \ + rcu_read_unlock(); \ + _ret; \ + }) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/linux/filter.h b/include/linux/filter.h index 911d454af107..2d2db394b0ca 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -481,7 +481,7 @@ struct sk_filter { struct bpf_prog *prog; }; -#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +#define BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6d2137b4cf38..762f74bc6c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,11 +143,47 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command - * to the given target_fd cgroup the descendent cgroup will be able to - * override effective bpf program that was inherited from this cgroup +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..6b7500bbdb53 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; - if (prog) { - bpf_prog_put(prog); + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); } } +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; + } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); +} + /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + u32 old_flags; + int err; - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup + if (!hierarchy_allows_attach(cgrp, type, flags)) + return -EPERM; + + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; - old_prog = cgrp->bpf.prog[type]; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; - } - - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - - cgrp->bpf.prog[type] = prog; - - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); - - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; } + pl->prog = prog; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + old_flags = cgrp->bpf.flags[type]; + cgrp->bpf.flags[type] = flags; + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; } /** @@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; + int ret; - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; + int ret; - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..6b49e1991ae7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +} empty_prog_array = { + .null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ + if (prog_cnt) + return kzalloc(sizeof(struct bpf_prog_array) + + sizeof(struct bpf_prog *) * (prog_cnt + 1), + flags); + + return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ + if (!progs || + progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + return; + kfree_rcu(progs, rcu); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b927da66f653..51bee695d32c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return 0; } +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; switch (attr->attach_type) { @@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, - attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); if (ret) bpf_prog_put(prog); cgroup_put(cgrp); @@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; + struct bpf_prog *prog; struct cgroup *cgrp; int ret; @@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; case BPF_CGROUP_SOCK_OPS: - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sockmap_get_from_fd(attr, false); - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + cgroup_put(cgrp); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..57eb866ae78d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: [PATCH 1284/1322] bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 4 ++++ include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 13 +++++++++++ kernel/bpf/cgroup.c | 46 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 38 +++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 10 +++++++++ 7 files changed, 148 insertions(+) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 102e56fbb6de..359b6f5d3d90 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6964b75f070..a67daea731ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -260,6 +260,9 @@ struct bpf_prog_array { struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs); +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt); #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ ({ \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 762f74bc6c47..cb2b9f95160a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -211,6 +212,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U union bpf_attr { @@ -289,6 +293,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6b7500bbdb53..e88abc0865d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -384,6 +384,52 @@ cleanup: return err; } +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6b49e1991ae7..eba966c09053 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 51bee695d32c..0048cb24ba7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57eb866ae78d..269512b94a94 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ From 390ee7e29fc8e6e90d3065b00afb047c4ee552f9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:23 -0700 Subject: [PATCH 1285/1322] bpf: enforce return code for cgroup-bpf programs with addition of tnum logic the verifier got smart enough and we can enforce return codes at program load time. For now do so for cgroup-bpf program types. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++ tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cf9b72c59a0..52b022310f6a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3073,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + break; + default: + return 0; + } + + reg = &env->cur_state.regs[BPF_REG_0]; + if (reg->type != SCALAR_VALUE) { + verbose("At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose("At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("has value %s", tn_buf); + } else { + verbose("has unknown scalar value"); + } + verbose(" should have been 0 or 1\n"); + return -EINVAL; + } + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -3863,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env) return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 290d5056c165..cc91d0159f43 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6892,6 +6892,78 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_XDP, }, + { + "bpf_exit with invalid return code. test1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x0; 0xffffffff)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test2", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x0; 0x3)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test5", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has value (0x2; 0x0)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test6", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R0 is not a known value (ctx)", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, + { + "bpf_exit with invalid return code. test7", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4), + BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 has unknown scalar value", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 244d20efdb68c5c1a26c667baeb232ea163e2f69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:24 -0700 Subject: [PATCH 1286/1322] libbpf: introduce bpf_prog_detach2() introduce bpf_prog_detach2() that takes one more argument prog_fd vs bpf_prog_detach() that takes only attach_fd and type. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/lib/bpf/bpf.c | 12 ++++++++++++ tools/lib/bpf/bpf.h | 1 + 2 files changed, 13 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index daf624e4c720..d4b6ba8292ee 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -291,6 +291,18 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } +int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + + return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); +} + int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, __u32 *duration) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 118d00535a0d..afd64727c9cf 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -66,6 +66,7 @@ int bpf_obj_get(const char *pathname); int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, __u32 *duration); From 39323e788cb672adba8709ca407bd6763aae577d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:25 -0700 Subject: [PATCH 1287/1322] samples/bpf: add multi-prog cgroup test case create 5 cgroups, attach 6 progs and check that progs are executed as: cgrp1 (MULTI progs A, B) -> cgrp2 (OVERRIDE prog C) -> cgrp3 (MULTI prog D) -> cgrp4 (OVERRIDE prog E) -> cgrp5 (NONE prog F) the event in cgrp5 triggers execution of F,D,A,B in that order. if prog F is detached, the execution is E,D,A,B if prog F and D are detached, the execution is E,A,B if prog F, E and D are detached, the execution is C,A,B Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/cgroup_helpers.c | 4 +- samples/bpf/test_cgrp2_attach2.c | 188 ++++++++++++++++++++++++++++++- 2 files changed, 185 insertions(+), 7 deletions(-) diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c index 9d1be9426401..88bdcf4b1670 100644 --- a/samples/bpf/cgroup_helpers.c +++ b/samples/bpf/cgroup_helpers.c @@ -56,7 +56,7 @@ int setup_cgroup_environment(void) return 1; } - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) { + if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) { log_err("mount cgroup2"); return 1; } @@ -163,7 +163,7 @@ int create_and_get_cgroup(char *path) format_cgroup_path(cgroup_path, path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup"); + log_err("mkdiring cgroup %s .. %s", path, cgroup_path); return 0; } diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 3049b1f26267..9a9f6836e5e9 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -30,7 +30,7 @@ #define FOO "/foo" #define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1" +#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -55,8 +55,7 @@ static int prog_load(int verdict) return ret; } - -int main(int argc, char **argv) +static int test_foo_bar(void) { int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; @@ -189,8 +188,187 @@ out: close(bar); cleanup_cgroup_environment(); if (!rc) - printf("PASS\n"); + printf("### override:PASS\n"); else - printf("FAIL\n"); + printf("### override:FAIL\n"); return rc; } + +static int map_fd = -1; + +static int prog_load_cnt(int verdict, int val) +{ + if (map_fd < 0) + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + struct bpf_insn prog[] = { + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int ret; + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + + if (ret < 0) { + log_err("Loading program"); + printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); + return 0; + } + return ret; +} + + +static int test_multiprog(void) +{ + int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; + int drop_prog, allow_prog[6] = {}, rc = 0; + unsigned long long value; + int i = 0; + + for (i = 0; i < 6; i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (!allow_prog[i]) + goto err; + } + drop_prog = prog_load_cnt(0, 1); + if (!drop_prog) + goto err; + + if (setup_cgroup_environment()) + goto err; + + cg1 = create_and_get_cgroup("/cg1"); + if (!cg1) + goto err; + cg2 = create_and_get_cgroup("/cg1/cg2"); + if (!cg2) + goto err; + cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); + if (!cg3) + goto err; + cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); + if (!cg4) + goto err; + cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); + if (!cg5) + goto err; + + if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) + goto err; + + if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg1"); + goto err; + } + if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Unexpected success attaching the same prog to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog2 to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg2"); + goto err; + } + if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg3"); + goto err; + } + if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg4"); + goto err; + } + if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching prog to cg5"); + goto err; + } + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 32); + + /* detach bottom program and ping again */ + if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg5"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 16); + + /* detach 3rd from bottom program and ping again */ + errno = 0; + if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Unexpected success on detach from cg3"); + goto err; + } + if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching from cg3"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 16); + + /* detach 2nd from bottom program and ping again */ + if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg4"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 4); + goto out; +err: + rc = 1; + +out: + for (i = 0; i < 6; i++) + if (allow_prog[i] > 0) + close(allow_prog[i]); + close(cg1); + close(cg2); + close(cg3); + close(cg4); + close(cg5); + cleanup_cgroup_environment(); + if (!rc) + printf("### multi:PASS\n"); + else + printf("### multi:FAIL\n"); + return rc; +} + +int main(int argc, char **argv) +{ + int rc = 0; + + rc = test_foo_bar(); + if (rc) + return rc; + + return test_multiprog(); +} From defd9c476fa6b01b4eb5450452bfd202138decb7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:26 -0700 Subject: [PATCH 1288/1322] libbpf: sync bpf.h tools/include/uapi/linux/bpf.h got out of sync with actual kernel header. Update it. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/include/uapi/linux/bpf.h | 55 ++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6d2137b4cf38..cb2b9f95160a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -92,6 +92,7 @@ enum bpf_cmd { BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, }; enum bpf_map_type { @@ -143,11 +144,47 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command - * to the given target_fd cgroup the descendent cgroup will be able to - * override effective bpf program that was inherited from this cgroup +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. */ #define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the * verifier will perform strict alignment checking as if the kernel @@ -175,6 +212,9 @@ enum bpf_attach_type { /* Specify numa node during map creation */ #define BPF_F_NUMA_NODE (1U << 2) +/* flags for BPF_PROG_QUERY */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + #define BPF_OBJ_NAME_LEN 16U union bpf_attr { @@ -253,6 +293,15 @@ union bpf_attr { __u32 info_len; __aligned_u64 info; } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + } query; } __attribute__((aligned(8))); /* BPF helper function descriptions: From 5d0cbf9b6c11964bf2f4041a5b0b6f81cb169736 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:27 -0700 Subject: [PATCH 1289/1322] libbpf: add support for BPF_PROG_QUERY add support for BPF_PROG_QUERY command to libbpf Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/lib/bpf/bpf.c | 20 ++++++++++++++++++++ tools/lib/bpf/bpf.h | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index d4b6ba8292ee..5128677e4117 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -303,6 +303,26 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); } +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + union bpf_attr attr; + int ret; + + bzero(&attr, sizeof(attr)); + attr.query.target_fd = target_fd; + attr.query.attach_type = type; + attr.query.query_flags = query_flags; + attr.query.prog_cnt = *prog_cnt; + attr.query.prog_ids = ptr_to_u64(prog_ids); + + ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); + if (attach_flags) + *attach_flags = attr.query.attach_flags; + *prog_cnt = attr.query.prog_cnt; + return ret; +} + int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, __u32 *duration) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index afd64727c9cf..6534889e2b2f 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -75,5 +75,6 @@ int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); int bpf_prog_get_fd_by_id(__u32 id); int bpf_map_get_fd_by_id(__u32 id); int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); - +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); #endif From dfc069998ebb010f910dfec379dab4f44d331980 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:28 -0700 Subject: [PATCH 1290/1322] samples/bpf: use bpf_prog_query() interface use BPF_PROG_QUERY command to strengthen test coverage Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/test_cgrp2_attach2.c | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 9a9f6836e5e9..3e8232cc04a8 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -236,6 +236,7 @@ static int prog_load_cnt(int verdict, int val) static int test_multiprog(void) { + __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; int drop_prog, allow_prog[6] = {}, rc = 0; unsigned long long value; @@ -304,6 +305,32 @@ static int test_multiprog(void) assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); assert(value == 1 + 2 + 8 + 32); + /* query the number of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + NULL, NULL, &prog_cnt) == 0); + assert(prog_cnt == 4); + /* retrieve prog_ids of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 4); + assert(attach_flags == 0); + saved_prog_id = prog_ids[0]; + /* check enospc handling */ + prog_ids[0] = 0; + prog_cnt = 2; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == -1 && + errno == ENOSPC); + assert(prog_cnt == 4); + /* check that prog_ids are returned even when buffer is too small */ + assert(prog_ids[0] == saved_prog_id); + /* retrieve prog_id of single attached prog in cg5 */ + prog_ids[0] = 0; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 1); + assert(prog_ids[0] == saved_prog_id); + /* detach bottom program and ping again */ if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { log_err("Detaching prog from cg5"); @@ -341,6 +368,15 @@ static int test_multiprog(void) assert(system(PING_CMD) == 0); assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); assert(value == 1 + 2 + 4); + + prog_cnt = 4; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 3); + assert(attach_flags == 0); + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 0); goto out; err: rc = 1; From e0a86312874e36033cd94fb977dd603a292875c8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 4 Oct 2017 23:10:59 +0100 Subject: [PATCH 1291/1322] Update James Hogan's email address Update my imgtec.com and personal email address to my kernel.org one in a few places as MIPS will soon no longer be part of Imagination Technologies, and add mappings in .mailcap so get_maintainer.pl reports the right address. Signed-off-by: James Hogan Signed-off-by: Linus Torvalds --- .mailmap | 2 ++ Documentation/ABI/testing/sysfs-power | 2 +- MAINTAINERS | 6 +++--- drivers/i2c/busses/i2c-img-scb.c | 2 +- drivers/media/rc/ir-sharp-decoder.c | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.mailmap b/.mailmap index 5273cfd70ad6..c7b10caecc4e 100644 --- a/.mailmap +++ b/.mailmap @@ -68,6 +68,8 @@ Jacob Shin James Bottomley James Bottomley James E Wilson +James Hogan +James Hogan James Ketrenos Javi Merino diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 713cab1d5f12..a1d1612f3651 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -127,7 +127,7 @@ Description: What; /sys/power/pm_trace_dev_match Date: October 2010 -Contact: James Hogan +Contact: James Hogan Description: The /sys/power/pm_trace_dev_match file contains the name of the device associated with the last PM event point saved in the RTC diff --git a/MAINTAINERS b/MAINTAINERS index 65b0c88d5ee0..3f05fc6961ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6738,7 +6738,7 @@ F: Documentation/devicetree/bindings/auxdisplay/img-ascii-lcd.txt F: drivers/auxdisplay/img-ascii-lcd.c IMGTEC IR DECODER DRIVER -M: James Hogan +M: James Hogan S: Maintained F: drivers/media/rc/img-ir/ @@ -7562,7 +7562,7 @@ F: arch/arm64/include/asm/kvm* F: arch/arm64/kvm/ KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) -M: James Hogan +M: James Hogan L: linux-mips@linux-mips.org S: Supported F: arch/mips/include/uapi/asm/kvm* @@ -8885,7 +8885,7 @@ F: Documentation/devicetree/bindings/media/meson-ao-cec.txt T: git git://linuxtv.org/media_tree.git METAG ARCHITECTURE -M: James Hogan +M: James Hogan L: linux-metag@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/metag.git S: Odd Fixes diff --git a/drivers/i2c/busses/i2c-img-scb.c b/drivers/i2c/busses/i2c-img-scb.c index 84fb35f6837f..eb1d91b986fd 100644 --- a/drivers/i2c/busses/i2c-img-scb.c +++ b/drivers/i2c/busses/i2c-img-scb.c @@ -1459,6 +1459,6 @@ static struct platform_driver img_scb_i2c_driver = { }; module_platform_driver(img_scb_i2c_driver); -MODULE_AUTHOR("James Hogan "); +MODULE_AUTHOR("James Hogan "); MODULE_DESCRIPTION("IMG host I2C driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c index ed43a4212479..129b558acc92 100644 --- a/drivers/media/rc/ir-sharp-decoder.c +++ b/drivers/media/rc/ir-sharp-decoder.c @@ -245,5 +245,5 @@ module_init(ir_sharp_decode_init); module_exit(ir_sharp_decode_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("James Hogan "); +MODULE_AUTHOR("James Hogan "); MODULE_DESCRIPTION("Sharp IR protocol decoder"); From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 3 Oct 2017 13:53:23 +0200 Subject: [PATCH 1292/1322] dev: advertise the new nsid when the netns iface changes x-netns interfaces are bound to two netns: the link netns and the upper netns. Usually, this kind of interfaces is created in the link netns and then moved to the upper netns. At the end, the interface is visible only in the upper netns. The link nsid is advertised via netlink in the upper netns, thus the user always knows where is the link part. There is no such mechanism in the link netns. When the interface is moved to another netns, the user cannot "follow" it. This patch adds a new netlink attribute which helps to follow an interface which moves to another netns. When the interface is unregistered, the new nsid is advertised. If the interface is a x-netns interface (ie rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed. CC: Jason A. Donenfeld Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 4 +++- include/uapi/linux/if_link.h | 1 + net/core/dev.c | 11 ++++++++--- net/core/rtnetlink.c | 31 ++++++++++++++++++++++--------- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index dea59c8eec54..1251638e60d3 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, - gfp_t flags); + gfp_t flags, int *new_nsid); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ea87bd708ee9..cd580fc0e58f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -158,6 +158,7 @@ enum { IFLA_PAD, IFLA_XDP, IFLA_EVENT, + IFLA_NEW_NETNSID, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 454f05441546..bffc75429184 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -7204,7 +7205,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL); + GFP_KERNEL, NULL); /* * Flush the unicast and multicast chains @@ -8291,7 +8292,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err; + int err, new_nsid; ASSERT_RTNL(); @@ -8347,7 +8348,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) + new_nsid = peernet2id_alloc(dev_net(dev), net); + else + new_nsid = peernet2id(dev_net(dev), net); + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); /* * Flush the unicast and multicast chains diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3fb1ca33cba4..1ee98b1369d5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -915,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1384,7 +1385,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb, static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event) + u32 event, int *new_nsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1472,6 +1473,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_link_netnsid(skb, dev)) goto nla_put_failure; + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) goto nla_put_failure; @@ -1701,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0); + ext_filter_mask, 0, NULL); if (err < 0) { if (likely(skb->len)) @@ -2808,7 +2813,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2893,7 +2898,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags) + u32 event, gfp_t flags, int *new_nsid) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2904,7 +2909,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event, + new_nsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2927,14 +2933,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags) + gfp_t flags, int *new_nsid) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -2942,10 +2948,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); } EXPORT_SYMBOL(rtmsg_ifinfo); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid); +} + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -4321,7 +4334,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL); + GFP_KERNEL, NULL); break; default: break; From e769fcec6bc4bdd1b0e2cf817680148f9c40b1c4 Mon Sep 17 00:00:00 2001 From: Vishakha Narvekar Date: Tue, 3 Oct 2017 16:13:29 -0400 Subject: [PATCH 1293/1322] net: 8021q: skip packets if the vlan is down If the vlan is down, free the packet instead of proceeding with other processing, or counting it as received. If vlan interfaces are used as slaves for bonding, with arp monitoring for connectivity, if the rx counter is seen to be incrementing, then the bond device will not observe that the interface is down. CC: David S. Miller Signed-off-by: Vishakha Narvekar Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:45 -0700 Subject: [PATCH 1294/1322] net: Add extack to netdev_notifier_info Add netlink_ext_ack to netdev_notifier_info to allow notifier handlers to return errors to userspace. Clean up the initialization in dev.c such that extack is easily added in subsequent patches where relevant. Specifically, remove the init call in call_netdevice_notifiers_info and have callers initalize on stack when info is declared. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++- net/core/dev.c | 77 +++++++++++++++++++++++---------------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d04424cfffba..05fcaba4b0d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2309,7 +2309,8 @@ int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); struct netdev_notifier_info { - struct net_device *dev; + struct net_device *dev; + struct netlink_ext_ack *extack; }; struct netdev_notifier_change_info { @@ -2334,6 +2335,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { info->dev = dev; + info->extack = NULL; } static inline struct net_device * @@ -2342,6 +2344,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) return info->dev; } +static inline struct netlink_ext_ack * +netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) +{ + return info->extack; +} + int call_netdevice_notifiers(unsigned long val, struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index bffc75429184..e27a6bc0ac4d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -163,7 +163,6 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info); static struct napi_struct *napi_by_id(unsigned int napi_id); @@ -1339,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; - change_info.flags_changed = 0; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } @@ -1563,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } @@ -1690,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); */ static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info) { ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -1709,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_info(val, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -6278,7 +6279,15 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; int ret = 0; ASSERT_RTNL(); @@ -6296,12 +6305,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - changeupper_info.upper_dev = upper_dev; - changeupper_info.master = master; - changeupper_info.linking = true; - changeupper_info.upper_info = upper_info; - - ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6312,7 +6316,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6376,20 +6380,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; ASSERT_RTNL(); - changeupper_info.upper_dev = upper_dev; changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; - changeupper_info.linking = false; - call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -6405,11 +6413,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink); void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info info; + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); - call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); @@ -6535,11 +6545,13 @@ EXPORT_SYMBOL(dev_get_nest_level); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { - struct netdev_notifier_changelowerstate_info changelowerstate_info; + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; - call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); @@ -6830,11 +6842,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:46 -0700 Subject: [PATCH 1295/1322] net: Add extack to ndo_add_slave Pass extack to do_set_master and down to ndo_add_slave Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 5 +++-- drivers/net/bonding/bond_options.c | 2 +- drivers/net/team/team.c | 3 ++- drivers/net/vrf.c | 3 ++- include/linux/netdevice.h | 3 ++- include/net/bonding.h | 3 ++- net/batman-adv/soft-interface.c | 3 ++- net/bridge/br_device.c | 3 ++- net/core/rtnetlink.c | 10 ++++++---- 9 files changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b19dc033fb36..78feb94a36db 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1328,7 +1328,8 @@ void bond_lower_state_changed(struct slave *slave) } /* enslave device to bond device */ -int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) +int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; @@ -3492,7 +3493,7 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd switch (cmd) { case BOND_ENSLAVE_OLD: case SIOCBONDENSLAVE: - res = bond_enslave(bond_dev, slave_dev); + res = bond_enslave(bond_dev, slave_dev, NULL); break; case BOND_RELEASE_OLD: case SIOCBONDRELEASE: diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 5931aa2fe997..8a9b085c2a98 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1383,7 +1383,7 @@ static int bond_option_slaves_set(struct bonding *bond, switch (command[0]) { case '+': netdev_dbg(bond->dev, "Adding slave %s\n", dev->name); - ret = bond_enslave(bond->dev, dev); + ret = bond_enslave(bond->dev, dev, NULL); break; case '-': diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index ae53e899259f..4359d45aa131 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1914,7 +1914,8 @@ static int team_netpoll_setup(struct net_device *dev, } #endif -static int team_add_slave(struct net_device *dev, struct net_device *port_dev) +static int team_add_slave(struct net_device *dev, struct net_device *port_dev, + struct netlink_ext_ack *extack) { struct team *team = netdev_priv(dev); int err; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index cc18b7b11612..4a082ef53533 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -788,7 +788,8 @@ err: return ret; } -static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) +static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, + struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) return -EINVAL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 05fcaba4b0d9..368a5064a487 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1246,7 +1246,8 @@ struct net_device_ops { u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, - struct net_device *slave_dev); + struct net_device *slave_dev, + struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); netdev_features_t (*ndo_fix_features)(struct net_device *dev, diff --git a/include/net/bonding.h b/include/net/bonding.h index b2e68657a216..2860cc66c2bb 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); -int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev); +int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c2c986746d0b..e7d5fbb6ad53 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -867,7 +867,8 @@ free_bat_counters: * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, - struct net_device *slave_dev) + struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; struct net *net = dev_net(dev); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f6b6a92f1c48..cb0131d70ab1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -320,7 +320,8 @@ void br_netpoll_disable(struct net_bridge_port *p) #endif -static int br_add_slave(struct net_device *dev, struct net_device *slave_dev) +static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1ee98b1369d5..c5ee429bcce9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1957,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } -static int do_set_master(struct net_device *dev, int ifindex) +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; @@ -1982,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(upper_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { @@ -2115,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2753,7 +2754,8 @@ replay: goto out_unregister; } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), + extack); if (err) goto out_unregister; } From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:47 -0700 Subject: [PATCH 1296/1322] net: Add extack to upper device linking Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++++--- .../net/ethernet/qualcomm/rmnet/rmnet_config.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 2 +- drivers/net/macsec.c | 2 +- drivers/net/macvlan.c | 7 ++++--- drivers/net/macvtap.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/usb/qmi_wwan.c | 2 +- drivers/net/vrf.c | 7 ++++--- include/linux/if_macvlan.h | 3 ++- include/linux/netdevice.h | 6 ++++-- net/8021q/vlan.c | 6 +++--- net/8021q/vlan.h | 2 +- net/8021q/vlan_netlink.c | 2 +- net/batman-adv/hard-interface.c | 2 +- net/bridge/br_if.c | 2 +- net/core/dev.c | 15 ++++++++++----- net/openvswitch/vport-netdev.c | 3 ++- 19 files changed, 44 insertions(+), 32 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 78feb94a36db..bc92307c2082 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1217,14 +1217,15 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) } } -static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave) +static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, + struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; int err; lag_upper_info.tx_type = bond_lag_tx_type(bond); err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave, - &lag_upper_info); + &lag_upper_info, extack); if (err) return err; rtmsg_ifinfo(RTM_NEWLINK, slave->dev, IFF_SLAVE, GFP_KERNEL); @@ -1710,7 +1711,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, goto err_detach; } - res = bond_master_upper_dev_link(bond, new_slave); + res = bond_master_upper_dev_link(bond, new_slave, extack); if (res) { netdev_dbg(bond_dev, "Error %d calling bond_master_upper_dev_link\n", res); goto err_unregister; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 98f22551eb45..1af326a60cbb 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -178,7 +178,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, if (err) goto err1; - err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL); + err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack); if (err) goto err2; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index f300ae61c6c6..dfb986421ec6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1748,7 +1748,7 @@ static int netvsc_vf_join(struct net_device *vf_netdev, goto rx_handler_failed; } - ret = netdev_upper_dev_link(vf_netdev, ndev); + ret = netdev_upper_dev_link(vf_netdev, ndev, NULL); if (ret != 0) { netdev_err(vf_netdev, "can not set master device %s (err = %d)\n", diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index c74893c1e620..57c3856bab05 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -584,7 +584,7 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, if (err < 0) goto remove_ida; - err = netdev_upper_dev_link(phy_dev, dev); + err = netdev_upper_dev_link(phy_dev, dev, extack); if (err) { goto unregister_netdev; } diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 98e4deaa3a6a..ccbe4eaffe4d 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3244,7 +3244,7 @@ static int macsec_newlink(struct net *net, struct net_device *dev, &macsec_netdev_addr_lock_key, macsec_get_nest_level(dev)); - err = netdev_upper_dev_link(real_dev, dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 1ffe77e95d46..858bd66511a2 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1344,7 +1344,8 @@ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, } int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; @@ -1433,7 +1434,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; - err = netdev_upper_dev_link(lowerdev, dev); + err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; @@ -1456,7 +1457,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - return macvlan_common_newlink(src_net, dev, tb, data); + return macvlan_common_newlink(src_net, dev, tb, data, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index c2d0ea2fb019..f62aea2fcfa9 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -105,7 +105,7 @@ static int macvtap_newlink(struct net *src_net, struct net_device *dev, /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ - err = macvlan_common_newlink(src_net, dev, tb, data); + err = macvlan_common_newlink(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 4359d45aa131..a468439969df 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1112,7 +1112,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port) lag_upper_info.tx_type = team->mode->lag_tx_type; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, - &lag_upper_info); + &lag_upper_info, NULL); if (err) return err; port->dev->priv_flags |= IFF_TEAM_PORT; diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 8c3733608271..db7279d5b250 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -221,7 +221,7 @@ static int qmimux_register_device(struct net_device *real_dev, u8 mux_id) /* Account for reference in struct qmimux_priv_priv */ dev_hold(real_dev); - err = netdev_upper_dev_link(real_dev, new_dev); + err = netdev_upper_dev_link(real_dev, new_dev, NULL); if (err) goto out_unregister_netdev; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 4a082ef53533..77d0655a0250 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -764,7 +764,8 @@ static void cycle_netdev(struct net_device *dev) } } -static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) +static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, + struct netlink_ext_ack *extack) { int ret; @@ -775,7 +776,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) return -EOPNOTSUPP; port_dev->priv_flags |= IFF_L3MDEV_SLAVE; - ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); + ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; @@ -794,7 +795,7 @@ static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) return -EINVAL; - return do_vrf_add_slave(dev, port_dev); + return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index c9ec1343d187..10e319f41fb1 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, extern void macvlan_common_setup(struct net_device *dev); extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]); + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack); extern void macvlan_count_rx(const struct macvlan_dev *vlan, unsigned int len, bool success, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 368a5064a487..31bb3010c69b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3919,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); -int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); +int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, + struct netlink_ext_ack *extack); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info); + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9649579b5b9f..71c3e045505b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev, return 0; } -int register_vlan_dev(struct net_device *dev) +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; @@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - err = netdev_upper_dev_link(real_dev, dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; @@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; - err = register_vlan_dev(new_dev); + err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index df8bd65dd370..94f8eed9f9b3 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id); void vlan_setup(struct net_device *dev); -int register_vlan_dev(struct net_device *dev); +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 5e831de3103e..6e7c5a6a7930 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - return register_vlan_dev(dev); + return register_vlan_dev(dev, extack); } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e348f76ea8c1..f7b413b9297e 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL, NULL); + soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3aef22931ab..0a3fd727048d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -540,7 +540,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL); if (err) goto err5; diff --git a/net/core/dev.c b/net/core/dev.c index e27a6bc0ac4d..fcddccb6be41 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6277,11 +6277,13 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, + .extack = extack, }, .upper_dev = upper_dev, .master = master, @@ -6341,9 +6343,11 @@ rollback: * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -6362,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info); + upper_priv, upper_info, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 0389398fa4ab..2e5e7a41d8ef 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL, NULL); + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; From de3baa3ed72f09f8d2ba797645d052cd5f5de27d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:48 -0700 Subject: [PATCH 1297/1322] net: vrf: Add extack messages for enslave errors Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 77d0655a0250..0b54f553228e 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -772,8 +772,11 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ - if (port_dev == dev_net(dev)->loopback_dev) + if (port_dev == dev_net(dev)->loopback_dev) { + NL_SET_ERR_MSG(extack, + "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; + } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); @@ -792,7 +795,13 @@ err: static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { - if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) + if (netif_is_l3_master(port_dev)) { + NL_SET_ERR_MSG(extack, + "Can not enslave an L3 master device to a VRF"); + return -EINVAL; + } + + if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); From 759088bda21f4887c645579418e8b478eb570d78 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:49 -0700 Subject: [PATCH 1298/1322] net: bonding: Add extack messages for some enslave failures A number of bond_enslave errors are logged using the netdev_err API. Return those messages to userspace via the extack facility. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index bc92307c2082..172eeeb68152 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1348,12 +1348,14 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { + NL_SET_ERR_MSG(extack, "Device is in use and cannot be enslaved"); netdev_err(bond_dev, "Error: Device is in use and cannot be enslaved\n"); return -EBUSY; } if (bond_dev == slave_dev) { + NL_SET_ERR_MSG(extack, "Cannot enslave bond to itself."); netdev_err(bond_dev, "cannot enslave bond to itself.\n"); return -EPERM; } @@ -1364,6 +1366,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, netdev_dbg(bond_dev, "%s is NETIF_F_VLAN_CHALLENGED\n", slave_dev->name); if (vlan_uses_dev(bond_dev)) { + NL_SET_ERR_MSG(extack, "Can not enslave VLAN challenged device to VLAN enabled bond"); netdev_err(bond_dev, "Error: cannot enslave VLAN challenged slave %s on VLAN enabled bond %s\n", slave_dev->name, bond_dev->name); return -EPERM; @@ -1383,6 +1386,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, * enslaving it; the old ifenslave will not. */ if (slave_dev->flags & IFF_UP) { + NL_SET_ERR_MSG(extack, "Device can not be enslaved while up"); netdev_err(bond_dev, "%s is up - this may be due to an out of date ifenslave\n", slave_dev->name); return -EPERM; @@ -1423,6 +1427,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond_dev); } } else if (bond_dev->type != slave_dev->type) { + NL_SET_ERR_MSG(extack, "Device type is different from other slaves"); netdev_err(bond_dev, "%s ether type (%d) is different from other slaves (%d), can not enslave it\n", slave_dev->name, slave_dev->type, bond_dev->type); return -EINVAL; @@ -1430,6 +1435,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (slave_dev->type == ARPHRD_INFINIBAND && BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { + NL_SET_ERR_MSG(extack, "Only active-backup mode is supported for infiniband slaves"); netdev_warn(bond_dev, "Type (%d) supports only active-backup mode\n", slave_dev->type); res = -EOPNOTSUPP; @@ -1445,6 +1451,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond->params.fail_over_mac = BOND_FOM_ACTIVE; netdev_warn(bond_dev, "Setting fail_over_mac to active for active-backup mode\n"); } else { + NL_SET_ERR_MSG(extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active"); netdev_err(bond_dev, "The slave device specified does not support setting the MAC address, but fail_over_mac is not set to active\n"); res = -EOPNOTSUPP; goto err_undo_flags; From ca752be006013ac6f19721280f44c91ef07ad3d1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:50 -0700 Subject: [PATCH 1299/1322] net: bridge: Pass extack to down to netdev_master_upper_dev_link Pass extack arg to br_add_if. Add messages for a couple of failures and pass arg to netdev_master_upper_dev_link. Signed-off-by: David Ahern Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 2 +- net/bridge/br_if.c | 15 +++++++++++---- net/bridge/br_ioctl.c | 2 +- net/bridge/br_private.h | 3 ++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index cb0131d70ab1..7acb77c9bd65 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -326,7 +326,7 @@ static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, { struct net_bridge *br = netdev_priv(dev); - return br_add_if(br, slave_dev); + return br_add_if(br, slave_dev, extack); } static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 0a3fd727048d..59a74a414e20 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -480,7 +480,8 @@ netdev_features_t br_features_recompute(struct net_bridge *br, } /* called with RTNL */ -int br_add_if(struct net_bridge *br, struct net_device *dev) +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; @@ -500,16 +501,22 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return -EINVAL; /* No bridging of bridges */ - if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) + if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { + NL_SET_ERR_MSG(extack, + "Can not enslave a bridge to a bridge"); return -ELOOP; + } /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ - if (dev->priv_flags & IFF_DONT_BRIDGE) + if (dev->priv_flags & IFF_DONT_BRIDGE) { + NL_SET_ERR_MSG(extack, + "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; + } p = new_nbp(br, dev); if (IS_ERR(p)) @@ -540,7 +547,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 66cd98772051..8f29103935a3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -98,7 +98,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) return -EINVAL; if (isadd) - ret = br_add_if(br, dev); + ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 020c709a017f..ab4df24f7bba 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -566,7 +566,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, void br_port_carrier_check(struct net_bridge_port *p); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); -int br_add_if(struct net_bridge *br, struct net_device *dev); +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); int br_min_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, From e58376e1df2aaffbf12753959142a50f824c46ea Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:51 -0700 Subject: [PATCH 1300/1322] mlxsw: spectrum: Add extack messages for enslave failures mlxsw fails device enslavement for a number of reasons. Use the extack facility to return an error message to the user stating why the enslave is failing. Messages are prefixed with "spectrum" so users know it is a constraint imposed by the hardware driver. For example: $ ip li add br0.11 link br0 type vlan id 11 $ ip li set swp11 master br0 Error: spectrum: Enslaving a port to a device that already has an upper device is not supported. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 3adf237c951a..5cd4df08ce97 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4019,14 +4019,21 @@ static int mlxsw_sp_lag_index_get(struct mlxsw_sp *mlxsw_sp, static bool mlxsw_sp_master_lag_check(struct mlxsw_sp *mlxsw_sp, struct net_device *lag_dev, - struct netdev_lag_upper_info *lag_upper_info) + struct netdev_lag_upper_info *lag_upper_info, + struct netlink_ext_ack *extack) { u16 lag_id; - if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0) + if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0) { + NL_SET_ERR_MSG(extack, + "spectrum: Exceeded number of supported LAG devices"); return false; - if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) + } + if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) { + NL_SET_ERR_MSG(extack, + "spectrum: LAG device using unsupported Tx type"); return false; + } return true; } @@ -4231,6 +4238,7 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, { struct netdev_notifier_changeupper_info *info; struct mlxsw_sp_port *mlxsw_sp_port; + struct netlink_ext_ack *extack; struct net_device *upper_dev; struct mlxsw_sp *mlxsw_sp; int err = 0; @@ -4238,6 +4246,7 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, mlxsw_sp_port = netdev_priv(dev); mlxsw_sp = mlxsw_sp_port->mlxsw_sp; info = ptr; + extack = netdev_notifier_info_to_extack(&info->info); switch (event) { case NETDEV_PRECHANGEUPPER: @@ -4245,25 +4254,43 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, if (!is_vlan_dev(upper_dev) && !netif_is_lag_master(upper_dev) && !netif_is_bridge_master(upper_dev) && - !netif_is_ovs_master(upper_dev)) + !netif_is_ovs_master(upper_dev)) { + NL_SET_ERR_MSG(extack, + "spectrum: Unknown upper device type"); return -EINVAL; + } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) + if (netdev_has_any_upper_dev(upper_dev)) { + NL_SET_ERR_MSG(extack, + "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; + } if (netif_is_lag_master(upper_dev) && !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev, - info->upper_info)) + info->upper_info, extack)) return -EINVAL; - if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev)) + if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev)) { + NL_SET_ERR_MSG(extack, + "spectrum: Master device is a LAG master and this device has a VLAN"); return -EINVAL; + } if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) && - !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) + !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) { + NL_SET_ERR_MSG(extack, + "spectrum: Can not put a VLAN on a LAG port"); return -EINVAL; - if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev)) + } + if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev)) { + NL_SET_ERR_MSG(extack, + "spectrum: Master device is an OVS master and this device has a VLAN"); return -EINVAL; - if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev)) + } + if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev)) { + NL_SET_ERR_MSG(extack, + "spectrum: Can not put a VLAN on an OVS port"); return -EINVAL; + } break; case NETDEV_CHANGEUPPER: upper_dev = info->upper_dev; From a92bb546cff0b31d96d5919ebfeda9c678756b42 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 4 Oct 2017 20:10:03 -0700 Subject: [PATCH 1301/1322] tools: rename tools/net directory to tools/bpf We currently only have BPF tools in the tools/net directory. We are about to add more BPF tools there, not necessarily networking related, rename the directory and related Makefile targets to bpf. Suggested-by: Daniel Borkmann Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- MAINTAINERS | 3 +-- tools/Makefile | 14 +++++++------- tools/{net => bpf}/Makefile | 0 tools/{net => bpf}/bpf_asm.c | 0 tools/{net => bpf}/bpf_dbg.c | 0 tools/{net => bpf}/bpf_exp.l | 0 tools/{net => bpf}/bpf_exp.y | 0 tools/{net => bpf}/bpf_jit_disasm.c | 0 8 files changed, 8 insertions(+), 9 deletions(-) rename tools/{net => bpf}/Makefile (100%) rename tools/{net => bpf}/bpf_asm.c (100%) rename tools/{net => bpf}/bpf_dbg.c (100%) rename tools/{net => bpf}/bpf_exp.l (100%) rename tools/{net => bpf}/bpf_exp.y (100%) rename tools/{net => bpf}/bpf_jit_disasm.c (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 5231392cf4bd..004816a585b8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2725,7 +2725,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ -F: tools/net/bpf* +F: tools/bpf/ F: tools/testing/selftests/bpf/ BROADCOM B44 10/100 ETHERNET DRIVER @@ -9416,7 +9416,6 @@ F: include/uapi/linux/in.h F: include/uapi/linux/net.h F: include/uapi/linux/netdevice.h F: include/uapi/linux/net_namespace.h -F: tools/net/ F: tools/testing/selftests/net/ F: lib/random32.c diff --git a/tools/Makefile b/tools/Makefile index 9dfede37c8ff..df6fcb293fbc 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -19,7 +19,7 @@ help: @echo ' kvm_stat - top-like utility for displaying kvm statistics' @echo ' leds - LEDs tools' @echo ' liblockdep - user-space wrapper for kernel locking-validator' - @echo ' net - misc networking tools' + @echo ' bpf - misc BPF tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' @echo ' spi - spi tools' @@ -57,7 +57,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup firewire hv guest spi usb virtio vm net iio gpio objtool leds: FORCE +cgroup firewire hv guest spi usb virtio vm bpf iio gpio objtool leds: FORCE $(call descend,$@) liblockdep: FORCE @@ -91,7 +91,7 @@ kvm_stat: FORCE all: acpi cgroup cpupower gpio hv firewire liblockdep \ perf selftests spi turbostat usb \ - virtio vm net x86_energy_perf_policy \ + virtio vm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat acpi_install: @@ -100,7 +100,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install net_install objtool_install: +cgroup_install firewire_install gpio_install hv_install iio_install perf_install spi_install usb_install virtio_install vm_install bpf_install objtool_install: $(call descend,$(@:_install=),install) liblockdep_install: @@ -124,7 +124,7 @@ kvm_stat_install: install: acpi_install cgroup_install cpupower_install gpio_install \ hv_install firewire_install iio_install liblockdep_install \ perf_install selftests_install turbostat_install usb_install \ - virtio_install vm_install net_install x86_energy_perf_policy_install \ + virtio_install vm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install acpi_clean: @@ -133,7 +133,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean: +cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean: $(call descend,$(@:_clean=),clean) liblockdep_clean: @@ -169,7 +169,7 @@ build_clean: clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \ - vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ + vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \ gpio_clean objtool_clean leds_clean diff --git a/tools/net/Makefile b/tools/bpf/Makefile similarity index 100% rename from tools/net/Makefile rename to tools/bpf/Makefile diff --git a/tools/net/bpf_asm.c b/tools/bpf/bpf_asm.c similarity index 100% rename from tools/net/bpf_asm.c rename to tools/bpf/bpf_asm.c diff --git a/tools/net/bpf_dbg.c b/tools/bpf/bpf_dbg.c similarity index 100% rename from tools/net/bpf_dbg.c rename to tools/bpf/bpf_dbg.c diff --git a/tools/net/bpf_exp.l b/tools/bpf/bpf_exp.l similarity index 100% rename from tools/net/bpf_exp.l rename to tools/bpf/bpf_exp.l diff --git a/tools/net/bpf_exp.y b/tools/bpf/bpf_exp.y similarity index 100% rename from tools/net/bpf_exp.y rename to tools/bpf/bpf_exp.y diff --git a/tools/net/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c similarity index 100% rename from tools/net/bpf_jit_disasm.c rename to tools/bpf/bpf_jit_disasm.c From 71bb428fe2c19512ac671d5ee16ef3e73e1b49a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 4 Oct 2017 20:10:04 -0700 Subject: [PATCH 1302/1322] tools: bpf: add bpftool Add a simple tool for querying and updating BPF objects on the system. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- tools/bpf/Makefile | 18 +- tools/bpf/bpftool/Makefile | 80 ++++ tools/bpf/bpftool/common.c | 216 ++++++++++ tools/bpf/bpftool/jit_disasm.c | 87 ++++ tools/bpf/bpftool/main.c | 212 ++++++++++ tools/bpf/bpftool/main.h | 99 +++++ tools/bpf/bpftool/map.c | 744 +++++++++++++++++++++++++++++++++ tools/bpf/bpftool/prog.c | 456 ++++++++++++++++++++ 8 files changed, 1909 insertions(+), 3 deletions(-) create mode 100644 tools/bpf/bpftool/Makefile create mode 100644 tools/bpf/bpftool/common.c create mode 100644 tools/bpf/bpftool/jit_disasm.c create mode 100644 tools/bpf/bpftool/main.c create mode 100644 tools/bpf/bpftool/main.h create mode 100644 tools/bpf/bpftool/map.c create mode 100644 tools/bpf/bpftool/prog.c diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index ddf888010652..325a35e1c28e 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -3,6 +3,7 @@ prefix = /usr CC = gcc LEX = flex YACC = bison +MAKE = make CFLAGS += -Wall -O2 CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include @@ -13,7 +14,7 @@ CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include %.lex.c: %.l $(LEX) -o $@ $< -all : bpf_jit_disasm bpf_dbg bpf_asm +all: bpf_jit_disasm bpf_dbg bpf_asm bpftool bpf_jit_disasm : CFLAGS += -DPACKAGE='bpf_jit_disasm' bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl @@ -26,10 +27,21 @@ bpf_asm : LDLIBS = bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o bpf_exp.lex.o : bpf_exp.yacc.c -clean : +clean: bpftool_clean rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.* -install : +install: bpftool_install install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm install bpf_dbg $(prefix)/bin/bpf_dbg install bpf_asm $(prefix)/bin/bpf_asm + +bpftool: + $(MAKE) -C bpftool + +bpftool_install: + $(MAKE) -C bpftool install + +bpftool_clean: + $(MAKE) -C bpftool clean + +.PHONY: bpftool FORCE diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile new file mode 100644 index 000000000000..a7151f47fb40 --- /dev/null +++ b/tools/bpf/bpftool/Makefile @@ -0,0 +1,80 @@ +include ../../scripts/Makefile.include + +include ../../scripts/utilities.mak + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +#$(info Determined 'srctree' to be $(srctree)) +endif + +ifneq ($(objtree),) +#$(info Determined 'objtree' to be $(objtree)) +endif + +ifneq ($(OUTPUT),) +#$(info Determined 'OUTPUT' to be $(OUTPUT)) +# Adding $(OUTPUT) as a directory to look for source files, +# because use generated output files as sources dependency +# for flex/bison parsers. +VPATH += $(OUTPUT) +export VPATH +endif + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +BPF_DIR = $(srctree)/tools/lib/bpf/ + +ifneq ($(OUTPUT),) + BPF_PATH=$(OUTPUT) +else + BPF_PATH=$(BPF_DIR) +endif + +LIBBPF = $(BPF_PATH)libbpf.a + +$(LIBBPF): FORCE + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) + +$(LIBBPF)-clean: + $(call QUIET_CLEAN, libbpf) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null + +prefix = /usr + +CC = gcc + +CFLAGS += -O2 +CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow +CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf +LIBS = -lelf -lbfd -lopcodes $(LIBBPF) + +include $(wildcard *.d) + +all: $(OUTPUT)bpftool + +SRCS=$(wildcard *.c) +OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) + +$(OUTPUT)bpftool: $(OBJS) $(LIBBPF) + $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS) + +$(OUTPUT)%.o: %.c + $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< + +clean: $(LIBBPF)-clean + $(call QUIET_CLEAN, bpftool) + $(Q)rm -rf $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d + +install: + install $(OUTPUT)bpftool $(prefix)/sbin/bpftool + +FORCE: + +.PHONY: all clean FORCE +.DEFAULT_GOAL := all diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c new file mode 100644 index 000000000000..df8396a0c400 --- /dev/null +++ b/tools/bpf/bpftool/common.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Author: Jakub Kicinski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +static bool is_bpffs(char *path) +{ + struct statfs st_fs; + + if (statfs(path, &st_fs) < 0) + return false; + + return (unsigned long)st_fs.f_type == BPF_FS_MAGIC; +} + +int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type) +{ + enum bpf_obj_type type; + int fd; + + fd = bpf_obj_get(path); + if (fd < 0) { + err("bpf obj get (%s): %s\n", path, + errno == EACCES && !is_bpffs(dirname(path)) ? + "directory not in bpf file system (bpffs)" : + strerror(errno)); + return -1; + } + + type = get_fd_type(fd); + if (type < 0) { + close(fd); + return type; + } + if (type != exp_type) { + err("incorrect object type: %s\n", get_fd_type_name(type)); + close(fd); + return -1; + } + + return fd; +} + +int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)) +{ + unsigned int id; + char *endptr; + int err; + int fd; + + if (!is_prefix(*argv, "id")) { + err("expected 'id' got %s\n", *argv); + return -1; + } + NEXT_ARG(); + + id = strtoul(*argv, &endptr, 0); + if (*endptr) { + err("can't parse %s as ID\n", *argv); + return -1; + } + NEXT_ARG(); + + if (argc != 1) + usage(); + + fd = get_fd_by_id(id); + if (fd < 0) { + err("can't get prog by id (%u): %s\n", id, strerror(errno)); + return -1; + } + + err = bpf_obj_pin(fd, *argv); + close(fd); + if (err) { + err("can't pin the object (%s): %s\n", *argv, + errno == EACCES && !is_bpffs(dirname(*argv)) ? + "directory not in bpf file system (bpffs)" : + strerror(errno)); + return -1; + } + + return 0; +} + +const char *get_fd_type_name(enum bpf_obj_type type) +{ + static const char * const names[] = { + [BPF_OBJ_UNKNOWN] = "unknown", + [BPF_OBJ_PROG] = "prog", + [BPF_OBJ_MAP] = "map", + }; + + if (type < 0 || type >= ARRAY_SIZE(names) || !names[type]) + return names[BPF_OBJ_UNKNOWN]; + + return names[type]; +} + +int get_fd_type(int fd) +{ + char path[PATH_MAX]; + char buf[512]; + ssize_t n; + + snprintf(path, sizeof(path), "/proc/%d/fd/%d", getpid(), fd); + + n = readlink(path, buf, sizeof(buf)); + if (n < 0) { + err("can't read link type: %s\n", strerror(errno)); + return -1; + } + if (n == sizeof(path)) { + err("can't read link type: path too long!\n"); + return -1; + } + + if (strstr(buf, "bpf-map")) + return BPF_OBJ_MAP; + else if (strstr(buf, "bpf-prog")) + return BPF_OBJ_PROG; + + return BPF_OBJ_UNKNOWN; +} + +char *get_fdinfo(int fd, const char *key) +{ + char path[PATH_MAX]; + char *line = NULL; + size_t line_n = 0; + ssize_t n; + FILE *fdi; + + snprintf(path, sizeof(path), "/proc/%d/fdinfo/%d", getpid(), fd); + + fdi = fopen(path, "r"); + if (!fdi) { + err("can't open fdinfo: %s\n", strerror(errno)); + return NULL; + } + + while ((n = getline(&line, &line_n, fdi))) { + char *value; + int len; + + if (!strstr(line, key)) + continue; + + fclose(fdi); + + value = strchr(line, '\t'); + if (!value || !value[1]) { + err("malformed fdinfo!?\n"); + free(line); + return NULL; + } + value++; + + len = strlen(value); + memmove(line, value, len); + line[len - 1] = '\0'; + + return line; + } + + err("key '%s' not found in fdinfo\n", key); + free(line); + fclose(fdi); + return NULL; +} diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c new file mode 100644 index 000000000000..70e480b59e9d --- /dev/null +++ b/tools/bpf/bpftool/jit_disasm.c @@ -0,0 +1,87 @@ +/* + * Based on: + * + * Minimal BPF JIT image disassembler + * + * Disassembles BPF JIT compiler emitted opcodes back to asm insn's for + * debugging or verification purposes. + * + * Copyright 2013 Daniel Borkmann + * Licensed under the GNU General Public License, version 2.0 (GPLv2) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void get_exec_path(char *tpath, size_t size) +{ + ssize_t len; + char *path; + + snprintf(tpath, size, "/proc/%d/exe", (int) getpid()); + tpath[size - 1] = 0; + + path = strdup(tpath); + assert(path); + + len = readlink(path, tpath, size - 1); + assert(len > 0); + tpath[len] = 0; + + free(path); +} + +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes) +{ + disassembler_ftype disassemble; + struct disassemble_info info; + int count, i, pc = 0; + char tpath[256]; + bfd *bfdf; + + if (!len) + return; + + memset(tpath, 0, sizeof(tpath)); + get_exec_path(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + assert(bfdf); + assert(bfd_check_format(bfdf, bfd_object)); + + init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + info.buffer = image; + info.buffer_length = len; + + disassemble_init_for_target(&info); + + disassemble = disassembler(bfdf); + assert(disassemble); + + do { + printf("%4x:\t", pc); + + count = disassemble(pc, &info); + + if (opcodes) { + printf("\n\t"); + for (i = 0; i < count; ++i) + printf("%02x ", (uint8_t) image[pc + i]); + } + printf("\n"); + + pc += count; + } while (count > 0 && pc < len); + + bfd_close(bfdf); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c new file mode 100644 index 000000000000..e02d00d6e00b --- /dev/null +++ b/tools/bpf/bpftool/main.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Author: Jakub Kicinski */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +const char *bin_name; +static int last_argc; +static char **last_argv; +static int (*last_do_help)(int argc, char **argv); + +void usage(void) +{ + last_do_help(last_argc - 1, last_argv + 1); + + exit(-1); +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %s OBJECT { COMMAND | help }\n" + " %s batch file FILE\n" + "\n" + " OBJECT := { prog | map }\n", + bin_name, bin_name); + + return 0; +} + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)) +{ + unsigned int i; + + last_argc = argc; + last_argv = argv; + last_do_help = help; + + if (argc < 1 && cmds[0].func) + return cmds[0].func(argc, argv); + + for (i = 0; cmds[i].func; i++) + if (is_prefix(*argv, cmds[i].cmd)) + return cmds[i].func(argc - 1, argv + 1); + + help(argc - 1, argv + 1); + + return -1; +} + +bool is_prefix(const char *pfx, const char *str) +{ + if (!pfx) + return false; + if (strlen(str) < strlen(pfx)) + return false; + + return !memcmp(str, pfx, strlen(pfx)); +} + +void print_hex(void *arg, unsigned int n, const char *sep) +{ + unsigned char *data = arg; + unsigned int i; + + for (i = 0; i < n; i++) { + const char *pfx = ""; + + if (!i) + /* nothing */; + else if (!(i % 16)) + printf("\n"); + else if (!(i % 8)) + printf(" "); + else + pfx = sep; + + printf("%s%02hhx", i ? pfx : "", data[i]); + } +} + +static int do_batch(int argc, char **argv); + +static const struct cmd cmds[] = { + { "help", do_help }, + { "batch", do_batch }, + { "prog", do_prog }, + { "map", do_map }, + { 0 } +}; + +static int do_batch(int argc, char **argv) +{ + unsigned int lines = 0; + char *n_argv[4096]; + char buf[65536]; + int n_argc; + FILE *fp; + int err; + + if (argc < 2) { + err("too few parameters for batch\n"); + return -1; + } else if (!is_prefix(*argv, "file")) { + err("expected 'file', got: %s\n", *argv); + return -1; + } else if (argc > 2) { + err("too many parameters for batch\n"); + return -1; + } + NEXT_ARG(); + + fp = fopen(*argv, "r"); + if (!fp) { + err("Can't open file (%s): %s\n", *argv, strerror(errno)); + return -1; + } + + while (fgets(buf, sizeof(buf), fp)) { + if (strlen(buf) == sizeof(buf) - 1) { + errno = E2BIG; + break; + } + + n_argc = 0; + n_argv[n_argc] = strtok(buf, " \t\n"); + + while (n_argv[n_argc]) { + n_argc++; + if (n_argc == ARRAY_SIZE(n_argv)) { + err("line %d has too many arguments, skip\n", + lines); + n_argc = 0; + break; + } + n_argv[n_argc] = strtok(NULL, " \t\n"); + } + + if (!n_argc) + continue; + + err = cmd_select(cmds, n_argc, n_argv, do_help); + if (err) + goto err_close; + + lines++; + } + + if (errno && errno != ENOENT) { + perror("reading batch file failed"); + err = -1; + } else { + info("processed %d lines\n", lines); + err = 0; + } +err_close: + fclose(fp); + + return err; +} + +int main(int argc, char **argv) +{ + bin_name = argv[0]; + NEXT_ARG(); + + bfd_init(); + + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h new file mode 100644 index 000000000000..85d2d7870a58 --- /dev/null +++ b/tools/bpf/bpftool/main.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Author: Jakub Kicinski */ + +#ifndef __BPF_TOOL_H +#define __BPF_TOOL_H + +#include +#include +#include + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +#define err(msg...) fprintf(stderr, "Error: " msg) +#define warn(msg...) fprintf(stderr, "Warning: " msg) +#define info(msg...) fprintf(stderr, msg) + +#define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) + +#define min(a, b) \ + ({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; }) +#define max(a, b) \ + ({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; }) + +#define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) +#define NEXT_ARGP() ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); }) +#define BAD_ARG() ({ err("what is '%s'?\n", *argv); -1; }) + +#define BPF_TAG_FMT "%02hhx:%02hhx:%02hhx:%02hhx:" \ + "%02hhx:%02hhx:%02hhx:%02hhx" + +#define HELP_SPEC_PROGRAM \ + "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG }" + +enum bpf_obj_type { + BPF_OBJ_UNKNOWN, + BPF_OBJ_PROG, + BPF_OBJ_MAP, +}; + +extern const char *bin_name; + +bool is_prefix(const char *pfx, const char *str); +void print_hex(void *arg, unsigned int n, const char *sep); +void usage(void) __attribute__((noreturn)); + +struct cmd { + const char *cmd; + int (*func)(int argc, char **argv); +}; + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)); + +int get_fd_type(int fd); +const char *get_fd_type_name(enum bpf_obj_type type); +char *get_fdinfo(int fd, const char *key); +int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type); +int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)); + +int do_prog(int argc, char **arg); +int do_map(int argc, char **arg); + +int prog_parse_fd(int *argc, char ***argv); + +void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes); + +#endif diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c new file mode 100644 index 000000000000..0528a5379e6c --- /dev/null +++ b/tools/bpf/bpftool/map.c @@ -0,0 +1,744 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Author: Jakub Kicinski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", +}; + +static unsigned int get_possible_cpus(void) +{ + static unsigned int result; + char buf[128]; + long int n; + char *ptr; + int fd; + + if (result) + return result; + + fd = open("/sys/devices/system/cpu/possible", O_RDONLY); + if (fd < 0) { + err("can't open sysfs possible cpus\n"); + exit(-1); + } + + n = read(fd, buf, sizeof(buf)); + if (n < 2) { + err("can't read sysfs possible cpus\n"); + exit(-1); + } + close(fd); + + if (n == sizeof(buf)) { + err("read sysfs possible cpus overflow\n"); + exit(-1); + } + + ptr = buf; + n = 0; + while (*ptr && *ptr != '\n') { + unsigned int a, b; + + if (sscanf(ptr, "%u-%u", &a, &b) == 2) { + n += b - a + 1; + + ptr = strchr(ptr, '-') + 1; + } else if (sscanf(ptr, "%u", &a) == 1) { + n++; + } else { + assert(0); + } + + while (isdigit(*ptr)) + ptr++; + if (*ptr == ',') + ptr++; + } + + result = n; + + return result; +} + +static bool map_is_per_cpu(__u32 type) +{ + return type == BPF_MAP_TYPE_PERCPU_HASH || + type == BPF_MAP_TYPE_PERCPU_ARRAY || + type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool map_is_map_of_maps(__u32 type) +{ + return type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static bool map_is_map_of_progs(__u32 type) +{ + return type == BPF_MAP_TYPE_PROG_ARRAY; +} + +static void *alloc_value(struct bpf_map_info *info) +{ + if (map_is_per_cpu(info->type)) + return malloc(info->value_size * get_possible_cpus()); + else + return malloc(info->value_size); +} + +static int map_parse_fd(int *argc, char ***argv) +{ + int fd; + + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + err("can't parse %s as ID\n", **argv); + return -1; + } + NEXT_ARGP(); + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) + err("get map by id (%u): %s\n", id, strerror(errno)); + return fd; + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + return open_obj_pinned_any(path, BPF_OBJ_MAP); + } + + err("expected 'id' or 'pinned', got: '%s'?\n", **argv); + return -1; +} + +static int +map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len) +{ + int err; + int fd; + + fd = map_parse_fd(argc, argv); + if (fd < 0) + return -1; + + err = bpf_obj_get_info_by_fd(fd, info, info_len); + if (err) { + err("can't get map info: %s\n", strerror(errno)); + close(fd); + return err; + } + + return fd; +} + +static void print_entry(struct bpf_map_info *info, unsigned char *key, + unsigned char *value) +{ + if (!map_is_per_cpu(info->type)) { + bool single_line, break_names; + + break_names = info->key_size > 16 || info->value_size > 16; + single_line = info->key_size + info->value_size <= 24 && + !break_names; + + printf("key:%c", break_names ? '\n' : ' '); + print_hex(key, info->key_size, " "); + + printf(single_line ? " " : "\n"); + + printf("value:%c", break_names ? '\n' : ' '); + print_hex(value, info->value_size, " "); + + printf("\n"); + } else { + unsigned int i, n; + + n = get_possible_cpus(); + + printf("key:\n"); + print_hex(key, info->key_size, " "); + printf("\n"); + for (i = 0; i < n; i++) { + printf("value (CPU %02d):%c", + i, info->value_size > 16 ? '\n' : ' '); + print_hex(value + i * info->value_size, + info->value_size, " "); + printf("\n"); + } + } +} + +static char **parse_bytes(char **argv, const char *name, unsigned char *val, + unsigned int n) +{ + unsigned int i = 0; + char *endptr; + + while (i < n && argv[i]) { + val[i] = strtoul(argv[i], &endptr, 0); + if (*endptr) { + err("error parsing byte: %s\n", argv[i]); + break; + } + i++; + } + + if (i != n) { + err("%s expected %d bytes got %d\n", name, n, i); + return NULL; + } + + return argv + i; +} + +static int parse_elem(char **argv, struct bpf_map_info *info, + void *key, void *value, __u32 key_size, __u32 value_size, + __u32 *flags, __u32 **value_fd) +{ + if (!*argv) { + if (!key && !value) + return 0; + err("did not find %s\n", key ? "key" : "value"); + return -1; + } + + if (is_prefix(*argv, "key")) { + if (!key) { + if (key_size) + err("duplicate key\n"); + else + err("unnecessary key\n"); + return -1; + } + + argv = parse_bytes(argv + 1, "key", key, key_size); + if (!argv) + return -1; + + return parse_elem(argv, info, NULL, value, key_size, value_size, + flags, value_fd); + } else if (is_prefix(*argv, "value")) { + int fd; + + if (!value) { + if (value_size) + err("duplicate value\n"); + else + err("unnecessary value\n"); + return -1; + } + + argv++; + + if (map_is_map_of_maps(info->type)) { + int argc = 2; + + if (value_size != 4) { + err("value smaller than 4B for map in map?\n"); + return -1; + } + if (!argv[0] || !argv[1]) { + err("not enough value arguments for map in map\n"); + return -1; + } + + fd = map_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + *value_fd = value; + **value_fd = fd; + } else if (map_is_map_of_progs(info->type)) { + int argc = 2; + + if (value_size != 4) { + err("value smaller than 4B for map of progs?\n"); + return -1; + } + if (!argv[0] || !argv[1]) { + err("not enough value arguments for map of progs\n"); + return -1; + } + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + *value_fd = value; + **value_fd = fd; + } else { + argv = parse_bytes(argv, "value", value, value_size); + if (!argv) + return -1; + } + + return parse_elem(argv, info, key, NULL, key_size, value_size, + flags, NULL); + } else if (is_prefix(*argv, "any") || is_prefix(*argv, "noexist") || + is_prefix(*argv, "exist")) { + if (!flags) { + err("flags specified multiple times: %s\n", *argv); + return -1; + } + + if (is_prefix(*argv, "any")) + *flags = BPF_ANY; + else if (is_prefix(*argv, "noexist")) + *flags = BPF_NOEXIST; + else if (is_prefix(*argv, "exist")) + *flags = BPF_EXIST; + + return parse_elem(argv + 1, info, key, value, key_size, + value_size, NULL, value_fd); + } + + err("expected key or value, got: %s\n", *argv); + return -1; +} + +static int show_map_close(int fd, struct bpf_map_info *info) +{ + char *memlock; + + memlock = get_fdinfo(fd, "memlock"); + close(fd); + + printf("%u: ", info->id); + if (info->type < ARRAY_SIZE(map_type_name)) + printf("%s ", map_type_name[info->type]); + else + printf("type %u ", info->type); + + if (*info->name) + printf("name %s ", info->name); + + printf("flags 0x%x\n", info->map_flags); + printf("\tkey %uB value %uB max_entries %u", + info->key_size, info->value_size, info->max_entries); + + if (memlock) + printf(" memlock %sB", memlock); + free(memlock); + + printf("\n"); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 id = 0; + int err; + int fd; + + if (argc == 2) { + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + return show_map_close(fd, &info); + } + + if (argc) + return BAD_ARG(); + + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + err("can't get next map: %s\n", strerror(errno)); + if (errno == EINVAL) + err("kernel too old?\n"); + return -1; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + err("can't get map by id (%u): %s\n", + id, strerror(errno)); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err("can't get map info: %s\n", strerror(errno)); + close(fd); + return -1; + } + + show_map_close(fd, &info); + } + + return errno == ENOENT ? 0 : -1; +} + +static int do_dump(int argc, char **argv) +{ + void *key, *value, *prev_key; + unsigned int num_elems = 0; + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int err; + int fd; + + if (argc != 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) { + err("Dumping maps of maps and program maps not supported\n"); + close(fd); + return -1; + } + + key = malloc(info.key_size); + value = alloc_value(&info); + if (!key || !value) { + err("mem alloc failed\n"); + err = -1; + goto exit_free; + } + + prev_key = NULL; + while (true) { + err = bpf_map_get_next_key(fd, prev_key, key); + if (err) { + if (errno == ENOENT) + err = 0; + break; + } + + if (!bpf_map_lookup_elem(fd, key, value)) { + print_entry(&info, key, value); + } else { + info("can't lookup element with key: "); + print_hex(key, info.key_size, " "); + printf("\n"); + } + + prev_key = key; + num_elems++; + } + + printf("Found %u element%s\n", num_elems, num_elems != 1 ? "s" : ""); + +exit_free: + free(key); + free(value); + close(fd); + + return err; +} + +static int do_update(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 *value_fd = NULL; + __u32 flags = BPF_ANY; + void *key, *value; + int fd, err; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + value = alloc_value(&info); + if (!key || !value) { + err("mem alloc failed"); + err = -1; + goto exit_free; + } + + err = parse_elem(argv, &info, key, value, info.key_size, + info.value_size, &flags, &value_fd); + if (err) + goto exit_free; + + err = bpf_map_update_elem(fd, key, value, flags); + if (err) { + err("update failed: %s\n", strerror(errno)); + goto exit_free; + } + +exit_free: + if (value_fd) + close(*value_fd); + free(key); + free(value); + close(fd); + + return err; +} + +static int do_lookup(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *value; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + value = alloc_value(&info); + if (!key || !value) { + err("mem alloc failed"); + err = -1; + goto exit_free; + } + + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL); + if (err) + goto exit_free; + + err = bpf_map_lookup_elem(fd, key, value); + if (!err) { + print_entry(&info, key, value); + } else if (errno == ENOENT) { + printf("key:\n"); + print_hex(key, info.key_size, " "); + printf("\n\nNot found\n"); + } else { + err("lookup failed: %s\n", strerror(errno)); + } + +exit_free: + free(key); + free(value); + close(fd); + + return err; +} + +static int do_getnext(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *nextkey; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + nextkey = malloc(info.key_size); + if (!key || !nextkey) { + err("mem alloc failed"); + err = -1; + goto exit_free; + } + + if (argc) { + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, + NULL, NULL); + if (err) + goto exit_free; + } else { + free(key); + key = NULL; + } + + err = bpf_map_get_next_key(fd, key, nextkey); + if (err) { + err("can't get next key: %s\n", strerror(errno)); + goto exit_free; + } + + if (key) { + printf("key:\n"); + print_hex(key, info.key_size, " "); + printf("\n"); + } else { + printf("key: None\n"); + } + + printf("next key:\n"); + print_hex(nextkey, info.key_size, " "); + printf("\n"); + +exit_free: + free(nextkey); + free(key); + close(fd); + + return err; +} + +static int do_delete(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + if (!key) { + err("mem alloc failed"); + err = -1; + goto exit_free; + } + + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL); + if (err) + goto exit_free; + + err = bpf_map_delete_elem(fd, key); + if (err) + err("delete failed: %s\n", strerror(errno)); + +exit_free: + free(key); + close(fd); + + return err; +} + +static int do_pin(int argc, char **argv) +{ + return do_pin_any(argc, argv, bpf_map_get_fd_by_id); +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %s %s show [MAP]\n" + " %s %s dump MAP\n" + " %s %s update MAP key BYTES value VALUE [UPDATE_FLAGS]\n" + " %s %s lookup MAP key BYTES\n" + " %s %s getnext MAP [key BYTES]\n" + " %s %s delete MAP key BYTES\n" + " %s %s pin MAP FILE\n" + " %s %s help\n" + "\n" + " MAP := { id MAP_ID | pinned FILE }\n" + " " HELP_SPEC_PROGRAM "\n" + " VALUE := { BYTES | MAP | PROG }\n" + " UPDATE_FLAGS := { any | exist | noexist }\n" + "", + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "help", do_help }, + { "dump", do_dump }, + { "update", do_update }, + { "lookup", do_lookup }, + { "getnext", do_getnext }, + { "delete", do_delete }, + { "pin", do_pin }, + { 0 } +}; + +int do_map(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c new file mode 100644 index 000000000000..421ba89ce86a --- /dev/null +++ b/tools/bpf/bpftool/prog.c @@ -0,0 +1,456 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Author: Jakub Kicinski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", +}; + +static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) +{ + struct timespec real_time_ts, boot_time_ts; + time_t wallclock_secs; + struct tm load_tm; + + buf[--size] = '\0'; + + if (clock_gettime(CLOCK_REALTIME, &real_time_ts) || + clock_gettime(CLOCK_BOOTTIME, &boot_time_ts)) { + perror("Can't read clocks"); + snprintf(buf, size, "%llu", nsecs / 1000000000); + return; + } + + wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + + nsecs / 1000000000; + + if (!localtime_r(&wallclock_secs, &load_tm)) { + snprintf(buf, size, "%llu", nsecs / 1000000000); + return; + } + + strftime(buf, size, "%b %d/%H:%M", &load_tm); +} + +static int prog_fd_by_tag(unsigned char *tag) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + unsigned int id = 0; + int err; + int fd; + + while (true) { + err = bpf_prog_get_next_id(id, &id); + if (err) { + err("%s\n", strerror(errno)); + return -1; + } + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + err("can't get prog by id (%u): %s\n", + id, strerror(errno)); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err("can't get prog info (%u): %s\n", + id, strerror(errno)); + close(fd); + return -1; + } + + if (!memcmp(tag, info.tag, BPF_TAG_SIZE)) + return fd; + + close(fd); + } +} + +int prog_parse_fd(int *argc, char ***argv) +{ + int fd; + + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + err("can't parse %s as ID\n", **argv); + return -1; + } + NEXT_ARGP(); + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) + err("get by id (%u): %s\n", id, strerror(errno)); + return fd; + } else if (is_prefix(**argv, "tag")) { + unsigned char tag[BPF_TAG_SIZE]; + + NEXT_ARGP(); + + if (sscanf(**argv, BPF_TAG_FMT, tag, tag + 1, tag + 2, + tag + 3, tag + 4, tag + 5, tag + 6, tag + 7) + != BPF_TAG_SIZE) { + err("can't parse tag\n"); + return -1; + } + NEXT_ARGP(); + + return prog_fd_by_tag(tag); + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + return open_obj_pinned_any(path, BPF_OBJ_PROG); + } + + err("expected 'id', 'tag' or 'pinned', got: '%s'?\n", **argv); + return -1; +} + +static void show_prog_maps(int fd, u32 num_maps) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + __u32 map_ids[num_maps]; + unsigned int i; + int err; + + info.nr_map_ids = num_maps; + info.map_ids = ptr_to_u64(map_ids); + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err || !info.nr_map_ids) + return; + + printf(" map_ids "); + for (i = 0; i < info.nr_map_ids; i++) + printf("%u%s", map_ids[i], + i == info.nr_map_ids - 1 ? "" : ","); +} + +static int show_prog(int fd) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + char *memlock; + int err; + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err("can't get prog info: %s\n", strerror(errno)); + return -1; + } + + printf("%u: ", info.id); + if (info.type < ARRAY_SIZE(prog_type_name)) + printf("%s ", prog_type_name[info.type]); + else + printf("type %u ", info.type); + + if (*info.name) + printf("name %s ", info.name); + + printf("tag "); + print_hex(info.tag, BPF_TAG_SIZE, ":"); + printf("\n"); + + if (info.load_time) { + char buf[32]; + + print_boot_time(info.load_time, buf, sizeof(buf)); + + /* Piggy back on load_time, since 0 uid is a valid one */ + printf("\tloaded_at %s uid %u\n", buf, info.created_by_uid); + } + + printf("\txlated %uB", info.xlated_prog_len); + + if (info.jited_prog_len) + printf(" jited %uB", info.jited_prog_len); + else + printf(" not jited"); + + memlock = get_fdinfo(fd, "memlock"); + if (memlock) + printf(" memlock %sB", memlock); + free(memlock); + + if (info.nr_map_ids) + show_prog_maps(fd, info.nr_map_ids); + + printf("\n"); + + return 0; +} + +static int do_show(int argc, char **argv) +{ __u32 id = 0; + int err; + int fd; + + if (argc == 2) { + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + return show_prog(fd); + } + + if (argc) + return BAD_ARG(); + + while (true) { + err = bpf_prog_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + err("can't get next program: %s\n", strerror(errno)); + if (errno == EINVAL) + err("kernel too old?\n"); + return -1; + } + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + err("can't get prog by id (%u): %s\n", + id, strerror(errno)); + return -1; + } + + err = show_prog(fd); + close(fd); + if (err) + return err; + } + + return 0; +} + +static int do_dump(int argc, char **argv) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + bool can_disasm = false; + unsigned int buf_size; + char *filepath = NULL; + bool opcodes = false; + unsigned char *buf; + __u32 *member_len; + __u64 *member_ptr; + ssize_t n; + int err; + int fd; + + if (is_prefix(*argv, "jited")) { + member_len = &info.jited_prog_len; + member_ptr = &info.jited_prog_insns; + can_disasm = true; + } else if (is_prefix(*argv, "xlated")) { + member_len = &info.xlated_prog_len; + member_ptr = &info.xlated_prog_insns; + } else { + err("expected 'xlated' or 'jited', got: %s\n", *argv); + return -1; + } + NEXT_ARG(); + + if (argc < 2) + usage(); + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + if (is_prefix(*argv, "file")) { + NEXT_ARG(); + if (!argc) { + err("expected file path\n"); + return -1; + } + + filepath = *argv; + NEXT_ARG(); + } else if (is_prefix(*argv, "opcodes")) { + opcodes = true; + NEXT_ARG(); + } + + if (!filepath && !can_disasm) { + err("expected 'file' got %s\n", *argv); + return -1; + } + if (argc) { + usage(); + return -1; + } + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + if (err) { + err("can't get prog info: %s\n", strerror(errno)); + return -1; + } + + if (!*member_len) { + info("no instructions returned\n"); + close(fd); + return 0; + } + + buf_size = *member_len; + + buf = malloc(buf_size); + if (!buf) { + err("mem alloc failed\n"); + close(fd); + return -1; + } + + memset(&info, 0, sizeof(info)); + + *member_ptr = ptr_to_u64(buf); + *member_len = buf_size; + + err = bpf_obj_get_info_by_fd(fd, &info, &len); + close(fd); + if (err) { + err("can't get prog info: %s\n", strerror(errno)); + goto err_free; + } + + if (*member_len > buf_size) { + info("too many instructions returned\n"); + goto err_free; + } + + if (filepath) { + fd = open(filepath, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + err("can't open file %s: %s\n", filepath, + strerror(errno)); + goto err_free; + } + + n = write(fd, buf, *member_len); + close(fd); + if (n != *member_len) { + err("error writing output file: %s\n", + n < 0 ? strerror(errno) : "short write"); + goto err_free; + } + } else { + disasm_print_insn(buf, *member_len, opcodes); + } + + free(buf); + + return 0; + +err_free: + free(buf); + return -1; +} + +static int do_pin(int argc, char **argv) +{ + return do_pin_any(argc, argv, bpf_prog_get_fd_by_id); +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %s %s show [PROG]\n" + " %s %s dump xlated PROG file FILE\n" + " %s %s dump jited PROG [file FILE] [opcodes]\n" + " %s %s pin PROG FILE\n" + " %s %s help\n" + "\n" + " " HELP_SPEC_PROGRAM "\n" + "", + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "dump", do_dump }, + { "pin", do_pin }, + { 0 } +}; + +int do_prog(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} From ff69c21a85a4ed3d8dcf79dd750a0a9eda4fca83 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 4 Oct 2017 20:10:05 -0700 Subject: [PATCH 1303/1322] tools: bpftool: add documentation Add documentation for bpftool. Separate files for each subcommand. Use rst format. Documentation is compiled into man pages using rst2man. Signed-off-by: David Beckett Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/bpf/bpftool/Documentation/Makefile | 34 ++++++ .../bpf/bpftool/Documentation/bpftool-map.rst | 110 ++++++++++++++++++ .../bpftool/Documentation/bpftool-prog.rst | 79 +++++++++++++ tools/bpf/bpftool/Documentation/bpftool.rst | 34 ++++++ tools/bpf/bpftool/Makefile | 6 + 5 files changed, 263 insertions(+) create mode 100644 tools/bpf/bpftool/Documentation/Makefile create mode 100644 tools/bpf/bpftool/Documentation/bpftool-map.rst create mode 100644 tools/bpf/bpftool/Documentation/bpftool-prog.rst create mode 100644 tools/bpf/bpftool/Documentation/bpftool.rst diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile new file mode 100644 index 000000000000..bde77d7c4390 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -0,0 +1,34 @@ +include ../../../scripts/Makefile.include +include ../../../scripts/utilities.mak + +INSTALL ?= install +RM ?= rm -f + +# Make the path relative to DESTDIR, not prefix +ifndef DESTDIR +prefix?=$(HOME) +endif +mandir ?= $(prefix)/share/man +man8dir = $(mandir)/man8 + +MAN8_RST = $(wildcard *.rst) + +_DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) +DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) + +man: man8 +man8: $(DOC_MAN8) + +$(OUTPUT)%.8: %.rst + rst2man $< > $@ + +clean: + $(call QUIET_CLEAN, Documentation) $(RM) $(DOC_MAN8) + +install: man + $(call QUIET_INSTALL, Documentation-man) \ + $(INSTALL) -d -m 755 $(DESTDIR)$(man8dir); \ + $(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir); + +.PHONY: man man8 clean install +.DEFAULT_GOAL := man diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst new file mode 100644 index 000000000000..ff63e89e4b6c --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -0,0 +1,110 @@ +================ +bpftool-map +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF maps +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** **map** *COMMAND* + + *COMMANDS* := + { show | dump | update | lookup | getnext | delete | pin | help } + +MAP COMMANDS +============= + +| **bpftool** map show [*MAP*] +| **bpftool** map dump *MAP* +| **bpftool** map update *MAP* key *BYTES* value *VALUE* [*UPDATE_FLAGS*] +| **bpftool** map lookup *MAP* key *BYTES* +| **bpftool** map getnext *MAP* [key *BYTES*] +| **bpftool** map delete *MAP* key *BYTES* +| **bpftool** map pin *MAP* *FILE* +| **bpftool** map help +| +| *MAP* := { id MAP_ID | pinned FILE } +| *VALUE* := { BYTES | MAP | PROGRAM } +| *UPDATE_FLAGS* := { any | exist | noexist } + +DESCRIPTION +=========== + **bpftool map show** [*MAP*] + Show information about loaded maps. If *MAP* is specified + show information only about given map, otherwise list all + maps currently loaded on the system. + + Output will start with map ID followed by map type and + zero or more named attributes (depending on kernel version). + + **bpftool map dump** *MAP* + Dump all entries in a given *MAP*. + + **bpftool map update** *MAP* **key** *BYTES* **value** *VALUE* [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. + + *UPDATE_FLAGS* can be one of: **any** update existing entry + or add if doesn't exit; **exist** update only if entry already + exists; **noexist** update only if entry doesn't exist. + + **bpftool map lookup** *MAP* **key** *BYTES* + Lookup **key** in the map. + + **bpftool map getnext** *MAP* [**key** *BYTES*] + Get next key. If *key* is not specified, get first key. + + **bpftool map delete** *MAP* **key** *BYTES* + Remove entry from the map. + + **bpftool map pin** *MAP* *FILE* + Pin map *MAP* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. + + **bpftool map help** + Print short help message. + +EXAMPLES +======== +**# bpftool map show** +:: + + 10: hash name some_map flags 0x0 + key 4B value 8B max_entries 2048 memlock 167936B + +**# bpftool map update id 10 key 13 00 07 00 value 02 00 00 00 01 02 03 04** + +**# bpftool map lookup id 10 key 0 1 2 3** + +:: + + key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 + + +**# bpftool map dump id 10** +:: + + key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 + key: 0d 00 07 00 value: 02 00 00 00 01 02 03 04 + Found 2 elements + +**# bpftool map getnext id 10 key 0 1 2 3** +:: + + key: + 00 01 02 03 + next key: + 0d 00 07 00 + +| +| **# mount -t bpf none /sys/fs/bpf/** +| **# bpftool map pin id 10 /sys/fs/bpf/map** +| **# bpftool map del pinned /sys/fs/bpf/map key 13 00 07 00** + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-prog**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst new file mode 100644 index 000000000000..57fc4b9924ea --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -0,0 +1,79 @@ +================ +bpftool-prog +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF progs +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + +| **bpftool** prog show [*PROG*] +| **bpftool** prog dump xlated *PROG* file *FILE* +| **bpftool** prog dump jited *PROG* [file *FILE*] [opcodes] +| **bpftool** prog pin *PROG* *FILE* +| **bpftool** prog help +| +| *PROG* := { id *PROG_ID* | pinned *FILE* | tag *PROG_TAG* } + +DESCRIPTION +=========== + **bpftool prog show** [*PROG*] + Show information about loaded programs. If *PROG* is + specified show information only about given program, otherwise + list all programs currently loaded on the system. + + Output will start with program ID followed by program type and + zero or more named attributes (depending on kernel version). + + **bpftool prog dump xlated** *PROG* **file** *FILE* + Dump eBPF instructions of the program from the kernel to a + file. + + **bpftool prog dump jited** *PROG* [**file** *FILE*] [**opcodes**] + Dump jited image (host machine code) of the program. + If *FILE* is specified image will be written to a file, + otherwise it will be disassembled and printed to stdout. + + **opcodes** controls if raw opcodes will be printed. + + **bpftool prog pin** *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. + + **bpftool prog help** + Print short help message. + +EXAMPLES +======== +**# bpftool prog show** +:: + + 10: xdp name some_prog tag 00:5a:3d:21:23:62:0c:8b + loaded_at Sep 29/20:11 uid 0 + xlated 528B jited 370B memlock 4096B map_ids 10 + +| +| **# bpftool prog dump xlated id 10 file /tmp/t** +| **# ls -l /tmp/t** +| -rw------- 1 root root 560 Jul 22 01:42 /tmp/t + +| +| **# bpftool prog dum jited pinned /sys/fs/bpf/prog** + +:: + + push %rbp + mov %rsp,%rbp + sub $0x228,%rsp + sub $0x28,%rbp + mov %rbx,0x0(%rbp) + + + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-map**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst new file mode 100644 index 000000000000..f1df1893fb54 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -0,0 +1,34 @@ +================ +BPFTOOL +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF programs and maps +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** *OBJECT* { *COMMAND* | help } + + **bpftool** batch file *FILE* + + *OBJECT* := { **map** | **program** } + + *MAP-COMMANDS* := + { show | dump | update | lookup | getnext | delete | pin | help } + + *PROG-COMMANDS* := { show | dump jited | dump xlated | pin | help } + +DESCRIPTION +=========== + *bpftool* allows for inspection and simple modification of BPF objects + on the system. + + Note that format of the output of all tools is not guaranteed to be + stable and should not be depended upon. + +SEE ALSO +======== + **bpftool-map**\ (8), **bpftool-prog**\ (8) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index a7151f47fb40..8705ee44664d 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -74,6 +74,12 @@ clean: $(LIBBPF)-clean install: install $(OUTPUT)bpftool $(prefix)/sbin/bpftool +doc: + $(Q)$(MAKE) -C Documentation/ + +doc-install: + $(Q)$(MAKE) -C Documentation/ install + FORCE: .PHONY: all clean FORCE From 41dcf197ad5373a7dd0a4b6572aec2e3ec6a0e49 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Mon, 2 Oct 2017 17:17:35 -0500 Subject: [PATCH 1304/1322] dm raid: fix incorrect status output at the end of a "recover" process There are three important fields that indicate the overall health and status of an array: dev_health, sync_ratio, and sync_action. They tell us the condition of the devices in the array, and the degree to which the array is synchronized. This commit fixes a condition that is reported incorrectly. When a member of the array is being rebuilt or a new device is added, the "recover" process is used to synchronize it with the rest of the array. When the process is complete, but the sync thread hasn't yet been reaped, it is possible for the state of MD to be: mddev->recovery = [ MD_RECOVERY_RUNNING MD_RECOVERY_RECOVER MD_RECOVERY_DONE ] curr_resync_completed = (but not MaxSector) and all rdevs to be In_sync. This causes the 'array_in_sync' output parameter that is passed to rs_get_progress() to be computed incorrectly and reported as 'false' -- or not in-sync. This in turn causes the dev_health status characters to be reported as all 'a', rather than the proper 'A'. This can cause erroneous output for several seconds at a time when tools will want to be checking the condition due to events that are raised at the end of a sync process. Fix this by properly calculating the 'array_in_sync' return parameter in rs_get_progress(). Also, remove an unnecessary intermediate 'recovery_cp' variable in rs_get_progress(). Signed-off-by: Jonathan Brassow Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-raid.txt | 1 + drivers/md/dm-raid.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 4a0a7469fdd7..32df07e29f68 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -344,3 +344,4 @@ Version History (wrong raid10_copies/raid10_format sequence) 1.11.1 Add raid4/5/6 journal write-back support via journal_mode option 1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available +1.13.0 Fix dev_health status at end of "recover" (was 'a', now 'A') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5bfe285ea9d1..43094ea89e37 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3297,11 +3297,10 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, static sector_t rs_get_progress(struct raid_set *rs, sector_t resync_max_sectors, bool *array_in_sync) { - sector_t r, recovery_cp, curr_resync_completed; + sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - recovery_cp = mddev->recovery_cp; *array_in_sync = false; if (rs_is_raid0(rs)) { @@ -3330,9 +3329,11 @@ static sector_t rs_get_progress(struct raid_set *rs, } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) r = curr_resync_completed; else - r = recovery_cp; + r = mddev->recovery_cp; - if (r == MaxSector) { + if ((r == MaxSector) || + (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ @@ -3892,7 +3893,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 12, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, From 44f209807ee87a5eddf6c0432f3fb63cec27bad8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:50 -0400 Subject: [PATCH 1305/1322] VSOCK: export socket tables for sock_diag interface The socket table symbols need to be exported from vsock.ko so that the vsock_diag.ko module will be able to traverse sockets. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 5 +++++ net/vmw_vsock/af_vsock.c | 10 ++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f9fb566e75cf..30cba806e344 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -27,6 +27,11 @@ #define LAST_RESERVED_PORT 1023 +#define VSOCK_HASH_SIZE 251 +extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +extern spinlock_t vsock_table_lock; + #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index dfc8c51e4d74..9afe4da8c67d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -153,7 +153,6 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ -#define VSOCK_HASH_SIZE 251 #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) @@ -168,9 +167,12 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) -static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; -static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; -static DEFINE_SPINLOCK(vsock_table_lock); +struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +EXPORT_SYMBOL_GPL(vsock_bind_table); +struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +EXPORT_SYMBOL_GPL(vsock_connected_table); +DEFINE_SPINLOCK(vsock_table_lock); +EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) From bf359b8127719535f88494adb3c2b73c06667dcd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:51 -0400 Subject: [PATCH 1306/1322] VSOCK: move __vsock_in_bound/connected_table() to af_vsock.h The vsock_diag.ko module will need to check socket table membership. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 12 ++++++++++++ net/vmw_vsock/af_vsock.c | 10 ---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 30cba806e344..3dd217718a2f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -180,6 +180,18 @@ const struct vsock_transport *vsock_core_get_transport(void); /**** UTILS ****/ +/* vsock_table_lock must be held */ +static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->bound_table); +} + +/* vsock_table_lock must be held */ +static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->connected_table); +} + void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9afe4da8c67d..9b179a0081b3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -250,16 +250,6 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, return NULL; } -static bool __vsock_in_bound_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->bound_table); -} - -static bool __vsock_in_connected_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->connected_table); -} - static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); From 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:52 -0400 Subject: [PATCH 1307/1322] VSOCK: use TCP state constants for sk_state There are two state fields: socket->state and sock->sk_state. The socket->state field uses SS_UNCONNECTED, SS_CONNECTED, etc while the sock->sk_state typically uses values that match TCP state constants (TCP_CLOSE, TCP_ESTABLISHED). AF_VSOCK does not follow this convention and instead uses SS_* constants for both fields. The sk_state field will be exposed to userspace through the vsock_diag interface for ss(8), netstat(8), and other programs. This patch switches sk_state to TCP state constants so that the meaning of this field is consistent with other address families. Not just AF_INET and AF_INET6 use the TCP constants, AF_UNIX and others do too. The following mapping was used to convert the code: SS_FREE -> TCP_CLOSE SS_UNCONNECTED -> TCP_CLOSE SS_CONNECTING -> TCP_SYN_SENT SS_CONNECTED -> TCP_ESTABLISHED SS_DISCONNECTING -> TCP_CLOSING VSOCK_SS_LISTEN -> TCP_LISTEN In __vsock_create() the sk_state initialization was dropped because sock_init_data() already initializes sk_state to TCP_CLOSE. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 3 -- net/vmw_vsock/af_vsock.c | 46 ++++++++++++-------- net/vmw_vsock/hyperv_transport.c | 12 ++--- net/vmw_vsock/virtio_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 22 +++++----- net/vmw_vsock/vmci_transport.c | 34 +++++++-------- net/vmw_vsock/vmci_transport_notify.c | 2 +- net/vmw_vsock/vmci_transport_notify_qstate.c | 2 +- 8 files changed, 64 insertions(+), 59 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 3dd217718a2f..9324ac2d9ff2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -22,9 +22,6 @@ #include "vsock_addr.h" -/* vsock-specific sock->sk_state constants */ -#define VSOCK_SS_LISTEN 255 - #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9b179a0081b3..98359c19522f 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -36,7 +36,7 @@ * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this - * implementation because they are in the VSOCK_SS_LISTEN state. When a + * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. @@ -82,6 +82,15 @@ * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. + * + * - sk->sk_state uses the TCP state constants because they are widely used by + * other address families and exposed to userspace tools like ss(8): + * + * TCP_CLOSE - unconnected + * TCP_SYN_SENT - connecting + * TCP_ESTABLISHED - connected + * TCP_CLOSING - disconnecting + * TCP_LISTEN - listening */ #include @@ -477,7 +486,7 @@ void vsock_pending_work(struct work_struct *work) if (vsock_in_connected_table(vsk)) vsock_remove_connected(vsk); - sk->sk_state = SS_FREE; + sk->sk_state = TCP_CLOSE; out: release_sock(sk); @@ -617,7 +626,6 @@ struct sock *__vsock_create(struct net *net, sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; - sk->sk_state = 0; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); @@ -891,7 +899,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, /* Listening sockets that have connections in their accept * queue can be read. */ - if (sk->sk_state == VSOCK_SS_LISTEN + if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= POLLIN | POLLRDNORM; @@ -920,7 +928,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( @@ -942,7 +950,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == SS_UNCONNECTED) { + if (sk->sk_state == TCP_CLOSE) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; @@ -1112,9 +1120,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk = sk_vsock(vsk); lock_sock(sk); - if (sk->sk_state == SS_CONNECTING && + if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); cancel = 1; @@ -1160,7 +1168,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = -EALREADY; break; default: - if ((sk->sk_state == VSOCK_SS_LISTEN) || + if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; @@ -1183,7 +1191,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - sk->sk_state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) @@ -1203,7 +1211,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { + while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection @@ -1226,13 +1234,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; @@ -1243,7 +1251,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; @@ -1276,7 +1284,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, goto out; } - if (listener->sk_state != VSOCK_SS_LISTEN) { + if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } @@ -1366,7 +1374,7 @@ static int vsock_listen(struct socket *sock, int backlog) } sk->sk_max_ack_backlog = backlog; - sk->sk_state = VSOCK_SS_LISTEN; + sk->sk_state = TCP_LISTEN; err = 0; @@ -1546,7 +1554,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { - err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } @@ -1557,7 +1565,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != SS_CONNECTED || + if (sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1681,7 +1689,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, lock_sock(sk); - if (sk->sk_state != SS_CONNECTED) { + if (sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 14ed5a344cdf..bbac023e70d1 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -310,7 +310,7 @@ static void hvs_close_connection(struct vmbus_channel *chan) struct sock *sk = get_per_channel_state(chan); struct vsock_sock *vsk = vsock_sk(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; @@ -344,8 +344,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!sk) return; - if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || - (!conn_from_host && sk->sk_state != SS_CONNECTING)) + if ((conn_from_host && sk->sk_state != TCP_LISTEN) || + (!conn_from_host && sk->sk_state != TCP_SYN_SENT)) goto out; if (conn_from_host) { @@ -357,7 +357,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!new) goto out; - new->sk_state = SS_CONNECTING; + new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); hvs_new = vnew->trans; hvs_new->chan = chan; @@ -384,7 +384,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vmbus_set_chn_rescind_callback(chan, hvs_close_connection); if (conn_from_host) { - new->sk_state = SS_CONNECTED; + new->sk_state = TCP_ESTABLISHED; sk->sk_ack_backlog++; hvs_addr_init(&vnew->local_addr, if_type); @@ -399,7 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vsock_enqueue_accept(sk, new); release_sock(sk); } else { - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsock_sk(sk)); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 403d86e80162..8e03bd3f3668 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -414,7 +414,7 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { lock_sock(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); release_sock(sk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index edba7ab97563..3ae3a33da70b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -708,7 +708,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); if (vsk->close_work_scheduled && @@ -748,8 +748,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; - if (!(sk->sk_state == SS_CONNECTED || - sk->sk_state == SS_DISCONNECTING)) + if (!(sk->sk_state == TCP_ESTABLISHED || + sk->sk_state == TCP_CLOSING)) return true; /* Already received SHUTDOWN from peer, reply with RST */ @@ -801,7 +801,7 @@ virtio_transport_recv_connecting(struct sock *sk, switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RESPONSE: - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -821,7 +821,7 @@ virtio_transport_recv_connecting(struct sock *sk, destroy: virtio_transport_reset(vsk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -857,7 +857,7 @@ virtio_transport_recv_connected(struct sock *sk, vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); break; @@ -928,7 +928,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) lock_sock_nested(child, SINGLE_DEPTH_NESTING); - child->sk_state = SS_CONNECTED; + child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), @@ -1016,18 +1016,18 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) sk->sk_write_space(sk); switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: virtio_transport_recv_listen(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: virtio_transport_recv_connecting(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: virtio_transport_recv_connected(sk, pkt); break; - case SS_DISCONNECTING: + case TCP_CLOSING: virtio_transport_recv_disconnecting(sk, pkt); virtio_transport_free_pkt(pkt); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 0206155bff53..391775e3575c 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -742,7 +742,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) /* The local context ID may be out of date, update it. */ vsk->local_addr.svm_cid = dst.svm_cid; - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vmci_trans(vsk)->notify_ops->handle_notify_pkt( sk, pkt, true, &dst, &src, &bh_process_pkt); @@ -800,7 +800,9 @@ static void vmci_transport_handle_detach(struct sock *sk) * left in our consume queue. */ if (vsock_stream_has_data(vsk) <= 0) { - if (sk->sk_state == SS_CONNECTING) { + sk->sk_state = TCP_CLOSE; + + if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., * if the peer VM is killed after attaching to @@ -809,12 +811,10 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ - sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } - sk->sk_state = SS_UNCONNECTED; } sk->sk_state_change(sk); } @@ -882,17 +882,17 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work) vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: vmci_transport_recv_listen(sk, pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: /* Processing of pending connections for servers goes through * the listening socket, so see vmci_transport_recv_listen() * for that path. */ vmci_transport_recv_connecting_client(sk, pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: vmci_transport_recv_connected(sk, pkt); break; default: @@ -941,7 +941,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context; switch (pending->sk_state) { - case SS_CONNECTING: + case TCP_SYN_SENT: err = vmci_transport_recv_connecting_server(sk, pending, pkt); @@ -1071,7 +1071,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_add_pending(sk, pending); sk->sk_ack_backlog++; - pending->sk_state = SS_CONNECTING; + pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; vmci_trans(vpending)->queue_pair_size = qp_size; @@ -1196,11 +1196,11 @@ vmci_transport_recv_connecting_server(struct sock *listener, * the socket will be valid until it is removed from the queue. * * If we fail sending the attach below, we remove the socket from the - * connected list and move the socket to SS_UNCONNECTED before + * connected list and move the socket to TCP_CLOSE before * releasing the lock, so a pending slow path processing of an incoming * packet will not see the socket in the connected state in that case. */ - pending->sk_state = SS_CONNECTED; + pending->sk_state = TCP_ESTABLISHED; vsock_insert_connected(vpending); @@ -1231,7 +1231,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, destroy: pending->sk_err = skerr; - pending->sk_state = SS_UNCONNECTED; + pending->sk_state = TCP_CLOSE; /* As long as we drop our reference, all necessary cleanup will handle * when the cleanup function drops its reference and our destruct * implementation is called. Note that since the listen handler will @@ -1269,7 +1269,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, * accounting (it can already be found since it's in the bound * table). */ - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -1337,7 +1337,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, destroy: vmci_transport_send_reset(sk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -1525,7 +1525,7 @@ static int vmci_transport_recv_connected(struct sock *sk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); break; @@ -1789,7 +1789,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) err = vmci_transport_send_conn_request( sk, vmci_trans(vsk)->queue_pair_size); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } } else { @@ -1799,7 +1799,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) sk, vmci_trans(vsk)->queue_pair_size, supported_proto_versions); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 1406db4d97d1..41fb427f150a 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -355,7 +355,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!send_waiting_read(sk, 1)) return -1; diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index f3a0afc46208..0cc84f2bb05e 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -176,7 +176,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vsock_block_update_write_window(sk); *data_ready_now = false; } From 413a4317aca7d6367d57a5971b0c461f03851207 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:53 -0400 Subject: [PATCH 1308/1322] VSOCK: add sock_diag interface This patch adds the sock_diag interface for querying sockets from userspace. Tools like ss(8) and netstat(8) can use this interface to list open sockets. The userspace ABI is defined in and includes netlink request and response structs. The request can query sockets based on their sk_state (e.g. listening sockets only) and the response contains socket information fields including the local/remote addresses, inode number, etc. This patch does not dump VMCI pending sockets because I have only tested the virtio transport, which does not use pending sockets. Support can be added later by extending vsock_diag_dump() if needed by VMCI users. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- MAINTAINERS | 2 + include/uapi/linux/vm_sockets_diag.h | 33 +++++ net/vmw_vsock/Kconfig | 10 ++ net/vmw_vsock/Makefile | 3 + net/vmw_vsock/diag.c | 186 +++++++++++++++++++++++++++ 5 files changed, 234 insertions(+) create mode 100644 include/uapi/linux/vm_sockets_diag.h create mode 100644 net/vmw_vsock/diag.c diff --git a/MAINTAINERS b/MAINTAINERS index d0cbb3d7a0ca..0fd9121953bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14286,6 +14286,8 @@ S: Maintained F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h F: include/uapi/linux/vsockmon.h +F: include/uapi/linux/vm_sockets_diag.h +F: net/vmw_vsock/diag.c F: net/vmw_vsock/af_vsock_tap.c F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c diff --git a/include/uapi/linux/vm_sockets_diag.h b/include/uapi/linux/vm_sockets_diag.h new file mode 100644 index 000000000000..14cd7dc5a187 --- /dev/null +++ b/include/uapi/linux/vm_sockets_diag.h @@ -0,0 +1,33 @@ +/* AF_VSOCK sock_diag(7) interface for querying open sockets */ + +#ifndef _UAPI__VM_SOCKETS_DIAG_H__ +#define _UAPI__VM_SOCKETS_DIAG_H__ + +#include + +/* Request */ +struct vsock_diag_req { + __u8 sdiag_family; /* must be AF_VSOCK */ + __u8 sdiag_protocol; /* must be 0 */ + __u16 pad; /* must be 0 */ + __u32 vdiag_states; /* query bitmap (e.g. 1 << TCP_LISTEN) */ + __u32 vdiag_ino; /* must be 0 (reserved) */ + __u32 vdiag_show; /* must be 0 (reserved) */ + __u32 vdiag_cookie[2]; +}; + +/* Response */ +struct vsock_diag_msg { + __u8 vdiag_family; /* AF_VSOCK */ + __u8 vdiag_type; /* SOCK_STREAM or SOCK_DGRAM */ + __u8 vdiag_state; /* sk_state (e.g. TCP_LISTEN) */ + __u8 vdiag_shutdown; /* local RCV_SHUTDOWN | SEND_SHUTDOWN */ + __u32 vdiag_src_cid; + __u32 vdiag_src_port; + __u32 vdiag_dst_cid; + __u32 vdiag_dst_port; + __u32 vdiag_ino; + __u32 vdiag_cookie[2]; +}; + +#endif /* _UAPI__VM_SOCKETS_DIAG_H__ */ diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index a24369d175fd..970f96489fe7 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -15,6 +15,16 @@ config VSOCKETS To compile this driver as a module, choose M here: the module will be called vsock. If unsure, say N. +config VSOCKETS_DIAG + tristate "Virtual Sockets monitoring interface" + depends on VSOCKETS + default y + help + Support for PF_VSOCK sockets monitoring interface used by the ss tool. + If unsure, say Y. + + Enable this module so userspace applications can query open sockets. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index e63d574234a9..64afc06805da 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_VSOCKETS) += vsock.o +obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o @@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o +vsock_diag-y += diag.o + vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c new file mode 100644 index 000000000000..31b567652250 --- /dev/null +++ b/net/vmw_vsock/diag.c @@ -0,0 +1,186 @@ +/* + * vsock sock_diag(7) module + * + * Copyright (C) 2017 Red Hat, Inc. + * Author: Stefan Hajnoczi + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_diag_msg *rep; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->vdiag_family = AF_VSOCK; + + /* Lock order dictates that sk_lock is acquired before + * vsock_table_lock, so we cannot lock here. Simply don't take + * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is + * held. + */ + rep->vdiag_type = sk->sk_type; + rep->vdiag_state = sk->sk_state; + rep->vdiag_shutdown = sk->sk_shutdown; + rep->vdiag_src_cid = vsk->local_addr.svm_cid; + rep->vdiag_src_port = vsk->local_addr.svm_port; + rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; + rep->vdiag_dst_port = vsk->remote_addr.svm_port; + rep->vdiag_ino = sock_i_ino(sk); + + sock_diag_save_cookie(sk, rep->vdiag_cookie); + + return 0; +} + +static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct vsock_diag_req *req; + struct vsock_sock *vsk; + unsigned int bucket; + unsigned int last_i; + unsigned int table; + struct net *net; + unsigned int i; + + req = nlmsg_data(cb->nlh); + net = sock_net(skb->sk); + + /* State saved between calls: */ + table = cb->args[0]; + bucket = cb->args[1]; + i = last_i = cb->args[2]; + + /* TODO VMCI pending sockets? */ + + spin_lock_bh(&vsock_table_lock); + + /* Bind table (locally created sockets) */ + if (table == 0) { + while (bucket < ARRAY_SIZE(vsock_bind_table)) { + struct list_head *head = &vsock_bind_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, bound_table) { + struct sock *sk = sk_vsock(vsk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_bind; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_bind; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_bind: + i++; + } + last_i = 0; + bucket++; + } + + table++; + bucket = 0; + } + + /* Connected table (accepted connections) */ + while (bucket < ARRAY_SIZE(vsock_connected_table)) { + struct list_head *head = &vsock_connected_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, connected_table) { + struct sock *sk = sk_vsock(vsk); + + /* Skip sockets we've already seen above */ + if (__vsock_in_bound_table(vsk)) + continue; + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_connected; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_connected; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_connected: + i++; + } + last_i = 0; + bucket++; + } + +done: + spin_unlock_bh(&vsock_table_lock); + + cb->args[0] = table; + cb->args[1] = bucket; + cb->args[2] = i; + + return skb->len; +} + +static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct vsock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = vsock_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler vsock_diag_handler = { + .family = AF_VSOCK, + .dump = vsock_diag_handler_dump, +}; + +static int __init vsock_diag_init(void) +{ + return sock_diag_register(&vsock_diag_handler); +} + +static void __exit vsock_diag_exit(void) +{ + sock_diag_unregister(&vsock_diag_handler); +} + +module_init(vsock_diag_init); +module_exit(vsock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, + 40 /* AF_VSOCK */); From 0b02503384301e4c2db13f1045a1ddbaca2ba0ce Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:54 -0400 Subject: [PATCH 1309/1322] VSOCK: add tools/testing/vsock/vsock_diag_test This patch adds tests for the vsock_diag.ko module. These tests are not self-tests because they require manual set up of a KVM or VMware guest. Please see tools/testing/vsock/README for instructions. The control.h and timeout.h infrastructure can be used for additional AF_VSOCK tests in the future. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- MAINTAINERS | 1 + tools/testing/vsock/.gitignore | 2 + tools/testing/vsock/Makefile | 9 + tools/testing/vsock/README | 36 ++ tools/testing/vsock/control.c | 219 +++++++++ tools/testing/vsock/control.h | 13 + tools/testing/vsock/timeout.c | 64 +++ tools/testing/vsock/timeout.h | 14 + tools/testing/vsock/vsock_diag_test.c | 681 ++++++++++++++++++++++++++ 9 files changed, 1039 insertions(+) create mode 100644 tools/testing/vsock/.gitignore create mode 100644 tools/testing/vsock/Makefile create mode 100644 tools/testing/vsock/README create mode 100644 tools/testing/vsock/control.c create mode 100644 tools/testing/vsock/control.h create mode 100644 tools/testing/vsock/timeout.c create mode 100644 tools/testing/vsock/timeout.h create mode 100644 tools/testing/vsock/vsock_diag_test.c diff --git a/MAINTAINERS b/MAINTAINERS index 0fd9121953bb..f0c37be4e04a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14294,6 +14294,7 @@ F: net/vmw_vsock/virtio_transport.c F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c F: drivers/vhost/vsock.h +F: tools/testing/vsock/ VIRTIO CONSOLE DRIVER M: Amit Shah diff --git a/tools/testing/vsock/.gitignore b/tools/testing/vsock/.gitignore new file mode 100644 index 000000000000..dc5f11faf530 --- /dev/null +++ b/tools/testing/vsock/.gitignore @@ -0,0 +1,2 @@ +*.d +vsock_diag_test diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile new file mode 100644 index 000000000000..66ba0924194d --- /dev/null +++ b/tools/testing/vsock/Makefile @@ -0,0 +1,9 @@ +all: test +test: vsock_diag_test +vsock_diag_test: vsock_diag_test.o timeout.o control.o + +CFLAGS += -g -O2 -Werror -Wall -I. -I../../include/uapi -I../../include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE +.PHONY: all test clean +clean: + ${RM} *.o *.d vsock_diag_test +-include *.d diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README new file mode 100644 index 000000000000..2cc6d7302db6 --- /dev/null +++ b/tools/testing/vsock/README @@ -0,0 +1,36 @@ +AF_VSOCK test suite +------------------- +These tests exercise net/vmw_vsock/ host<->guest sockets for VMware, KVM, and +Hyper-V. + +The following tests are available: + + * vsock_diag_test - vsock_diag.ko module for listing open sockets + +The following prerequisite steps are not automated and must be performed prior +to running tests: + +1. Build the kernel and these tests. +2. Install the kernel and tests on the host. +3. Install the kernel and tests inside the guest. +4. Boot the guest and ensure that the AF_VSOCK transport is enabled. + +Invoke test binaries in both directions as follows: + + # host=server, guest=client + (host)# $TEST_BINARY --mode=server \ + --control-port=1234 \ + --peer-cid=3 + (guest)# $TEST_BINARY --mode=client \ + --control-host=$HOST_IP \ + --control-port=1234 \ + --peer-cid=2 + + # host=client, guest=server + (guest)# $TEST_BINARY --mode=server \ + --control-port=1234 \ + --peer-cid=2 + (host)# $TEST_BINARY --mode=client \ + --control-port=$GUEST_IP \ + --control-port=1234 \ + --peer-cid=3 diff --git a/tools/testing/vsock/control.c b/tools/testing/vsock/control.c new file mode 100644 index 000000000000..90fd47f0e422 --- /dev/null +++ b/tools/testing/vsock/control.c @@ -0,0 +1,219 @@ +/* Control socket for client/server test execution + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Stefan Hajnoczi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +/* The client and server may need to coordinate to avoid race conditions like + * the client attempting to connect to a socket that the server is not + * listening on yet. The control socket offers a communications channel for + * such coordination tasks. + * + * If the client calls control_expectln("LISTENING"), then it will block until + * the server calls control_writeln("LISTENING"). This provides a simple + * mechanism for coordinating between the client and the server. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "timeout.h" +#include "control.h" + +static int control_fd = -1; + +/* Open the control socket, either in server or client mode */ +void control_init(const char *control_host, + const char *control_port, + bool server) +{ + struct addrinfo hints = { + .ai_socktype = SOCK_STREAM, + }; + struct addrinfo *result = NULL; + struct addrinfo *ai; + int ret; + + ret = getaddrinfo(control_host, control_port, &hints, &result); + if (ret != 0) { + fprintf(stderr, "%s\n", gai_strerror(ret)); + exit(EXIT_FAILURE); + } + + for (ai = result; ai; ai = ai->ai_next) { + int fd; + int val = 1; + + fd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + if (fd < 0) + continue; + + if (!server) { + if (connect(fd, ai->ai_addr, ai->ai_addrlen) < 0) + goto next; + control_fd = fd; + printf("Control socket connected to %s:%s.\n", + control_host, control_port); + break; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &val, sizeof(val)) < 0) { + perror("setsockopt"); + exit(EXIT_FAILURE); + } + + if (bind(fd, ai->ai_addr, ai->ai_addrlen) < 0) + goto next; + if (listen(fd, 1) < 0) + goto next; + + printf("Control socket listening on %s:%s\n", + control_host, control_port); + fflush(stdout); + + control_fd = accept(fd, NULL, 0); + close(fd); + + if (control_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + printf("Control socket connection accepted...\n"); + break; + +next: + close(fd); + } + + if (control_fd < 0) { + fprintf(stderr, "Control socket initialization failed. Invalid address %s:%s?\n", + control_host, control_port); + exit(EXIT_FAILURE); + } + + freeaddrinfo(result); +} + +/* Free resources */ +void control_cleanup(void) +{ + close(control_fd); + control_fd = -1; +} + +/* Write a line to the control socket */ +void control_writeln(const char *str) +{ + ssize_t len = strlen(str); + ssize_t ret; + + timeout_begin(TIMEOUT); + + do { + ret = send(control_fd, str, len, MSG_MORE); + timeout_check("send"); + } while (ret < 0 && errno == EINTR); + + if (ret != len) { + perror("send"); + exit(EXIT_FAILURE); + } + + do { + ret = send(control_fd, "\n", 1, 0); + timeout_check("send"); + } while (ret < 0 && errno == EINTR); + + if (ret != 1) { + perror("send"); + exit(EXIT_FAILURE); + } + + timeout_end(); +} + +/* Return the next line from the control socket (without the trailing newline). + * + * The program terminates if a timeout occurs. + * + * The caller must free() the returned string. + */ +char *control_readln(void) +{ + char *buf = NULL; + size_t idx = 0; + size_t buflen = 0; + + timeout_begin(TIMEOUT); + + for (;;) { + ssize_t ret; + + if (idx >= buflen) { + char *new_buf; + + new_buf = realloc(buf, buflen + 80); + if (!new_buf) { + perror("realloc"); + exit(EXIT_FAILURE); + } + + buf = new_buf; + buflen += 80; + } + + do { + ret = recv(control_fd, &buf[idx], 1, 0); + timeout_check("recv"); + } while (ret < 0 && errno == EINTR); + + if (ret == 0) { + fprintf(stderr, "unexpected EOF on control socket\n"); + exit(EXIT_FAILURE); + } + + if (ret != 1) { + perror("recv"); + exit(EXIT_FAILURE); + } + + if (buf[idx] == '\n') { + buf[idx] = '\0'; + break; + } + + idx++; + } + + timeout_end(); + + return buf; +} + +/* Wait until a given line is received or a timeout occurs */ +void control_expectln(const char *str) +{ + char *line; + + line = control_readln(); + if (strcmp(str, line) != 0) { + fprintf(stderr, "expected \"%s\" on control socket, got \"%s\"\n", + str, line); + exit(EXIT_FAILURE); + } + + free(line); +} diff --git a/tools/testing/vsock/control.h b/tools/testing/vsock/control.h new file mode 100644 index 000000000000..54a07efd267c --- /dev/null +++ b/tools/testing/vsock/control.h @@ -0,0 +1,13 @@ +#ifndef CONTROL_H +#define CONTROL_H + +#include + +void control_init(const char *control_host, const char *control_port, + bool server); +void control_cleanup(void); +void control_writeln(const char *str); +char *control_readln(void); +void control_expectln(const char *str); + +#endif /* CONTROL_H */ diff --git a/tools/testing/vsock/timeout.c b/tools/testing/vsock/timeout.c new file mode 100644 index 000000000000..c49b3003b2db --- /dev/null +++ b/tools/testing/vsock/timeout.c @@ -0,0 +1,64 @@ +/* Timeout API for single-threaded programs that use blocking + * syscalls (read/write/send/recv/connect/accept). + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Stefan Hajnoczi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +/* Use the following pattern: + * + * timeout_begin(TIMEOUT); + * do { + * ret = accept(...); + * timeout_check("accept"); + * } while (ret < 0 && ret == EINTR); + * timeout_end(); + */ + +#include +#include +#include +#include +#include "timeout.h" + +static volatile bool timeout; + +/* SIGALRM handler function. Do not use sleep(2), alarm(2), or + * setitimer(2) while using this API - they may interfere with each + * other. + */ +void sigalrm(int signo) +{ + timeout = true; +} + +/* Start a timeout. Call timeout_check() to verify that the timeout hasn't + * expired. timeout_end() must be called to stop the timeout. Timeouts cannot + * be nested. + */ +void timeout_begin(unsigned int seconds) +{ + alarm(seconds); +} + +/* Exit with an error message if the timeout has expired */ +void timeout_check(const char *operation) +{ + if (timeout) { + fprintf(stderr, "%s timed out\n", operation); + exit(EXIT_FAILURE); + } +} + +/* Stop a timeout */ +void timeout_end(void) +{ + alarm(0); + timeout = false; +} diff --git a/tools/testing/vsock/timeout.h b/tools/testing/vsock/timeout.h new file mode 100644 index 000000000000..77db9ce9860a --- /dev/null +++ b/tools/testing/vsock/timeout.h @@ -0,0 +1,14 @@ +#ifndef TIMEOUT_H +#define TIMEOUT_H + +enum { + /* Default timeout */ + TIMEOUT = 10 /* seconds */ +}; + +void sigalrm(int signo); +void timeout_begin(unsigned int seconds); +void timeout_check(const char *operation); +void timeout_end(void); + +#endif /* TIMEOUT_H */ diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c new file mode 100644 index 000000000000..e896a4af52f4 --- /dev/null +++ b/tools/testing/vsock/vsock_diag_test.c @@ -0,0 +1,681 @@ +/* + * vsock_diag_test - vsock_diag.ko test suite + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Stefan Hajnoczi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../include/uapi/linux/vm_sockets.h" +#include "../../../include/uapi/linux/vm_sockets_diag.h" + +#include "timeout.h" +#include "control.h" + +enum test_mode { + TEST_MODE_UNSET, + TEST_MODE_CLIENT, + TEST_MODE_SERVER +}; + +/* Per-socket status */ +struct vsock_stat { + struct list_head list; + struct vsock_diag_msg msg; +}; + +static const char *sock_type_str(int type) +{ + switch (type) { + case SOCK_DGRAM: + return "DGRAM"; + case SOCK_STREAM: + return "STREAM"; + default: + return "INVALID TYPE"; + } +} + +static const char *sock_state_str(int state) +{ + switch (state) { + case TCP_CLOSE: + return "UNCONNECTED"; + case TCP_SYN_SENT: + return "CONNECTING"; + case TCP_ESTABLISHED: + return "CONNECTED"; + case TCP_CLOSING: + return "DISCONNECTING"; + case TCP_LISTEN: + return "LISTEN"; + default: + return "INVALID STATE"; + } +} + +static const char *sock_shutdown_str(int shutdown) +{ + switch (shutdown) { + case 1: + return "RCV_SHUTDOWN"; + case 2: + return "SEND_SHUTDOWN"; + case 3: + return "RCV_SHUTDOWN | SEND_SHUTDOWN"; + default: + return "0"; + } +} + +static void print_vsock_addr(FILE *fp, unsigned int cid, unsigned int port) +{ + if (cid == VMADDR_CID_ANY) + fprintf(fp, "*:"); + else + fprintf(fp, "%u:", cid); + + if (port == VMADDR_PORT_ANY) + fprintf(fp, "*"); + else + fprintf(fp, "%u", port); +} + +static void print_vsock_stat(FILE *fp, struct vsock_stat *st) +{ + print_vsock_addr(fp, st->msg.vdiag_src_cid, st->msg.vdiag_src_port); + fprintf(fp, " "); + print_vsock_addr(fp, st->msg.vdiag_dst_cid, st->msg.vdiag_dst_port); + fprintf(fp, " %s %s %s %u\n", + sock_type_str(st->msg.vdiag_type), + sock_state_str(st->msg.vdiag_state), + sock_shutdown_str(st->msg.vdiag_shutdown), + st->msg.vdiag_ino); +} + +static void print_vsock_stats(FILE *fp, struct list_head *head) +{ + struct vsock_stat *st; + + list_for_each_entry(st, head, list) + print_vsock_stat(fp, st); +} + +static struct vsock_stat *find_vsock_stat(struct list_head *head, int fd) +{ + struct vsock_stat *st; + struct stat stat; + + if (fstat(fd, &stat) < 0) { + perror("fstat"); + exit(EXIT_FAILURE); + } + + list_for_each_entry(st, head, list) + if (st->msg.vdiag_ino == stat.st_ino) + return st; + + fprintf(stderr, "cannot find fd %d\n", fd); + exit(EXIT_FAILURE); +} + +static void check_no_sockets(struct list_head *head) +{ + if (!list_empty(head)) { + fprintf(stderr, "expected no sockets\n"); + print_vsock_stats(stderr, head); + exit(1); + } +} + +static void check_num_sockets(struct list_head *head, int expected) +{ + struct list_head *node; + int n = 0; + + list_for_each(node, head) + n++; + + if (n != expected) { + fprintf(stderr, "expected %d sockets, found %d\n", + expected, n); + print_vsock_stats(stderr, head); + exit(EXIT_FAILURE); + } +} + +static void check_socket_state(struct vsock_stat *st, __u8 state) +{ + if (st->msg.vdiag_state != state) { + fprintf(stderr, "expected socket state %#x, got %#x\n", + state, st->msg.vdiag_state); + exit(EXIT_FAILURE); + } +} + +static void send_req(int fd) +{ + struct sockaddr_nl nladdr = { + .nl_family = AF_NETLINK, + }; + struct { + struct nlmsghdr nlh; + struct vsock_diag_req vreq; + } req = { + .nlh = { + .nlmsg_len = sizeof(req), + .nlmsg_type = SOCK_DIAG_BY_FAMILY, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + }, + .vreq = { + .sdiag_family = AF_VSOCK, + .vdiag_states = ~(__u32)0, + }, + }; + struct iovec iov = { + .iov_base = &req, + .iov_len = sizeof(req), + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + for (;;) { + if (sendmsg(fd, &msg, 0) < 0) { + if (errno == EINTR) + continue; + + perror("sendmsg"); + exit(EXIT_FAILURE); + } + + return; + } +} + +static ssize_t recv_resp(int fd, void *buf, size_t len) +{ + struct sockaddr_nl nladdr = { + .nl_family = AF_NETLINK, + }; + struct iovec iov = { + .iov_base = buf, + .iov_len = len, + }; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + ssize_t ret; + + do { + ret = recvmsg(fd, &msg, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + perror("recvmsg"); + exit(EXIT_FAILURE); + } + + return ret; +} + +static void add_vsock_stat(struct list_head *sockets, + const struct vsock_diag_msg *resp) +{ + struct vsock_stat *st; + + st = malloc(sizeof(*st)); + if (!st) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + st->msg = *resp; + list_add_tail(&st->list, sockets); +} + +/* + * Read vsock stats into a list. + */ +static void read_vsock_stat(struct list_head *sockets) +{ + long buf[8192 / sizeof(long)]; + int fd; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + if (fd < 0) { + perror("socket"); + exit(EXIT_FAILURE); + } + + send_req(fd); + + for (;;) { + const struct nlmsghdr *h; + ssize_t ret; + + ret = recv_resp(fd, buf, sizeof(buf)); + if (ret == 0) + goto done; + if (ret < sizeof(*h)) { + fprintf(stderr, "short read of %zd bytes\n", ret); + exit(EXIT_FAILURE); + } + + h = (struct nlmsghdr *)buf; + + while (NLMSG_OK(h, ret)) { + if (h->nlmsg_type == NLMSG_DONE) + goto done; + + if (h->nlmsg_type == NLMSG_ERROR) { + const struct nlmsgerr *err = NLMSG_DATA(h); + + if (h->nlmsg_len < NLMSG_LENGTH(sizeof(*err))) + fprintf(stderr, "NLMSG_ERROR\n"); + else { + errno = -err->error; + perror("NLMSG_ERROR"); + } + + exit(EXIT_FAILURE); + } + + if (h->nlmsg_type != SOCK_DIAG_BY_FAMILY) { + fprintf(stderr, "unexpected nlmsg_type %#x\n", + h->nlmsg_type); + exit(EXIT_FAILURE); + } + if (h->nlmsg_len < + NLMSG_LENGTH(sizeof(struct vsock_diag_msg))) { + fprintf(stderr, "short vsock_diag_msg\n"); + exit(EXIT_FAILURE); + } + + add_vsock_stat(sockets, NLMSG_DATA(h)); + + h = NLMSG_NEXT(h, ret); + } + } + +done: + close(fd); +} + +static void free_sock_stat(struct list_head *sockets) +{ + struct vsock_stat *st; + struct vsock_stat *next; + + list_for_each_entry_safe(st, next, sockets, list) + free(st); +} + +static void test_no_sockets(unsigned int peer_cid) +{ + LIST_HEAD(sockets); + + read_vsock_stat(&sockets); + + check_no_sockets(&sockets); + + free_sock_stat(&sockets); +} + +static void test_listen_socket_server(unsigned int peer_cid) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } addr = { + .svm = { + .svm_family = AF_VSOCK, + .svm_port = 1234, + .svm_cid = VMADDR_CID_ANY, + }, + }; + LIST_HEAD(sockets); + struct vsock_stat *st; + int fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + + if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { + perror("bind"); + exit(EXIT_FAILURE); + } + + if (listen(fd, 1) < 0) { + perror("listen"); + exit(EXIT_FAILURE); + } + + read_vsock_stat(&sockets); + + check_num_sockets(&sockets, 1); + st = find_vsock_stat(&sockets, fd); + check_socket_state(st, TCP_LISTEN); + + close(fd); + free_sock_stat(&sockets); +} + +static void test_connect_client(unsigned int peer_cid) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } addr = { + .svm = { + .svm_family = AF_VSOCK, + .svm_port = 1234, + .svm_cid = peer_cid, + }, + }; + int fd; + int ret; + LIST_HEAD(sockets); + struct vsock_stat *st; + + control_expectln("LISTENING"); + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + + timeout_begin(TIMEOUT); + do { + ret = connect(fd, &addr.sa, sizeof(addr.svm)); + timeout_check("connect"); + } while (ret < 0 && errno == EINTR); + timeout_end(); + + if (ret < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + read_vsock_stat(&sockets); + + check_num_sockets(&sockets, 1); + st = find_vsock_stat(&sockets, fd); + check_socket_state(st, TCP_ESTABLISHED); + + control_expectln("DONE"); + control_writeln("DONE"); + + close(fd); + free_sock_stat(&sockets); +} + +static void test_connect_server(unsigned int peer_cid) +{ + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } addr = { + .svm = { + .svm_family = AF_VSOCK, + .svm_port = 1234, + .svm_cid = VMADDR_CID_ANY, + }, + }; + union { + struct sockaddr sa; + struct sockaddr_vm svm; + } clientaddr; + socklen_t clientaddr_len = sizeof(clientaddr.svm); + LIST_HEAD(sockets); + struct vsock_stat *st; + int fd; + int client_fd; + + fd = socket(AF_VSOCK, SOCK_STREAM, 0); + + if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) { + perror("bind"); + exit(EXIT_FAILURE); + } + + if (listen(fd, 1) < 0) { + perror("listen"); + exit(EXIT_FAILURE); + } + + control_writeln("LISTENING"); + + timeout_begin(TIMEOUT); + do { + client_fd = accept(fd, &clientaddr.sa, &clientaddr_len); + timeout_check("accept"); + } while (client_fd < 0 && errno == EINTR); + timeout_end(); + + if (client_fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + if (clientaddr.sa.sa_family != AF_VSOCK) { + fprintf(stderr, "expected AF_VSOCK from accept(2), got %d\n", + clientaddr.sa.sa_family); + exit(EXIT_FAILURE); + } + if (clientaddr.svm.svm_cid != peer_cid) { + fprintf(stderr, "expected peer CID %u from accept(2), got %u\n", + peer_cid, clientaddr.svm.svm_cid); + exit(EXIT_FAILURE); + } + + read_vsock_stat(&sockets); + + check_num_sockets(&sockets, 2); + find_vsock_stat(&sockets, fd); + st = find_vsock_stat(&sockets, client_fd); + check_socket_state(st, TCP_ESTABLISHED); + + control_writeln("DONE"); + control_expectln("DONE"); + + close(client_fd); + close(fd); + free_sock_stat(&sockets); +} + +static struct { + const char *name; + void (*run_client)(unsigned int peer_cid); + void (*run_server)(unsigned int peer_cid); +} test_cases[] = { + { + .name = "No sockets", + .run_server = test_no_sockets, + }, + { + .name = "Listen socket", + .run_server = test_listen_socket_server, + }, + { + .name = "Connect", + .run_client = test_connect_client, + .run_server = test_connect_server, + }, + {}, +}; + +static void init_signals(void) +{ + struct sigaction act = { + .sa_handler = sigalrm, + }; + + sigaction(SIGALRM, &act, NULL); + signal(SIGPIPE, SIG_IGN); +} + +static unsigned int parse_cid(const char *str) +{ + char *endptr = NULL; + unsigned long int n; + + errno = 0; + n = strtoul(str, &endptr, 10); + if (errno || *endptr != '\0') { + fprintf(stderr, "malformed CID \"%s\"\n", str); + exit(EXIT_FAILURE); + } + return n; +} + +static const char optstring[] = ""; +static const struct option longopts[] = { + { + .name = "control-host", + .has_arg = required_argument, + .val = 'H', + }, + { + .name = "control-port", + .has_arg = required_argument, + .val = 'P', + }, + { + .name = "mode", + .has_arg = required_argument, + .val = 'm', + }, + { + .name = "peer-cid", + .has_arg = required_argument, + .val = 'p', + }, + { + .name = "help", + .has_arg = no_argument, + .val = '?', + }, + {}, +}; + +static void usage(void) +{ + fprintf(stderr, "Usage: vsock_diag_test [--help] [--control-host=] --control-port= --mode=client|server --peer-cid=\n" + "\n" + " Server: vsock_diag_test --control-port=1234 --mode=server --peer-cid=3\n" + " Client: vsock_diag_test --control-host=192.168.0.1 --control-port=1234 --mode=client --peer-cid=2\n" + "\n" + "Run vsock_diag.ko tests. Must be launched in both\n" + "guest and host. One side must use --mode=client and\n" + "the other side must use --mode=server.\n" + "\n" + "A TCP control socket connection is used to coordinate tests\n" + "between the client and the server. The server requires a\n" + "listen address and the client requires an address to\n" + "connect to.\n" + "\n" + "The CID of the other side must be given with --peer-cid=.\n"); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + const char *control_host = NULL; + const char *control_port = NULL; + int mode = TEST_MODE_UNSET; + unsigned int peer_cid = VMADDR_CID_ANY; + int i; + + init_signals(); + + for (;;) { + int opt = getopt_long(argc, argv, optstring, longopts, NULL); + + if (opt == -1) + break; + + switch (opt) { + case 'H': + control_host = optarg; + break; + case 'm': + if (strcmp(optarg, "client") == 0) + mode = TEST_MODE_CLIENT; + else if (strcmp(optarg, "server") == 0) + mode = TEST_MODE_SERVER; + else { + fprintf(stderr, "--mode must be \"client\" or \"server\"\n"); + return EXIT_FAILURE; + } + break; + case 'p': + peer_cid = parse_cid(optarg); + break; + case 'P': + control_port = optarg; + break; + case '?': + default: + usage(); + } + } + + if (!control_port) + usage(); + if (mode == TEST_MODE_UNSET) + usage(); + if (peer_cid == VMADDR_CID_ANY) + usage(); + + if (!control_host) { + if (mode != TEST_MODE_SERVER) + usage(); + control_host = "0.0.0.0"; + } + + control_init(control_host, control_port, mode == TEST_MODE_SERVER); + + for (i = 0; test_cases[i].name; i++) { + void (*run)(unsigned int peer_cid); + + printf("%s...", test_cases[i].name); + fflush(stdout); + + if (mode == TEST_MODE_CLIENT) + run = test_cases[i].run_client; + else + run = test_cases[i].run_server; + + if (run) + run(peer_cid); + + printf("ok\n"); + } + + control_cleanup(); + return EXIT_SUCCESS; +} From 27204aaa9dc67b833b77179fdac556288ec3a4bf Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:03:44 -0700 Subject: [PATCH 1310/1322] tcp: uniform the set up of sockets after successful connection Currently in the TCP code, the initialization sequence for cached metrics, congestion control, BPF, etc, after successful connection is very inconsistent. This introduces inconsistent bevhavior and is prone to bugs. The current call sequence is as follows: (1) for active case (tcp_finish_connect() case): tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); (2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case): icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tcp_init_buffer_space(sk); tcp_init_metrics(sk); (3) for TFO passive case (tcp_fastopen_create_child()): inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); This commit uniforms the above functions to have the following sequence: tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); This sequence is the same as the (1) active case. We pick this sequence because this order correctly allows BPF to override the settings including congestion control module and initial cwnd, etc from the route, and then allows the CC module to see those settings. Suggested-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ net/ipv4/tcp_fastopen.c | 7 +------ net/ipv4/tcp_input.c | 21 +++------------------ 4 files changed, 17 insertions(+), 24 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a3a8af56fd6..426c2e986016 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -416,6 +416,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); +void tcp_init_transfer(struct sock *sk, int bpf_op); unsigned int tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 23225c98d287..c115e37ca608 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -456,6 +456,18 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { if (tsflags && skb) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index de470e7e586f..29fff14d5a53 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -236,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index db9bb46b5776..bd3a35f5dbf2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5513,20 +5513,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5693,7 +5686,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5920,14 +5912,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5957,8 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * are sent out. */ tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); + } if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); From 6d05081e55a5a55b32ae6609f1902eca8277541d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:04:04 -0700 Subject: [PATCH 1311/1322] tcp: clean up TFO server's initial tcp_rearm_rto() call This commit does a cleanup and moves tcp_rearm_rto() call in the TFO server case into a previous spot in tcp_rcv_state_process() to make it more compact. This is only a cosmetic change. Suggested-by: Yuchung Cheng Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bd3a35f5dbf2..c5b8d61846c2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5911,6 +5911,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; @@ -5933,18 +5942,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); From 1bcdca3ffbbbed0d09b46cf4263ed2593a418375 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Wed, 4 Oct 2017 15:59:49 -0400 Subject: [PATCH 1312/1322] net/ipv4: Remove unused variable in route.c int rc is unmodified after initalization in net/ipv4/route.c, this patch simply cleans up that variable and returns 0. This was found with coccicheck M=net/ipv4/ on linus' tree. Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac6fde5d45f1..1c7ed77968c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3038,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3095,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL From 9dff9936f0ccfffba5106ee4217c71c7bcf95143 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Wed, 4 Oct 2017 12:10:43 -0700 Subject: [PATCH 1313/1322] RDS: IB: Limit the scope of has_fr/has_fmr variables This patch fixes the scope of has_fr and has_fmr variables as they are needed only in rds_ib_add_one(). Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 11 ++++++----- net/rds/ib.h | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index a0954ace3774..36dd2099048a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -126,6 +126,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; + bool has_fr, has_fmr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) @@ -143,11 +144,11 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index bf4822407567..6ea6a27891b0 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -215,8 +215,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - bool has_fmr; - bool has_fr; bool use_fastreg; unsigned int max_mrs; From b1fb67fa501c4787035317f84db6caf013385581 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Wed, 4 Oct 2017 12:11:29 -0700 Subject: [PATCH 1314/1322] RDS: IB: Initialize max_items based on underlying device attributes Use max_1m_mrs/max_8k_mrs while setting max_items, as the former variables are set based on the underlying device attributes. Signed-off-by: Avinash Repaka Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 9a3c54e659e9..e678699268a2 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -601,11 +601,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; - pool->max_items = RDS_MR_1M_POOL_SIZE; + pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; - pool->max_items = RDS_MR_8K_POOL_SIZE; + pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 12:59:58 -0700 Subject: [PATCH 1315/1322] tcp: new list for sent but unacked skbs for RACK recovery This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- include/linux/tcp.h | 1 + include/net/tcp.h | 24 ++++++++++++++++++++++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 9 +++++++-- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 42 +++++++++++++++++++++++++++++----------- 7 files changed, 74 insertions(+), 16 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ada821466e88..01a985937867 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t; * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function + * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on @@ -686,8 +687,14 @@ struct sk_buff { */ char cb[48] __aligned(8); - unsigned long _skb_refdst; - void (*destructor)(struct sk_buff *skb); + union { + struct { + unsigned long _skb_refdst; + void (*destructor)(struct sk_buff *skb); + }; + struct list_head tcp_tsorted_anchor; + }; + #ifdef CONFIG_XFRM struct sec_path *sp; #endif diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4aa40ef02d32..1d2c44e09e31 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -191,6 +191,7 @@ struct tcp_sock { u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ + struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 426c2e986016..3b16f353b539 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1589,14 +1589,34 @@ enum tcp_chrono { void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); +/* This helper is needed, because skb->tcp_tsorted_anchor uses + * the same memory storage than skb->destructor/_skb_refdst + */ +static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) +{ + skb->destructor = NULL; + skb->_skb_refdst = 0UL; +} + +#define tcp_skb_tsorted_save(skb) { \ + unsigned long _save = skb->_skb_refdst; \ + skb->_skb_refdst = 0UL; + +#define tcp_skb_tsorted_restore(skb) \ + skb->_skb_refdst = _save; \ +} + /* write queue abstraction */ static inline void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); sk_wmem_free_skb(sk, skb); + } + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); } @@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new, static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { + list_del(&skb->tcp_tsorted_anchor); + tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c115e37ca608..8cf742fd4f99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -881,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c5b8d61846c2..fb0d7ed84b94 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..2341b9f857b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc9e46a5369..8162e2880178 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -3023,6 +3037,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk) } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_unlink_write_queue(skb, sk); __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); From 043b87d7599ed8e86a33f4cbc3f062d57e263711 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 4 Oct 2017 12:59:59 -0700 Subject: [PATCH 1316/1322] tcp: more efficient RACK loss detection Use the new time-ordered list to speed up RACK. The detection logic is identical. But since the list is chronologically ordered by skb_mstamp and contains only skbs not yet acked or sacked, RACK can abort the loop upon hitting skbs that were sent more recently. On YouTube servers this patch reduces the iterations on write queue by 40x. The improvement is even bigger with large BDP networks. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..8aa56caefde8 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -58,17 +58,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) - continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, tp->rack.end_seq, scb->end_seq)) { /* Step 3 in draft-cheng-tcpm-rack-00.txt: @@ -81,6 +74,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if (remaining < 0) { tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); continue; } @@ -91,11 +85,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ + } else { break; } } From bef06223083b81d2064824afe2bc85be416ab73a Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 4 Oct 2017 13:00:00 -0700 Subject: [PATCH 1317/1322] tcp: a small refactor of RACK loss detection Refactor the RACK loop to improve readability and speed up the checks. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 8aa56caefde8..cda6074a429a 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -61,32 +61,28 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - list_del_init(&skb->tcp_tsorted_anchor); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - } else { - break; } } } From d009313c99ba575b65a944fe2c683c6346ea1721 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 5 Oct 2017 10:10:23 +0100 Subject: [PATCH 1318/1322] net: qcom/emac: make function emac_isr static The function emac_isr is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warnings: symbol 'emac_isr' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index 759543512117..f477ba29c569 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -130,7 +130,7 @@ static int emac_start_xmit(struct sk_buff *skb, struct net_device *netdev) return emac_mac_tx_buf_send(adpt, &adpt->tx_q, skb); } -irqreturn_t emac_isr(int _irq, void *data) +static irqreturn_t emac_isr(int _irq, void *data) { struct emac_irq *irq = data; struct emac_adapter *adpt = From b13c5c14dbfd5923d773de9661404ed9600c53ef Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 5 Oct 2017 10:41:57 -0400 Subject: [PATCH 1319/1322] libbpf: parse maps sections of varying size This library previously assumed a fixed-size map options structure. Any new options were ignored. In order to allow the options structure to grow and to support parsing older programs, this patch updates the maps section parsing to handle varying sizes. Object files with maps sections smaller than expected will have the new fields initialized to zero. Object files which have larger than expected maps sections will be rejected unless all of the unrecognized data is zero. This change still assumes that each map definition in the maps section is the same size. Signed-off-by: Craig Gallek Acked-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/lib/bpf/libbpf.c | 70 +++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4f402dcdf372..23152890ec60 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -579,31 +579,6 @@ bpf_object__init_kversion(struct bpf_object *obj, return 0; } -static int -bpf_object__validate_maps(struct bpf_object *obj) -{ - int i; - - /* - * If there's only 1 map, the only error case should have been - * catched in bpf_object__init_maps(). - */ - if (!obj->maps || !obj->nr_maps || (obj->nr_maps == 1)) - return 0; - - for (i = 1; i < obj->nr_maps; i++) { - const struct bpf_map *a = &obj->maps[i - 1]; - const struct bpf_map *b = &obj->maps[i]; - - if (b->offset - a->offset < sizeof(struct bpf_map_def)) { - pr_warning("corrupted map section in %s: map \"%s\" too small\n", - obj->path, a->name); - return -EINVAL; - } - } - return 0; -} - static int compare_bpf_map(const void *_a, const void *_b) { const struct bpf_map *a = _a; @@ -615,7 +590,7 @@ static int compare_bpf_map(const void *_a, const void *_b) static int bpf_object__init_maps(struct bpf_object *obj) { - int i, map_idx, nr_maps = 0; + int i, map_idx, map_def_sz, nr_maps = 0; Elf_Scn *scn; Elf_Data *data; Elf_Data *symbols = obj->efile.symbols; @@ -658,6 +633,15 @@ bpf_object__init_maps(struct bpf_object *obj) if (!nr_maps) return 0; + /* Assume equally sized map definitions */ + map_def_sz = data->d_size / nr_maps; + if (!data->d_size || (data->d_size % nr_maps) != 0) { + pr_warning("unable to determine map definition size " + "section %s, %d maps in %zd bytes\n", + obj->path, nr_maps, data->d_size); + return -EINVAL; + } + obj->maps = calloc(nr_maps, sizeof(obj->maps[0])); if (!obj->maps) { pr_warning("alloc maps for object failed\n"); @@ -690,7 +674,7 @@ bpf_object__init_maps(struct bpf_object *obj) obj->efile.strtabidx, sym.st_name); obj->maps[map_idx].offset = sym.st_value; - if (sym.st_value + sizeof(struct bpf_map_def) > data->d_size) { + if (sym.st_value + map_def_sz > data->d_size) { pr_warning("corrupted maps section in %s: last map \"%s\" too small\n", obj->path, map_name); return -EINVAL; @@ -704,12 +688,40 @@ bpf_object__init_maps(struct bpf_object *obj) pr_debug("map %d is \"%s\"\n", map_idx, obj->maps[map_idx].name); def = (struct bpf_map_def *)(data->d_buf + sym.st_value); - obj->maps[map_idx].def = *def; + /* + * If the definition of the map in the object file fits in + * bpf_map_def, copy it. Any extra fields in our version + * of bpf_map_def will default to zero as a result of the + * calloc above. + */ + if (map_def_sz <= sizeof(struct bpf_map_def)) { + memcpy(&obj->maps[map_idx].def, def, map_def_sz); + } else { + /* + * Here the map structure being read is bigger than what + * we expect, truncate if the excess bits are all zero. + * If they are not zero, reject this map as + * incompatible. + */ + char *b; + for (b = ((char *)def) + sizeof(struct bpf_map_def); + b < ((char *)def) + map_def_sz; b++) { + if (*b != 0) { + pr_warning("maps section in %s: \"%s\" " + "has unrecognized, non-zero " + "options\n", + obj->path, map_name); + return -EINVAL; + } + } + memcpy(&obj->maps[map_idx].def, def, + sizeof(struct bpf_map_def)); + } map_idx++; } qsort(obj->maps, obj->nr_maps, sizeof(obj->maps[0]), compare_bpf_map); - return bpf_object__validate_maps(obj); + return 0; } static int bpf_object__elf_collect(struct bpf_object *obj) From fe9b5f774b28143dcc2957eadffefdca6171a26a Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 5 Oct 2017 10:41:58 -0400 Subject: [PATCH 1320/1322] libbpf: use map_flags when creating maps This is required to use BPF_MAP_TYPE_LPM_TRIE or any other map type which requires flags. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/libbpf.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 23152890ec60..5aa45f89da93 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -942,7 +942,7 @@ bpf_object__create_maps(struct bpf_object *obj) def->key_size, def->value_size, def->max_entries, - 0); + def->map_flags); if (*pfd < 0) { size_t j; int err = *pfd; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 7959086eb9c9..6e20003109e0 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -207,6 +207,7 @@ struct bpf_map_def { unsigned int key_size; unsigned int value_size; unsigned int max_entries; + unsigned int map_flags; }; /* From 380537b4f7f2e706e499695b38eabc90a5f72fa8 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 02:07:08 +0800 Subject: [PATCH 1321/1322] net: ipv6: remove unused code in ipv6_find_hdr() Storing the left length of skb into 'len' actually has no effect so we can remove it. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 115d60919f72..11025f8d124b 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -187,7 +187,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; bool found; if (fragoff) @@ -204,7 +203,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } - len = skb->len - start; do { struct ipv6_opt_hdr _hdr, *hp; @@ -273,7 +271,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, if (!found) { nexthdr = hp->nexthdr; - len -= hdrlen; start += hdrlen; } } while (!found); From cc71b7b071192ac1c288e272fdc3f3877eb96663 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 5 Oct 2017 15:45:32 -0400 Subject: [PATCH 1322/1322] net/ipv6: remove unused err variable on icmpv6_push_pending_frames int err is unused by icmpv6_push_pending_frames(), this patch returns removes the variable and returns the function with 0. git bisect shows this variable has been around since linux has been in git in commit 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2. This was found by running make coccicheck M=net/ipv6/ on linus' tree on commit 77ede3a014a32746002f7889211f0cecf4803163 (current HEAD as of this patch). Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5acb54405b10..aeb49b4d8c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -255,7 +255,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, { struct sk_buff *skb; struct icmp6hdr *icmp6h; - int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) @@ -288,7 +287,7 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } ip6_push_pending_frames(sk); out: - return err; + return 0; } struct icmpv6_msg {