From 3073774e638ef18d222465fe92bfc8fccb90d288 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:41 -0500 Subject: [PATCH 001/236] KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt Currently the kvm_resize_hpt structure has two fields relevant to the state of an ongoing resize: 'prepare_done', which indicates whether the worker thread has completed or not, and 'error' which indicates whether it was successful or not. Since the success/failure isn't known until completion, this is confusingly redundant. This patch consolidates the information into just the 'error' value: -EBUSY indicates the worked is still in progress, other negative values indicate (completed) failure, 0 indicates successful completion. As a bonus this reduces size of struct kvm_resize_hpt by __alignof__(struct kvm_hpt_info) and saves few bytes of code. While there correct comment in struct kvm_resize_hpt which references a non-existent semaphore (leftover from an early draft). Assert with WARN_ON() in case of HPT allocation thread work runs more than once for resize request or resize_hpt_allocate() returns -EBUSY that is treated specially. Change comparison against zero to make checkpatch.pl happy. Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Serhii Popovych [dwg: Changed BUG_ON()s to WARN_ON()s and altered commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 46 ++++++++++++++++++----------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 966097232d21..f5f2c6bf5856 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -65,11 +65,17 @@ struct kvm_resize_hpt { u32 order; /* These fields protected by kvm->lock */ - int error; - bool prepare_done; - /* Private to the work thread, until prepare_done is true, - * then protected by kvm->resize_hpt_sem */ + /* Possible values and their usage: + * <0 an error occurred during allocation, + * -EBUSY allocation is in the progress, + * 0 allocation made successfuly. + */ + int error; + + /* Private to the work thread, until error != -EBUSY, + * then protected by kvm->lock. + */ struct kvm_hpt_info hpt; }; @@ -1433,15 +1439,23 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm *kvm = resize->kvm; int err; + if (WARN_ON(resize->error != -EBUSY)) + return; + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", resize->order); err = resize_hpt_allocate(resize); + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + mutex_lock(&kvm->lock); resize->error = err; - resize->prepare_done = true; mutex_unlock(&kvm->lock); } @@ -1466,14 +1480,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (resize) { if (resize->order == shift) { - /* Suitable resize in progress */ - if (resize->prepare_done) { - ret = resize->error; - if (ret != 0) - resize_hpt_release(kvm, resize); - } else { + /* Suitable resize in progress? */ + ret = resize->error; + if (ret == -EBUSY) ret = 100; /* estimated time in ms */ - } + else if (ret) + resize_hpt_release(kvm, resize); goto out; } @@ -1493,6 +1505,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = -ENOMEM; goto out; } + + resize->error = -EBUSY; resize->order = shift; resize->kvm = kvm; INIT_WORK(&resize->work, resize_hpt_prepare_work); @@ -1547,16 +1561,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, if (!resize || (resize->order != shift)) goto out; - ret = -EBUSY; - if (!resize->prepare_done) - goto out; - ret = resize->error; - if (ret != 0) + if (ret) goto out; ret = resize_hpt_rehash(resize); - if (ret != 0) + if (ret) goto out; resize_hpt_pivot(resize); From 4ed11aeefda439c76ddae3ceebcfa4fad111f149 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:42 -0500 Subject: [PATCH 002/236] KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests When serving multiple resize requests following could happen: CPU0 CPU1 ---- ---- kvm_vm_ioctl_resize_hpt_prepare(1); -> schedule_work() /* system_rq might be busy: delay */ kvm_vm_ioctl_resize_hpt_prepare(2); mutex_lock(); if (resize) { ... release_hpt_resize(); } ... resize_hpt_prepare_work() -> schedule_work() { mutex_unlock() /* resize->kvm could be wrong */ struct kvm *kvm = resize->kvm; mutex_lock(&kvm->lock); <<<< UAF ... } i.e. a second resize request with different order could be started by kvm_vm_ioctl_resize_hpt_prepare(), causing the previous request to be free()d when there's still an active worker thread which will try to access it. This leads to a use after free in point marked with UAF on the diagram above. To prevent this from happening, instead of unconditionally releasing a pre-existing resize structure from the prepare ioctl(), we check if the existing structure has an in-progress worker. We do that by checking if the resize->error == -EBUSY, which is safe because the resize->error field is protected by the kvm->lock. If there is an active worker, instead of releasing, we mark the structure as stale by unlinking it from kvm_struct. In the worker thread we check for a stale structure (with kvm->lock held), and in that case abort, releasing the stale structure ourself. We make the check both before and the actual allocation. Strictly, only the check afterwards is needed, the check before is an optimization: if the structure happens to become stale before the worker thread is dispatched, rather than during the allocation, it means we can avoid allocating then immediately freeing a potentially substantial amount of memory. This fixes following or similar host kernel crash message: [ 635.277361] Unable to handle kernel paging request for data at address 0x00000000 [ 635.277438] Faulting instruction address: 0xc00000000052f568 [ 635.277446] Oops: Kernel access of bad area, sig: 11 [#1] [ 635.277451] SMP NR_CPUS=2048 NUMA PowerNV [ 635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T) ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T) tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod [ 635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G ------------ T 3.10.0.bz1510771+ #1 [ 635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv] [ 635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000 [ 635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0 [ 635.278988] REGS: c0000007e91837f0 TRAP: 0300 Tainted: G ------------ T (3.10.0.bz1510771+) [ 635.279077] MSR: 9000000100009033 CR: 24002022 XER: 00000000 [ 635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1 GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10 GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000 GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0 GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40 GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000 GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7 GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000 GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10 [ 635.280149] NIP [c00000000052f568] __list_add+0x38/0x110 [ 635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280253] Call Trace: [ 635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70 [ 635.280426] [c0000007e9183ba0] [d00000000d24da04] resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv] [ 635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680 [ 635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520 [ 635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100 [ 635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4 [ 635.280814] Instruction dump: [ 635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78 f8010010 [ 635.281099] f821ff81 e8a50008 7fa52040 40de00b8 7fbd2840 40de008c 7fbff040 [ 635.281324] ---[ end trace b628b73449719b9d ]--- Cc: stable@vger.kernel.org # v4.10+ Fixes: b5baa6877315 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation") Signed-off-by: Serhii Popovych [dwg: Replaced BUG_ON()s with WARN_ONs() and reworded commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 54 ++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f5f2c6bf5856..8355398f0bb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1419,16 +1419,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - BUG_ON(kvm->arch.resize_hpt != resize); + if (WARN_ON(!mutex_is_locked(&kvm->lock))) + return; if (!resize) return; - if (resize->hpt.virt) - kvmppc_free_hpt(&resize->hpt); + if (resize->error != -EBUSY) { + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kfree(resize); + } - kvm->arch.resize_hpt = NULL; - kfree(resize); + if (kvm->arch.resize_hpt == resize) + kvm->arch.resize_hpt = NULL; } static void resize_hpt_prepare_work(struct work_struct *work) @@ -1437,26 +1441,42 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm_resize_hpt, work); struct kvm *kvm = resize->kvm; - int err; + int err = 0; if (WARN_ON(resize->error != -EBUSY)) return; - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", - resize->order); - - err = resize_hpt_allocate(resize); - - /* We have strict assumption about -EBUSY - * when preparing for HPT resize. - */ - if (WARN_ON(err == -EBUSY)) - err = -EINPROGRESS; - mutex_lock(&kvm->lock); + /* Request is still current? */ + if (kvm->arch.resize_hpt == resize) { + /* We may request large allocations here: + * do not sleep with kvm->lock held for a while. + */ + mutex_unlock(&kvm->lock); + + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); + + err = resize_hpt_allocate(resize); + + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + + mutex_lock(&kvm->lock); + /* It is possible that kvm->arch.resize_hpt != resize + * after we grab kvm->lock again. + */ + } + resize->error = err; + if (kvm->arch.resize_hpt != resize) + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); } From cfe17c9bbe6a673fdafdab179c32b355ed447f66 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Nov 2017 21:15:13 +0900 Subject: [PATCH 003/236] kbuild: move cc-option and cc-disable-warning after incl. arch Makefile Geert reported commit ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before incl. arch Makefile") broke cross-compilation using a cross-compiler that supports less compiler options than the host compiler. For example, cc1: error: unrecognized command line option "-Wno-unused-but-set-variable" This problem happens on architectures that setup CROSS_COMPILE in their arch/*/Makefile. Move the cc-option and cc-disable-warning back to the original position, but keep the Clang target options untouched. Fixes: ae6b289a3789 ("kbuild: Set KBUILD_CFLAGS before incl. arch Makefile") Reported-by: Geert Uytterhoeven Signed-off-by: Masahiro Yamada Tested-by: Geert Uytterhoeven --- Makefile | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index c988e46a53cd..477c4cf01cae 100644 --- a/Makefile +++ b/Makefile @@ -484,26 +484,6 @@ CLANG_GCC_TC := --gcc-toolchain=$(GCC_TOOLCHAIN) endif KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC) -KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) -KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) -KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -# Quiet clang warning: comparison of unsigned expression < 0 is always false -KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) -# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the -# source of a reference will be _MergedGlobals and not on of the whitelisted names. -# See modpost pattern 2 -KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) -KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) -KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) -KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) -else - -# These warnings generated too much noise in a regular build. -# Use make W=1 to enable them (see scripts/Makefile.extrawarn) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) -KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) endif ifeq ($(config-targets),1) @@ -716,6 +696,29 @@ ifdef CONFIG_CC_STACKPROTECTOR endif KBUILD_CFLAGS += $(stackp-flag) +ifeq ($(cc-name),clang) +KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable) +KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) +KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +# Quiet clang warning: comparison of unsigned expression < 0 is always false +KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare) +# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the +# source of a reference will be _MergedGlobals and not on of the whitelisted names. +# See modpost pattern 2 +KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,) +KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior) +KBUILD_CFLAGS += $(call cc-option, -no-integrated-as) +KBUILD_AFLAGS += $(call cc-option, -no-integrated-as) +else + +# These warnings generated too much noise in a regular build. +# Use make W=1 to enable them (see scripts/Makefile.extrawarn) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) +endif + ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001 From: Ma Shimiao Date: Tue, 12 Dec 2017 09:43:49 +0800 Subject: [PATCH 004/236] cgroup: avoid copying strings longer than the buffers cgroup root name and file name have max length limit, we should avoid copying longer name than that to the name. tj: minor update to $SUBJ. Signed-off-by: Ma Shimiao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..18d71fbd3923 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } From 17278a91e04f858155d54bee5528ba4fbcec6f87 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Nov 2017 12:01:20 +0000 Subject: [PATCH 005/236] MIPS: CPS: Fix r1 .set mt assembler warning MIPS CPS has a build warning on kernels configured for MIPS32R1 or MIPS64R1, due to the use of .set mt without a prior .set mips{32,64}r2: arch/mips/kernel/cps-vec.S Assembler messages: arch/mips/kernel/cps-vec.S:238: Warning: the `mt' extension requires MIPS32 revision 2 or greater Add .set MIPS_ISA_LEVEL_RAW before .set mt to silence the warning. Fixes: 245a7868d2f2 ("MIPS: smp-cps: rework core/VPE initialisation") Signed-off-by: James Hogan Cc: Paul Burton Cc: James Hogan Cc: James Hogan Cc: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17699/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cps-vec.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S index c7ed26029cbb..e68e6e04063a 100644 --- a/arch/mips/kernel/cps-vec.S +++ b/arch/mips/kernel/cps-vec.S @@ -235,6 +235,7 @@ LEAF(mips_cps_core_init) has_mt t0, 3f .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* Only allow 1 TC per VPE to execute... */ @@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes) #elif defined(CONFIG_MIPS_MT) .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* If the core doesn't support MT then return */ From a03fe72572c12e98f4173f8a535f32468e48b6ec Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:51:35 +0000 Subject: [PATCH 006/236] MIPS: Factor out NT_PRFPREG regset access helpers In preparation to fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") FCSR access regression factor out NT_PRFPREG regset access helpers for the non-MSA and the MSA variants respectively, to avoid having to deal with excessive indentation in the actual fix. No functional change, however use `target->thread.fpu.fpr[0]' rather than `target->thread.fpu.fpr[i]' for FGR holding type size determination as there's no `i' variable to refer to anymore, and for the factored out `i' variable declaration use `unsigned int' rather than `unsigned' as its type, following the common style. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17925/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 108 +++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 25 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index efbd8df8b665..62e8ffd9370a 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -419,25 +419,36 @@ static int gpr64_set(struct task_struct *target, #endif /* CONFIG_64BIT */ -static int fpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots + * correspond 1:1 to buffer slots. + */ +static int fpr_get_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) { - unsigned i; - int err; + return user_regset_copyout(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} + +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's + * general register slots are copied to buffer slots. + */ +static int fpr_get_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) +{ + unsigned int i; u64 fpr_val; - - /* XXX fcr31 */ - - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + int err; for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); - err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + err = user_regset_copyout(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -447,27 +458,54 @@ static int fpr_get(struct task_struct *target, return 0; } -static int fpr_set(struct task_struct *target, +/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + void *kbuf, void __user *ubuf) { - unsigned i; int err; - u64 fpr_val; /* XXX fcr31 */ - init_fp_ctx(target); + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + return err; +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP + * context's general register slots. + */ +static int fpr_set_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + return user_regset_copyin(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 + * bits only of FP context's general register slots. + */ +static int fpr_set_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + unsigned int i; + u64 fpr_val; + int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) { - err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -478,6 +516,26 @@ static int fpr_set(struct task_struct *target, return 0; } +/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +static int fpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int err; + + /* XXX fcr31 */ + + init_fp_ctx(target); + + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + + return err; +} + enum mips_regset { REGSET_GPR, REGSET_FPR, From dc24d0edf33c3e15099688b6bbdf7bdc24bf6e91 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:52:15 +0000 Subject: [PATCH 007/236] MIPS: Guard against any partial write attempt with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and ensure that no partial register write attempt is made with PTRACE_SETREGSET, as we do not preinitialize any temporaries used to hold incoming register data and consequently random data could be written. It is the responsibility of the caller, such as `ptrace_regset', to arrange for writes to span whole registers only, so here we only assert that it has indeed happened. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17926/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 62e8ffd9370a..7fcadaaf330f 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -516,7 +516,15 @@ static int fpr_set_msa(struct task_struct *target, return 0; } -/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * + * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', + * which is supposed to have been guaranteed by the kernel before + * calling us, e.g. in `ptrace_regset'. We enforce that requirement, + * so that we can safely avoid preinitializing temporaries for + * partial register writes. + */ static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, @@ -524,6 +532,8 @@ static int fpr_set(struct task_struct *target, { int err; + BUG_ON(count % sizeof(elf_fpreg_t)); + /* XXX fcr31 */ init_fp_ctx(target); From 80b3ffce0196ea50068885d085ff981e4b8396f4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:53:14 +0000 Subject: [PATCH 008/236] MIPS: Consistently handle buffer counter with PTRACE_SETREGSET Update commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") bug and consistently consume all data supplied to `fpr_set_msa' with the ptrace(2) PTRACE_SETREGSET request, such that a zero data buffer counter is returned where insufficient data has been given to fill a whole number of FP general registers. In reality this is not going to happen, as the caller is supposed to only supply data covering a whole number of registers and it is verified in `ptrace_regset' and again asserted in `fpr_set', however structuring code such that the presence of trailing partial FP general register data causes `fpr_set_msa' to return with a non-zero data buffer counter makes it appear that this trailing data will be used if there are subsequent writes made to FP registers, which is going to be the case with the FCSR once the missing write to that register has been fixed. Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Signed-off-by: Maciej W. Rozycki Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17927/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 7fcadaaf330f..47a01d5f26ea 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -504,7 +504,7 @@ static int fpr_set_msa(struct task_struct *target, int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) { err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); From be07a6a1188372b6d19a3307ec33211fc9c9439d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:54:33 +0000 Subject: [PATCH 009/236] MIPS: Fix an FCSR access API regression with NT_PRFPREG and MSA Fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") public API regression, then activated by commit 1db1af84d6df ("MIPS: Basic MSA context switching support"), that caused the FCSR register not to be read or written for CONFIG_CPU_HAS_MSA kernel configurations (regardless of actual presence or absence of the MSA feature in a given processor) with ptrace(2) PTRACE_GETREGSET and PTRACE_SETREGSET requests nor recorded in core dumps. This is because with !CONFIG_CPU_HAS_MSA configurations the whole of `elf_fpregset_t' array is bulk-copied as it is, which includes the FCSR in one half of the last, 33rd slot, whereas with CONFIG_CPU_HAS_MSA configurations array elements are copied individually, and then only the leading 32 FGR slots while the remaining slot is ignored. Correct the code then such that only FGR slots are copied in the respective !MSA and MSA helpers an then the FCSR slot is handled separately in common code. Use `ptrace_setfcr31' to update the FCSR too, so that the read-only mask is respected. Retrieving a correct value of FCSR is important in debugging not only for the human to be able to get the right interpretation of the situation, but for correct operation of GDB as well. This is because the condition code bits in FSCR are used by GDB to determine the location to place a breakpoint at when single-stepping through an FPU branch instruction. If such a breakpoint is placed incorrectly (i.e. with the condition reversed), then it will be missed, likely causing the debuggee to run away from the control of GDB and consequently breaking the process of investigation. Fortunately GDB continues using the older PTRACE_GETFPREGS ptrace(2) request which is unaffected, so the regression only really hits with post-mortem debug sessions using a core dump file, in which case execution, and consequently single-stepping through branches is not possible. Of course core files created by buggy kernels out there will have the value of FCSR recorded clobbered, but such core files cannot be corrected and the person using them simply will have to be aware that the value of FCSR retrieved is not reliable. Which also means we can likely get away without defining a replacement API which would ensure a correct value of FSCR to be retrieved, or none at all. This is based on previous work by Alex Smith, extensively rewritten. Signed-off-by: Alex Smith Signed-off-by: James Hogan Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: Paul Burton Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17928/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 47a01d5f26ea..0a939593ccb7 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -422,7 +422,7 @@ static int gpr64_set(struct task_struct *target, /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots - * correspond 1:1 to buffer slots. + * correspond 1:1 to buffer slots. Only general registers are copied. */ static int fpr_get_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -430,13 +430,14 @@ static int fpr_get_fpa(struct task_struct *target, { return user_regset_copyout(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's - * general register slots are copied to buffer slots. + * general register slots are copied to buffer slots. Only general + * registers are copied. */ static int fpr_get_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -458,20 +459,29 @@ static int fpr_get_msa(struct task_struct *target, return 0; } -/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. + */ static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); int err; - /* XXX fcr31 */ - if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); return err; } @@ -479,7 +489,7 @@ static int fpr_get(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP - * context's general register slots. + * context's general register slots. Only general registers are copied. */ static int fpr_set_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -487,13 +497,14 @@ static int fpr_set_fpa(struct task_struct *target, { return user_regset_copyin(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 - * bits only of FP context's general register slots. + * bits only of FP context's general register slots. Only general + * registers are copied. */ static int fpr_set_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -518,6 +529,8 @@ static int fpr_set_msa(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. * * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', * which is supposed to have been guaranteed by the kernel before @@ -530,18 +543,30 @@ static int fpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); + u32 fcr31; int err; BUG_ON(count % sizeof(elf_fpreg_t)); - /* XXX fcr31 */ - init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + if (count > 0) { + err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); + if (err) + return err; + + ptrace_setfcr31(target, fcr31); + } return err; } From 006501e039eec411842bb3150c41358867d320c2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:55:40 +0000 Subject: [PATCH 010/236] MIPS: Also verify sizeof `elf_fpreg_t' with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and like with the PTRACE_GETREGSET ptrace(2) request also apply a BUILD_BUG_ON check for the size of the `elf_fpreg_t' type in the PTRACE_SETREGSET request handler. Signed-off-by: Maciej W. Rozycki Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17929/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0a939593ccb7..256908951a7c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -447,6 +447,7 @@ static int fpr_get_msa(struct task_struct *target, u64 fpr_val; int err; + BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); err = user_regset_copyout(pos, count, kbuf, ubuf, From c8c5a3a24d395b14447a9a89d61586a913840a3b Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:56:54 +0000 Subject: [PATCH 011/236] MIPS: Disallow outsized PTRACE_SETREGSET NT_PRFPREG regset accesses Complement commit c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") and also reject outsized PTRACE_SETREGSET requests to the NT_PRFPREG regset, like with the NT_PRSTATUS regset. Signed-off-by: Maciej W. Rozycki Fixes: c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.17+ Patchwork: https://patchwork.linux-mips.org/patch/17930/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 256908951a7c..0b23b1ad99e6 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -550,6 +550,9 @@ static int fpr_set(struct task_struct *target, BUG_ON(count % sizeof(elf_fpreg_t)); + if (pos + count > sizeof(elf_fpregset_t)) + return -EIO; + init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) From 967a6a07e95c58eb9c1581d22a1d9c2d1929843f Mon Sep 17 00:00:00 2001 From: Masaharu Hayakawa Date: Wed, 13 Dec 2017 11:33:00 +0900 Subject: [PATCH 012/236] mmc: renesas_sdhi: Add MODULE_LICENSE The following error occurs when loading renesas_sdhi_core.c module, so add MODULE_LICENSE("GPL v2"). renesas_sdhi_core: module license 'unspecified' taints kernel. Signed-off-by: Masaharu Hayakawa Fixes: 9d08428afb72 ("mmc: renesas-sdhi: make renesas_sdhi_sys_dmac main module file") Cc: # v4.13+ [Shimoda: Added Fixes tag and Cc to the stable ML] Signed-off-by: Yoshihiro Shimoda Reviewed-by: Simon Horman Acked-by: Wolfram Sang Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index fcf7235d5742..157e1d9e7725 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -667,3 +668,5 @@ int renesas_sdhi_remove(struct platform_device *pdev) return 0; } EXPORT_SYMBOL_GPL(renesas_sdhi_remove); + +MODULE_LICENSE("GPL v2"); From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Dec 2017 05:09:47 -0800 Subject: [PATCH 013/236] cgroup: use strlcpy() instead of strscpy() to avoid spurious warning As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would work just as well and avoid that warning, so the change below could be folded into that commit. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 18d71fbd3923..f4c2f8cb5748 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 19 Dec 2017 12:56:57 +0530 Subject: [PATCH 014/236] cgroup: Fix deadlock in cpu hotplug path Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); From b67336eee3fcb8ecedc6c13e2bf88aacfa3151e2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Nov 2017 09:33:03 +0000 Subject: [PATCH 015/236] MIPS: Validate PR_SET_FP_MODE prctl(2) requests against the ABI of the task Fix an API loophole introduced with commit 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS"), where the caller of prctl(2) is incorrectly allowed to make a change to CP0.Status.FR or CP0.Config5.FRE register bits even if CONFIG_MIPS_O32_FP64_SUPPORT has not been enabled, despite that an executable requesting the mode requested via ELF file annotation would not be allowed to run in the first place, or for n64 and n64 ABI tasks which do not have non-default modes defined at all. Add suitable checks to `mips_set_process_fp_mode' and bail out if an invalid mode change has been requested for the ABI in effect, even if the FPU hardware or emulation would otherwise allow it. Always succeed however without taking any further action if the mode requested is the same as one already in effect, regardless of whether any mode change, should it be requested, would actually be allowed for the task concerned. Signed-off-by: Maciej W. Rozycki Fixes: 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS") Reviewed-by: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # 4.0+ Patchwork: https://patchwork.linux-mips.org/patch/17800/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 45d0b6b037ee..57028d49c202 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) struct task_struct *t; int max_users; + /* If nothing to change, return right away, successfully. */ + if (value == mips_get_process_fp_mode(task)) + return 0; + + /* Only accept a mode change if 64-bit FP enabled for o32. */ + if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) + return -EOPNOTSUPP; + + /* And only for o32 tasks. */ + if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS)) + return -EOPNOTSUPP; + /* Check the value is valid */ if (value & ~known_bits) return -EOPNOTSUPP; From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 20 Dec 2017 07:09:19 -0800 Subject: [PATCH 016/236] cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC While teaching css_task_iter to handle skipping over tasks which aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a silly bug. CSS_TASK_ITER_PROCS is implemented by repeating css_task_iter_advance() while the advanced cursor is pointing to a non-leader thread. However, the cursor variable, @l, wasn't updated when the iteration has to advance to the next css_set and the following repetition would operate on the terminal @l from the previous iteration which isn't pointing to a valid task leading to oopses like the following or infinite looping. BUG: unable to handle kernel NULL pointer dereference at 0000000000000254 IP: __task_pid_nr_ns+0xc7/0xf0 PGD 0 P4D 0 Oops: 0000 [#1] SMP ... CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1 Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017 task: ffff88c4baee8000 task.stack: ffff96d5c3158000 RIP: 0010:__task_pid_nr_ns+0xc7/0xf0 RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250 RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00 RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005 R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18 R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000 FS: 00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0 Call Trace: cgroup_procs_show+0x19/0x30 cgroup_seqfile_show+0x4c/0xb0 kernfs_seq_show+0x21/0x30 seq_read+0x2ec/0x3f0 kernfs_fop_read+0x134/0x180 __vfs_read+0x37/0x160 ? security_file_permission+0x9b/0xc0 vfs_read+0x8e/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x7f94455f942d RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60 R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0 R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560 Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50 Fix it by moving the initialization of the cursor below the repeat label. While at it, rename it to @next for readability. Signed-off-by: Tejun Heo Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") Cc: stable@vger.kernel.org # v4.14+ Reported-by: Laura Abbott Reported-by: Bronek Kozicki Reported-by: George Amanakis Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f4c2f8cb5748..2cf06c274e4c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && From 24c0df82ef7919e4d10cf2e4e65d368eb2e8ea21 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2017 12:01:21 +0100 Subject: [PATCH 017/236] netfilter: nf_tables: fix chain filter in nf_tables_dump_rules() ctx->chain may be null now that we have very large object names, so we cannot check for ctx->chain[0] here. Fixes: b7263e071aba7 ("netfilter: nf_tables: Allow table names of up to 255 chars") Signed-off-by: Pablo Neira Ayuso Acked-by: Phil Sutter --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 10798b357481..8d4526651661 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2072,7 +2072,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(chain, &table->chains, list) { - if (ctx && ctx->chain[0] && + if (ctx && ctx->chain && strcmp(ctx->chain, chain->name) != 0) continue; From 4c82fd0abb87e20d0d68ef5237e74732352806c8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Dec 2017 12:08:33 +0100 Subject: [PATCH 018/236] netfilter: uapi: correct UNTRACKED conntrack state bit number nft_ct exposes this bit to userspace. This used to be #define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_NUMBER + 1)) (IP_CT_NUMBER is 5, so this was 0x40) .. but this got changed to 8 (0x100) when the untracked object got removed. Replace this with a literal 6 to prevent further incompatible changes in case IP_CT_NUMBER ever increases. Fixes: cc41c84b7e7f2 ("netfilter: kill the fake untracked conntrack objects") Reported-by: Li Shuang Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 3fea7709a441..57ccfb32e87f 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -36,7 +36,7 @@ enum ip_conntrack_info { #define NF_CT_STATE_INVALID_BIT (1 << 0) #define NF_CT_STATE_BIT(ctinfo) (1 << ((ctinfo) % IP_CT_IS_REPLY + 1)) -#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_UNTRACKED + 1)) +#define NF_CT_STATE_UNTRACKED_BIT (1 << 6) /* Bitset representing status of connection. */ enum ip_conntrack_status { From 8bea728dce8972e534e6b99fd550f7b5cc3864e8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 25 Dec 2017 11:34:54 +0800 Subject: [PATCH 019/236] netfilter: nf_tables: fix potential NULL-ptr deref in nf_tables_dump_obj_done() If there is no NFTA_OBJ_TABLE and NFTA_OBJ_TYPE, the c.data will be NULL in nf_tables_getobj(). So before free filter->table in nf_tables_dump_obj_done(), we need to check if filter is NULL first. Fixes: e46abbcc05aa ("netfilter: nf_tables: Allow table names of up to 255 chars") Signed-off-by: Hangbin Liu Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d4526651661..07bd4138c84e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4665,8 +4665,10 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) { struct nft_obj_filter *filter = cb->data; - kfree(filter->table); - kfree(filter); + if (filter) { + kfree(filter->table); + kfree(filter); + } return 0; } From 955b1b5a00ba694159a7d3763412597f707c294d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 20 Dec 2017 16:30:50 +0900 Subject: [PATCH 020/236] nvme-pci: move use_sgl initialization to nvme_init_iod() A flag "use_sgl" of "struct nvme_iod" has been used in nvme_init_iod() without being set to any value. It seems like "use_sgl" has been set in either nvme_pci_setup_prps() or nvme_pci_setup_sgls() which occur later than nvme_init_iod(). Make "iod->use_sgl" being set in a proper place, nvme_init_iod(). Also move nvme_pci_use_sgls() up above nvme_init_iod() to make it possible to be called by nvme_init_iod(). Signed-off-by: Minwoo Im Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 42 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5800c3c9082..d53550e612bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -448,12 +448,31 @@ static void **nvme_pci_iod_list(struct request *req) return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); } +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int avg_seg_size; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), + blk_rq_nr_phys_segments(req)); + + if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) + return false; + if (!iod->nvmeq->qid) + return false; + if (!sgl_threshold || avg_seg_size < sgl_threshold) + return false; + return true; +} + static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); unsigned int size = blk_rq_payload_bytes(rq); + iod->use_sgl = nvme_pci_use_sgls(dev, rq); + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, iod->use_sgl); @@ -604,8 +623,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, dma_addr_t prp_dma; int nprps, i; - iod->use_sgl = false; - length -= (page_size - offset); if (length <= 0) { iod->first_dma = 0; @@ -715,8 +732,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, int entries = iod->nents, i = 0; dma_addr_t sgl_dma; - iod->use_sgl = true; - /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; @@ -770,23 +785,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_OK; } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int avg_seg_size; - - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), - blk_rq_nr_phys_segments(req)); - - if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) - return false; - if (!iod->nvmeq->qid) - return false; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return false; - return true; -} - static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -806,7 +804,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - if (nvme_pci_use_sgls(dev, req)) + if (iod->use_sgl) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); From cee160fd34b459ace029653436319557a643795a Mon Sep 17 00:00:00 2001 From: Jeff Lien Date: Tue, 19 Dec 2017 13:24:15 -0600 Subject: [PATCH 021/236] nvme: fix sector units when going between formats If you format a device with a 4k sector size back to 512 bytes, the queue limit values for physical block size and minimum IO size were not getting updated; only the logical block size was being updated. This patch adds code to update the physical block and IO minimum sizes. Signed-off-by: Jeff Lien Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1e46e60b8f10..961d6a4af19c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1335,6 +1335,7 @@ static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); + unsigned short bs = 1 << ns->lba_shift; unsigned stream_alignment = 0; if (ns->ctrl->nr_streams && ns->sws && ns->sgs) @@ -1343,7 +1344,10 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); - blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift); + blk_queue_logical_block_size(disk->queue, bs); + blk_queue_physical_block_size(disk->queue, bs); + blk_queue_io_min(disk->queue, bs); + if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); From d5bf4b7f437c250821d40c3e32158729e6b484ce Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Dec 2017 14:54:15 +0200 Subject: [PATCH 022/236] nvme-rdma: fix concurrent reset and reconnect Now ctrl state machine allows to transition from RESETTING to RECONNECTING. In nvme-rdma when we receive a rdma cm DISONNECTED event, we trigger nvme_rdma_error_recovery. This happens also when we execute a controller reset, issue a cm diconnect request and receive a cm disconnect reply, as a result, the reset work and the error recovery work can run concurrently. Until now the state machine prevented from the error recovery work from running as a result of a controller reset (RESETTING -> RECONNECTING was not allowed). To fix this, we adopt the FC state machine approach, we always transition from LIVE to RESETTING and only then to RECONNECTING. We do this both for the error recovery work and the controller reset work: 1. transition to RESETTING 2. teardown the controller association 3. transition to RECONNECTING This will restore the protection against reset work and error recovery work from concurrently running together. Fixes: 3cec7f9de448 ("nvme: allow controller RESETTING to RECONNECTING transition") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 37af56596be6..2a0bba7f50cf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -974,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + nvme_rdma_reconnect_or_remove(ctrl); } static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) { - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; queue_work(nvme_wq, &ctrl->err_work); @@ -1753,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto out_fail; From 479a322fb729d657d34706ccf8dd12916f36628f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Dec 2017 15:07:27 +0200 Subject: [PATCH 023/236] nvme-mpath: fix last path removal during traffic In case our last path is removed during traffic, we can end up requeueing the bio(s) but never schedule the actual requeue work as upper layers still have open handles on the mpath device node. Fix this by scheduling requeue work if the namespace being removed is the last path in the ns_head path list. Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme subsystems") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 961d6a4af19c..839650e0926a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2991,6 +2991,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_unlock(&ns->ctrl->namespaces_mutex); synchronize_srcu(&ns->head->srcu); + nvme_mpath_check_last_path(ns); nvme_put_ns(ns); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ea1aa5283e8e..a00eabd06427 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -417,6 +417,15 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) rcu_assign_pointer(head->current_path, NULL); } struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); + +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + + if (head->disk && list_empty(&head->list)) + kblockd_schedule_work(&head->requeue_work); +} + #else static inline void nvme_failover_req(struct request *req) { @@ -448,6 +457,9 @@ static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns) static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { } +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM From 254beb84faccbe2f4eda0b51924857bdfb679969 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 21 Dec 2017 14:15:47 -0800 Subject: [PATCH 024/236] nvme-fcloop: avoid possible uninitialized variable warning The kbuild test robot send mail of a potential use of an uninitialized variable - "tport" in fcloop_delete_targetport() which then calls __targetport_unreg() which uses the variable. It will never be the case it is uninitialized as the call to __targetport_unreg() only occurs if there is a valid nport pointer. And at the time the nport pointer is assigned, the tport variable is set. Remove the warning by assigning a NULL value initially. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 7b75d9de55ab..6a018a0bd6ce 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1085,7 +1085,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct fcloop_nport *nport = NULL, *tmpport; - struct fcloop_tport *tport; + struct fcloop_tport *tport = NULL; u64 nodename, portname; unsigned long flags; int ret; From 4307413256ac1e09b8f53e8715af3df9e49beec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20Petten=C3=B2?= Date: Fri, 29 Dec 2017 09:54:25 +0000 Subject: [PATCH 025/236] USB: serial: cp210x: add IDs for LifeScan OneTouch Verio IQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add IDs for the OneTouch Verio IQ that comes with an embedded USB-to-serial converter. Signed-off-by: Diego Elio Pettenò Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 7c6273bf5beb..38814225a816 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -124,6 +124,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x10C4, 0x8470) }, /* Juniper Networks BX Series System Console */ { USB_DEVICE(0x10C4, 0x8477) }, /* Balluff RFID */ { USB_DEVICE(0x10C4, 0x84B6) }, /* Starizona Hyperion */ + { USB_DEVICE(0x10C4, 0x85A7) }, /* LifeScan OneTouch Verio IQ */ { USB_DEVICE(0x10C4, 0x85EA) }, /* AC-Services IBUS-IF */ { USB_DEVICE(0x10C4, 0x85EB) }, /* AC-Services CIS-IBUS */ { USB_DEVICE(0x10C4, 0x85F8) }, /* Virtenio Preon32 */ From 55a5ec9b77106ffc05e8c40d7568432bf4696d7b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 2 Jan 2018 11:45:07 -0500 Subject: [PATCH 026/236] Revert "net: core: dev_get_valid_name is now the same as dev_alloc_name_ns" This reverts commit 87c320e51519a83c496ab7bfb4e96c8f9c001e89. Changing the error return code in some situations turns out to be harmful in practice. In particular Michael Ellerman reports that DHCP fails on his powerpc machines, and this revert gets things working again. Johannes Berg agrees that this revert is the best course of action for now. Fixes: 029b6d140550 ("Revert "net: core: maybe return -EEXIST in __dev_alloc_name"") Reported-by: Michael Ellerman Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 01ee854454a8..0e0ba36eeac9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1146,7 +1146,19 @@ EXPORT_SYMBOL(dev_alloc_name); int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { - return dev_alloc_name_ns(net, dev, name); + BUG_ON(!net); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name_ns(net, dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; } EXPORT_SYMBOL(dev_get_valid_name); From 23263ec86a5f44312d2899323872468752324107 Mon Sep 17 00:00:00 2001 From: Eli Cooper Date: Mon, 25 Dec 2017 10:43:49 +0800 Subject: [PATCH 027/236] ip6_tunnel: disable dst caching if tunnel is dual-stack When an ip6_tunnel is in mode 'any', where the transport layer protocol can be either 4 or 41, dst_cache must be disabled. This is because xfrm policies might apply to only one of the two protocols. Caching dst would cause xfrm policies for one protocol incorrectly used for the other. Signed-off-by: Eli Cooper Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 931c38f6ff4a..b263c809d8d4 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1074,10 +1074,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } - } else if (!(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only only if the routing decision does - * not depend on the current inner header value + } else if (t->parms.proto != 0 && !(t->parms.flags & + (IP6_TNL_F_USE_ORIG_TCLASS | + IP6_TNL_F_USE_ORIG_FWMARK))) { + /* enable the cache only if neither the outer protocol nor the + * routing decision depends on the current inner header value */ use_cache = true; } From 52a589d51f1008f62569bf89e95b26221ee76690 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 25 Dec 2017 14:43:58 +0800 Subject: [PATCH 028/236] geneve: update skb dst pmtu on tx path Commit a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") has fixed a performance issue caused by the change of lower dev's mtu for vxlan. The same thing needs to be done for geneve as well. Note that geneve cannot adjust it's mtu according to lower dev's mtu when creating it. The performance is very low later when netperfing over it without fixing the mtu manually. This patch could also avoid this issue. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/geneve.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index b718a02a6bb6..0a48b3073d3d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -825,6 +825,13 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(rt)) return PTR_ERR(rt); + if (skb_dst(skb)) { + int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) - + GENEVE_BASE_HLEN - info->options_len - 14; + + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + } + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); @@ -864,6 +871,13 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) return PTR_ERR(dst); + if (skb_dst(skb)) { + int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) - + GENEVE_BASE_HLEN - info->options_len - 14; + + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + } + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); From 2fa771be953a17f8e0a9c39103464c2574444c62 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 25 Dec 2017 14:45:12 +0800 Subject: [PATCH 029/236] ip6_tunnel: allow ip6gre dev mtu to be set below 1280 Commit 582442d6d5bc ("ipv6: Allow the MTU of ipip6 tunnel to be set below 1280") fixed a mtu setting issue. It works for ipip6 tunnel. But ip6gre dev updates the mtu also with ip6_tnl_change_mtu. Since the inner packet over ip6gre can be ipv4 and it's mtu should also be allowed to set below 1280, the same issue also exists on ip6gre. This patch is to fix it by simply changing to check if parms.proto is IPPROTO_IPV6 in ip6_tnl_change_mtu instead, to make ip6gre to go to 'else' branch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b263c809d8d4..9a7cf355bc8c 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1677,11 +1677,11 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); - if (tnl->parms.proto == IPPROTO_IPIP) { - if (new_mtu < ETH_MIN_MTU) + if (tnl->parms.proto == IPPROTO_IPV6) { + if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { - if (new_mtu < IPV6_MIN_MTU) + if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (new_mtu > 0xFFF8 - dev->hard_header_len) From 8764a8267b128405cf383157d5e9a4a3735d2409 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 25 Dec 2017 08:57:35 +0100 Subject: [PATCH 030/236] mlxsw: spectrum_router: Fix NULL pointer deref When we remove the neighbour associated with a nexthop we should always refuse to write the nexthop to the adjacency table. Regardless if it is already present in the table or not. Otherwise, we risk dereferencing the NULL pointer that was set instead of the neighbour. Fixes: a7ff87acd995 ("mlxsw: spectrum_router: Implement next-hop routing") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index be657b8533f0..434b3922b34f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3228,7 +3228,7 @@ static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh, { if (!removing) nh->should_offload = 1; - else if (nh->offloaded) + else nh->should_offload = 0; nh->update = 1; } From 90045fc9c78855bdc625a0ab185d97b72a937613 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 25 Dec 2017 09:05:33 +0100 Subject: [PATCH 031/236] mlxsw: spectrum: Relax sanity checks during enslavement Since commit 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers") the driver forbids enslavement to netdevs that already have uppers of their own, as this can result in various ordering problems. This requirement proved to be too strict for some users who need to be able to enslave ports to a bridge that already has uppers. In this case, we can allow the enslavement if the bridge is already known to us, as any configuration performed on top of the bridge was already reflected to the device. Fixes: 25cc72a33835 ("mlxsw: spectrum: Forbid linking to devices that have uppers") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Tested-by: Alexander Petrovskiy Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 11 +++++++++-- drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 2 ++ .../net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 ++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 9bd8d28de152..c3837ca7a705 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4376,7 +4376,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) { + if (netdev_has_any_upper_dev(upper_dev) && + (!netif_is_bridge_master(upper_dev) || + !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp, + upper_dev))) { NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; @@ -4504,6 +4507,7 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, u16 vid) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct netdev_notifier_changeupper_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *upper_dev; @@ -4520,7 +4524,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) { + if (netdev_has_any_upper_dev(upper_dev) && + (!netif_is_bridge_master(upper_dev) || + !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp, + upper_dev))) { NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 432ab9b12b7f..05ce1befd9b3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -365,6 +365,8 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port, void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *brport_dev, struct net_device *br_dev); +bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, + const struct net_device *br_dev); /* spectrum.c */ int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 7b8548e25ae7..593ad31be749 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -152,6 +152,12 @@ mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge, return NULL; } +bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, + const struct net_device *br_dev) +{ + return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); +} + static struct mlxsw_sp_bridge_device * mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge, struct net_device *br_dev) From 5a371cf87e145b86efd32007e46146e78c1eff6d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 31 Dec 2017 15:33:14 +0200 Subject: [PATCH 032/236] IB/mlx4: Fix mlx4_ib_alloc_mr error flow ibmr.device is being set only after ib_alloc_mr() is successfully complete. Therefore, in case imlx4_mr_enable() returns with error, the error flow unwinder calls to mlx4_free_priv_pages(), which uses ibmr.device. Such usage causes to NULL dereference oops and to fix it, the IB device should be set in the mr struct earlier stage (e.g. prior to calling mlx4_free_priv_pages()). Fixes: 1b2cd0fc673c ("IB/mlx4: Support the new memory registration API") Signed-off-by: Nitzan Carmi Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 313bfb9ccb71..4975f3e6596e 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -642,7 +642,6 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, goto err_free_mr; mr->max_pages = max_num_sg; - err = mlx4_mr_enable(dev->dev, &mr->mmr); if (err) goto err_free_pl; @@ -653,6 +652,7 @@ struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, return &mr->ibmr; err_free_pl: + mr->ibmr.device = pd->device; mlx4_free_priv_pages(mr); err_free_mr: (void) mlx4_mr_free(dev->dev, &mr->mmr); From 16ba3defb8bd01a9464ba4820a487f5b196b455b Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Sun, 31 Dec 2017 15:33:15 +0200 Subject: [PATCH 033/236] IB/ipoib: Fix race condition in neigh creation When using enhanced mode for IPoIB, two threads may execute xmit in parallel to two different TX queues while the target is the same. In this case, both of them will add the same neighbor to the path's neigh link list and we might see the following message: list_add double add: new=ffff88024767a348, prev=ffff88024767a348... WARNING: lib/list_debug.c:31__list_add_valid+0x4e/0x70 ipoib_start_xmit+0x477/0x680 [ib_ipoib] dev_hard_start_xmit+0xb9/0x3e0 sch_direct_xmit+0xf9/0x250 __qdisc_run+0x176/0x5d0 __dev_queue_xmit+0x1f5/0xb10 __dev_queue_xmit+0x55/0xb10 Analysis: Two SKB are scheduled to be transmitted from two cores. In ipoib_start_xmit, both gets NULL when calling ipoib_neigh_get. Two calls to neigh_add_path are made. One thread takes the spin-lock and calls ipoib_neigh_alloc which creates the neigh structure, then (after the __path_find) the neigh is added to the path's neigh link list. When the second thread enters the critical section it also calls ipoib_neigh_alloc but in this case it gets the already allocated ipoib_neigh structure, which is already linked to the path's neigh link list and adds it again to the list. Which beside of triggering the list, it creates a loop in the linked list. This loop leads to endless loop inside path_rec_completion. Solution: Check list_empty(&neigh->list) before adding to the list. Add a similar fix in "ipoib_multicast.c::ipoib_mcast_send" Fixes: b63b70d87741 ('IPoIB: Use a private hash table for path lookup in xmit path') Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 25 +++++++++++++------ .../infiniband/ulp/ipoib/ipoib_multicast.c | 5 +++- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 12b7f911f0e5..8880351df179 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -902,8 +902,8 @@ static int path_rec_start(struct net_device *dev, return 0; } -static void neigh_add_path(struct sk_buff *skb, u8 *daddr, - struct net_device *dev) +static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); @@ -917,7 +917,15 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); - return; + return NULL; + } + + /* To avoid race condition, make sure that the + * neigh will be added only once. + */ + if (unlikely(!list_empty(&neigh->list))) { + spin_unlock_irqrestore(&priv->lock, flags); + return neigh; } path = __path_find(dev, daddr + 4); @@ -956,7 +964,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); - return; + return NULL; } } else { neigh->ah = NULL; @@ -973,7 +981,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); - return; + return NULL; err_path: ipoib_neigh_free(neigh); @@ -983,6 +991,8 @@ err_drop: spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); + + return NULL; } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, @@ -1091,8 +1101,9 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { - neigh_add_path(skb, phdr->hwaddr, dev); - return NETDEV_TX_OK; + neigh = neigh_add_path(skb, phdr->hwaddr, dev); + if (likely(!neigh)) + return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 93e149efc1f5..9b3f47ae2016 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -816,7 +816,10 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) spin_lock_irqsave(&priv->lock, flags); if (!neigh) { neigh = ipoib_neigh_alloc(daddr, dev); - if (neigh) { + /* Make sure that the neigh will be added only + * once to mcast list. + */ + if (neigh && list_empty(&neigh->list)) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; list_add_tail(&neigh->list, &mcast->neigh_list); From 3bb23421a504f01551b7cb9dff0e41dbf16656b0 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 26 Dec 2017 07:48:51 +0200 Subject: [PATCH 034/236] net/sched: Fix update of lastuse in act modules implementing stats_update We need to update lastuse to to the most updated value between what is already set and the new value. If HW matching fails, i.e. because of an issue, the stats are not updated but it could be that software did match and updated lastuse. Fixes: 5712bf9c5c30 ("net/sched: act_mirred: Use passed lastuse argument") Fixes: 9fea47d93bcc ("net/sched: act_gact: Update statistics when offloaded to hardware") Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_gact.c | 2 +- net/sched/act_mirred.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e29a48ef7fc3..a0ac42b3ed06 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -159,7 +159,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8b3e59388480..08b61849c2a2 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -239,7 +239,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); - tm->lastuse = lastuse; + tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, From d02fd6e7d2933ede6478a15f9e4ce8a93845824e Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Tue, 26 Dec 2017 21:44:32 +0800 Subject: [PATCH 035/236] macvlan: Fix one possible double free Because the macvlan_uninit would free the macvlan port, so there is one double free case in macvlan_common_newlink. When the macvlan port is just created, then register_netdevice or netdev_upper_dev_link failed and they would invoke macvlan_uninit. Then it would reach the macvlan_port_destroy which triggers the double free. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index a178c5efd33e..a0f2be81d52e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1444,9 +1444,14 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, return 0; unregister_netdev: + /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); + return err; destroy_macvlan_port: - if (create) + /* the macvlan port may be freed by macvlan_uninit when fail to register. + * so we destroy the macvlan port only when it's valid. + */ + if (create && macvlan_port_get_rtnl(dev)) macvlan_port_destroy(port->dev); return err; } From ac817f5ad066697e4d4d35ec68c974eba2c5f17a Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 26 Dec 2017 23:15:12 +0000 Subject: [PATCH 036/236] phylink: ensure we report link down when LOS asserted Although we disable the netdev carrier, we fail to report in the kernel log that the link went down. Fix this. Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Russell King Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 827f3f92560e..150cd95a6e1e 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1429,9 +1429,8 @@ static void phylink_sfp_link_down(void *upstream) WARN_ON(!lockdep_rtnl_is_held()); set_bit(PHYLINK_DISABLE_LINK, &pl->phylink_disable_state); + queue_work(system_power_efficient_wq, &pl->resolve); flush_work(&pl->resolve); - - netif_carrier_off(pl->netdev); } static void phylink_sfp_link_up(void *upstream) From 0b2122e4934c7783d336397864e34ee53aad0965 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 26 Dec 2017 23:15:17 +0000 Subject: [PATCH 037/236] sfp: fix sfp-bus oops when removing socket/upstream When we remove a socket or upstream, and the other side isn't registered, we dereference a NULL pointer, causing a kernel oops. Fix this. Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages") Signed-off-by: Russell King Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 8a1b1f4c1b7c..ab64a142b832 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -356,7 +356,8 @@ EXPORT_SYMBOL_GPL(sfp_register_upstream); void sfp_unregister_upstream(struct sfp_bus *bus) { rtnl_lock(); - sfp_unregister_bus(bus); + if (bus->sfp) + sfp_unregister_bus(bus); bus->upstream = NULL; bus->netdev = NULL; rtnl_unlock(); @@ -459,7 +460,8 @@ EXPORT_SYMBOL_GPL(sfp_register_socket); void sfp_unregister_socket(struct sfp_bus *bus) { rtnl_lock(); - sfp_unregister_bus(bus); + if (bus->netdev) + sfp_unregister_bus(bus); bus->sfp_dev = NULL; bus->sfp = NULL; bus->socket_ops = NULL; From 0b76aae741abb9d16d2c0e67f8b1e766576f897d Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Wed, 6 Dec 2017 02:26:29 +0530 Subject: [PATCH 038/236] e1000: fix disabling already-disabled warning This patch adds check so that driver does not disable already disabled device. [ 44.637743] advantechwdt: Unexpected close, not stopping watchdog! [ 44.997548] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input6 [ 45.013419] e1000 0000:00:03.0: disabling already-disabled device [ 45.013447] ------------[ cut here ]------------ [ 45.014868] WARNING: CPU: 1 PID: 71 at drivers/pci/pci.c:1641 pci_disable_device+0xa1/0x105: pci_disable_device at drivers/pci/pci.c:1640 [ 45.016171] CPU: 1 PID: 71 Comm: rcu_perf_shutdo Not tainted 4.14.0-01330-g3c07399 #1 [ 45.017197] task: ffff88011bee9e40 task.stack: ffffc90000860000 [ 45.017987] RIP: 0010:pci_disable_device+0xa1/0x105: pci_disable_device at drivers/pci/pci.c:1640 [ 45.018603] RSP: 0000:ffffc90000863e30 EFLAGS: 00010286 [ 45.019282] RAX: 0000000000000035 RBX: ffff88013a230008 RCX: 0000000000000000 [ 45.020182] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000203 [ 45.021084] RBP: ffff88013a3f31e8 R08: 0000000000000001 R09: 0000000000000000 [ 45.021986] R10: ffffffff827ec29c R11: 0000000000000002 R12: 0000000000000001 [ 45.022946] R13: ffff88013a230008 R14: ffff880117802b20 R15: ffffc90000863e8f [ 45.023842] FS: 0000000000000000(0000) GS:ffff88013fd00000(0000) knlGS:0000000000000000 [ 45.024863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 45.025583] CR2: ffffc900006d4000 CR3: 000000000220f000 CR4: 00000000000006a0 [ 45.026478] Call Trace: [ 45.026811] __e1000_shutdown+0x1d4/0x1e2: __e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5162 [ 45.027344] ? rcu_perf_cleanup+0x2a1/0x2a1: rcu_perf_shutdown at kernel/rcu/rcuperf.c:627 [ 45.027883] e1000_shutdown+0x14/0x3a: e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5235 [ 45.028351] device_shutdown+0x110/0x1aa: device_shutdown at drivers/base/core.c:2807 [ 45.028858] kernel_power_off+0x31/0x64: kernel_power_off at kernel/reboot.c:260 [ 45.029343] rcu_perf_shutdown+0x9b/0xa7: rcu_perf_shutdown at kernel/rcu/rcuperf.c:637 [ 45.029852] ? __wake_up_common_lock+0xa2/0xa2: autoremove_wake_function at kernel/sched/wait.c:376 [ 45.030414] kthread+0x126/0x12e: kthread at kernel/kthread.c:233 [ 45.030834] ? __kthread_bind_mask+0x8e/0x8e: kthread at kernel/kthread.c:190 [ 45.031399] ? ret_from_fork+0x1f/0x30: ret_from_fork at arch/x86/entry/entry_64.S:443 [ 45.031883] ? kernel_init+0xa/0xf5: kernel_init at init/main.c:997 [ 45.032325] ret_from_fork+0x1f/0x30: ret_from_fork at arch/x86/entry/entry_64.S:443 [ 45.032777] Code: 00 48 85 ed 75 07 48 8b ab a8 00 00 00 48 8d bb 98 00 00 00 e8 aa d1 11 00 48 89 ea 48 89 c6 48 c7 c7 d8 e4 0b 82 e8 55 7d da ff <0f> ff b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 f0 b1 61 82 [ 45.035222] ---[ end trace c257137b1b1976ef ]--- [ 45.037838] ACPI: Preparing to enter system sleep state S5 Signed-off-by: Tushar Dave Tested-by: Fengguang Wu Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000/e1000.h | 3 ++- drivers/net/ethernet/intel/e1000/e1000_main.c | 27 +++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h index d7bdea79e9fa..8fd2458060a0 100644 --- a/drivers/net/ethernet/intel/e1000/e1000.h +++ b/drivers/net/ethernet/intel/e1000/e1000.h @@ -331,7 +331,8 @@ struct e1000_adapter { enum e1000_state_t { __E1000_TESTING, __E1000_RESETTING, - __E1000_DOWN + __E1000_DOWN, + __E1000_DISABLED }; #undef pr_fmt diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index 1982f7917a8d..3dd4aeb2706d 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -945,7 +945,7 @@ static int e1000_init_hw_struct(struct e1000_adapter *adapter, static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; - struct e1000_adapter *adapter; + struct e1000_adapter *adapter = NULL; struct e1000_hw *hw; static int cards_found; @@ -955,6 +955,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) u16 tmp = 0; u16 eeprom_apme_mask = E1000_EEPROM_APME; int bars, need_ioport; + bool disable_dev = false; /* do not allocate ioport bars when not needed */ need_ioport = e1000_is_need_ioport(pdev); @@ -1259,11 +1260,13 @@ err_mdio_ioremap: iounmap(hw->ce4100_gbe_mdio_base_virt); iounmap(hw->hw_addr); err_ioremap: + disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags); free_netdev(netdev); err_alloc_etherdev: pci_release_selected_regions(pdev, bars); err_pci_reg: - pci_disable_device(pdev); + if (!adapter || disable_dev) + pci_disable_device(pdev); return err; } @@ -1281,6 +1284,7 @@ static void e1000_remove(struct pci_dev *pdev) struct net_device *netdev = pci_get_drvdata(pdev); struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; + bool disable_dev; e1000_down_and_stop(adapter); e1000_release_manageability(adapter); @@ -1299,9 +1303,11 @@ static void e1000_remove(struct pci_dev *pdev) iounmap(hw->flash_address); pci_release_selected_regions(pdev, adapter->bars); + disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags); free_netdev(netdev); - pci_disable_device(pdev); + if (disable_dev) + pci_disable_device(pdev); } /** @@ -5156,7 +5162,8 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) if (netif_running(netdev)) e1000_free_irq(adapter); - pci_disable_device(pdev); + if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags)) + pci_disable_device(pdev); return 0; } @@ -5200,6 +5207,10 @@ static int e1000_resume(struct pci_dev *pdev) pr_err("Cannot enable PCI device from suspend\n"); return err; } + + /* flush memory to make sure state is correct */ + smp_mb__before_atomic(); + clear_bit(__E1000_DISABLED, &adapter->flags); pci_set_master(pdev); pci_enable_wake(pdev, PCI_D3hot, 0); @@ -5274,7 +5285,9 @@ static pci_ers_result_t e1000_io_error_detected(struct pci_dev *pdev, if (netif_running(netdev)) e1000_down(adapter); - pci_disable_device(pdev); + + if (!test_and_set_bit(__E1000_DISABLED, &adapter->flags)) + pci_disable_device(pdev); /* Request a slot slot reset. */ return PCI_ERS_RESULT_NEED_RESET; @@ -5302,6 +5315,10 @@ static pci_ers_result_t e1000_io_slot_reset(struct pci_dev *pdev) pr_err("Cannot re-enable PCI device after reset.\n"); return PCI_ERS_RESULT_DISCONNECT; } + + /* flush memory to make sure state is correct */ + smp_mb__before_atomic(); + clear_bit(__E1000_DISABLED, &adapter->flags); pci_set_master(pdev); pci_enable_wake(pdev, PCI_D3hot, 0); From 4110e02eb45ea447ec6f5459c9934de0a273fb91 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 11 Dec 2017 16:26:40 +0900 Subject: [PATCH 039/236] e1000e: Fix e1000_check_for_copper_link_ich8lan return value. e1000e_check_for_copper_link() and e1000_check_for_copper_link_ich8lan() are the two functions that may be assigned to mac.ops.check_for_link when phy.media_type == e1000_media_type_copper. Commit 19110cfbb34d ("e1000e: Separate signaling for link check/link up") changed the meaning of the return value of check_for_link for copper media but only adjusted the first function. This patch adjusts the second function likewise. Reported-by: Christian Hesse Reported-by: Gabriel C Link: https://bugzilla.kernel.org/show_bug.cgi?id=198047 Fixes: 19110cfbb34d ("e1000e: Separate signaling for link check/link up") Signed-off-by: Benjamin Poirier Tested-by: Aaron Brown Tested-by: Christian Hesse Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index d6d4ed7acf03..31277d3bb7dc 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1367,6 +1367,9 @@ out: * Checks to see of the link status of the hardware has changed. If a * change in link status has been detected, then we read the PHY registers * to get the current speed/duplex if link exists. + * + * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link + * up). **/ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) { @@ -1382,7 +1385,7 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) * Change or Rx Sequence Error interrupt. */ if (!mac->get_link_status) - return 0; + return 1; /* First we want to see if the MII Status Register reports * link. If so, then we want to get the current speed/duplex @@ -1613,10 +1616,12 @@ static s32 e1000_check_for_copper_link_ich8lan(struct e1000_hw *hw) * different link partner. */ ret_val = e1000e_config_fc_after_link_up(hw); - if (ret_val) + if (ret_val) { e_dbg("Error configuring flow control\n"); + return ret_val; + } - return ret_val; + return 1; } static s32 e1000_get_variants_ich8lan(struct e1000_adapter *adapter) From bd30ffc414e55194ed6149fad69a145550cb7c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SZ=20Lin=20=28=E6=9E=97=E4=B8=8A=E6=99=BA=29?= Date: Fri, 29 Dec 2017 17:02:17 +0800 Subject: [PATCH 040/236] NET: usb: qmi_wwan: add support for YUGA CLM920-NC5 PID 0x9625 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for PID 0x9625 of YUGA CLM920-NC5. YUGA CLM920-NC5 needs to enable QMI_WWAN_QUIRK_DTR before QMI operation. qmicli -d /dev/cdc-wdm0 -p --dms-get-revision [/dev/cdc-wdm0] Device revision retrieved: Revision: 'CLM920_NC5-V1 1 [Oct 23 2016 19:00:00]' Signed-off-by: SZ Lin (林上智) Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 3000ddd1c7e2..728819feab44 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1100,6 +1100,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x05c6, 0x9084, 4)}, {QMI_FIXED_INTF(0x05c6, 0x920d, 0)}, {QMI_FIXED_INTF(0x05c6, 0x920d, 5)}, + {QMI_QUIRK_SET_DTR(0x05c6, 0x9625, 4)}, /* YUGA CLM920-NC5 */ {QMI_FIXED_INTF(0x0846, 0x68a2, 8)}, {QMI_FIXED_INTF(0x12d1, 0x140c, 1)}, /* Huawei E173 */ {QMI_FIXED_INTF(0x12d1, 0x14ac, 1)}, /* Huawei E1820 */ From f8978bd95cf92f869f3d9b34c1b699f49253b8c6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:15 +0200 Subject: [PATCH 041/236] RDMA/netlink: Fix locking around __ib_get_device_by_index Holding locks is mandatory when calling __ib_device_get_by_index, otherwise there are races during the list iteration with device removal. Since the locks are static to device.c, __ib_device_get_by_index can never be called correctly by any user out side the file. Make the function static and provide a safe function that gets the correct locks and returns a kref'd pointer. Fix all callers. Fixes: e5c9469efcb1 ("RDMA/netlink: Add nldev device doit implementation") Fixes: c3f66f7b0052 ("RDMA/netlink: Implement nldev port doit callback") Fixes: 7d02f605f0dc ("RDMA/netlink: Add nldev port dumpit implementation") Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 2 +- drivers/infiniband/core/device.c | 18 +++++++++- drivers/infiniband/core/nldev.c | 54 +++++++++++++++++++---------- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a1d687a664f8..66f0268f37a6 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -314,7 +314,7 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, } #endif -struct ib_device *__ib_device_get_by_index(u32 ifindex); +struct ib_device *ib_device_get_by_index(u32 ifindex); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 30914f3baa5f..465520627e4b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,7 +134,7 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } -struct ib_device *__ib_device_get_by_index(u32 index) +static struct ib_device *__ib_device_get_by_index(u32 index) { struct ib_device *device; @@ -145,6 +145,22 @@ struct ib_device *__ib_device_get_by_index(u32 index) return NULL; } +/* + * Caller is responsible to return refrerence count by calling put_device() + */ +struct ib_device *ib_device_get_by_index(u32 index) +{ + struct ib_device *device; + + down_read(&lists_rwsem); + device = __ib_device_get_by_index(index); + if (device) + get_device(&device->dev); + + up_read(&lists_rwsem); + return device; +} + static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9a05245a1acf..0dcd1aa6f683 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -142,27 +142,34 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = __ib_device_get_by_index(index); + device = ib_device_get_by_index(index); if (!device) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; + if (!msg) { + err = -ENOMEM; + goto err; + } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); err = fill_dev_info(msg, device); - if (err) { - nlmsg_free(msg); - return err; - } + if (err) + goto err_free; nlmsg_end(msg, nlh); + put_device(&device->dev); return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + put_device(&device->dev); + return err; } static int _nldev_get_dumpit(struct ib_device *device, @@ -220,31 +227,40 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = __ib_device_get_by_index(index); + device = ib_device_get_by_index(index); if (!device) return -EINVAL; port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); - if (!rdma_is_port_valid(device, port)) - return -EINVAL; + if (!rdma_is_port_valid(device, port)) { + err = -EINVAL; + goto err; + } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; + if (!msg) { + err = -ENOMEM; + goto err; + } nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), 0, 0); err = fill_port_info(msg, device, port); - if (err) { - nlmsg_free(msg); - return err; - } + if (err) + goto err_free; nlmsg_end(msg, nlh); + put_device(&device->dev); return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + put_device(&device->dev); + return err; } static int nldev_port_get_dumpit(struct sk_buff *skb, @@ -265,7 +281,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = __ib_device_get_by_index(ifindex); + device = ib_device_get_by_index(ifindex); if (!device) return -EINVAL; @@ -299,7 +315,9 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, nlmsg_end(skb, nlh); } -out: cb->args[0] = idx; +out: + put_device(&device->dev); + cb->args[0] = idx; return skb->len; } From 71891e2dab6b55a870f8f7735e44a2963860b5c6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 29 Dec 2017 10:02:52 -0800 Subject: [PATCH 042/236] ethtool: do not print warning for applications using legacy API In kernel log ths message appears on every boot: "warning: `NetworkChangeNo' uses legacy ethtool link settings API, link modes are only partially reported" When ethtool link settings API changed, it started complaining about usages of old API. Ironically, the original patch was from google but the application using the legacy API is chrome. Linux ABI is fixed as much as possible. The kernel must not break it and should not complain about applications using legacy API's. This patch just removes the warning since using legacy API's in Linux is perfectly acceptable. Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API") Signed-off-by: Stephen Hemminger Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- net/core/ethtool.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f8fcf450a36e..8225416911ae 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -770,15 +770,6 @@ static int ethtool_set_link_ksettings(struct net_device *dev, return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } -static void -warn_incomplete_ethtool_legacy_settings_conversion(const char *details) -{ - char name[sizeof(current->comm)]; - - pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n", - get_task_comm(name, current), details); -} - /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, @@ -805,10 +796,8 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) &link_ksettings); if (err < 0) return err; - if (!convert_link_ksettings_to_legacy_settings(&cmd, - &link_ksettings)) - warn_incomplete_ethtool_legacy_settings_conversion( - "link modes are only partially reported"); + convert_link_ksettings_to_legacy_settings(&cmd, + &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; From f9c935db8086231a35b7f5c2a53e3f1e10f388ee Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 29 Dec 2017 19:48:02 +0100 Subject: [PATCH 043/236] tipc: fix problems with multipoint-to-point flow control In commit 04d7b574b245 ("tipc: add multipoint-to-point flow control") we introduced a protocol for preventing buffer overflow when many group members try to simultaneously send messages to the same receiving member. Stress test of this mechanism has revealed a couple of related bugs: - When the receiving member receives an advertisement REMIT message from one of the senders, it will sometimes prematurely activate a pending member and send it the remitted advertisement, although the upper limit for active senders has been reached. This leads to accumulation of illegal advertisements, and eventually to messages being dropped because of receive buffer overflow. - When the receiving member leaves REMITTED state while a received message is being read, we miss to look at the pending queue, to activate the oldest pending peer. This leads to some pending senders being starved out, and never getting the opportunity to profit from the remitted advertisement. We fix the former in the function tipc_group_proto_rcv() by returning directly from the function once it becomes clear that the remitting peer cannot leave REMITTED state at that point. We fix the latter in the function tipc_group_update_rcv_win() by looking up and activate the longest pending peer when it becomes clear that the remitting peer now can leave REMITTED state. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/net/tipc/group.c b/net/tipc/group.c index 8e12ab55346b..5f4ffae807ee 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -109,7 +109,8 @@ static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { - if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || + m->state == MBR_REMITTED) grp->active_cnt--; } @@ -562,7 +563,7 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; - struct tipc_member *m, *rm; + struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) @@ -605,6 +606,17 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } + grp->active_cnt--; + list_del_init(&m->list); + if (list_empty(&grp->pending)) + return; + + /* Set oldest pending member to active and advertise */ + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_DISCOVERED: @@ -742,14 +754,14 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, if (!m || m->state != MBR_RECLAIMING) return; - list_del_init(&m->list); - grp->active_cnt--; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; + m->advertised = ADV_IDLE + in_flight; + return; } /* All messages preceding the REMIT have been read */ if (m->advertised <= remitted) { @@ -761,6 +773,8 @@ void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); m->advertised = ADV_IDLE + in_flight; + grp->active_cnt--; + list_del_init(&m->list); /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) From c0bace798436bca0fdc221ff61143f1376a9c3de Mon Sep 17 00:00:00 2001 From: Felix Janda Date: Mon, 1 Jan 2018 19:33:20 +0100 Subject: [PATCH 044/236] uapi libc compat: add fallback for unsupported libcs libc-compat.h aims to prevent symbol collisions between uapi and libc headers for each supported libc. This requires continuous coordination between them. The goal of this commit is to improve the situation for libcs (such as musl) which are not yet supported and/or do not wish to be explicitly supported, while not affecting supported libcs. More precisely, with this commit, unsupported libcs can request the suppression of any specific uapi definition by defining the correspondings _UAPI_DEF_* macro as 0. This can fix symbol collisions for them, as long as the libc headers are included before the uapi headers. Inclusion in the other order is outside the scope of this commit. All infrastructure in order to enable this fallback for unsupported libcs is already in place, except that libc-compat.h unconditionally defines all _UAPI_DEF_* macros to 1 for all unsupported libcs so that any previous definitions are ignored. In order to fix this, this commit merely makes these definitions conditional. This commit together with the musl libc commit http://git.musl-libc.org/cgit/musl/commit/?id=04983f2272382af92eb8f8838964ff944fbb8258 fixes for example the following compiler errors when is included after musl's : ./linux/in6.h:32:8: error: redefinition of 'struct in6_addr' ./linux/in6.h:49:8: error: redefinition of 'struct sockaddr_in6' ./linux/in6.h:59:8: error: redefinition of 'struct ipv6_mreq' The comments referencing glibc are still correct, but this file is not only used for glibc any more. Signed-off-by: Felix Janda Reviewed-by: Hauke Mehrtens Signed-off-by: David S. Miller --- include/uapi/linux/libc-compat.h | 55 +++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index 282875cf8056..8254c937c9f4 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -168,46 +168,99 @@ /* If we did not see any headers from any supported C libraries, * or we are being included in the kernel, then define everything - * that we need. */ + * that we need. Check for previous __UAPI_* definitions to give + * unsupported C libraries a way to opt out of any kernel definition. */ #else /* !defined(__GLIBC__) */ /* Definitions for if.h */ +#ifndef __UAPI_DEF_IF_IFCONF #define __UAPI_DEF_IF_IFCONF 1 +#endif +#ifndef __UAPI_DEF_IF_IFMAP #define __UAPI_DEF_IF_IFMAP 1 +#endif +#ifndef __UAPI_DEF_IF_IFNAMSIZ #define __UAPI_DEF_IF_IFNAMSIZ 1 +#endif +#ifndef __UAPI_DEF_IF_IFREQ #define __UAPI_DEF_IF_IFREQ 1 +#endif /* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS #define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1 +#endif /* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO #define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 +#endif /* Definitions for in.h */ +#ifndef __UAPI_DEF_IN_ADDR #define __UAPI_DEF_IN_ADDR 1 +#endif +#ifndef __UAPI_DEF_IN_IPPROTO #define __UAPI_DEF_IN_IPPROTO 1 +#endif +#ifndef __UAPI_DEF_IN_PKTINFO #define __UAPI_DEF_IN_PKTINFO 1 +#endif +#ifndef __UAPI_DEF_IP_MREQ #define __UAPI_DEF_IP_MREQ 1 +#endif +#ifndef __UAPI_DEF_SOCKADDR_IN #define __UAPI_DEF_SOCKADDR_IN 1 +#endif +#ifndef __UAPI_DEF_IN_CLASS #define __UAPI_DEF_IN_CLASS 1 +#endif /* Definitions for in6.h */ +#ifndef __UAPI_DEF_IN6_ADDR #define __UAPI_DEF_IN6_ADDR 1 +#endif +#ifndef __UAPI_DEF_IN6_ADDR_ALT #define __UAPI_DEF_IN6_ADDR_ALT 1 +#endif +#ifndef __UAPI_DEF_SOCKADDR_IN6 #define __UAPI_DEF_SOCKADDR_IN6 1 +#endif +#ifndef __UAPI_DEF_IPV6_MREQ #define __UAPI_DEF_IPV6_MREQ 1 +#endif +#ifndef __UAPI_DEF_IPPROTO_V6 #define __UAPI_DEF_IPPROTO_V6 1 +#endif +#ifndef __UAPI_DEF_IPV6_OPTIONS #define __UAPI_DEF_IPV6_OPTIONS 1 +#endif +#ifndef __UAPI_DEF_IN6_PKTINFO #define __UAPI_DEF_IN6_PKTINFO 1 +#endif +#ifndef __UAPI_DEF_IP6_MTUINFO #define __UAPI_DEF_IP6_MTUINFO 1 +#endif /* Definitions for ipx.h */ +#ifndef __UAPI_DEF_SOCKADDR_IPX #define __UAPI_DEF_SOCKADDR_IPX 1 +#endif +#ifndef __UAPI_DEF_IPX_ROUTE_DEFINITION #define __UAPI_DEF_IPX_ROUTE_DEFINITION 1 +#endif +#ifndef __UAPI_DEF_IPX_INTERFACE_DEFINITION #define __UAPI_DEF_IPX_INTERFACE_DEFINITION 1 +#endif +#ifndef __UAPI_DEF_IPX_CONFIG_DATA #define __UAPI_DEF_IPX_CONFIG_DATA 1 +#endif +#ifndef __UAPI_DEF_IPX_ROUTE_DEF #define __UAPI_DEF_IPX_ROUTE_DEF 1 +#endif /* Definitions for xattr.h */ +#ifndef __UAPI_DEF_XATTR #define __UAPI_DEF_XATTR 1 +#endif #endif /* __GLIBC__ */ From c095508770aebf1b9218e77026e48345d719b17c Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Tue, 2 Jan 2018 19:44:34 +0000 Subject: [PATCH 045/236] RDS: Heap OOB write in rds_message_alloc_sgs() When args->nr_local is 0, nr_pages gets also 0 due some size calculation via rds_rm_size(), which is later used to allocate pages for DMA, this bug produces a heap Out-Of-Bound write access to a specific memory region. Signed-off-by: Mohamed Ghannam Signed-off-by: David S. Miller --- net/rds/rdma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index bc2f1e0977d6..94729d9da437 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -525,6 +525,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + if (args->nr_local == 0) + return -EINVAL; + /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], From 79d0895140e937ba111e6420b4cd83ee75efa788 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 2 Jan 2018 19:44:37 -0200 Subject: [PATCH 046/236] sctp: fix error path in sctp_stream_init syzbot noticed a NULL pointer dereference panic in sctp_stream_free() which was caused by an incomplete error handling in sctp_stream_init(). By not clearing stream->outcnt, it made a for() in sctp_stream_free() think that it had elements to free, but not, leading to the panic. As suggested by Xin Long, this patch also simplifies the error path by moving it to the only if() that uses it. See-also: https://www.spinics.net/lists/netdev/msg473756.html See-also: https://www.spinics.net/lists/netdev/msg465024.html Reported-by: syzbot Fixes: f952be79cebd ("sctp: introduce struct sctp_stream_out_ext") Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 76ea66be0bbe..524dfeb94c41 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -156,9 +156,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, sctp_stream_outq_migrate(stream, NULL, outcnt); sched->sched_all(stream); - i = sctp_stream_alloc_out(stream, outcnt, gfp); - if (i) - return i; + ret = sctp_stream_alloc_out(stream, outcnt, gfp); + if (ret) + goto out; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) @@ -170,19 +170,17 @@ in: if (!incnt) goto out; - i = sctp_stream_alloc_in(stream, incnt, gfp); - if (i) { - ret = -ENOMEM; - goto free; + ret = sctp_stream_alloc_in(stream, incnt, gfp); + if (ret) { + sched->free(stream); + kfree(stream->out); + stream->out = NULL; + stream->outcnt = 0; + goto out; } stream->incnt = incnt; - goto out; -free: - sched->free(stream); - kfree(stream->out); - stream->out = NULL; out: return ret; } From f1c8d3720f2e6c8c2b209120678236debd0360e5 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 2 Jan 2018 14:05:19 -0800 Subject: [PATCH 047/236] vxlan: trivial indenting fix. Fix indentation of reserved_flags2 field in vxlanhdr_gpe. Fixes: e1e5314de08b ("vxlan: implement GPE") Signed-off-by: William Tu Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 13223396dc64..f96391e84a8a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -146,7 +146,7 @@ struct vxlanhdr_gpe { np_applied:1, instance_applied:1, version:2, -reserved_flags2:2; + reserved_flags2:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags2:2, version:2, From 64e711ca59ef9b7873d77ef06bc174aa01af9115 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 17 Nov 2017 15:51:47 -0800 Subject: [PATCH 048/236] i40e: Remove UDP support for big buffer Since UDP based filters are not supported via big buffer cloud filters, remove UDP support. Also change a few return types to indicate unsupported vs invalid configuration. Signed-off-by: Amritha Nambiar Acked-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 321d8be80871..fffd4868defb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6038,8 +6038,8 @@ static int i40e_validate_and_set_switch_mode(struct i40e_vsi *vsi) /* Set Bit 7 to be valid */ mode = I40E_AQ_SET_SWITCH_BIT7_VALID; - /* Set L4type to both TCP and UDP support */ - mode |= I40E_AQ_SET_SWITCH_L4_TYPE_BOTH; + /* Set L4type for TCP support */ + mode |= I40E_AQ_SET_SWITCH_L4_TYPE_TCP; /* Set cloud filter mode */ mode |= I40E_AQ_SET_SWITCH_MODE_NON_TUNNEL; @@ -6969,18 +6969,18 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, is_valid_ether_addr(filter->src_mac)) || (is_multicast_ether_addr(filter->dst_mac) && is_multicast_ether_addr(filter->src_mac))) - return -EINVAL; + return -EOPNOTSUPP; - /* Make sure port is specified, otherwise bail out, for channel - * specific cloud filter needs 'L4 port' to be non-zero + /* Big buffer cloud filter needs 'L4 port' to be non-zero. Also, UDP + * ports are not supported via big buffer now. */ - if (!filter->dst_port) - return -EINVAL; + if (!filter->dst_port || filter->ip_proto == IPPROTO_UDP) + return -EOPNOTSUPP; /* adding filter using src_port/src_ip is not supported at this stage */ if (filter->src_port || filter->src_ipv4 || !ipv6_addr_any(&filter->ip.v6.src_ip6)) - return -EINVAL; + return -EOPNOTSUPP; /* copy element needed to add cloud filter from filter */ i40e_set_cld_element(filter, &cld_filter.element); @@ -6991,7 +6991,7 @@ static int i40e_add_del_cloud_filter_big_buf(struct i40e_vsi *vsi, is_multicast_ether_addr(filter->src_mac)) { /* MAC + IP : unsupported mode */ if (filter->dst_ipv4) - return -EINVAL; + return -EOPNOTSUPP; /* since we validated that L4 port must be valid before * we get here, start with respective "flags" value From e90f686b4358d7d7e5dbaa48b8e78c9a4e41826e Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Wed, 3 Jan 2018 10:39:29 +0800 Subject: [PATCH 049/236] net: fec: restore dev_id in the cases of probe error The static variable dev_id always plus one before netdev registerred. It should restore the dev_id value in the cases of probe error. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 8184d2fca9be..6a4fc2b35488 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3556,6 +3556,7 @@ failed_phy: of_node_put(phy_node); failed_ioremap: free_netdev(ndev); + dev_id--; return ret; } From 3f38c683033a9a0a2738e7067f449deefabfa3ef Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Wed, 3 Jan 2018 10:39:30 +0800 Subject: [PATCH 050/236] net: fec: defer probe if regulator is not ready Defer probe if regulator is not ready. E.g. some regulator is fixed regulator controlled by i2c expander gpio, the i2c device may be probed after the driver, then it should handle the case of defer probe error. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 6a4fc2b35488..19f198e22e15 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3469,6 +3469,10 @@ fec_probe(struct platform_device *pdev) goto failed_regulator; } } else { + if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) { + ret = -EPROBE_DEFER; + goto failed_regulator; + } fep->reg_phy = NULL; } From 248de22e638f10bd5bfc7624a357f940f66ba137 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 8 Dec 2017 10:55:04 -0800 Subject: [PATCH 051/236] i40e/i40evf: Account for frags split over multiple descriptors in check linearize The original code for __i40e_chk_linearize didn't take into account the fact that if a fragment is 16K in size or larger it has to be split over 2 descriptors and the smaller of those 2 descriptors will be on the trailing edge of the transmit. As a result we can get into situations where we didn't catch requests that could result in a Tx hang. This patch takes care of that by subtracting the length of all but the trailing edge of the stale fragment before we test for sum. By doing this we can guarantee that we have all cases covered, including the case of a fragment that spans multiple descriptors. We don't need to worry about checking the inner portions of this since 12K is the maximum aligned DMA size and that is larger than any MSS will ever be since the MTU limit for jumbos is something on the order of 9K. Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 26 ++++++++++++++++--- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 26 ++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 4566d66ffc7c..5bc2748ac468 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3047,10 +3047,30 @@ bool __i40e_chk_linearize(struct sk_buff *skb) /* Walk through fragments adding latest fragment, testing it, and * then removing stale fragments from the sum. */ - stale = &skb_shinfo(skb)->frags[0]; - for (;;) { + for (stale = &skb_shinfo(skb)->frags[0];; stale++) { + int stale_size = skb_frag_size(stale); + sum += skb_frag_size(frag++); + /* The stale fragment may present us with a smaller + * descriptor than the actual fragment size. To account + * for that we need to remove all the data on the front and + * figure out what the remainder would be in the last + * descriptor associated with the fragment. + */ + if (stale_size > I40E_MAX_DATA_PER_TXD) { + int align_pad = -(stale->page_offset) & + (I40E_MAX_READ_REQ_SIZE - 1); + + sum -= align_pad; + stale_size -= align_pad; + + do { + sum -= I40E_MAX_DATA_PER_TXD_ALIGNED; + stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED; + } while (stale_size > I40E_MAX_DATA_PER_TXD); + } + /* if sum is negative we failed to make sufficient progress */ if (sum < 0) return true; @@ -3058,7 +3078,7 @@ bool __i40e_chk_linearize(struct sk_buff *skb) if (!nr_frags--) break; - sum -= skb_frag_size(stale++); + sum -= stale_size; } return false; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 50864f99446d..1ba29bb85b67 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -2012,10 +2012,30 @@ bool __i40evf_chk_linearize(struct sk_buff *skb) /* Walk through fragments adding latest fragment, testing it, and * then removing stale fragments from the sum. */ - stale = &skb_shinfo(skb)->frags[0]; - for (;;) { + for (stale = &skb_shinfo(skb)->frags[0];; stale++) { + int stale_size = skb_frag_size(stale); + sum += skb_frag_size(frag++); + /* The stale fragment may present us with a smaller + * descriptor than the actual fragment size. To account + * for that we need to remove all the data on the front and + * figure out what the remainder would be in the last + * descriptor associated with the fragment. + */ + if (stale_size > I40E_MAX_DATA_PER_TXD) { + int align_pad = -(stale->page_offset) & + (I40E_MAX_READ_REQ_SIZE - 1); + + sum -= align_pad; + stale_size -= align_pad; + + do { + sum -= I40E_MAX_DATA_PER_TXD_ALIGNED; + stale_size -= I40E_MAX_DATA_PER_TXD_ALIGNED; + } while (stale_size > I40E_MAX_DATA_PER_TXD); + } + /* if sum is negative we failed to make sufficient progress */ if (sum < 0) return true; @@ -2023,7 +2043,7 @@ bool __i40evf_chk_linearize(struct sk_buff *skb) if (!nr_frags--) break; - sum -= skb_frag_size(stale++); + sum -= stale_size; } return false; From 458867b2ca0c987445c5d9adccd1642970e1ba07 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 20 Dec 2017 11:04:36 -0500 Subject: [PATCH 052/236] i40e: don't remove netdev->dev_addr when syncing uc list In some circumstances, such as with bridging, it is possible that the stack will add a devices own MAC address to its unicast address list. If, later, the stack deletes this address, then the i40e driver will receive a request to remove this address. The driver stores its current MAC address as part of the MAC/VLAN hash array, since it is convenient and matches exactly how the hardware expects to be told which traffic to receive. This causes a problem, since for more devices, the MAC address is stored separately, and requests to delete a unicast address should not have the ability to remove the filter for the MAC address. Fix this by forcing a check on every address sync to ensure we do not remove the device address. There is a very narrow possibility of a race between .set_mac and .set_rx_mode, if we don't change netdev->dev_addr before updating our internal MAC list in .set_mac. This might be possible if .set_rx_mode is going to remove MAC "XYZ" from the list, at the same time as .set_mac changes our dev_addr to MAC "XYZ", we might possibly queue a delete, then an add in .set_mac, then queue a delete in .set_rx_mode's dev_uc_sync and then update netdev->dev_addr. We can avoid this by moving the copy into dev_addr prior to the changes to the MAC filter list. A similar race on the other side does not cause problems, as if we're changing our MAC form A to B, and we race with .set_rx_mode, it could queue a delete from A, we'd update our address, and allow the delete. This seems like a race, but in reality we're about to queue a delete of A anyways, so it would not cause any issues. A race in the initialization code is unlikely because the netdevice has not yet been fully initialized and the stack should not be adding or removing addresses yet. Note that we don't (yet) need similar code for the VF driver because it does not make use of __dev_uc_sync and __dev_mc_sync, but instead roles its own method for handling updates to the MAC/VLAN list, which already has code to protect against removal of the hardware address. Signed-off-by: Jacob Keller Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index fffd4868defb..9e4b78e447f8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1573,11 +1573,18 @@ static int i40e_set_mac(struct net_device *netdev, void *p) else netdev_info(netdev, "set new mac address %pM\n", addr->sa_data); + /* Copy the address first, so that we avoid a possible race with + * .set_rx_mode(). If we copy after changing the address in the filter + * list, we might open ourselves to a narrow race window where + * .set_rx_mode could delete our dev_addr filter and prevent traffic + * from passing. + */ + ether_addr_copy(netdev->dev_addr, addr->sa_data); + spin_lock_bh(&vsi->mac_filter_hash_lock); i40e_del_mac_filter(vsi, netdev->dev_addr); i40e_add_mac_filter(vsi, addr->sa_data); spin_unlock_bh(&vsi->mac_filter_hash_lock); - ether_addr_copy(netdev->dev_addr, addr->sa_data); if (vsi->type == I40E_VSI_MAIN) { i40e_status ret; @@ -1923,6 +1930,14 @@ static int i40e_addr_unsync(struct net_device *netdev, const u8 *addr) struct i40e_netdev_priv *np = netdev_priv(netdev); struct i40e_vsi *vsi = np->vsi; + /* Under some circumstances, we might receive a request to delete + * our own device address from our uc list. Because we store the + * device address in the VSI's MAC/VLAN filter list, we need to ignore + * such requests and not delete our device address from this list. + */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + i40e_del_mac_filter(vsi, addr); return 0; From bc4244c6e33f96b48c4986ce4653df4673c6a08e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 22 Dec 2017 12:45:16 +0100 Subject: [PATCH 053/236] i40e: flower: Fix return value for unsupported offload When filter configuration is not supported, drivers should return -EOPNOTSUPP so the core can react correctly. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Signed-off-by: Jiri Pirko Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 9e4b78e447f8..42dcaefc4c19 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7371,7 +7371,7 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi, if (tc < 0) { dev_err(&vsi->back->pdev->dev, "Invalid traffic class\n"); - return -EINVAL; + return -EOPNOTSUPP; } if (test_bit(__I40E_RESET_RECOVERY_PENDING, pf->state) || From 15962a18284552b5ec58982ff60a5e92e0c5c92b Mon Sep 17 00:00:00 2001 From: Arjun Vynipadath Date: Wed, 3 Jan 2018 11:44:07 +0530 Subject: [PATCH 054/236] cxgb4: Fix FW flash errors commit 96ac18f14a5a ("cxgb4: Add support for new flash parts") removed initialization of adapter->params.sf_fw_start causing issues while flashing firmware to card. We no longer need sf_fw_start in adapter->params as we already have macros defined for FW flash addresses. Fixes: 96ac18f14a5a ("cxgb4: Add support for new flash parts") Signed-off-by: Arjun Vynipadath Signed-off-by: Casey Leedom Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 - drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 6f9fa6e3c42a..d8424ed16c33 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -344,7 +344,6 @@ struct adapter_params { unsigned int sf_size; /* serial flash size in bytes */ unsigned int sf_nsec; /* # of flash sectors */ - unsigned int sf_fw_start; /* start of FW image in flash */ unsigned int fw_vers; /* firmware version */ unsigned int bs_vers; /* bootstrap version */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index f63210f15579..375ef86a84da 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -2844,8 +2844,6 @@ enum { SF_RD_DATA_FAST = 0xb, /* read flash */ SF_RD_ID = 0x9f, /* read ID */ SF_ERASE_SECTOR = 0xd8, /* erase sector */ - - FW_MAX_SIZE = 16 * SF_SEC_SIZE, }; /** @@ -3558,8 +3556,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) const __be32 *p = (const __be32 *)fw_data; const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data; unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec; - unsigned int fw_img_start = adap->params.sf_fw_start; - unsigned int fw_start_sec = fw_img_start / sf_sec_size; + unsigned int fw_start_sec = FLASH_FW_START_SEC; + unsigned int fw_size = FLASH_FW_MAX_SIZE; + unsigned int fw_start = FLASH_FW_START; if (!size) { dev_err(adap->pdev_dev, "FW image has no data\n"); @@ -3575,9 +3574,9 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) "FW image size differs from size in FW header\n"); return -EINVAL; } - if (size > FW_MAX_SIZE) { + if (size > fw_size) { dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n", - FW_MAX_SIZE); + fw_size); return -EFBIG; } if (!t4_fw_matches_chip(adap, hdr)) @@ -3604,11 +3603,11 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) */ memcpy(first_page, fw_data, SF_PAGE_SIZE); ((struct fw_hdr *)first_page)->fw_ver = cpu_to_be32(0xffffffff); - ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page); + ret = t4_write_flash(adap, fw_start, SF_PAGE_SIZE, first_page); if (ret) goto out; - addr = fw_img_start; + addr = fw_start; for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) { addr += SF_PAGE_SIZE; fw_data += SF_PAGE_SIZE; @@ -3618,7 +3617,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) } ret = t4_write_flash(adap, - fw_img_start + offsetof(struct fw_hdr, fw_ver), + fw_start + offsetof(struct fw_hdr, fw_ver), sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver); out: if (ret) From 7853b49ce8e0ef6364d24512b287463841d71bd3 Mon Sep 17 00:00:00 2001 From: Netanel Belgazal Date: Wed, 3 Jan 2018 06:17:29 +0000 Subject: [PATCH 055/236] net: ena: unmask MSI-X only after device initialization is completed Under certain conditions MSI-X interrupt might arrive right after it was unmasked in ena_up(). There is a chance it would be processed by the driver before device ENA_FLAG_DEV_UP flag is set. In such a case the interrupt is ignored. ENA device operates in auto-masked mode, therefore ignoring interrupt leaves it masked for good. Moving unmask of interrupt to be the last step in ena_up(). Signed-off-by: Netanel Belgazal Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 97c5a89a9cf7..6fb28fd43eb3 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1565,7 +1565,7 @@ static int ena_rss_configure(struct ena_adapter *adapter) static int ena_up_complete(struct ena_adapter *adapter) { - int rc, i; + int rc; rc = ena_rss_configure(adapter); if (rc) @@ -1584,17 +1584,6 @@ static int ena_up_complete(struct ena_adapter *adapter) ena_napi_enable_all(adapter); - /* Enable completion queues interrupt */ - for (i = 0; i < adapter->num_queues; i++) - ena_unmask_interrupt(&adapter->tx_ring[i], - &adapter->rx_ring[i]); - - /* schedule napi in case we had pending packets - * from the last time we disable napi - */ - for (i = 0; i < adapter->num_queues; i++) - napi_schedule(&adapter->ena_napi[i].napi); - return 0; } @@ -1731,7 +1720,7 @@ create_err: static int ena_up(struct ena_adapter *adapter) { - int rc; + int rc, i; netdev_dbg(adapter->netdev, "%s\n", __func__); @@ -1774,6 +1763,17 @@ static int ena_up(struct ena_adapter *adapter) set_bit(ENA_FLAG_DEV_UP, &adapter->flags); + /* Enable completion queues interrupt */ + for (i = 0; i < adapter->num_queues; i++) + ena_unmask_interrupt(&adapter->tx_ring[i], + &adapter->rx_ring[i]); + + /* schedule napi in case we had pending packets + * from the last time we disable napi + */ + for (i = 0; i < adapter->num_queues; i++) + napi_schedule(&adapter->ena_napi[i].napi); + return rc; err_up: From ee4552aaf3fef5345199b8a82e40be7245b289fb Mon Sep 17 00:00:00 2001 From: Netanel Belgazal Date: Wed, 3 Jan 2018 06:17:30 +0000 Subject: [PATCH 056/236] net: ena: fix error handling in ena_down() sequence ENA admin command queue errors are not handled as part of ena_down(). As a result, in case of error admin queue transitions to non-running state and aborts all subsequent commands including those coming from ena_up(). Reset scheduled by the driver from the timer service context would not proceed due to sharing rtnl with ena_up()/ena_down() Signed-off-by: Netanel Belgazal Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 6fb28fd43eb3..fbe21a817bd8 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -75,6 +75,9 @@ static struct workqueue_struct *ena_wq; MODULE_DEVICE_TABLE(pci, ena_pci_tbl); static int ena_rss_init_default(struct ena_adapter *adapter); +static void check_for_admin_com_state(struct ena_adapter *adapter); +static void ena_destroy_device(struct ena_adapter *adapter); +static int ena_restore_device(struct ena_adapter *adapter); static void ena_tx_timeout(struct net_device *dev) { @@ -1884,6 +1887,17 @@ static int ena_close(struct net_device *netdev) if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) ena_down(adapter); + /* Check for device status and issue reset if needed*/ + check_for_admin_com_state(adapter); + if (unlikely(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags))) { + netif_err(adapter, ifdown, adapter->netdev, + "Destroy failure, restarting device\n"); + ena_dump_stats_to_dmesg(adapter); + /* rtnl lock already obtained in dev_ioctl() layer */ + ena_destroy_device(adapter); + ena_restore_device(adapter); + } + return 0; } @@ -2544,11 +2558,12 @@ static void ena_destroy_device(struct ena_adapter *adapter) ena_com_set_admin_running_state(ena_dev, false); - ena_close(netdev); + if (test_bit(ENA_FLAG_DEV_UP, &adapter->flags)) + ena_down(adapter); /* Before releasing the ENA resources, a device reset is required. * (to prevent the device from accessing them). - * In case the reset flag is set and the device is up, ena_close + * In case the reset flag is set and the device is up, ena_down() * already perform the reset, so it can be skipped. */ if (!(test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags) && dev_up)) From ee4aa8df70fa6d76bd776c025dc0d8d746c18317 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 3 Jan 2018 13:09:23 -0500 Subject: [PATCH 057/236] 3c59x: fix missing dma_mapping_error check and bad ring refill logic A few spots in 3c59x missed calls to dma_mapping_error checks, casuing WARN_ONS to trigger. Clean those up. While we're at it, refactor the refill code a bit so that if skb allocation or dma mapping fails, we recycle the existing buffer. This prevents holes in the rx ring, and makes for much simpler logic Note: This is compile only tested. Ted, if you could run this and confirm that it continues to work properly, I would appreciate it, as I currently don't have access to this hardware Signed-off-by: Neil Horman CC: Steffen Klassert CC: "David S. Miller" Reported-by: tedheadster@gmail.com Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/3c59x.c | 90 +++++++++++++------------------ 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index f4e13a7014bd..36c8950dbd2d 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -602,7 +602,7 @@ struct vortex_private { struct sk_buff* rx_skbuff[RX_RING_SIZE]; struct sk_buff* tx_skbuff[TX_RING_SIZE]; unsigned int cur_rx, cur_tx; /* The next free ring entry */ - unsigned int dirty_rx, dirty_tx; /* The ring entries to be free()ed. */ + unsigned int dirty_tx; /* The ring entries to be free()ed. */ struct vortex_extra_stats xstats; /* NIC-specific extra stats */ struct sk_buff *tx_skb; /* Packet being eaten by bus master ctrl. */ dma_addr_t tx_skb_dma; /* Allocated DMA address for bus master ctrl DMA. */ @@ -618,7 +618,6 @@ struct vortex_private { /* The remainder are related to chip state, mostly media selection. */ struct timer_list timer; /* Media selection timer. */ - struct timer_list rx_oom_timer; /* Rx skb allocation retry timer */ int options; /* User-settable misc. driver options. */ unsigned int media_override:4, /* Passed-in media type. */ default_media:4, /* Read from the EEPROM/Wn3_Config. */ @@ -760,7 +759,6 @@ static void mdio_sync(struct vortex_private *vp, int bits); static int mdio_read(struct net_device *dev, int phy_id, int location); static void mdio_write(struct net_device *vp, int phy_id, int location, int value); static void vortex_timer(struct timer_list *t); -static void rx_oom_timer(struct timer_list *t); static netdev_tx_t vortex_start_xmit(struct sk_buff *skb, struct net_device *dev); static netdev_tx_t boomerang_start_xmit(struct sk_buff *skb, @@ -1601,7 +1599,6 @@ vortex_up(struct net_device *dev) timer_setup(&vp->timer, vortex_timer, 0); mod_timer(&vp->timer, RUN_AT(media_tbl[dev->if_port].wait)); - timer_setup(&vp->rx_oom_timer, rx_oom_timer, 0); if (vortex_debug > 1) pr_debug("%s: Initial media type %s.\n", @@ -1676,7 +1673,7 @@ vortex_up(struct net_device *dev) window_write16(vp, 0x0040, 4, Wn4_NetDiag); if (vp->full_bus_master_rx) { /* Boomerang bus master. */ - vp->cur_rx = vp->dirty_rx = 0; + vp->cur_rx = 0; /* Initialize the RxEarly register as recommended. */ iowrite16(SetRxThreshold + (1536>>2), ioaddr + EL3_CMD); iowrite32(0x0020, ioaddr + PktStatus); @@ -1729,6 +1726,7 @@ vortex_open(struct net_device *dev) struct vortex_private *vp = netdev_priv(dev); int i; int retval; + dma_addr_t dma; /* Use the now-standard shared IRQ implementation. */ if ((retval = request_irq(dev->irq, vp->full_bus_master_rx ? @@ -1753,7 +1751,11 @@ vortex_open(struct net_device *dev) break; /* Bad news! */ skb_reserve(skb, NET_IP_ALIGN); /* Align IP on 16 byte boundaries */ - vp->rx_ring[i].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE)); + dma = pci_map_single(VORTEX_PCI(vp), skb->data, + PKT_BUF_SZ, PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&VORTEX_PCI(vp)->dev, dma)) + break; + vp->rx_ring[i].addr = cpu_to_le32(dma); } if (i != RX_RING_SIZE) { pr_emerg("%s: no memory for rx ring\n", dev->name); @@ -2067,6 +2069,12 @@ vortex_start_xmit(struct sk_buff *skb, struct net_device *dev) int len = (skb->len + 3) & ~3; vp->tx_skb_dma = pci_map_single(VORTEX_PCI(vp), skb->data, len, PCI_DMA_TODEVICE); + if (dma_mapping_error(&VORTEX_PCI(vp)->dev, vp->tx_skb_dma)) { + dev_kfree_skb_any(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } + spin_lock_irq(&vp->window_lock); window_set(vp, 7); iowrite32(vp->tx_skb_dma, ioaddr + Wn7_MasterAddr); @@ -2593,7 +2601,7 @@ boomerang_rx(struct net_device *dev) int entry = vp->cur_rx % RX_RING_SIZE; void __iomem *ioaddr = vp->ioaddr; int rx_status; - int rx_work_limit = vp->dirty_rx + RX_RING_SIZE - vp->cur_rx; + int rx_work_limit = RX_RING_SIZE; if (vortex_debug > 5) pr_debug("boomerang_rx(): status %4.4x\n", ioread16(ioaddr+EL3_STATUS)); @@ -2614,7 +2622,8 @@ boomerang_rx(struct net_device *dev) } else { /* The packet length: up to 4.5K!. */ int pkt_len = rx_status & 0x1fff; - struct sk_buff *skb; + struct sk_buff *skb, *newskb; + dma_addr_t newdma; dma_addr_t dma = le32_to_cpu(vp->rx_ring[entry].addr); if (vortex_debug > 4) @@ -2633,9 +2642,27 @@ boomerang_rx(struct net_device *dev) pci_dma_sync_single_for_device(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE); vp->rx_copy++; } else { + /* Pre-allocate the replacement skb. If it or its + * mapping fails then recycle the buffer thats already + * in place + */ + newskb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ); + if (!newskb) { + dev->stats.rx_dropped++; + goto clear_complete; + } + newdma = pci_map_single(VORTEX_PCI(vp), newskb->data, + PKT_BUF_SZ, PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&VORTEX_PCI(vp)->dev, newdma)) { + dev->stats.rx_dropped++; + consume_skb(newskb); + goto clear_complete; + } + /* Pass up the skbuff already on the Rx ring. */ skb = vp->rx_skbuff[entry]; - vp->rx_skbuff[entry] = NULL; + vp->rx_skbuff[entry] = newskb; + vp->rx_ring[entry].addr = cpu_to_le32(newdma); skb_put(skb, pkt_len); pci_unmap_single(VORTEX_PCI(vp), dma, PKT_BUF_SZ, PCI_DMA_FROMDEVICE); vp->rx_nocopy++; @@ -2653,55 +2680,15 @@ boomerang_rx(struct net_device *dev) netif_rx(skb); dev->stats.rx_packets++; } - entry = (++vp->cur_rx) % RX_RING_SIZE; - } - /* Refill the Rx ring buffers. */ - for (; vp->cur_rx - vp->dirty_rx > 0; vp->dirty_rx++) { - struct sk_buff *skb; - entry = vp->dirty_rx % RX_RING_SIZE; - if (vp->rx_skbuff[entry] == NULL) { - skb = netdev_alloc_skb_ip_align(dev, PKT_BUF_SZ); - if (skb == NULL) { - static unsigned long last_jif; - if (time_after(jiffies, last_jif + 10 * HZ)) { - pr_warn("%s: memory shortage\n", - dev->name); - last_jif = jiffies; - } - if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE) - mod_timer(&vp->rx_oom_timer, RUN_AT(HZ * 1)); - break; /* Bad news! */ - } - vp->rx_ring[entry].addr = cpu_to_le32(pci_map_single(VORTEX_PCI(vp), skb->data, PKT_BUF_SZ, PCI_DMA_FROMDEVICE)); - vp->rx_skbuff[entry] = skb; - } +clear_complete: vp->rx_ring[entry].status = 0; /* Clear complete bit. */ iowrite16(UpUnstall, ioaddr + EL3_CMD); + entry = (++vp->cur_rx) % RX_RING_SIZE; } return 0; } -/* - * If we've hit a total OOM refilling the Rx ring we poll once a second - * for some memory. Otherwise there is no way to restart the rx process. - */ -static void -rx_oom_timer(struct timer_list *t) -{ - struct vortex_private *vp = from_timer(vp, t, rx_oom_timer); - struct net_device *dev = vp->mii.dev; - - spin_lock_irq(&vp->lock); - if ((vp->cur_rx - vp->dirty_rx) == RX_RING_SIZE) /* This test is redundant, but makes me feel good */ - boomerang_rx(dev); - if (vortex_debug > 1) { - pr_debug("%s: rx_oom_timer %s\n", dev->name, - ((vp->cur_rx - vp->dirty_rx) != RX_RING_SIZE) ? "succeeded" : "retrying"); - } - spin_unlock_irq(&vp->lock); -} - static void vortex_down(struct net_device *dev, int final_down) { @@ -2711,7 +2698,6 @@ vortex_down(struct net_device *dev, int final_down) netdev_reset_queue(dev); netif_stop_queue(dev); - del_timer_sync(&vp->rx_oom_timer); del_timer_sync(&vp->timer); /* Turn off statistics ASAP. We update dev->stats below. */ From ce9caf2f79a5aa170a4b6456a03db639eed9c988 Mon Sep 17 00:00:00 2001 From: Stefan Schake Date: Fri, 29 Dec 2017 17:05:43 +0100 Subject: [PATCH 058/236] drm/vc4: Move IRQ enable to PM path We were calling enable_irq on bind, where it was already enabled previously by the IRQ helper. Additionally, dev->irq is not set correctly until after postinstall and so was always zero here, triggering a warning in 4.15. Fix both by moving the enable to the power management resume path, where we know there was a previous disable invocation during suspend. Fixes: 253696ccd613 ("drm/vc4: Account for interrupts in flight") Signed-off-by: Stefan Schake Signed-off-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/1514563543-32511-1-git-send-email-stschake@gmail.com Tested-by: Stefan Wahren Reviewed-by: Eric Anholt --- drivers/gpu/drm/vc4/vc4_irq.c | 3 --- drivers/gpu/drm/vc4/vc4_v3d.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 26eddbb62893..3dd62d75f531 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -209,9 +209,6 @@ vc4_irq_postinstall(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - /* Undo the effects of a previous vc4_irq_uninstall. */ - enable_irq(dev->irq); - /* Enable both the render done and out of memory interrupts. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c index 622cd43840b8..493f392b3a0a 100644 --- a/drivers/gpu/drm/vc4/vc4_v3d.c +++ b/drivers/gpu/drm/vc4/vc4_v3d.c @@ -327,6 +327,9 @@ static int vc4_v3d_runtime_resume(struct device *dev) return ret; vc4_v3d_init_hw(vc4->dev); + + /* We disabled the IRQ as part of vc4_irq_uninstall in suspend. */ + enable_irq(vc4->dev->irq); vc4_irq_postinstall(vc4->dev); return 0; From bec40c26041de61162f7be9d2ce548c756ce0f65 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 3 Jan 2018 13:39:15 -0800 Subject: [PATCH 059/236] IB/srpt: Disable RDMA access by the initiator With the SRP protocol all RDMA operations are initiated by the target. Since no RDMA operations are initiated by the initiator, do not grant the initiator permission to submit RDMA reads or writes to the target. Signed-off-by: Bart Van Assche Cc: Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 8a1bd354b1cc..7c4249038004 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1013,8 +1013,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) return -ENOMEM; attr->qp_state = IB_QPS_INIT; - attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE; + attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE; attr->port_num = ch->sport->port; attr->pkey_index = 0; From a1ffa4670cb97ae3a4b3e8535d88be5f643f7c3b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 3 Jan 2018 13:39:16 -0800 Subject: [PATCH 060/236] IB/srpt: Fix ACL lookup during login Make sure that the initiator port GUID is stored in ch->ini_guid. Note: when initiating a connection sgid and dgid members in struct sa_path_rec represent the source and destination GIDs. When accepting a connection however sgid represents the destination GID and dgid the source GID. Fixes: commit 2bce1a6d2209 ("IB/srpt: Accept GUIDs as port names") Signed-off-by: Bart Van Assche Cc: Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 7c4249038004..bfa576aa9f03 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2077,7 +2077,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - guid = (__be16 *)¶m->primary_path->sgid.global.interface_id; + guid = (__be16 *)¶m->primary_path->dgid.global.interface_id; snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x", be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); From 121d760d0788f95619049c63449d977065cab69d Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Fri, 29 Dec 2017 02:50:08 +0800 Subject: [PATCH 061/236] drm/i915/gvt: Clear the shadow page table entry after post-sync A shadow page table entry needs to be cleared after being set as post-sync. This patch fixes the recent error reported in Win7-32 test. Fixes: 2707e4446688 ("drm/i915/gvt: vGPU graphics memory virtualization") Signed-off-by: Zhi Wang CC: Stable Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/gtt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 8e331142badb..64d67ff9bf08 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1359,12 +1359,15 @@ static int ppgtt_handle_guest_write_page_table_bytes(void *gp, return ret; } else { if (!test_bit(index, spt->post_shadow_bitmap)) { + int type = spt->shadow_page.type; + ppgtt_get_shadow_entry(spt, &se, index); ret = ppgtt_handle_guest_entry_removal(gpt, &se, index); if (ret) return ret; + ops->set_pfn(&se, vgpu->gtt.scratch_pt[type].page_mfn); + ppgtt_set_shadow_entry(spt, &se, index); } - ppgtt_set_post_shadow(spt, index); } From 2bd7b4aacdb6efa5ccd4749c365c171b884791d2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2018 23:49:18 +0100 Subject: [PATCH 062/236] mmc: s3mci: mark debug_regs[] as static The global array clashes with a newly added symbol of the same name: drivers/staging/ccree/cc_debugfs.o:(.data+0x0): multiple definition of `debug_regs' drivers/mmc/host/s3cmci.o:(.data+0x70): first defined here We should fix both, this one addresses the s3cmci driver by removing the symbol from the global namespace. While at it, this separates the declaration from the type definition and makes the variable const. Fixes: 9bdd203b4dc8 ("s3cmci: add debugfs support for examining driver and hardware state") Fixes: b3ec9a6736f2 ("staging: ccree: staging: ccree: replace sysfs by debugfs interface") Signed-off-by: Arnd Bergmann Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index f7f157a62a4a..555c7f133eb8 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -1424,7 +1424,9 @@ static const struct file_operations s3cmci_fops_state = { struct s3cmci_reg { unsigned short addr; unsigned char *name; -} debug_regs[] = { +}; + +static const struct s3cmci_reg debug_regs[] = { DBG_REG(CON), DBG_REG(PRE), DBG_REG(CMDARG), @@ -1446,7 +1448,7 @@ struct s3cmci_reg { static int s3cmci_regs_show(struct seq_file *seq, void *v) { struct s3cmci_host *host = seq->private; - struct s3cmci_reg *rptr = debug_regs; + const struct s3cmci_reg *rptr = debug_regs; for (; rptr->name; rptr++) seq_printf(seq, "SDI%s\t=0x%08x\n", rptr->name, From 3ea15452ee85754f70f3b9fa1f23165ef2e77ba7 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Wed, 3 Jan 2018 11:00:31 +0800 Subject: [PATCH 063/236] nl80211: Check for the required netlink attribute presence nl80211_nan_add_func() does not check if the required attribute NL80211_NAN_FUNC_FOLLOW_UP_DEST is present when processing NL80211_CMD_ADD_NAN_FUNCTION request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attribute presence. Signed-off-by: Hao Chen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 213d0c498c97..2b3dbcd40e46 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11361,7 +11361,8 @@ static int nl80211_nan_add_func(struct sk_buff *skb, break; case NL80211_NAN_FUNC_FOLLOW_UP: if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] || - !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) { + !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] || + !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) { err = -EINVAL; goto out; } From 736a80bbfda709fb3631f5f62056f250a38e5804 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Jan 2018 15:51:53 +0100 Subject: [PATCH 064/236] mac80211: mesh: drop frames appearing to be from us If there are multiple mesh stations with the same MAC address, they will both get confused and start throwing warnings. Obviously in this case nothing can actually work anyway, so just drop frames that look like they're from ourselves early on. Reported-by: Gui Iribarren Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 70e9d2ca8bbe..4daafb07602f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3632,6 +3632,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) } return true; case NL80211_IFTYPE_MESH_POINT: + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return false; if (multicast) return true; return ether_addr_equal(sdata->vif.addr, hdr->addr1); From d14ac576d10f865970bb1324d337e5e24d79aaf4 Mon Sep 17 00:00:00 2001 From: Christian Holl Date: Wed, 3 Jan 2018 19:53:02 +0100 Subject: [PATCH 065/236] USB: serial: cp210x: add new device ID ELV ALC 8xxx This adds the ELV ALC 8xxx Battery Charging device to the list of USB IDs of drivers/usb/serial/cp210x.c Signed-off-by: Christian Holl Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 38814225a816..06d502b3e913 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -175,6 +175,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */ + { USB_DEVICE(0x18EF, 0xE030) }, /* ELV ALC 8xxx Battery Charger */ { USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */ { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */ { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */ From 54e98b5d663fcd8e3279c2391537b1a1f7bfe344 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 3 Jan 2018 22:02:29 -0800 Subject: [PATCH 066/236] net: dsa: b53: Turn off Broadcom tags for more switches Models such as BCM5395/97/98 and BCM53125/24/53115 and compatible require that we turn on managed mode to actually act on Broadcom tags, otherwise they just pass them through on ingress (host -> switch) and don't insert them in egress (switch -> host). Turning on managed mode is simple, but requires us to properly support ARL misses on multicast addresses which is a much more involved set of changes not suitable for a bug fix for this release. Reported-by: Jochen Friedrich Fixes: 7edc58d614d4 ("net: dsa: b53: Turn on Broadcom tags") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index f5a8dd96fd75..4498ab897d94 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1500,10 +1500,13 @@ static enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, { struct b53_device *dev = ds->priv; - /* Older models support a different tag format that we do not - * support in net/dsa/tag_brcm.c yet. + /* Older models (5325, 5365) support a different tag format that we do + * not support in net/dsa/tag_brcm.c yet. 539x and 531x5 require managed + * mode to be turned on which means we need to specifically manage ARL + * misses on multicast addresses (TBD). */ - if (is5325(dev) || is5365(dev) || !b53_can_enable_brcm_tags(ds, port)) + if (is5325(dev) || is5365(dev) || is539x(dev) || is531x5(dev) || + !b53_can_enable_brcm_tags(ds, port)) return DSA_TAG_PROTO_NONE; /* Broadcom BCM58xx chips have a flow accelerator on Port 8 From b4c2951a4833e66f1bbfe65ddcd4fdcdfafe5e8f Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sat, 2 Dec 2017 18:48:52 +0100 Subject: [PATCH 067/236] can: vxcan: improve handling of missing peer name attribute Picking up the patch from Serhey Popovych (commit 191cdb3822e5df6b3c8, "veth: Be more robust on network device creation when no attributes"). When the peer name attribute is not provided the former implementation tries to register the given device name twice ... which leads to -EEXIST. If only one device name is given apply an automatic generated and valid name for the peer. Cc: Serhey Popovych Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/vxcan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 8404e8852a0f..b4c4a2c76437 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -194,7 +194,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, tbp = peer_tb; } - if (tbp[IFLA_IFNAME]) { + if (ifmp && tbp[IFLA_IFNAME]) { nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { From d5b42e6607661b198d8b26a0c30969605b1bf5c7 Mon Sep 17 00:00:00 2001 From: Wolfgang Grandegger Date: Wed, 13 Dec 2017 19:52:23 +0100 Subject: [PATCH 068/236] can: gs_usb: fix return value of the "set_bittiming" callback The "set_bittiming" callback treats a positive return value as error! For that reason "can_changelink()" will quit silently after setting the bittiming values without processing ctrlmode, restart-ms, etc. Signed-off-by: Wolfgang Grandegger Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 68ac3e88a8ce..8bf80ad9dc44 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -449,7 +449,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev) dev_err(netdev->dev.parent, "Couldn't set bittimings (err=%d)", rc); - return rc; + return (rc > 0) ? 0 : rc; } static void gs_usb_xmit_callback(struct urb *urb) From 13454c14550065fcc1705d6bd4ee6d40e057099f Mon Sep 17 00:00:00 2001 From: Luu An Phu Date: Tue, 2 Jan 2018 10:44:18 +0700 Subject: [PATCH 069/236] can: flex_can: Correct the checking for frame length in flexcan_start_xmit() The flexcan_start_xmit() function compares the frame length with data register length to write frame content into data[0] and data[1] register. Data register length is 4 bytes and frame maximum length is 8 bytes. Fix the check that compares frame length with 3. Because the register length is 4. Signed-off-by: Luu An Phu Reviewed-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- drivers/net/can/flexcan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 0626dcfd1f3d..760d2c07e3a2 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -526,7 +526,7 @@ static int flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev) data = be32_to_cpup((__be32 *)&cf->data[0]); flexcan_write(data, &priv->tx_mb->data[0]); } - if (cf->can_dlc > 3) { + if (cf->can_dlc > 4) { data = be32_to_cpup((__be32 *)&cf->data[4]); flexcan_write(data, &priv->tx_mb->data[1]); } From 6ebc5e8fe85286c7392f1777a3dba9e1fd6d0253 Mon Sep 17 00:00:00 2001 From: Martin Lederhilger Date: Thu, 21 Dec 2017 14:42:44 +0000 Subject: [PATCH 070/236] can: ems_usb: improve error reporting for error warning and error passive This patch adds the missing CAN_ERR_CRTL to cf->can_id in case of CAN_STATE_ERROR_WARNING or CAN_STATE_ERROR_PASSIVE Signed-off-by: Martin Lederhilger Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ems_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index b00358297424..12ff0020ecd6 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -395,6 +395,7 @@ static void ems_usb_rx_err(struct ems_usb *dev, struct ems_cpc_msg *msg) if (dev->can.state == CAN_STATE_ERROR_WARNING || dev->can.state == CAN_STATE_ERROR_PASSIVE) { + cf->can_id |= CAN_ERR_CRTL; cf->data[1] = (txerr > rxerr) ? CAN_ERR_CRTL_TX_PASSIVE : CAN_ERR_CRTL_RX_PASSIVE; } From 06e7e776ca4d36547e503279aeff996cbb292c16 Mon Sep 17 00:00:00 2001 From: Ben Seri Date: Fri, 8 Dec 2017 15:14:47 +0100 Subject: [PATCH 071/236] Bluetooth: Prevent stack info leak from the EFS element. In the function l2cap_parse_conf_rsp and in the function l2cap_parse_conf_req the following variable is declared without initialization: struct l2cap_conf_efs efs; In addition, when parsing input configuration parameters in both of these functions, the switch case for handling EFS elements may skip the memcpy call that will write to the efs variable: ... case L2CAP_CONF_EFS: if (olen == sizeof(efs)) memcpy(&efs, (void *)val, olen); ... The olen in the above if is attacker controlled, and regardless of that if, in both of these functions the efs variable would eventually be added to the outgoing configuration request that is being built: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), (unsigned long) &efs); So by sending a configuration request, or response, that contains an L2CAP_CONF_EFS element, but with an element length that is not sizeof(efs) - the memcpy to the uninitialized efs variable can be avoided, and the uninitialized variable would be returned to the attacker (16 bytes). This issue has been assigned CVE-2017-1000410 Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Cc: stable Signed-off-by: Ben Seri Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/l2cap_core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 43ba91c440bc..fc6615d59165 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3363,9 +3363,10 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data break; case L2CAP_CONF_EFS: - remote_efs = 1; - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { + remote_efs = 1; memcpy(&efs, (void *) val, olen); + } break; case L2CAP_CONF_EWS: @@ -3584,16 +3585,17 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, break; case L2CAP_CONF_EFS: - if (olen == sizeof(efs)) + if (olen == sizeof(efs)) { memcpy(&efs, (void *)val, olen); - if (chan->local_stype != L2CAP_SERV_NOTRAFIC && - efs.stype != L2CAP_SERV_NOTRAFIC && - efs.stype != chan->local_stype) - return -ECONNREFUSED; + if (chan->local_stype != L2CAP_SERV_NOTRAFIC && + efs.stype != L2CAP_SERV_NOTRAFIC && + efs.stype != chan->local_stype) + return -ECONNREFUSED; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs, endptr - ptr); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), + (unsigned long) &efs, endptr - ptr); + } break; case L2CAP_CONF_FCS: From b78d830f0049ef1966dc1e0ebd1ec2a594e2cf25 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 19:23:46 -0700 Subject: [PATCH 072/236] usbip: fix vudc_rx: harden CMD_SUBMIT path to handle malicious input Harden CMD_SUBMIT path to handle malicious input that could trigger large memory allocations. Add checks to validate transfer_buffer_length and number_of_packets to protect against bad input requesting for unbounded memory allocations. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vudc_rx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/usb/usbip/vudc_rx.c b/drivers/usb/usbip/vudc_rx.c index df1e30989148..1e8a23d92cb4 100644 --- a/drivers/usb/usbip/vudc_rx.c +++ b/drivers/usb/usbip/vudc_rx.c @@ -120,6 +120,25 @@ static int v_recv_cmd_submit(struct vudc *udc, urb_p->new = 1; urb_p->seqnum = pdu->base.seqnum; + if (urb_p->ep->type == USB_ENDPOINT_XFER_ISOC) { + /* validate packet size and number of packets */ + unsigned int maxp, packets, bytes; + + maxp = usb_endpoint_maxp(urb_p->ep->desc); + maxp *= usb_endpoint_maxp_mult(urb_p->ep->desc); + bytes = pdu->u.cmd_submit.transfer_buffer_length; + packets = DIV_ROUND_UP(bytes, maxp); + + if (pdu->u.cmd_submit.number_of_packets < 0 || + pdu->u.cmd_submit.number_of_packets > packets) { + dev_err(&udc->gadget.dev, + "CMD_SUBMIT: isoc invalid num packets %d\n", + pdu->u.cmd_submit.number_of_packets); + ret = -EMSGSIZE; + goto free_urbp; + } + } + ret = alloc_urb_from_cmd(&urb_p->urb, pdu, urb_p->ep->type); if (ret) { usbip_event_add(&udc->ud, VUDC_EVENT_ERROR_MALLOC); From e1346fd87c71a1f61de1fe476ec8df1425ac931c Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 17:00:06 -0700 Subject: [PATCH 073/236] usbip: remove kernel addresses from usb device and urb debug msgs usbip_dump_usb_device() and usbip_dump_urb() print kernel addresses. Remove kernel addresses from usb device and urb debug msgs and improve the message content. Instead of printing parent device and bus addresses, print parent device and bus names. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 7b219d9109b4..ee2bbce24584 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -91,7 +91,7 @@ static void usbip_dump_usb_device(struct usb_device *udev) dev_dbg(dev, " devnum(%d) devpath(%s) usb speed(%s)", udev->devnum, udev->devpath, usb_speed_string(udev->speed)); - pr_debug("tt %p, ttport %d\n", udev->tt, udev->ttport); + pr_debug("tt hub ttport %d\n", udev->ttport); dev_dbg(dev, " "); for (i = 0; i < 16; i++) @@ -124,12 +124,8 @@ static void usbip_dump_usb_device(struct usb_device *udev) } pr_debug("\n"); - dev_dbg(dev, "parent %p, bus %p\n", udev->parent, udev->bus); - - dev_dbg(dev, - "descriptor %p, config %p, actconfig %p, rawdescriptors %p\n", - &udev->descriptor, udev->config, - udev->actconfig, udev->rawdescriptors); + dev_dbg(dev, "parent %s, bus %s\n", dev_name(&udev->parent->dev), + udev->bus->bus_name); dev_dbg(dev, "have_langid %d, string_langid %d\n", udev->have_langid, udev->string_langid); @@ -237,9 +233,6 @@ void usbip_dump_urb(struct urb *urb) dev = &urb->dev->dev; - dev_dbg(dev, " urb :%p\n", urb); - dev_dbg(dev, " dev :%p\n", urb->dev); - usbip_dump_usb_device(urb->dev); dev_dbg(dev, " pipe :%08x ", urb->pipe); @@ -248,11 +241,9 @@ void usbip_dump_urb(struct urb *urb) dev_dbg(dev, " status :%d\n", urb->status); dev_dbg(dev, " transfer_flags :%08X\n", urb->transfer_flags); - dev_dbg(dev, " transfer_buffer :%p\n", urb->transfer_buffer); dev_dbg(dev, " transfer_buffer_length:%d\n", urb->transfer_buffer_length); dev_dbg(dev, " actual_length :%d\n", urb->actual_length); - dev_dbg(dev, " setup_packet :%p\n", urb->setup_packet); if (urb->setup_packet && usb_pipetype(urb->pipe) == PIPE_CONTROL) usbip_dump_usb_ctrlrequest( @@ -262,8 +253,6 @@ void usbip_dump_urb(struct urb *urb) dev_dbg(dev, " number_of_packets :%d\n", urb->number_of_packets); dev_dbg(dev, " interval :%d\n", urb->interval); dev_dbg(dev, " error_count :%d\n", urb->error_count); - dev_dbg(dev, " context :%p\n", urb->context); - dev_dbg(dev, " complete :%p\n", urb->complete); } EXPORT_SYMBOL_GPL(usbip_dump_urb); From 5fd77a3a0e408c23ab4002a57db980e46bc16e72 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 22 Dec 2017 19:23:47 -0700 Subject: [PATCH 074/236] usbip: vudc_tx: fix v_send_ret_submit() vulnerability to null xfer buffer v_send_ret_submit() handles urb with a null transfer_buffer, when it replays a packet with potential malicious data that could contain a null buffer. Add a check for the condition when actual_length > 0 and transfer_buffer is null. Signed-off-by: Shuah Khan Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vudc_tx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/usb/usbip/vudc_tx.c b/drivers/usb/usbip/vudc_tx.c index 1440ae0919ec..3ccb17c3e840 100644 --- a/drivers/usb/usbip/vudc_tx.c +++ b/drivers/usb/usbip/vudc_tx.c @@ -85,6 +85,13 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p) memset(&pdu_header, 0, sizeof(pdu_header)); memset(&msg, 0, sizeof(msg)); + if (urb->actual_length > 0 && !urb->transfer_buffer) { + dev_err(&udc->gadget.dev, + "urb: actual_length %d transfer_buffer null\n", + urb->actual_length); + return -1; + } + if (urb_p->type == USB_ENDPOINT_XFER_ISOC) iovnum = 2 + urb->number_of_packets; else @@ -100,8 +107,8 @@ static int v_send_ret_submit(struct vudc *udc, struct urbp *urb_p) /* 1. setup usbip_header */ setup_ret_submit_pdu(&pdu_header, urb_p); - usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n", - pdu_header.base.seqnum, urb); + usbip_dbg_stub_tx("setup txdata seqnum: %d\n", + pdu_header.base.seqnum); usbip_header_correct_endian(&pdu_header, 1); iov[iovnum].iov_base = &pdu_header; From 0856655a25476d4431005e39d606e349050066b0 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Mon, 11 Dec 2017 09:52:22 +0100 Subject: [PATCH 075/236] wcn36xx: Fix dynamic power saving Since driver does not report hardware dynamic power saving cap, this is up to the mac80211 to manage power saving timeout and state machine, using the ieee80211 config callback to report PS changes. This patch enables/disables PS mode according to the new configuration. Remove old behaviour enabling PS mode in a static way, this make the device unusable when power save is enabled since device is forced to PS regardless RX/TX traffic. Acked-by: Bjorn Andersson Signed-off-by: Loic Poulain Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/wcn36xx/main.c | 23 ++++++++++++----------- drivers/net/wireless/ath/wcn36xx/pmc.c | 6 ++++-- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index f7d228b5ba93..987f1252a3cf 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -384,6 +384,18 @@ static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed) } } + if (changed & IEEE80211_CONF_CHANGE_PS) { + list_for_each_entry(tmp, &wcn->vif_list, list) { + vif = wcn36xx_priv_to_vif(tmp); + if (hw->conf.flags & IEEE80211_CONF_PS) { + if (vif->bss_conf.ps) /* ps allowed ? */ + wcn36xx_pmc_enter_bmps_state(wcn, vif); + } else { + wcn36xx_pmc_exit_bmps_state(wcn, vif); + } + } + } + mutex_unlock(&wcn->conf_mutex); return 0; @@ -747,17 +759,6 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw, vif_priv->dtim_period = bss_conf->dtim_period; } - if (changed & BSS_CHANGED_PS) { - wcn36xx_dbg(WCN36XX_DBG_MAC, - "mac bss PS set %d\n", - bss_conf->ps); - if (bss_conf->ps) { - wcn36xx_pmc_enter_bmps_state(wcn, vif); - } else { - wcn36xx_pmc_exit_bmps_state(wcn, vif); - } - } - if (changed & BSS_CHANGED_BSSID) { wcn36xx_dbg(WCN36XX_DBG_MAC, "mac bss changed_bssid %pM\n", bss_conf->bssid); diff --git a/drivers/net/wireless/ath/wcn36xx/pmc.c b/drivers/net/wireless/ath/wcn36xx/pmc.c index 589fe5f70971..1976b80c235f 100644 --- a/drivers/net/wireless/ath/wcn36xx/pmc.c +++ b/drivers/net/wireless/ath/wcn36xx/pmc.c @@ -45,8 +45,10 @@ int wcn36xx_pmc_exit_bmps_state(struct wcn36xx *wcn, struct wcn36xx_vif *vif_priv = wcn36xx_vif_to_priv(vif); if (WCN36XX_BMPS != vif_priv->pw_state) { - wcn36xx_err("Not in BMPS mode, no need to exit from BMPS mode!\n"); - return -EINVAL; + /* Unbalanced call or last BMPS enter failed */ + wcn36xx_dbg(WCN36XX_DBG_PMC, + "Not in BMPS mode, no need to exit\n"); + return -EALREADY; } wcn36xx_smd_exit_bmps(wcn, vif); vif_priv->pw_state = WCN36XX_FULL_POWER; From fb32dd3abf7a8fc13271d0d1c45ffc66df28dd15 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 2 Jan 2018 20:14:42 -0800 Subject: [PATCH 076/236] MAINTAINERS: Update my email address. Signed-off-by: Pravin Shelar Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a6e86e20761e..1e6872b4c6e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10137,7 +10137,7 @@ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* OPENVSWITCH -M: Pravin Shelar +M: Pravin B Shelar L: netdev@vger.kernel.org L: dev@openvswitch.org W: http://openvswitch.org From f428fe4a04cc339166c8bbd489789760de3a0cee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jan 2018 23:27:33 -0800 Subject: [PATCH 077/236] rtnetlink: give a user socket to get_target_net() This function is used from two places: rtnl_dump_ifinfo and rtnl_getlink. In rtnl_getlink(), we give a request skb into get_target_net(), but in rtnl_dump_ifinfo, we give a response skb into get_target_net(). The problem here is that NETLINK_CB() isn't initialized for the response skb. In both cases we can get a user socket and give it instead of skb into get_target_net(). This bug was found by syzkaller with this call-trace: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 1 PID: 3149 Comm: syzkaller140561 Not tainted 4.15.0-rc4-mm1+ #47 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__netlink_ns_capable+0x8b/0x120 net/netlink/af_netlink.c:868 RSP: 0018:ffff8801c880f348 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8443f900 RDX: 000000000000007b RSI: ffffffff86510f40 RDI: 00000000000003d8 RBP: ffff8801c880f360 R08: 0000000000000000 R09: 1ffff10039101e4f R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff86510f40 R13: 000000000000000c R14: 0000000000000004 R15: 0000000000000011 FS: 0000000001a1a880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020151000 CR3: 00000001c9511005 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: netlink_ns_capable+0x26/0x30 net/netlink/af_netlink.c:886 get_target_net+0x9d/0x120 net/core/rtnetlink.c:1765 rtnl_dump_ifinfo+0x2e5/0xee0 net/core/rtnetlink.c:1806 netlink_dump+0x48c/0xce0 net/netlink/af_netlink.c:2222 __netlink_dump_start+0x4f0/0x6d0 net/netlink/af_netlink.c:2319 netlink_dump_start include/linux/netlink.h:214 [inline] rtnetlink_rcv_msg+0x7f0/0xb10 net/core/rtnetlink.c:4485 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2441 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4540 netlink_unicast_kernel net/netlink/af_netlink.c:1308 [inline] netlink_unicast+0x4be/0x6a0 net/netlink/af_netlink.c:1334 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1897 Cc: Jiri Benc Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface") Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dabba2a91fc8..778d7f03404a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1681,18 +1681,18 @@ static bool link_dump_filtered(struct net_device *dev, return false; } -static struct net *get_target_net(struct sk_buff *skb, int netnsid) +static struct net *get_target_net(struct sock *sk, int netnsid) { struct net *net; - net = get_net_ns_by_id(sock_net(skb->sk), netnsid); + net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } @@ -1733,7 +1733,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) ifla_policy, NULL) >= 0) { if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(skb->sk, netnsid); if (IS_ERR(tgt_net)) { tgt_net = net; netnsid = -1; @@ -2883,7 +2883,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); - tgt_net = get_target_net(skb, netnsid); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } From 879626e3a52630316d817cbda7cec9a5446d1d82 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 3 Jan 2018 16:46:29 +0100 Subject: [PATCH 078/236] net: stmmac: enable EEE in MII, GMII or RGMII only Note in the databook - Section 4.4 - EEE : " The EEE feature is not supported when the MAC is configured to use the TBI, RTBI, SMII, RMII or SGMII single PHY interface. Even if the MAC supports multiple PHY interfaces, you should activate the EEE mode only when the MAC is operating with GMII, MII, or RGMII interface." Applying this restriction solves a stability issue observed on Amlogic gxl platforms operating with RMII interface and the internal PHY. Fixes: 83bf79b6bb64 ("stmmac: disable at run-time the EEE if not supported") Signed-off-by: Jerome Brunet Tested-by: Arnaud Patard Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 337d53d12e94..c0af0bc4e714 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -364,9 +364,15 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t) bool stmmac_eee_init(struct stmmac_priv *priv) { struct net_device *ndev = priv->dev; + int interface = priv->plat->interface; unsigned long flags; bool ret = false; + if ((interface != PHY_INTERFACE_MODE_MII) && + (interface != PHY_INTERFACE_MODE_GMII) && + !phy_interface_mode_is_rgmii(interface)) + goto out; + /* Using PCS we cannot dial with the phy registers at this stage * so we do not support extra feature like EEE. */ From dfe8266b8dd10e12a731c985b725fcf7f0e537f0 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 3 Jan 2018 20:09:49 +0300 Subject: [PATCH 079/236] sh_eth: fix TSU resource handling When switching the driver to the managed device API, I managed to break the case of a dual Ether devices sharing a single TSU: the 2nd Ether port wouldn't probe. Iwamatsu-san has tried to fix this but his patch was buggy and he then dropped the ball... The solution is to limit calling devm_request_mem_region() to the first of the two ports sharing the same TSU, so devm_ioremap_resource() can't be used anymore for the TSU resource... Fixes: d5e07e69218f ("sh_eth: use managed device API") Reported-by: Nobuhiro Iwamatsu Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 75323000c364..1bdd67a8a869 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3225,10 +3225,29 @@ static int sh_eth_drv_probe(struct platform_device *pdev) /* ioremap the TSU registers */ if (mdp->cd->tsu) { struct resource *rtsu; + rtsu = platform_get_resource(pdev, IORESOURCE_MEM, 1); - mdp->tsu_addr = devm_ioremap_resource(&pdev->dev, rtsu); - if (IS_ERR(mdp->tsu_addr)) { - ret = PTR_ERR(mdp->tsu_addr); + if (!rtsu) { + dev_err(&pdev->dev, "no TSU resource\n"); + ret = -ENODEV; + goto out_release; + } + /* We can only request the TSU region for the first port + * of the two sharing this TSU for the probe to succeed... + */ + if (devno % 2 == 0 && + !devm_request_mem_region(&pdev->dev, rtsu->start, + resource_size(rtsu), + dev_name(&pdev->dev))) { + dev_err(&pdev->dev, "can't request TSU resource.\n"); + ret = -EBUSY; + goto out_release; + } + mdp->tsu_addr = devm_ioremap(&pdev->dev, rtsu->start, + resource_size(rtsu)); + if (!mdp->tsu_addr) { + dev_err(&pdev->dev, "TSU region ioremap() failed.\n"); + ret = -ENOMEM; goto out_release; } mdp->port = devno % 2; From 7d11f77f84b27cef452cee332f4e469503084737 Mon Sep 17 00:00:00 2001 From: Mohamed Ghannam Date: Wed, 3 Jan 2018 21:06:06 +0000 Subject: [PATCH 080/236] RDS: null pointer dereference in rds_atomic_free_op set rm->atomic.op_active to 0 when rds_pin_pages() fails or the user supplied address is invalid, this prevents a NULL pointer usage in rds_atomic_free_op() Signed-off-by: Mohamed Ghannam Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 94729d9da437..634cfcb7bba6 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -877,6 +877,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, err: if (page) put_page(page); + rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; From 7bbfe00e025240505db3e04c3b296d7c023b2a26 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 3 Jan 2018 14:11:59 -0800 Subject: [PATCH 081/236] ipv6: fix general protection fault in fib6_add() In fib6_add(), pn could be NULL if fib6_add_1() failed to return a fib6 node. Checking pn != fn before accessing pn->leaf makes sure pn is not NULL. This fixes the following GPF reported by syzkaller: general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 3201 Comm: syzkaller001778 Not tainted 4.15.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:fib6_add+0x736/0x15a0 net/ipv6/ip6_fib.c:1244 RSP: 0018:ffff8801c7626a70 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffffff84794465 RDX: 0000000000000004 RSI: ffff8801d38935f0 RDI: 0000000000000282 RBP: ffff8801c7626da0 R08: 1ffff10038ec4c35 R09: 0000000000000000 R10: ffff8801c7626c68 R11: 0000000000000000 R12: 00000000fffffffe R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000009 FS: 0000000000000000(0000) GS:ffff8801db200000(0063) knlGS:0000000009b70840 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 0000000020be1000 CR3: 00000001d585a006 CR4: 00000000001606f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1006 ip6_route_multipath_add+0xd14/0x16c0 net/ipv6/route.c:3833 inet6_rtm_newroute+0xdc/0x160 net/ipv6/route.c:3957 rtnetlink_rcv_msg+0x733/0x1020 net/core/rtnetlink.c:4411 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4423 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864 sock_sendmsg_nosec net/socket.c:636 [inline] sock_sendmsg+0xca/0x110 net/socket.c:646 sock_write_iter+0x31a/0x5d0 net/socket.c:915 call_write_iter include/linux/fs.h:1772 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 compat_writev+0x225/0x420 fs/read_write.c:1246 do_compat_writev+0x115/0x220 fs/read_write.c:1267 C_SYSC_writev fs/read_write.c:1278 [inline] compat_SyS_writev+0x26/0x30 fs/read_write.c:1274 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:125 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f5285f4e1d08..d11a5578e4f8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1241,23 +1241,28 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, - lockdep_is_held(&table->tb6_lock)); - if (pn != fn && pn_leaf == rt) { - pn_leaf = NULL; - RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn_leaf = fib6_find_prefix(info->nl_net, table, pn); -#if RT6_DEBUG >= 2 - if (!pn_leaf) { - WARN_ON(!pn_leaf); - pn_leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct rt6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + atomic_dec(&rt->rt6i_ref); } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); +#if RT6_DEBUG >= 2 + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = + info->nl_net->ipv6.ip6_null_entry; + } #endif - atomic_inc(&pn_leaf->rt6i_ref); - rcu_assign_pointer(pn->leaf, pn_leaf); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); + } } #endif goto failure; From 6926e041a8920c8ec27e4e155efa760aa01551fd Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Wed, 3 Jan 2018 23:14:21 +0100 Subject: [PATCH 082/236] uapi/if_ether.h: prevent redefinition of struct ethhdr Musl provides its own ethhdr struct definition. Add a guard to prevent its definition of the appropriate musl header has already been included. glibc does not implement this header, but when glibc will implement this they can just define __UAPI_DEF_ETHHDR 0 to make it work with the kernel. Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 3 +++ include/uapi/linux/libc-compat.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 3ee3bf7c8526..144de4d2f385 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -23,6 +23,7 @@ #define _UAPI_LINUX_IF_ETHER_H #include +#include /* * IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble @@ -149,11 +150,13 @@ * This is an Ethernet frame header. */ +#if __UAPI_DEF_ETHHDR struct ethhdr { unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ unsigned char h_source[ETH_ALEN]; /* source ether addr */ __be16 h_proto; /* packet type ID field */ } __attribute__((packed)); +#endif #endif /* _UAPI_LINUX_IF_ETHER_H */ diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index 8254c937c9f4..fc29efaa918c 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -264,4 +264,10 @@ #endif /* __GLIBC__ */ +/* Definitions for if_ether.h */ +/* allow libcs like musl to deactivate this, glibc does not implement this. */ +#ifndef __UAPI_DEF_ETHHDR +#define __UAPI_DEF_ETHHDR 1 +#endif + #endif /* _UAPI_LIBC_COMPAT_H */ From 9a00674213a3f00394f4e3221b88f2d21fc05789 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 14:30:19 -0600 Subject: [PATCH 083/236] crypto: algapi - fix NULL dereference in crypto_remove_spawns() syzkaller triggered a NULL pointer dereference in crypto_remove_spawns() via a program that repeatedly and concurrently requests AEADs "authenc(cmac(des3_ede-asm),pcbc-aes-aesni)" and hashes "cmac(des3_ede)" through AF_ALG, where the hashes are requested as "untested" (CRYPTO_ALG_TESTED is set in ->salg_mask but clear in ->salg_feat; this causes the template to be instantiated for every request). Although AF_ALG users really shouldn't be able to request an "untested" algorithm, the NULL pointer dereference is actually caused by a longstanding race condition where crypto_remove_spawns() can encounter an instance which has had spawn(s) "grabbed" but hasn't yet been registered, resulting in ->cra_users still being NULL. We probably should properly initialize ->cra_users earlier, but that would require updating many templates individually. For now just fix the bug in a simple way that can easily be backported: make crypto_remove_spawns() treat a NULL ->cra_users list as empty. Reported-by: syzbot Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/algapi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/crypto/algapi.c b/crypto/algapi.c index 60d7366ed343..9a636f961572 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -167,6 +167,18 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, spawn->alg = NULL; spawns = &inst->alg.cra_users; + + /* + * We may encounter an unregistered instance here, since + * an instance's spawns are set up prior to the instance + * being registered. An unregistered instance will have + * NULL ->cra_users.next, since ->cra_users isn't + * properly initialized until registration. But an + * unregistered instance cannot have any users, so treat + * it the same as ->cra_users being empty. + */ + if (spawns->next == NULL) + break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); From 943309d4aad6732b905f3f500e6e17e33c211494 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 4 Jan 2018 09:19:13 +0200 Subject: [PATCH 084/236] iwlwifi: pcie: fix DMA memory mapping / unmapping 22000 devices (previously referenced as A000) can support short transmit queues. This means that we have less DMA descriptors (TFD) for those shorter queues. Previous devices must still have 256 TFDs for each queue even if those 256 TFDs point to fewer buffers. When I introduced support for the short queues for 22000 I broke older devices by assuming that they can also have less TFDs in their queues. This led to several problems: 1) the payload of the commands weren't unmapped properly which caused the SWIOTLB to complain at some point. 2) the hardware could get confused and we get hardware crashes. The corresponding bugzilla entries are: https://bugzilla.kernel.org/show_bug.cgi?id=198201 https://bugzilla.kernel.org/show_bug.cgi?id=198265 Cc: stable@vger.kernel.org # 4.14+ Fixes: 4ecab5616023 ("iwlwifi: pcie: support short Tx queues for A000 device family") Reviewed-by: Sharon, Sara Signed-off-by: Emmanuel Grumbach Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 10 +++++++--- drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 11 +++-------- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 8 ++++---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index d749abeca3ae..403e65c309d0 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -670,11 +670,15 @@ static inline u8 iwl_pcie_get_cmd_index(struct iwl_txq *q, u32 index) return index & (q->n_window - 1); } -static inline void *iwl_pcie_get_tfd(struct iwl_trans_pcie *trans_pcie, +static inline void *iwl_pcie_get_tfd(struct iwl_trans *trans, struct iwl_txq *txq, int idx) { - return txq->tfds + trans_pcie->tfd_size * iwl_pcie_get_cmd_index(txq, - idx); + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + + if (trans->cfg->use_tfh) + idx = iwl_pcie_get_cmd_index(txq, idx); + + return txq->tfds + trans_pcie->tfd_size * idx; } static inline void iwl_enable_rfkill_int(struct iwl_trans *trans) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 16b345f54ff0..6d0a907d5ba5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -171,8 +171,6 @@ static void iwl_pcie_gen2_tfd_unmap(struct iwl_trans *trans, static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - /* rd_ptr is bounded by TFD_QUEUE_SIZE_MAX and * idx is bounded by n_window */ @@ -181,7 +179,7 @@ static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq) lockdep_assert_held(&txq->lock); iwl_pcie_gen2_tfd_unmap(trans, &txq->entries[idx].meta, - iwl_pcie_get_tfd(trans_pcie, txq, idx)); + iwl_pcie_get_tfd(trans, txq, idx)); /* free SKB */ if (txq->entries) { @@ -364,11 +362,9 @@ struct iwl_tfh_tfd *iwl_pcie_gen2_build_tfd(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *out_meta) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; int idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr); - struct iwl_tfh_tfd *tfd = - iwl_pcie_get_tfd(trans_pcie, txq, idx); + struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, idx); dma_addr_t tb_phys; bool amsdu; int i, len, tb1_len, tb2_len, hdr_len; @@ -565,8 +561,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, u8 group_id = iwl_cmd_groupid(cmd->id); const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD]; u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD]; - struct iwl_tfh_tfd *tfd = - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr); + struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr); memset(tfd, 0, sizeof(*tfd)); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index fed6d842a5e1..3f85713c41dc 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -373,7 +373,7 @@ static void iwl_pcie_tfd_unmap(struct iwl_trans *trans, { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int i, num_tbs; - void *tfd = iwl_pcie_get_tfd(trans_pcie, txq, index); + void *tfd = iwl_pcie_get_tfd(trans, txq, index); /* Sanity check on number of chunks */ num_tbs = iwl_pcie_tfd_get_num_tbs(trans, tfd); @@ -2018,7 +2018,7 @@ static int iwl_fill_data_tbs(struct iwl_trans *trans, struct sk_buff *skb, } trace_iwlwifi_dev_tx(trans->dev, skb, - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr), + iwl_pcie_get_tfd(trans, txq, txq->write_ptr), trans_pcie->tfd_size, &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, hdr_len); @@ -2092,7 +2092,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, IEEE80211_CCMP_HDR_LEN : 0; trace_iwlwifi_dev_tx(trans->dev, skb, - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr), + iwl_pcie_get_tfd(trans, txq, txq->write_ptr), trans_pcie->tfd_size, &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, 0); @@ -2425,7 +2425,7 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, memcpy(&txq->first_tb_bufs[txq->write_ptr], &dev_cmd->hdr, IWL_FIRST_TB_SIZE); - tfd = iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr); + tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr); /* Set up entry for this TFD in Tx byte-count array */ iwl_pcie_txq_update_byte_cnt_tbl(trans, txq, le16_to_cpu(tx_cmd->len), iwl_pcie_tfd_get_num_tbs(trans, tfd)); From 454be724f6f99cc7e7bbf15067128be9868186c6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Nov 2017 07:56:35 +0800 Subject: [PATCH 085/236] block: drain queue before waiting for q_usage_counter becoming zero Now we track legacy requests with .q_usage_counter in commit 055f6e18e08f ("block: Make q_usage_counter also track legacy requests"), but that commit never runs and drains legacy queue before waiting for this counter becoming zero, then IO hang is caused in the test of pulling disk during IO. This patch fixes the issue by draining requests before waiting for q_usage_counter becoming zero, both Mauricio and chenxiang reported this issue, and observed that it can be fixed by this patch. Link: https://marc.info/?l=linux-block&m=151192424731797&w=2 Fixes: 055f6e18e08f("block: Make q_usage_counter also track legacy requests") Cc: Wen Xiong Tested-by: "chenxiang (M)" Tested-by: Mauricio Faria de Oliveira Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++++-- block/blk-mq.c | 2 ++ block/blk.h | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b8881750a3ac..3ba4326a63b5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -562,6 +562,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) } } +void blk_drain_queue(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + __blk_drain_queue(q, true); + spin_unlock_irq(q->queue_lock); +} + /** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest @@ -689,8 +696,6 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); spin_lock_irq(lock); - if (!q->mq_ops) - __blk_drain_queue(q, true); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 11097477eeab..3d3797327491 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -161,6 +161,8 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); + if (!q->mq_ops) + blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } diff --git a/block/blk.h b/block/blk.h index 3f1446937aec..442098aa9463 100644 --- a/block/blk.h +++ b/block/blk.h @@ -330,4 +330,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_BOUNCE */ +extern void blk_drain_queue(struct request_queue *q); + #endif /* BLK_INTERNAL_H */ From d1616f07e8f1a4a490d1791316d4a68906b284aa Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Thu, 4 Jan 2018 10:47:20 +0800 Subject: [PATCH 086/236] net: fec: free/restore resource in related probe error pathes Fixes in probe error path: - Restore dev_id before failed_ioremap path. Fixes: ("net: fec: restore dev_id in the cases of probe error") - Call of_node_put(phy_node) before failed_phy path. Fixes: ("net: fec: Support phys probed from devicetree and fixed-link") Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 19f198e22e15..a74300a4459c 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3556,11 +3556,11 @@ failed_clk_ipg: failed_clk: if (of_phy_is_fixed_link(np)) of_phy_deregister_fixed_link(np); -failed_phy: of_node_put(phy_node); +failed_phy: + dev_id--; failed_ioremap: free_netdev(ndev); - dev_id--; return ret; } From 9059a3493efea6492451430c7e2fa0af799a2abb Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 16 Nov 2017 20:06:39 -0500 Subject: [PATCH 087/236] kconfig: fix relational operators for bool and tristate symbols Since commit 31847b67bec0 ("kconfig: allow use of relations other than (in)equality") it is possible to use relational operators in Kconfig statements. However, those operators give unexpected results when applied to bool/tristate values: (n < y) = y (correct) (m < y) = y (correct) (n < m) = n (wrong) This happens because relational operators process bool and tristate symbols as strings and m sorts before n. It makes little sense to do a lexicographical compare on bool and tristate values though. Documentation/kbuild/kconfig-language.txt states that expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 respectively for calculations). Let's make it so for relational comparisons with bool/tristate expressions as well and document them. If at least one symbol is an actual string then the lexicographical compare works just as before. Signed-off-by: Nicolas Pitre Acked-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- Documentation/kbuild/kconfig-language.txt | 23 +++++++++++++++-------- scripts/kconfig/expr.c | 5 ++++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index 262722d8867b..c4a293a03c33 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -200,10 +200,14 @@ module state. Dependency expressions have the following syntax: ::= (1) '=' (2) '!=' (3) - '(' ')' (4) - '!' (5) - '&&' (6) - '||' (7) + '<' (4) + '>' (4) + '<=' (4) + '>=' (4) + '(' ')' (5) + '!' (6) + '&&' (7) + '||' (8) Expressions are listed in decreasing order of precedence. @@ -214,10 +218,13 @@ Expressions are listed in decreasing order of precedence. otherwise 'n'. (3) If the values of both symbols are equal, it returns 'n', otherwise 'y'. -(4) Returns the value of the expression. Used to override precedence. -(5) Returns the result of (2-/expr/). -(6) Returns the result of min(/expr/, /expr/). -(7) Returns the result of max(/expr/, /expr/). +(4) If value of is respectively lower, greater, lower-or-equal, + or greater-or-equal than value of , it returns 'y', + otherwise 'n'. +(5) Returns the value of the expression. Used to override precedence. +(6) Returns the result of (2-/expr/). +(7) Returns the result of min(/expr/, /expr/). +(8) Returns the result of max(/expr/, /expr/). An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2 respectively for calculations). A menu entry becomes visible when its diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c index cbf4996dd9c1..8cee597d33a5 100644 --- a/scripts/kconfig/expr.c +++ b/scripts/kconfig/expr.c @@ -893,7 +893,10 @@ static enum string_value_kind expr_parse_string(const char *str, switch (type) { case S_BOOLEAN: case S_TRISTATE: - return k_string; + val->s = !strcmp(str, "n") ? 0 : + !strcmp(str, "m") ? 1 : + !strcmp(str, "y") ? 2 : -1; + return k_signed; case S_INT: val->s = strtoll(str, &tail, 10); kind = k_signed; From 5133550296d43236439494aa955bfb765a89f615 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 4 Jan 2018 21:06:49 +0300 Subject: [PATCH 088/236] sh_eth: fix SH7757 GEther initialization Renesas SH7757 has 2 Fast and 2 Gigabit Ether controllers, while the 'sh_eth' driver can only reset and initialize TSU of the first controller pair. Shimoda-san tried to solve that adding the 'needs_init' member to the 'struct sh_eth_plat_data', however the platform code still never sets this flag. I think that we can infer this information from the 'devno' variable (set to 'platform_device::id') and reset/init the Ether controller pair only for an even 'devno'; therefore 'sh_eth_plat_data::needs_init' can be removed... Fixes: 150647fb2c31 ("net: sh_eth: change the condition of initialization") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++-- include/linux/sh_eth.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 1bdd67a8a869..f21c1db91c3f 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3254,8 +3254,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev) ndev->features = NETIF_F_HW_VLAN_CTAG_FILTER; } - /* initialize first or needed device */ - if (!devno || pd->needs_init) { + /* Need to init only the first port of the two sharing a TSU */ + if (devno % 2 == 0) { if (mdp->cd->chip_reset) mdp->cd->chip_reset(ndev); diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index ff3642d267f7..94081e9a5010 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -17,7 +17,6 @@ struct sh_eth_plat_data { unsigned char mac_addr[ETH_ALEN]; unsigned no_ether_link:1; unsigned ether_link_active_low:1; - unsigned needs_init:1; }; #endif From 7729bebc619307a0233c86f8585a4bf3eadc7ce4 Mon Sep 17 00:00:00 2001 From: Valentin Ilie Date: Fri, 5 Jan 2018 23:12:59 +0000 Subject: [PATCH 089/236] ia64, sched/cputime: Fix build error if CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y Remove the extra parenthesis. This bug was introduced by: e2339a4caa5e: ("ia64: Convert vtime to use nsec units directly") Signed-off-by: Valentin Ilie Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: linux-ia64@vger.kernel.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1515193979-24873-1-git-send-email-valentin.ilie@gmail.com Signed-off-by: Ingo Molnar --- arch/ia64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index c6ecb97151a2..9025699049ca 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk) } if (ti->softirq_time) { - delta = cycle_to_nsec(ti->softirq_time)); + delta = cycle_to_nsec(ti->softirq_time); account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); } From b94b7373317164402ff7728d10f7023127a02b60 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Mon, 1 Jan 2018 10:04:47 +0800 Subject: [PATCH 090/236] x86/microcode/intel: Extend BDW late-loading with a revision check Instead of blacklisting all model 79 CPUs when attempting a late microcode loading, limit that only to CPUs with microcode revisions < 0x0b000021 because only on those late loading may cause a system hang. For such processors either: a) a BIOS update which might contain a newer microcode revision or b) the early microcode loading method should be considered. Processors with revisions 0x0b000021 or higher will not experience such hangs. For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. [ bp: Heavily massage commit message and pr_* statements. ] Fixes: 723f2828a98c ("x86/microcode/intel: Disable late loading on model 79") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Cc: x86-ml Cc: # v4.14 Link: http://lkml.kernel.org/r/1514772287-92959-1-git-send-email-qianyue.zj@alibaba-inc.com --- arch/x86/kernel/cpu/microcode/intel.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8ccdca6d3f9e..d9e460fc7a3b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -910,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { - pr_err_once("late loading on model 79 is disabled.\n"); + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * may result in a system hang. This behavior is documented in item + * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_mask == 0x01 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jan 2018 16:26:00 -0800 Subject: [PATCH 091/236] loop: fix concurrent lo_open/lo_release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 范龙飞 reports that KASAN can report a use-after-free in __lock_acquire. The reason is due to insufficient serialization in lo_release(), which will continue to use the loop device even after it has decremented the lo_refcnt to zero. In the meantime, another process can come in, open the loop device again as it is being shut down. Confusion ensues. Reported-by: 范龙飞 Signed-off-by: Linus Torvalds Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bc8e61506968..d5fe720cf149 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1581,9 +1581,8 @@ out: return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void __lo_release(struct loop_device *lo) { - struct loop_device *lo = disk->private_data; int err; if (atomic_dec_return(&lo->lo_refcnt)) @@ -1610,6 +1609,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&lo->lo_ctl_mutex); } +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + mutex_lock(&loop_index_mutex); + __lo_release(disk->private_data); + mutex_unlock(&loop_index_mutex); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, From de53c3786a3ce162a1c815d0c04c766c23ec9c0a Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 5 Jan 2018 22:35:41 +0100 Subject: [PATCH 092/236] x86/pti: Unbreak EFI old_memmap EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't execute it's code. Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done by the pgprot API). _PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as _set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on it. Tested-by: Dimitri Sivanich Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Cc: Andrea Arcangeli Cc: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm --- arch/x86/platform/efi/efi_64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 39c4b35ac7a4..61975b6bcb1a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -134,7 +134,9 @@ pgd_t * __init efi_call_phys_prolog(void) pud[j] = *pud_offset(p4d_k, vaddr); } } + pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; } + out: __flush_tlb_all(); From 01c9b17bf673b05bb401b76ec763e9730ccf1376 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 5 Jan 2018 09:44:36 -0800 Subject: [PATCH 093/236] x86/Documentation: Add PTI description Add some details about how PTI works, what some of the downsides are, and how to debug it when things go wrong. Also document the kernel parameter: 'pti/nopti'. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Randy Dunlap Reviewed-by: Kees Cook Cc: Moritz Lipp Cc: Daniel Gruss Cc: Michael Schwarz Cc: Richard Fellner Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Hugh Dickins Cc: Andi Lutomirsky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com --- .../admin-guide/kernel-parameters.txt | 21 +- Documentation/x86/pti.txt | 186 ++++++++++++++++++ 2 files changed, 200 insertions(+), 7 deletions(-) create mode 100644 Documentation/x86/pti.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 520fdec15bbb..905991745d26 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2685,8 +2685,6 @@ steal time is computed, but won't influence scheduler behaviour - nopti [X86-64] Disable kernel page table isolation - nolapic [X86-32,APIC] Do not enable or use the local APIC. nolapic_timer [X86-32,APIC] Do not use the local APIC timer. @@ -3255,11 +3253,20 @@ pt. [PARIDE] See Documentation/blockdev/paride.txt. - pti= [X86_64] - Control user/kernel address space isolation: - on - enable - off - disable - auto - default setting + pti= [X86_64] Control Page Table Isolation of user and + kernel address spaces. Disabling this feature + removes hardening, but improves performance of + system calls and interrupts. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable to issues that PTI mitigates + + Not specifying this option is equivalent to pti=auto. + + nopti [X86_64] + Equivalent to pti=off pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt new file mode 100644 index 000000000000..d11eff61fc9a --- /dev/null +++ b/Documentation/x86/pti.txt @@ -0,0 +1,186 @@ +Overview +======== + +Page Table Isolation (pti, previously known as KAISER[1]) is a +countermeasure against attacks on the shared user/kernel address +space such as the "Meltdown" approach[2]. + +To mitigate this class of attacks, we create an independent set of +page tables for use only when running userspace applications. When +the kernel is entered via syscalls, interrupts or exceptions, the +page tables are switched to the full "kernel" copy. When the system +switches back to user mode, the user copy is used again. + +The userspace page tables contain only a minimal amount of kernel +data: only what is needed to enter/exit the kernel such as the +entry/exit functions themselves and the interrupt descriptor table +(IDT). There are a few strictly unnecessary things that get mapped +such as the first C function when entering an interrupt (see +comments in pti.c). + +This approach helps to ensure that side-channel attacks leveraging +the paging structures do not function when PTI is enabled. It can be +enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. +Once enabled at compile-time, it can be disabled at boot with the +'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). + +Page Table Management +===================== + +When PTI is enabled, the kernel manages two sets of page tables. +The first set is very similar to the single set which is present in +kernels without PTI. This includes a complete mapping of userspace +that the kernel can use for things like copy_to_user(). + +Although _complete_, the user portion of the kernel page tables is +crippled by setting the NX bit in the top level. This ensures +that any missed kernel->user CR3 switch will immediately crash +userspace upon executing its first instruction. + +The userspace page tables map only the kernel data needed to enter +and exit the kernel. This data is entirely contained in the 'struct +cpu_entry_area' structure which is placed in the fixmap which gives +each CPU's copy of the area a compile-time-fixed virtual address. + +For new userspace mappings, the kernel makes the entries in its +page tables like normal. The only difference is when the kernel +makes entries in the top (PGD) level. In addition to setting the +entry in the main kernel PGD, a copy of the entry is made in the +userspace page tables' PGD. + +This sharing at the PGD level also inherently shares all the lower +layers of the page tables. This leaves a single, shared set of +userspace page tables to manage. One PTE to lock, one set of +accessed bits, dirty bits, etc... + +Overhead +======== + +Protection against side-channel attacks is important. But, +this protection comes at a cost: + +1. Increased Memory Use + a. Each process now needs an order-1 PGD instead of order-0. + (Consumes an additional 4k per process). + b. The 'cpu_entry_area' structure must be 2MB in size and 2MB + aligned so that it can be mapped by setting a single PMD + entry. This consumes nearly 2MB of RAM once the kernel + is decompressed, but no space in the kernel image itself. + +2. Runtime Cost + a. CR3 manipulation to switch between the page table copies + must be done at interrupt, syscall, and exception entry + and exit (it can be skipped when the kernel is interrupted, + though.) Moves to CR3 are on the order of a hundred + cycles, and are required at every entry and exit. + b. A "trampoline" must be used for SYSCALL entry. This + trampoline depends on a smaller set of resources than the + non-PTI SYSCALL entry code, so requires mapping fewer + things into the userspace page tables. The downside is + that stacks must be switched at entry time. + d. Global pages are disabled for all kernel structures not + mapped into both kernel and userspace page tables. This + feature of the MMU allows different processes to share TLB + entries mapping the kernel. Losing the feature means more + TLB misses after a context switch. The actual loss of + performance is very small, however, never exceeding 1%. + d. Process Context IDentifiers (PCID) is a CPU feature that + allows us to skip flushing the entire TLB when switching page + tables by setting a special bit in CR3 when the page tables + are changed. This makes switching the page tables (at context + switch, or kernel entry/exit) cheaper. But, on systems with + PCID support, the context switch code must flush both the user + and kernel entries out of the TLB. The user PCID TLB flush is + deferred until the exit to userspace, minimizing the cost. + See intel.com/sdm for the gory PCID/INVPCID details. + e. The userspace page tables must be populated for each new + process. Even without PTI, the shared kernel mappings + are created by copying top-level (PGD) entries into each + new process. But, with PTI, there are now *two* kernel + mappings: one in the kernel page tables that maps everything + and one for the entry/exit structures. At fork(), we need to + copy both. + f. In addition to the fork()-time copying, there must also + be an update to the userspace PGD any time a set_pgd() is done + on a PGD used to map userspace. This ensures that the kernel + and userspace copies always map the same userspace + memory. + g. On systems without PCID support, each CR3 write flushes + the entire TLB. That means that each syscall, interrupt + or exception flushes the TLB. + h. INVPCID is a TLB-flushing instruction which allows flushing + of TLB entries for non-current PCIDs. Some systems support + PCIDs, but do not support INVPCID. On these systems, addresses + can only be flushed from the TLB for the current PCID. When + flushing a kernel address, we need to flush all PCIDs, so a + single kernel address flush will require a TLB-flushing CR3 + write upon the next use of every PCID. + +Possible Future Work +==================== +1. We can be more careful about not actually writing to CR3 + unless its value is actually changed. +2. Allow PTI to be enabled/disabled at runtime in addition to the + boot-time switching. + +Testing +======== + +To test stability of PTI, the following test procedure is recommended, +ideally doing all of these in parallel: + +1. Set CONFIG_DEBUG_ENTRY=y +2. Run several copies of all of the tools/testing/selftests/x86/ tests + (excluding MPX and protection_keys) in a loop on multiple CPUs for + several minutes. These tests frequently uncover corner cases in the + kernel entry code. In general, old kernels might cause these tests + themselves to crash, but they should never crash the kernel. +3. Run the 'perf' tool in a mode (top or record) that generates many + frequent performance monitoring non-maskable interrupts (see "NMI" + in /proc/interrupts). This exercises the NMI entry/exit code which + is known to trigger bugs in code paths that did not expect to be + interrupted, including nested NMIs. Using "-c" boosts the rate of + NMIs, and using two -c with separate counters encourages nested NMIs + and less deterministic behavior. + + while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done + +4. Launch a KVM virtual machine. +5. Run 32-bit binaries on systems supporting the SYSCALL instruction. + This has been a lightly-tested code path and needs extra scrutiny. + +Debugging +========= + +Bugs in PTI cause a few different signatures of crashes +that are worth noting here. + + * Failures of the selftests/x86 code. Usually a bug in one of the + more obscure corners of entry_64.S + * Crashes in early boot, especially around CPU bringup. Bugs + in the trampoline code or mappings cause these. + * Crashes at the first interrupt. Caused by bugs in entry_64.S, + like screwing up a page table switch. Also caused by + incorrectly mapping the IRQ handler entry code. + * Crashes at the first NMI. The NMI code is separate from main + interrupt handlers and can have bugs that do not affect + normal interrupts. Also caused by incorrectly mapping NMI + code. NMIs that interrupt the entry code must be very + careful and can be the cause of crashes that show up when + running perf. + * Kernel crashes at the first exit to userspace. entry_64.S + bugs, or failing to map some of the exit code. + * Crashes at first interrupt that interrupts userspace. The paths + in entry_64.S that return to userspace are sometimes separate + from the ones that return to the kernel. + * Double faults: overflowing the kernel stack because of page + faults upon page faults. Caused by touching non-pti-mapped + data in the entry code, or forgetting to switch to kernel + CR3 before calling into C functions which are not pti-mapped. + * Userspace segfaults early in boot, sometimes manifesting + as mount(8) failing to mount the rootfs. These have + tended to be TLB invalidation issues. Usually invalidating + the wrong PCID, or otherwise missing an invalidation. + +1. https://gruss.cc/files/kaiser.pdf +2. https://meltdownattack.com/meltdown.pdf From 99c6fa2511d8a683e61468be91b83f85452115fa Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 6 Jan 2018 11:49:23 +0000 Subject: [PATCH 094/236] x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] Add the bug bits for spectre v1/2 and force them unconditionally for all cpus. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 21ac898df2d8..1641c2f96363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -342,5 +342,7 @@ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2d3bd2215e5b..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_AMD) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + fpu__init_system(c); #ifdef CONFIG_X86_32 From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 4 Jan 2018 20:02:09 -0800 Subject: [PATCH 095/236] bpf: sockmap missing NULL psock check Add psock NULL check to handle a racing sock event that can get the sk_callback_lock before this case but after xchg happens causing the refcnt to hit zero and sock user data (psock) to be null and queued for garbage collection. Also add a comment in the code because this is a bit subtle and not obvious in my opinion. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); From 2b36047e7889b7efee22c11e17f035f721855731 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Jan 2018 15:02:00 -0800 Subject: [PATCH 096/236] selftests/bpf: fix test_align since commit 82abbf8d2fc4 the verifier rejects the bit-wise arithmetic on pointers earlier. The test 'dubious pointer arithmetic' now has less output to match on. Adjust it. Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers") Reported-by: kernel test robot Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_align.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 8591c89c0828..471bbbdb94db 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -474,27 +474,7 @@ static struct bpf_align_test tests[] = { .result = REJECT, .matches = { {4, "R5=pkt(id=0,off=0,r=0,imm=0)"}, - /* ptr & 0x40 == either 0 or 0x40 */ - {5, "R5=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"}, - /* ptr << 2 == unknown, (4n) */ - {7, "R5=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, - /* (4n) + 14 == (4n+2). We blow our bounds, because - * the add could overflow. - */ - {8, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, - /* Checked s>=0 */ - {10, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - /* packet pointer + nonnegative (4n+2) */ - {12, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {14, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ - {16, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* R5 bitwise operator &= on pointer prohibited */ } }, { From 33c57c0d3c67f51f491a9d27108f7e97adc03d96 Mon Sep 17 00:00:00 2001 From: Karsten Merker Date: Thu, 4 Jan 2018 23:37:02 +0100 Subject: [PATCH 097/236] RISC-V: Add a basic defconfig This patch provides a basic defconfig for the RISC-V architecture that enables enough kernel features to run a basic Linux distribution on qemu's "virt" board for native software development. Features include: - serial console - virtio block and network device support - VFAT and ext2/3/4 filesystem support - NFS client and NFS rootfs support - an assortment of other kernel features required for running systemd It also enables a number of drivers for physical hardware that target the "SiFive U500" SoC and the corresponding development platform. These include: - PCIe host controller support for the FPGA-based U500 development platform (PCIE_XILINX) - USB host controller support (OHCI/EHCI/XHCI) - USB HID (keyboard/mouse) support - USB mass storage support (bulk and UAS) - SATA support (AHCI) - ethernet drivers (MACB for a SoC-internal MAC block, microsemi ethernet phy, E1000E and R8169 for PCIe-connected external devices) - DRM and framebuffer console support for PCIe-connected Radeon graphics chips Signed-off-by: Karsten Merker Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 75 ++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index e69de29bb2d1..47dacf06c679 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -0,0 +1,75 @@ +CONFIG_SMP=y +CONFIG_PCI=y +CONFIG_PCIE_XILINX=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BPF_SYSCALL=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_RAS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +# CONFIG_RCU_TRACE is not set +CONFIG_CRYPTO_USER_API_HASH=y From 9e49a4ed072ab67b17238c5a45d7cba7f848659e Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 26 Dec 2017 19:11:22 -0800 Subject: [PATCH 098/236] RISC-V: Make __NR_riscv_flush_icache visible to userspace We were hoping to avoid making this visible to userspace, but it looks like we're going to have to because QEMU's user-mode emulation doesn't want to emulate a vDSO. Having vDSO-only system calls was a bit unothodox anyway, so I think in this case it's OK to just make the actual system call number public. This patch simply moves the definition of __NR_riscv_flush_icache availiable to userspace, which results in the deletion of the now empty vdso-syscalls.h. Changes since v1: * I've moved the definition into uapi/asm/syscalls.h rathen than uapi/asm/unistd.h. This allows me to keep asm/unistd.h, so we can keep the syscall table macros sane. * As a side effect of the above, this no longer disables all system calls on RISC-V. Whoops! Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/unistd.h | 1 + arch/riscv/include/asm/vdso-syscalls.h | 28 -------------------------- arch/riscv/include/uapi/asm/syscalls.h | 26 ++++++++++++++++++++++++ arch/riscv/kernel/syscall_table.c | 1 - arch/riscv/kernel/vdso/flush_icache.S | 1 - 5 files changed, 27 insertions(+), 30 deletions(-) delete mode 100644 arch/riscv/include/asm/vdso-syscalls.h create mode 100644 arch/riscv/include/uapi/asm/syscalls.h diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 9f250ed007cd..2f704a5c4196 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -14,3 +14,4 @@ #define __ARCH_HAVE_MMU #define __ARCH_WANT_SYS_CLONE #include +#include diff --git a/arch/riscv/include/asm/vdso-syscalls.h b/arch/riscv/include/asm/vdso-syscalls.h deleted file mode 100644 index a2ccf1894929..000000000000 --- a/arch/riscv/include/asm/vdso-syscalls.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2017 SiFive - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef _ASM_RISCV_VDSO_SYSCALLS_H -#define _ASM_RISCV_VDSO_SYSCALLS_H - -#ifdef CONFIG_SMP - -/* These syscalls are only used by the vDSO and are not in the uapi. */ -#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) -__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) - -#endif - -#endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/syscalls.h b/arch/riscv/include/uapi/asm/syscalls.h new file mode 100644 index 000000000000..818655b0d535 --- /dev/null +++ b/arch/riscv/include/uapi/asm/syscalls.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 SiFive + */ + +#ifndef _ASM__UAPI__SYSCALLS_H +#define _ASM__UAPI__SYSCALLS_H + +/* + * Allows the instruction cache to be flushed from userspace. Despite RISC-V + * having a direct 'fence.i' instruction available to userspace (which we + * can't trap!), that's not actually viable when running on Linux because the + * kernel might schedule a process on another hart. There is no way for + * userspace to handle this without invoking the kernel (as it doesn't know the + * thread->hart mappings), so we've defined a RISC-V specific system call to + * flush the instruction cache. + * + * __NR_riscv_flush_icache is defined to flush the instruction cache over an + * address range, with the flush applying to either all threads or just the + * caller. We don't currently do anything with the address range, that's just + * in there for forwards compatibility. + */ +#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) +__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) + +#endif diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index a5bd6401f95e..ade52b903a43 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -23,5 +23,4 @@ void *sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = sys_ni_syscall, #include -#include }; diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S index b0fbad74e873..023e4d4aef58 100644 --- a/arch/riscv/kernel/vdso/flush_icache.S +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -13,7 +13,6 @@ #include #include -#include .text /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ From c163fb38ca34694b0cce99bb5604257bc29bf200 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:02 +0100 Subject: [PATCH 099/236] riscv: remove CONFIG_MMU ifdefs The RISC-V port doesn't suport a nommu mode, so there is no reason to provide some code only under a CONFIG_MMU ifdef. Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/io.h | 4 ---- arch/riscv/include/asm/pgtable.h | 4 ---- arch/riscv/include/asm/tlbflush.h | 4 ---- arch/riscv/include/asm/uaccess.h | 12 ------------ 4 files changed, 24 deletions(-) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index a82ce599b639..b269451e7e85 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -21,8 +21,6 @@ #include -#ifdef CONFIG_MMU - extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); /* @@ -36,8 +34,6 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); extern void iounmap(volatile void __iomem *addr); -#endif /* CONFIG_MMU */ - /* Generic IO read/write. These perform native-endian accesses. */ #define __raw_writeb __raw_writeb static inline void __raw_writeb(u8 val, volatile void __iomem *addr) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2cbd92ed1629..16301966d65b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -20,8 +20,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_MMU - /* Page Upper Directory not used in RISC-V */ #include #include @@ -413,8 +411,6 @@ static inline void pgtable_cache_init(void) /* No page table caches to initialize */ } -#endif /* CONFIG_MMU */ - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 715b0f10af58..7b9c24ebdf52 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -15,8 +15,6 @@ #ifndef _ASM_RISCV_TLBFLUSH_H #define _ASM_RISCV_TLBFLUSH_H -#ifdef CONFIG_MMU - #include /* @@ -64,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -#endif /* CONFIG_MMU */ - #endif /* _ASM_RISCV_TLBFLUSH_H */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 27b90d64814b..14b0b22fb578 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state); * call. */ -#ifdef CONFIG_MMU #define __get_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -153,13 +152,11 @@ do { \ __disable_user_access(); \ (x) = __x; \ } while (0) -#endif /* CONFIG_MMU */ #ifdef CONFIG_64BIT #define __get_user_8(x, ptr, err) \ __get_user_asm("ld", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __get_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -193,7 +190,6 @@ do { \ (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -267,8 +263,6 @@ do { \ ((x) = 0, -EFAULT); \ }) - -#ifdef CONFIG_MMU #define __put_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -292,14 +286,11 @@ do { \ : "rJ" (__x), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ - #ifdef CONFIG_64BIT #define __put_user_8(x, ptr, err) \ __put_user_asm("sd", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __put_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -329,7 +320,6 @@ do { \ : "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) * will set "err" to -EFAULT, while successful accesses return the previous * value. */ -#ifdef CONFIG_MMU #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ @@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) (err) = __err; \ __ret; \ }) -#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_UACCESS_H */ From 1125203c13b9da32125e171b4bd75e93d4918ddd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:03 +0100 Subject: [PATCH 100/236] riscv: rename SR_* constants to match the spec Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/csr.h | 8 ++++---- arch/riscv/include/asm/irqflags.h | 10 +++++----- arch/riscv/include/asm/ptrace.h | 2 +- arch/riscv/kernel/entry.S | 8 ++++---- arch/riscv/kernel/process.c | 4 ++-- arch/riscv/mm/fault.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0d64bc9f4f91..3c7a2c97e377 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -17,10 +17,10 @@ #include /* Status register flags */ -#define SR_IE _AC(0x00000002, UL) /* Interrupt Enable */ -#define SR_PIE _AC(0x00000020, UL) /* Previous IE */ -#define SR_PS _AC(0x00000100, UL) /* Previously Supervisor */ -#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ +#define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ #define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ #define SR_FS_OFF _AC(0x00000000, UL) diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index 6fdc860d7f84..07a3c6d5706f 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void) /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { - csr_set(sstatus, SR_IE); + csr_set(sstatus, SR_SIE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { - csr_clear(sstatus, SR_IE); + csr_clear(sstatus, SR_SIE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { - return csr_read_clear(sstatus, SR_IE); + return csr_read_clear(sstatus, SR_SIE); } /* test flags */ static inline int arch_irqs_disabled_flags(unsigned long flags) { - return !(flags & SR_IE); + return !(flags & SR_SIE); } /* test hardware interrupt enable bit */ @@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void) /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { - csr_set(sstatus, flags & SR_IE); + csr_set(sstatus, flags & SR_SIE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 93b8956e25e4..2c5df945d43c 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -66,7 +66,7 @@ struct pt_regs { #define REG_FMT "%08lx" #endif -#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0) +#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0) /* Helpers for working with the instruction pointer */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 20ee86f782a9..7404ec222406 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -196,7 +196,7 @@ handle_syscall: addi s2, s2, 0x4 REG_S s2, PT_SEPC(sp) /* System calls run with interrupts enabled */ - csrs sstatus, SR_IE + csrs sstatus, SR_SIE /* Trace syscalls, but only if requested by the user. */ REG_L t0, TASK_TI_FLAGS(tp) andi t0, t0, _TIF_SYSCALL_TRACE @@ -224,8 +224,8 @@ ret_from_syscall: ret_from_exception: REG_L s0, PT_SSTATUS(sp) - csrc sstatus, SR_IE - andi s0, s0, SR_PS + csrc sstatus, SR_SIE + andi s0, s0, SR_SPP bnez s0, restore_all resume_userspace: @@ -255,7 +255,7 @@ work_pending: bnez s1, work_resched work_notifysig: /* Handle pending signals and notify-resume requests */ - csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */ + csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */ move a0, sp /* pt_regs */ move a1, s0 /* current_thread_info->flags */ tail do_notify_resume diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 0d90dcc1fbd3..d74d4adf2d54 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs) void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL; + regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL; regs->sepc = pc; regs->sp = sp; set_fs(USER_DS); @@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, const register unsigned long gp __asm__ ("gp"); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp; - childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */ + childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */ p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = usp; /* fn */ diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index df2ca3c65048..0713f3c67ab4 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto vmalloc_fault; /* Enable interrupts if they were enabled in the parent context. */ - if (likely(regs->sstatus & SR_PIE)) + if (likely(regs->sstatus & SR_SPIE)) local_irq_enable(); /* From e2d5915293ffdff977ddcfc12b817b08c53ffa7a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 8 Jan 2018 14:54:32 +1100 Subject: [PATCH 101/236] powerpc/pseries: Make RAS IRQ explicitly dependent on DLPAR WQ The hotplug code uses its own workqueue to handle IRQ requests (pseries_hp_wq), however that workqueue is initialized after init_ras_IRQ(). That can lead to a kernel panic if any hotplug interrupts fire after init_ras_IRQ() but before pseries_hp_wq is initialised. eg: UDP-Lite hash table entries: 2048 (order: 0, 65536 bytes) NET: Registered protocol family 1 Unpacking initramfs... (qemu) object_add memory-backend-ram,id=mem1,size=10G (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Unable to handle kernel paging request for data at address 0xf94d03007c421378 Faulting instruction address: 0xc00000000012d744 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc2-ziviani+ #26 task: (ptrval) task.stack: (ptrval) NIP: c00000000012d744 LR: c00000000012d744 CTR: 0000000000000000 REGS: (ptrval) TRAP: 0380 Not tainted (4.15.0-rc2-ziviani+) MSR: 8000000000009033 CR: 28088042 XER: 20040000 CFAR: c00000000012d3c4 SOFTE: 0 ... NIP [c00000000012d744] __queue_work+0xd4/0x5c0 LR [c00000000012d744] __queue_work+0xd4/0x5c0 Call Trace: [c0000000fffefb90] [c00000000012d744] __queue_work+0xd4/0x5c0 (unreliable) [c0000000fffefc70] [c00000000012dce4] queue_work_on+0xb4/0xf0 This commit makes the RAS IRQ registration explicitly dependent on the creation of the pseries_hp_wq. Reported-by: Min Deng Reported-by: Daniel Henrique Barboza Tested-by: Jose Ricardo Ziviani Signed-off-by: Michael Ellerman Reviewed-by: David Gibson --- arch/powerpc/platforms/pseries/dlpar.c | 21 ++++++++++++++++++--- arch/powerpc/platforms/pseries/pseries.h | 2 ++ arch/powerpc/platforms/pseries/ras.c | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 6e35780c5962..a0b20c03f078 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr, static CLASS_ATTR_RW(dlpar); -static int __init pseries_dlpar_init(void) +int __init dlpar_workqueue_init(void) { + if (pseries_hp_wq) + return 0; + pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue", - WQ_UNBOUND, 1); + WQ_UNBOUND, 1); + + return pseries_hp_wq ? 0 : -ENOMEM; +} + +static int __init dlpar_sysfs_init(void) +{ + int rc; + + rc = dlpar_workqueue_init(); + if (rc) + return rc; + return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr); } -machine_device_initcall(pseries, pseries_dlpar_init); +machine_device_initcall(pseries, dlpar_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 4470a3194311..1ae1d9f4dbe9 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void) return CMO_PageSize; } +int dlpar_workqueue_init(void); + #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 4923ffe230cf..81d8614e7379 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void) /* Hotplug Events */ np = of_find_node_by_path("/event-sources/hot-plug-events"); if (np != NULL) { - request_event_sources_irqs(np, ras_hotplug_interrupt, + if (dlpar_workqueue_init() == 0) + request_event_sources_irqs(np, ras_hotplug_interrupt, "RAS_HOTPLUG"); of_node_put(np); } From 65e7439204b57b7a7f6e4694f9e2a9adde5e77ed Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 21 Dec 2017 10:29:32 +0800 Subject: [PATCH 102/236] drm/i915/gvt: Fix stack-out-of-bounds bug in cmd parser for_each_set_bit() only accepts variable of type unsigned long, and we can not cast it from smaller types. [ 16.499365] ================================================================== [ 16.506655] BUG: KASAN: stack-out-of-bounds in find_first_bit+0x1d/0x70 [ 16.513313] Read of size 8 at addr ffff8803616cf510 by task systemd-udevd/180 [ 16.521998] CPU: 0 PID: 180 Comm: systemd-udevd Tainted: G U O 4.15.0-rc3+ #14 [ 16.530317] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 01/26/2016 [ 16.537760] Call Trace: [ 16.540230] dump_stack+0x7c/0xbb [ 16.543569] print_address_description+0x6b/0x290 [ 16.548306] kasan_report+0x28a/0x370 [ 16.551993] ? find_first_bit+0x1d/0x70 [ 16.555858] find_first_bit+0x1d/0x70 [ 16.559625] intel_gvt_init_cmd_parser+0x127/0x3c0 [i915] [ 16.565060] ? __lock_is_held+0x8f/0xf0 [ 16.568990] ? intel_gvt_clean_cmd_parser+0x10/0x10 [i915] [ 16.574514] ? __hrtimer_init+0x5d/0xb0 [ 16.578445] intel_gvt_init_device+0x2c3/0x690 [i915] [ 16.583537] ? unregister_module_notifier+0x20/0x20 [ 16.588515] intel_gvt_init+0x89/0x100 [i915] [ 16.592962] i915_driver_load+0x1992/0x1c70 [i915] [ 16.597846] ? __i915_printk+0x210/0x210 [i915] [ 16.602410] ? wait_for_completion+0x280/0x280 [ 16.606883] ? lock_downgrade+0x2c0/0x2c0 [ 16.610923] ? __pm_runtime_resume+0x46/0x90 [ 16.615238] ? acpi_dev_found+0x76/0x80 [ 16.619162] ? i915_pci_remove+0x30/0x30 [i915] [ 16.623733] local_pci_probe+0x74/0xe0 [ 16.627518] pci_device_probe+0x208/0x310 [ 16.631561] ? pci_device_remove+0x100/0x100 [ 16.635871] ? __list_add_valid+0x29/0xa0 [ 16.639919] driver_probe_device+0x40b/0x6b0 [ 16.644223] ? driver_probe_device+0x6b0/0x6b0 [ 16.648696] __driver_attach+0x11d/0x130 [ 16.652649] bus_for_each_dev+0xe7/0x160 [ 16.656600] ? subsys_dev_iter_exit+0x10/0x10 [ 16.660987] ? __list_add_valid+0x29/0xa0 [ 16.665028] bus_add_driver+0x31d/0x3a0 [ 16.668893] driver_register+0xc6/0x170 [ 16.672758] ? 0xffffffffc0ad8000 [ 16.676108] do_one_initcall+0x9c/0x206 [ 16.679984] ? initcall_blacklisted+0x150/0x150 [ 16.684545] ? do_init_module+0x35/0x33b [ 16.688494] ? kasan_unpoison_shadow+0x31/0x40 [ 16.692968] ? kasan_kmalloc+0xa6/0xd0 [ 16.696743] ? do_init_module+0x35/0x33b [ 16.700694] ? kasan_unpoison_shadow+0x31/0x40 [ 16.705168] ? __asan_register_globals+0x82/0xa0 [ 16.709819] do_init_module+0xe7/0x33b [ 16.713597] load_module+0x4481/0x4ce0 [ 16.717397] ? module_frob_arch_sections+0x20/0x20 [ 16.722228] ? vfs_read+0x13b/0x190 [ 16.725742] ? kernel_read+0x74/0xa0 [ 16.729351] ? get_user_arg_ptr.isra.17+0x70/0x70 [ 16.734099] ? SYSC_finit_module+0x175/0x1b0 [ 16.738399] SYSC_finit_module+0x175/0x1b0 [ 16.742524] ? SYSC_init_module+0x1e0/0x1e0 [ 16.746741] ? __fget+0x157/0x240 [ 16.750090] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 16.754747] entry_SYSCALL_64_fastpath+0x23/0x9a [ 16.759397] RIP: 0033:0x7f8fbc837499 [ 16.762996] RSP: 002b:00007ffead76c138 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 16.770618] RAX: ffffffffffffffda RBX: 0000000000000012 RCX: 00007f8fbc837499 [ 16.777800] RDX: 0000000000000000 RSI: 000056484e67b080 RDI: 0000000000000012 [ 16.784979] RBP: 00007ffead76b140 R08: 0000000000000000 R09: 0000000000000021 [ 16.792164] R10: 0000000000000012 R11: 0000000000000246 R12: 000056484e67b460 [ 16.799345] R13: 00007ffead76b120 R14: 0000000000000005 R15: 0000000000000000 [ 16.808052] The buggy address belongs to the page: [ 16.812876] page:00000000dc4b8c1e count:0 mapcount:0 mapping: (null) index:0x0 [ 16.820934] flags: 0x17ffffc0000000() [ 16.824621] raw: 0017ffffc0000000 0000000000000000 0000000000000000 00000000ffffffff [ 16.832416] raw: ffffea000d85b3e0 ffffea000d85b3e0 0000000000000000 0000000000000000 [ 16.840208] page dumped because: kasan: bad access detected [ 16.847318] Memory state around the buggy address: [ 16.852143] ffff8803616cf400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 16.859427] ffff8803616cf480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 [ 16.866708] >ffff8803616cf500: f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00 00 00 [ 16.873988] ^ [ 16.877770] ffff8803616cf580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 16.885042] ffff8803616cf600: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 [ 16.892312] ================================================================== Signed-off-by: Changbin Du Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/cmd_parser.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index 701a3c6f1669..9d12090939e3 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -2777,12 +2777,12 @@ int intel_gvt_scan_and_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) } static struct cmd_info *find_cmd_entry_any_ring(struct intel_gvt *gvt, - unsigned int opcode, int rings) + unsigned int opcode, unsigned long rings) { struct cmd_info *info = NULL; unsigned int ring; - for_each_set_bit(ring, (unsigned long *)&rings, I915_NUM_ENGINES) { + for_each_set_bit(ring, &rings, I915_NUM_ENGINES) { info = find_cmd_entry(gvt, opcode, ring); if (info) break; From 6b018235b4daabae96d855219fae59c3fb8be417 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Fri, 5 Jan 2018 12:44:06 -0500 Subject: [PATCH 103/236] nvme-fabrics: initialize default host->id in nvmf_host_default() The field was uninitialized before use. Signed-off-by: Ewan D. Milne Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 76b4fe6816a0..894c2ccb3891 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -74,6 +74,7 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); + uuid_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); From 87590ce6e373d1a5401f6539f0c59ef92dd924a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 7 Jan 2018 22:48:00 +0100 Subject: [PATCH 104/236] sysfs/cpu: Add vulnerability folder As the meltdown/spectre problem affects several CPU architectures, it makes sense to have common way to express whether a system is affected by a particular vulnerability or not. If affected the way to express the mitigation should be common as well. Create /sys/devices/system/cpu/vulnerabilities folder and files for meltdown, spectre_v1 and spectre_v2. Allow architectures to override the show function. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.096657732@linutronix.de --- .../ABI/testing/sysfs-devices-system-cpu | 16 +++++++ drivers/base/Kconfig | 3 ++ drivers/base/cpu.c | 48 +++++++++++++++++++ include/linux/cpu.h | 7 +++ 4 files changed, 74 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index f3d5817c4ef0..bd3a88e16d8b 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -373,3 +373,19 @@ Contact: Linux kernel mailing list Description: information about CPUs heterogeneity. cpu_capacity: capacity of cpu#. + +What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/meltdown + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 +Date: Januar 2018 +Contact: Linux kernel mailing list +Description: Information about CPU vulnerabilities + + The files are named after the code names of CPU + vulnerabilities. The output of those files reflects the + state of the CPUs in the system. Possible output values: + + "Not affected" CPU is not affected by the vulnerability + "Vulnerable" CPU is affected and no mitigation in effect + "Mitigation: $M" CPU is affetcted and mitigation $M is in effect diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 2f6614c9a229..37a71fd9043f 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES config GENERIC_CPU_AUTOPROBE bool +config GENERIC_CPU_VULNERABILITIES + bool + config SOC_BUS bool select GLOB diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b4d817..825964efda1d 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -501,10 +501,58 @@ static void __init cpu_dev_register_generic(void) #endif } +#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES + +ssize_t __weak cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +ssize_t __weak cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + +static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); +static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); +static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); + +static struct attribute *cpu_root_vulnerabilities_attrs[] = { + &dev_attr_meltdown.attr, + &dev_attr_spectre_v1.attr, + &dev_attr_spectre_v2.attr, + NULL +}; + +static const struct attribute_group cpu_root_vulnerabilities_group = { + .name = "vulnerabilities", + .attrs = cpu_root_vulnerabilities_attrs, +}; + +static void __init cpu_register_vulnerabilities(void) +{ + if (sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_vulnerabilities_group)) + pr_err("Unable to register CPU vulnerabilities\n"); +} + +#else +static inline void cpu_register_vulnerabilities(void) { } +#endif + void __init cpu_dev_init(void) { if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups)) panic("Failed to register CPU subsystem"); cpu_dev_register_generic(); + cpu_register_vulnerabilities(); } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 938ea8ae0ba4..c816e6f2730c 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -47,6 +47,13 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr); extern int cpu_add_dev_attr_group(struct attribute_group *attrs); extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); +extern ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf); + extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, const struct attribute_group **groups, From 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 7 Jan 2018 22:48:01 +0100 Subject: [PATCH 105/236] x86/cpu: Implement CPU vulnerabilites sysfs functions Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and spectre_v2. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd5199de231e..e23d21ac745a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,6 +89,7 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..76ad6cb44b40 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -60,3 +61,31 @@ void __init check_bugs(void) set_memory_4k((unsigned long)__va(0), 1); #endif } + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} +#endif From 0dd6d272d39c7c1fe2f4253197b505f2b66538ee Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 23 Dec 2017 21:50:13 -0500 Subject: [PATCH 106/236] x86/xen/time: fix section mismatch for xen_init_time_ops() The header declares this function as __init but is defined in __ref section. Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/xen-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f96dbedb33d4..1a7a9469e5a7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -71,7 +71,7 @@ u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __init xen_init_time_ops(void); +void __ref xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); From 66a640e7823da803fdb68d5d88f7a8fbd11c29e6 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 6 Jan 2018 13:39:48 -0800 Subject: [PATCH 107/236] x86: xen: remove the use of VLAIS Variable Length Arrays In Structs (VLAIS) is not supported by Clang, and frowned upon by others. https://lkml.org/lkml/2013/9/23/500 Here, the VLAIS was used because the size of the bitmap returned from xen_mc_entry() depended on possibly (based on kernel configuration) runtime sized data. Rather than declaring args as a VLAIS then calling sizeof on *args, we calculate the appropriate sizeof args manually. Further, we can get rid of the #ifdef's and rely on num_possible_cpus() (thanks to a helpful checkpatch warning from an earlier version of this patch). Suggested-by: Juergen Gross Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 7118f776cd49..aa701d2a5023 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1339,20 +1339,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else DECLARE_BITMAP(mask, NR_CPUS); -#endif } *args; struct multicall_space mcs; + const size_t mc_entry_size = sizeof(args->op) + + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ - mcs = xen_mc_entry(sizeof(*args)); + mcs = xen_mc_entry(mc_entry_size); args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); From dba04eb76df982703fefc021a4d278347b6176a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 8 Jan 2018 16:27:31 +0100 Subject: [PATCH 108/236] locking/Documentation: Remove stale crossrelease_fullstack parameter The cross-release lockdep functionality has been removed in: e966eaeeb623: ("locking/lockdep: Remove the cross-release locking checks") ... leaving the kernel parameter docs behind. The code handling the parameter does not exist so this is a plain documentation change. Signed-off-by: David Sterba Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20180108152731.27613-1-dsterba@suse.com Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index af7104aaffd9..a626465dd877 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -713,9 +713,6 @@ It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. - crossrelease_fullstack - [KNL] Allow to record full stack trace in cross-release - cryptomgr.notests [KNL] Disable crypto self-tests From 262b6b30087246abf09d6275eb0c0dc421bcbe38 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 6 Jan 2018 18:41:14 +0100 Subject: [PATCH 109/236] x86/tboot: Unbreak tboot with PTI enabled This is another case similar to what EFI does: create a new set of page tables, map some code at a low address, and jump to it. PTI mistakes this low address for userspace and mistakenly marks it non-executable in an effort to make it unusable for userspace. Undo the poison to allow execution. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Signed-off-by: Dave Hansen Signed-off-by: Andrea Arcangeli Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Tim Chen Cc: Jon Masters Cc: Dave Hansen Cc: Andi Kleen Cc: Jeff Law Cc: Paolo Bonzini Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David" Cc: Nick Clifton Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..75869a4b6c41 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; + pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; From 527187d28569e39c5d489d6306d3b79605cf85a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jan 2018 17:27:19 +0100 Subject: [PATCH 110/236] locking/lockdep: Remove cross-release leftovers There's two cross-release leftover facilities: - the crossrelease_hist_*() irq-tracing callbacks (NOPs currently) - the complete_release_commit() callback (NOP as well) Remove them. Cc: David Sterba Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 1 - include/linux/irqflags.h | 4 ---- include/linux/lockdep.h | 2 -- kernel/sched/completion.c | 5 ----- 4 files changed, 12 deletions(-) diff --git a/include/linux/completion.h b/include/linux/completion.h index 94a59ba7d422..519e94915d18 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -32,7 +32,6 @@ struct completion { #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} -static inline void complete_release_commit(struct completion *x) {} #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 46cb57d5eb13..1b3996ff3f16 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -27,22 +27,18 @@ # define trace_hardirq_enter() \ do { \ current->hardirq_context++; \ - crossrelease_hist_start(XHLOCK_HARD); \ } while (0) # define trace_hardirq_exit() \ do { \ current->hardirq_context--; \ - crossrelease_hist_end(XHLOCK_HARD); \ } while (0) # define lockdep_softirq_enter() \ do { \ current->softirq_context++; \ - crossrelease_hist_start(XHLOCK_SOFT); \ } while (0) # define lockdep_softirq_exit() \ do { \ current->softirq_context--; \ - crossrelease_hist_end(XHLOCK_SOFT); \ } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 2e75dc34bff5..3251d9c0d313 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -475,8 +475,6 @@ enum xhlock_context_t { #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } -static inline void crossrelease_hist_start(enum xhlock_context_t c) {} -static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); From 8d56eff266f3e41a6c39926269c4c3f58f881a8e Mon Sep 17 00:00:00 2001 From: Jike Song Date: Tue, 9 Jan 2018 00:03:41 +0800 Subject: [PATCH 111/236] x86/mm/pti: Remove dead logic in pti_user_pagetable_walk*() The following code contains dead logic: 162 if (pgd_none(*pgd)) { 163 unsigned long new_p4d_page = __get_free_page(gfp); 164 if (!new_p4d_page) 165 return NULL; 166 167 if (pgd_none(*pgd)) { 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 169 new_p4d_page = 0; 170 } 171 if (new_p4d_page) 172 free_page(new_p4d_page); 173 } There can't be any difference between two pgd_none(*pgd) at L162 and L167, so it's always false at L171. Dave Hansen explained: Yes, the double-test was part of an optimization where we attempted to avoid using a global spinlock in the fork() path. We would check for unallocated mid-level page tables without the lock. The lock was only taken when we needed to *make* an entry to avoid collisions. Now that it is all single-threaded, there is no chance of a collision, no need for a lock, and no need for the re-check. As all these functions are only called during init, mark them __init as well. Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs") Signed-off-by: Jike Song Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Andi Kleen Cc: Tom Lendacky Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Koshina Cc: Dave Hansen Cc: Borislav Petkov Cc: Kees Cook Cc: Andi Lutomirski Cc: Linus Torvalds Cc: Greg KH Cc: David Woodhouse Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com --- arch/x86/mm/pti.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 43d4a4a29037..ce38f165489b 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * * Returns a pointer to a P4D on success, or NULL on failure. */ -static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); @@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (!new_p4d_page) return NULL; - if (pgd_none(*pgd)) { - set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); - new_p4d_page = 0; - } - if (new_p4d_page) - free_page(new_p4d_page); + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_large(*pgd) != 0); @@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) * * Returns a pointer to a PMD on success, or NULL on failure. */ -static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d = pti_user_pagetable_walk_p4d(address); @@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pud_page) return NULL; - if (p4d_none(*p4d)) { - set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); - new_pud_page = 0; - } - if (new_pud_page) - free_page(new_pud_page); + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); @@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pmd_page) return NULL; - if (pud_none(*pud)) { - set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - new_pmd_page = 0; - } - if (new_pmd_page) - free_page(new_pmd_page); + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); @@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!new_pte_page) return NULL; - if (pmd_none(*pmd)) { - set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - new_pte_page = 0; - } - if (new_pte_page) - free_page(new_pte_page); + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); From 98b8e4e5c17bf87c1b18ed929472051dab39878c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Jan 2018 12:49:29 +0100 Subject: [PATCH 112/236] platform/x86: wmi: Call acpi_wmi_init() later Calling acpi_wmi_init() at the subsys_initcall() level causes ordering issues to appear on some systems and they are difficult to reproduce, because there is no guaranteed ordering between subsys_initcall() calls, so they may occur in different orders on different systems. In particular, commit 86d9f48534e8 (mm/slab: fix kmemcg cache creation delayed issue) exposed one of these issues where genl_init() and acpi_wmi_init() are both called at the same initcall level, but the former must run before the latter so as to avoid a NULL pointer dereference. For this reason, move the acpi_wmi_init() invocation to the initcall_sync level which should still be early enough for things to work correctly in the WMI land. Link: https://marc.info/?t=151274596700002&r=1&w=2 Reported-by: Jonathan McDowell Reported-by: Joonsoo Kim Tested-by: Jonathan McDowell Signed-off-by: Rafael J. Wysocki Signed-off-by: Darren Hart (VMware) --- drivers/platform/x86/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 791449a2370f..daa68acbc900 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -1458,5 +1458,5 @@ static void __exit acpi_wmi_exit(void) class_unregister(&wmi_bus_class); } -subsys_initcall(acpi_wmi_init); +subsys_initcall_sync(acpi_wmi_init); module_exit(acpi_wmi_exit); From 9d0513d82f1a8fe17b41f113ac5922fa57dbaf5c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 28 Dec 2017 14:25:23 +0200 Subject: [PATCH 113/236] x86/platform/intel-mid: Revert "Make 'bt_sfi_data' const" So one of the constification patches unearthed a type casting fragility of the underlying code: 276c87054751 ("x86/platform/intel-mid: Make 'bt_sfi_data' const") converted the struct to be const while it is also used as a temporary container for important data that is used to fill 'parent' and 'name' fields in struct platform_device_info. The compiler doesn't notice this due to an explicit type cast that loses the const - which fragility will be fixed separately. This type cast turned a seemingly trivial const propagation patch into a hard to debug data corruptor and crasher bug. Signed-off-by: Andy Shevchenko Cc: Bhumika Goyal Cc: Darren Hart Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20171228122523.21802-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/device_libs/platform_bt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index dc036e511f48..5a0483e7bf66 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static const struct bt_sfi_data tng_bt_sfi_data __initdata = { +static struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; From 414a2dc138838642d28938506e31ad461648b898 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 2 Jan 2018 12:13:10 +0100 Subject: [PATCH 114/236] sched/isolation: Make CONFIG_CPU_ISOLATION=y depend on SMP or COMPILE_TEST On uniprocessor systems, critical and non-critical tasks cannot be isolated, as there is only a single CPU core. Hence enabling CPU isolation by default on such systems does not make much sense. Instead of changing the default for !SMP, fix this by making the feature depend on SMP, with an override for compile-testing. Note that its sole selector (NO_HZ_FULL) already depends on SMP. This decreases kernel size for a default uniprocessor kernel by ca. 1 KiB. Signed-off-by: Geert Uytterhoeven Acked-by: Nicolas Pitre Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2c43838c99d9d23f ("sched/isolation: Enable CONFIG_CPU_ISOLATION=y by default") Link: http://lkml.kernel.org/r/1514891590-20782-1-git-send-email-geert@linux-m68k.org Signed-off-by: Ingo Molnar --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index 690a381adee0..c1221332e128 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -461,6 +461,7 @@ endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION bool "CPU isolation" + depends on SMP || COMPILE_TEST default y help Make sure that CPUs running critical tasks are not disturbed by From f328299e54a94998b31baf788d2b33d8122a4acb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 13:53:03 -0600 Subject: [PATCH 115/236] locking/refcounts: Remove stale comment from the ARCH_HAS_REFCOUNT Kconfig entry ARCH_HAS_REFCOUNT is no longer marked as broken ('if BROKEN'), so remove the stale comment regarding it being broken. Signed-off-by: Eric Biggers Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171229195303.17781-1-ebiggers3@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4fc98c50378..ff4e9cd99854 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,7 +55,6 @@ config X86 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 - # Causing hangs/crashes, see the commit that added this change for details. select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY From 7deea450eb912f269d999de62c8ab922d1461748 Mon Sep 17 00:00:00 2001 From: Sunil Challa Date: Thu, 4 Jan 2018 18:46:54 -0500 Subject: [PATCH 116/236] bnxt_en: Fix population of flow_type in bnxt_hwrm_cfa_flow_alloc() flow_type in HWRM_FLOW_ALLOC is not being populated correctly due to incorrect passing of pointer and size of l3_mask argument of is_wildcard(). Fixed this. Fixes: db1d36a27324 ("bnxt_en: add TC flower offload flow_alloc/free FW cmds") Signed-off-by: Sunil Challa Reviewed-by: Sathya Perla Reviewed-by: Venkat Duvvuru Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 3d201d7324bd..d8fee26cd45e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -421,7 +421,7 @@ static int bnxt_hwrm_cfa_flow_alloc(struct bnxt *bp, struct bnxt_tc_flow *flow, } /* If all IP and L4 fields are wildcarded then this is an L2 flow */ - if (is_wildcard(&l3_mask, sizeof(l3_mask)) && + if (is_wildcard(l3_mask, sizeof(*l3_mask)) && is_wildcard(&flow->l4_mask, sizeof(flow->l4_mask))) { flow_flags |= CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2; } else { From 78f300049335ae81a5cc6b4b232481dc5e1f9d41 Mon Sep 17 00:00:00 2001 From: Venkat Duvvuru Date: Thu, 4 Jan 2018 18:46:55 -0500 Subject: [PATCH 117/236] bnxt_en: Fix the 'Invalid VF' id check in bnxt_vf_ndo_prep routine. In bnxt_vf_ndo_prep (which is called by bnxt_get_vf_config ndo), there is a check for "Invalid VF id". Currently, the check is done against max_vfs. However, the user doesn't always create max_vfs. So, the check should be against the created number of VFs. The number of bnxt_vf_info structures that are allocated in bnxt_alloc_vf_resources routine is the "number of requested VFs". So, if an "invalid VF id" falls between the requested number of VFs and the max_vfs, the driver will be dereferencing an invalid pointer. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Venkat Devvuru Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 5ee18660bc33..c9617675f934 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -70,7 +70,7 @@ static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id) netdev_err(bp->dev, "vf ndo called though sriov is disabled\n"); return -EINVAL; } - if (vf_id >= bp->pf.max_vfs) { + if (vf_id >= bp->pf.active_vfs) { netdev_err(bp->dev, "Invalid VF id %d\n", vf_id); return -EINVAL; } From b707fda2df4070785d0fa8a278aa13944c5f51f8 Mon Sep 17 00:00:00 2001 From: Eduardo Otubo Date: Fri, 5 Jan 2018 09:42:16 +0100 Subject: [PATCH 118/236] xen-netfront: enable device after manual module load When loading the module after unloading it, the network interface would not be enabled and thus wouldn't have a backend counterpart and unable to be used by the guest. The guest would face errors like: [root@guest ~]# ethtool -i eth0 Cannot get driver information: No such device [root@guest ~]# ifconfig eth0 eth0: error fetching interface information: Device not found This patch initializes the state of the netfront device whenever it is loaded manually, this state would communicate the netback to create its device and establish the connection between them. Signed-off-by: Eduardo Otubo Reviewed-by: Boris Ostrovsky Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index c5a34671abda..9bd7ddeeb6a5 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1326,6 +1326,7 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) netif_carrier_off(netdev); + xenbus_switch_state(dev, XenbusStateInitialising); return netdev; exit: From cc35c3d1edf7a8373a1a5daa80a912dec96a9cd5 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 5 Jan 2018 11:17:17 -0200 Subject: [PATCH 119/236] sctp: do not retransmit upon FragNeeded if PMTU discovery is disabled Currently, if PMTU discovery is disabled on a given transport, but the configured value is higher than the actual PMTU, it is likely that we will get some icmp Frag Needed. The issue is, if PMTU discovery is disabled, we won't update the information and will issue a retransmission immediately, which may very well trigger another ICMP, and another retransmission, leading to a loop. The fix is to simply not trigger immediate retransmissions if PMTU discovery is disabled on the given transport. Changes from v2: - updated stale comment, noticed by Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 621b5ca3fd1c..9320661cc41d 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -399,20 +399,20 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; } - if (t->param_flags & SPP_PMTUD_ENABLE) { - /* Update transports view of the MTU */ - sctp_transport_update_pmtu(t, pmtu); + if (!(t->param_flags & SPP_PMTUD_ENABLE)) + /* We can't allow retransmitting in such case, as the + * retransmission would be sized just as before, and thus we + * would get another icmp, and retransmit again. + */ + return; - /* Update association pmtu. */ - sctp_assoc_sync_pmtu(asoc); - } + /* Update transports view of the MTU */ + sctp_transport_update_pmtu(t, pmtu); - /* Retransmit with the new pmtu setting. - * Normally, if PMTU discovery is disabled, an ICMP Fragmentation - * Needed will never be sent, but if a message was sent before - * PMTU discovery was disabled that was larger than the PMTU, it - * would not be fragmented, so it must be re-transmitted fragmented. - */ + /* Update association pmtu. */ + sctp_assoc_sync_pmtu(asoc); + + /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } From b6c5734db07079c9410147b32407f2366d584e6c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 5 Jan 2018 11:17:18 -0200 Subject: [PATCH 120/236] sctp: fix the handling of ICMP Frag Needed for too small MTUs syzbot reported a hang involving SCTP, on which it kept flooding dmesg with the message: [ 246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too low, using default minimum of 512 That happened because whenever SCTP hits an ICMP Frag Needed, it tries to adjust to the new MTU and triggers an immediate retransmission. But it didn't consider the fact that MTUs smaller than the SCTP minimum MTU allowed (512) would not cause the PMTU to change, and issued the retransmission anyway (thus leading to another ICMP Frag Needed, and so on). As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU are higher than that, sctp_transport_update_pmtu() is changed to re-fetch the PMTU that got set after our request, and with that, detect if there was an actual change or not. The fix, thus, skips the immediate retransmission if the received ICMP resulted in no change, in the hope that SCTP will select another path. Note: The value being used for the minimum MTU (512, SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576, SCTP_MIN_PMTU), but such change belongs to another patch. Changes from v1: - do not disable PMTU discovery, in the light of commit 06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small") and as suggested by Xin Long. - changed the way to break the rtx loop by detecting if the icmp resulted in a change or not Changes from v2: none See-also: https://lkml.org/lkml/2017/12/22/811 Reported-by: syzbot Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/input.c | 8 ++++++-- net/sctp/transport.c | 29 +++++++++++++++++++---------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2f8f93da5dc2..9a5ccf03a59b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -966,7 +966,7 @@ void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *t); -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); diff --git a/net/sctp/input.c b/net/sctp/input.c index 9320661cc41d..141c9c466ec1 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -406,8 +406,12 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, */ return; - /* Update transports view of the MTU */ - sctp_transport_update_pmtu(t, pmtu); + /* Update transports view of the MTU. Return if no update was needed. + * If an update wasn't needed/possible, it also doesn't make sense to + * try to retransmit now. + */ + if (!sctp_transport_update_pmtu(t, pmtu)) + return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 1e5a22430cf5..47f82bd794d9 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -248,28 +248,37 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", - __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); - /* Use default minimum segment size and disable - * pmtu discovery on this transport. - */ - t->pathmtu = SCTP_DEFAULT_MINSEGMENT; - } else { - t->pathmtu = pmtu; + pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment instead */ + pmtu = SCTP_DEFAULT_MINSEGMENT; } + pmtu = SCTP_TRUNC4(pmtu); if (dst) { dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); dst = sctp_transport_dst_check(t); } - if (!dst) + if (!dst) { t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + dst = t->dst; + } + + if (dst) { + /* Re-fetch, as under layers may have a higher minimum size */ + pmtu = SCTP_TRUNC4(dst_mtu(dst)); + change = t->pathmtu != pmtu; + } + t->pathmtu = pmtu; + + return change; } /* Caches the dst entry and source address for a transport's destination From 46cd75036415d94e9cf451e6606a099945d54cc6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 5 Jan 2018 11:23:45 -0600 Subject: [PATCH 121/236] phylink: mark expected switch fall-throughs in phylink_mii_ioctl In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1463447 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 150cd95a6e1e..249ce5cbea22 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1296,6 +1296,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGMIIPHY: mii->phy_id = pl->phydev->mdio.addr; + /* fall through */ case SIOCGMIIREG: ret = phylink_phy_read(pl, mii->phy_id, mii->reg_num); @@ -1318,6 +1319,7 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGMIIPHY: mii->phy_id = 0; + /* fall through */ case SIOCGMIIREG: ret = phylink_mii_read(pl, mii->phy_id, mii->reg_num); From 56c0290202ab94a2f2780c449395d4ae8495fab4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 6 Jan 2018 09:00:09 +0100 Subject: [PATCH 122/236] mdio-sun4i: Fix a memory leak If the probing of the regulator is deferred, the memory allocated by 'mdiobus_alloc_size()' will be leaking. It should be freed before the next call to 'sun4i_mdio_probe()' which will reallocate it. Fixes: 4bdcb1dd9feb ("net: Add MDIO bus driver for the Allwinner EMAC") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/phy/mdio-sun4i.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mdio-sun4i.c b/drivers/net/phy/mdio-sun4i.c index 135296508a7e..6425ce04d3f9 100644 --- a/drivers/net/phy/mdio-sun4i.c +++ b/drivers/net/phy/mdio-sun4i.c @@ -118,8 +118,10 @@ static int sun4i_mdio_probe(struct platform_device *pdev) data->regulator = devm_regulator_get(&pdev->dev, "phy"); if (IS_ERR(data->regulator)) { - if (PTR_ERR(data->regulator) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(data->regulator) == -EPROBE_DEFER) { + ret = -EPROBE_DEFER; + goto err_out_free_mdiobus; + } dev_info(&pdev->dev, "no regulator found\n"); data->regulator = NULL; From 50f3d740d376f664f6accc7e86c9afd8f1c7e1e4 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 7 Jan 2018 00:26:47 +0300 Subject: [PATCH 123/236] sh_eth: fix TXALCR1 offsets The TXALCR1 offsets are incorrect in the register offset tables, most probably due to copy&paste error. Luckily, the driver never uses this register. :-) Fixes: 4a55530f38e4 ("net: sh_eth: modify the definitions of register") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index f21c1db91c3f..b9e2846589f8 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -147,7 +147,7 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = { [FWNLCR0] = 0x0090, [FWALCR0] = 0x0094, [TXNLCR1] = 0x00a0, - [TXALCR1] = 0x00a0, + [TXALCR1] = 0x00a4, [RXNLCR1] = 0x00a8, [RXALCR1] = 0x00ac, [FWNLCR1] = 0x00b0, @@ -399,7 +399,7 @@ static const u16 sh_eth_offset_fast_sh3_sh2[SH_ETH_MAX_REGISTER_OFFSET] = { [FWNLCR0] = 0x0090, [FWALCR0] = 0x0094, [TXNLCR1] = 0x00a0, - [TXALCR1] = 0x00a0, + [TXALCR1] = 0x00a4, [RXNLCR1] = 0x00a8, [RXALCR1] = 0x00ac, [FWNLCR1] = 0x00b0, From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Jan 2018 17:33:02 -0800 Subject: [PATCH 124/236] bpf: prevent out-of-bounds speculation Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Unconditionally mask index for all array types even when max_entries are not rounded to power of 2 for root user. When map is created by unpriv user generate a sequence of bpf insns that includes AND operation to make sure that JITed code includes the same 'index & index_mask' operation. If prog_array map is created by unpriv user replace bpf_tail_call(ctx, map, index); with if (index >= max_entries) { index &= map->index_mask; bpf_tail_call(ctx, map, index); } (along with roundup to power 2) to prevent out-of-bounds speculation. There is secondary redundant 'if (index >= max_entries)' in the interpreter and in all JITs, but they can be optimized later if necessary. Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array) cannot be used by unpriv, so no changes there. That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on all architectures with and without JIT. v2->v3: Daniel noticed that attack potentially can be crafted via syscall commands without loading the program, so add masking to those paths as well. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 47 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..1b985ca4ffbe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,6 +52,7 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + bool unpriv_array; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; @@ -221,6 +222,7 @@ struct bpf_prog_aux { struct bpf_array { struct bpf_map map; u32 elem_size; + u32 index_mask; /* 'ownership' of prog_array is claimed by the first program that * is going to use this map or by the first program which FD is stored * in the map to make sure that all callers and callees have the same diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..aaa319848e7d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + index_mask = roundup_pow_of_two(max_entries) - 1; + + if (unpriv) + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b24876cd23..b414d6b2d470 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call obusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } From e4d0e84e490790798691aaa0f2e598637f1867ec Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:21 -0600 Subject: [PATCH 125/236] x86/cpu/AMD: Make LFENCE a serializing instruction To aid in speculation control, make LFENCE a serializing instruction since it has less overhead than MFENCE. This is done by setting bit 1 of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not have this MSR. For these families, the LFENCE instruction is already serializing. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ab022618a50a..1e7d710fef43 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -352,6 +352,8 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bcb75dc97d44..5b438d81beb2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,16 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } From 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:32 -0600 Subject: [PATCH 126/236] x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference to MFENCE_RDTSC. However, since the kernel could be running under a hypervisor that does not support writing that MSR, read the MSR back and verify that the bit has been set successfully. If the MSR can be read and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the MFENCE_RDTSC feature. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1e7d710fef43..fa11fb1fa570 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -354,6 +354,7 @@ #define MSR_FAM10H_NODE_ID 0xc001100c #define MSR_F10H_DECFG 0xc0011029 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5b438d81beb2..ea831c858195 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,9 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + unsigned long long val; + int ret; + /* * A serializing LFENCE has less overhead than MFENCE, so * use it for execution serialization. On families which @@ -839,8 +842,19 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* From 1b5c7ef3d0d0610bda9b63263f7c5b7178d11015 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 6 Jan 2018 10:59:41 -0500 Subject: [PATCH 127/236] drm/nouveau/disp/gf119: add missing drive vfunc ptr Fixes broken dp on GF119: Call Trace: ? nvkm_dp_train_drive+0x183/0x2c0 [nouveau] nvkm_dp_acquire+0x4f3/0xcd0 [nouveau] nv50_disp_super_2_2+0x5d/0x470 [nouveau] ? nvkm_devinit_pll_set+0xf/0x20 [nouveau] gf119_disp_super+0x19c/0x2f0 [nouveau] process_one_work+0x193/0x3c0 worker_thread+0x35/0x3b0 kthread+0x125/0x140 ? process_one_work+0x3c0/0x3c0 ? kthread_park+0x60/0x60 ret_from_fork+0x25/0x30 Code: Bad RIP value. RIP: (null) RSP: ffffb1e243e4bc38 CR2: 0000000000000000 Fixes: af85389c614a drm/nouveau/disp: shuffle functions around Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103421 Signed-off-by: Rob Clark Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c index a2978a37b4f3..700fc754f28a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgf119.c @@ -174,6 +174,7 @@ gf119_sor = { .links = gf119_sor_dp_links, .power = g94_sor_dp_power, .pattern = gf119_sor_dp_pattern, + .drive = gf119_sor_dp_drive, .vcpi = gf119_sor_dp_vcpi, .audio = gf119_sor_dp_audio, .audio_sym = gf119_sor_dp_audio_sym, From aa1f10e85b0ab53dee85d8e293c8159d18d293a8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 29 Dec 2017 00:22:54 +0100 Subject: [PATCH 128/236] mux: core: fix double get_device() class_find_device already does a get_device on the returned device. So the device returned by of_find_mux_chip_by_node is already referenced and we should not reference it again (and unref it on error). Signed-off-by: Hans de Goede Signed-off-by: Peter Rosin Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/mux/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mux/core.c b/drivers/mux/core.c index 2260063b0ea8..6e5cf9d9cd99 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -413,6 +413,7 @@ static int of_dev_node_match(struct device *dev, const void *data) return dev->of_node == data; } +/* Note this function returns a reference to the mux_chip dev. */ static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np) { struct device *dev; @@ -466,6 +467,7 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) (!args.args_count && (mux_chip->controllers > 1))) { dev_err(dev, "%pOF: wrong #mux-control-cells for %pOF\n", np, args.np); + put_device(&mux_chip->dev); return ERR_PTR(-EINVAL); } @@ -476,10 +478,10 @@ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) if (controller >= mux_chip->controllers) { dev_err(dev, "%pOF: bad mux controller %u specified in %pOF\n", np, controller, args.np); + put_device(&mux_chip->dev); return ERR_PTR(-EINVAL); } - get_device(&mux_chip->dev); return &mux_chip->mux[controller]; } EXPORT_SYMBOL_GPL(mux_control_get); From 443064cb0b1fb4569fe0a71209da7625129fb760 Mon Sep 17 00:00:00 2001 From: Viktor Slavkovic Date: Mon, 8 Jan 2018 10:43:03 -0800 Subject: [PATCH 129/236] staging: android: ashmem: fix a race condition in ASHMEM_SET_SIZE ioctl A lock-unlock is missing in ASHMEM_SET_SIZE ioctl which can result in a race condition when mmap is called. After the !asma->file check, before setting asma->size, asma->file can be set in mmap. That would result in having different asma->size than the mapped memory size. Combined with ASHMEM_UNPIN ioctl and shrinker invocation, this can result in memory corruption. Signed-off-by: Viktor Slavkovic Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/android/ashmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 0f695df14c9d..372ce9913e6d 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -765,10 +765,12 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case ASHMEM_SET_SIZE: ret = -EINVAL; + mutex_lock(&ashmem_mutex); if (!asma->file) { ret = 0; asma->size = (size_t)arg; } + mutex_unlock(&ashmem_mutex); break; case ASHMEM_GET_SIZE: ret = asma->size; From 98648ae6ef6bdcdcb88c46cad963906ab452e96d Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Tue, 9 Jan 2018 15:33:42 +0100 Subject: [PATCH 130/236] drm/vmwgfx: Don't cache framebuffer maps Buffer objects need to be either pinned or reserved while a map is active, that's not the case here, so avoid caching the framebuffer map. This will cause increasing mapping activity mainly when we don't do page flipping. This fixes occasional garbage filled screens when the framebuffer has been evicted after the map. Since in-kernel mapping of whole buffer objects is error-prone on 32-bit architectures and also quite inefficient, we will revisit this later. Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh Cc: --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 6 ---- drivers/gpu/drm/vmwgfx/vmwgfx_kms.h | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c | 41 ++++++++-------------------- 3 files changed, 13 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 0545740b3724..641294aef165 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -697,7 +697,6 @@ vmw_du_plane_duplicate_state(struct drm_plane *plane) vps->pinned = 0; /* Mapping is managed by prepare_fb/cleanup_fb */ - memset(&vps->guest_map, 0, sizeof(vps->guest_map)); memset(&vps->host_map, 0, sizeof(vps->host_map)); vps->cpp = 0; @@ -760,11 +759,6 @@ vmw_du_plane_destroy_state(struct drm_plane *plane, /* Should have been freed by cleanup_fb */ - if (vps->guest_map.virtual) { - DRM_ERROR("Guest mapping not freed\n"); - ttm_bo_kunmap(&vps->guest_map); - } - if (vps->host_map.virtual) { DRM_ERROR("Host mapping not freed\n"); ttm_bo_kunmap(&vps->host_map); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h index ff9c8389ff21..cd9da2dd79af 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.h @@ -175,7 +175,7 @@ struct vmw_plane_state { int pinned; /* For CPU Blit */ - struct ttm_bo_kmap_obj host_map, guest_map; + struct ttm_bo_kmap_obj host_map; unsigned int cpp; }; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c index 90b5437fd787..b68d74888ab1 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c @@ -114,7 +114,7 @@ struct vmw_screen_target_display_unit { bool defined; /* For CPU Blit */ - struct ttm_bo_kmap_obj host_map, guest_map; + struct ttm_bo_kmap_obj host_map; unsigned int cpp; }; @@ -695,7 +695,8 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) s32 src_pitch, dst_pitch; u8 *src, *dst; bool not_used; - + struct ttm_bo_kmap_obj guest_map; + int ret; if (!dirty->num_hits) return; @@ -706,6 +707,13 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) if (width == 0 || height == 0) return; + ret = ttm_bo_kmap(&ddirty->buf->base, 0, ddirty->buf->base.num_pages, + &guest_map); + if (ret) { + DRM_ERROR("Failed mapping framebuffer for blit: %d\n", + ret); + goto out_cleanup; + } /* Assume we are blitting from Host (display_srf) to Guest (dmabuf) */ src_pitch = stdu->display_srf->base_size.width * stdu->cpp; @@ -713,7 +721,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) src += ddirty->top * src_pitch + ddirty->left * stdu->cpp; dst_pitch = ddirty->pitch; - dst = ttm_kmap_obj_virtual(&stdu->guest_map, ¬_used); + dst = ttm_kmap_obj_virtual(&guest_map, ¬_used); dst += ddirty->fb_top * dst_pitch + ddirty->fb_left * stdu->cpp; @@ -772,6 +780,7 @@ static void vmw_stdu_dmabuf_cpu_commit(struct vmw_kms_dirty *dirty) vmw_fifo_commit(dev_priv, sizeof(*cmd)); } + ttm_bo_kunmap(&guest_map); out_cleanup: ddirty->left = ddirty->top = ddirty->fb_left = ddirty->fb_top = S32_MAX; ddirty->right = ddirty->bottom = S32_MIN; @@ -1109,9 +1118,6 @@ vmw_stdu_primary_plane_cleanup_fb(struct drm_plane *plane, { struct vmw_plane_state *vps = vmw_plane_state_to_vps(old_state); - if (vps->guest_map.virtual) - ttm_bo_kunmap(&vps->guest_map); - if (vps->host_map.virtual) ttm_bo_kunmap(&vps->host_map); @@ -1277,33 +1283,11 @@ vmw_stdu_primary_plane_prepare_fb(struct drm_plane *plane, */ if (vps->content_fb_type == SEPARATE_DMA && !(dev_priv->capabilities & SVGA_CAP_3D)) { - - struct vmw_framebuffer_dmabuf *new_vfbd; - - new_vfbd = vmw_framebuffer_to_vfbd(new_fb); - - ret = ttm_bo_reserve(&new_vfbd->buffer->base, false, false, - NULL); - if (ret) - goto out_srf_unpin; - - ret = ttm_bo_kmap(&new_vfbd->buffer->base, 0, - new_vfbd->buffer->base.num_pages, - &vps->guest_map); - - ttm_bo_unreserve(&new_vfbd->buffer->base); - - if (ret) { - DRM_ERROR("Failed to map content buffer to CPU\n"); - goto out_srf_unpin; - } - ret = ttm_bo_kmap(&vps->surf->res.backup->base, 0, vps->surf->res.backup->base.num_pages, &vps->host_map); if (ret) { DRM_ERROR("Failed to map display buffer to CPU\n"); - ttm_bo_kunmap(&vps->guest_map); goto out_srf_unpin; } @@ -1350,7 +1334,6 @@ vmw_stdu_primary_plane_atomic_update(struct drm_plane *plane, stdu->display_srf = vps->surf; stdu->content_fb_type = vps->content_fb_type; stdu->cpp = vps->cpp; - memcpy(&stdu->guest_map, &vps->guest_map, sizeof(vps->guest_map)); memcpy(&stdu->host_map, &vps->host_map, sizeof(vps->host_map)); if (!stdu->defined) From 191eccb1580939fb0d47deb405b82a85b0379070 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 9 Jan 2018 03:52:05 +1100 Subject: [PATCH 131/236] powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags & wrapper A new hypervisor call has been defined to communicate various characteristics of the CPU to guests. Add definitions for the hcall number, flags and a wrapper function. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvcall.h | 17 +++++++++++++++++ arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a409177be8bd..f0461618bf7b 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -241,6 +241,7 @@ #define H_GET_HCA_INFO 0x1B8 #define H_GET_PERF_COUNT 0x1BC #define H_MANAGE_TRACE 0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4 #define H_QUERY_INT_STATE 0x1E4 #define H_POLL_PENDING 0x1D8 @@ -330,6 +331,17 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_CPU_CHAR_SPEC_BAR_ORI31 (1ull << 63) // IBM bit 0 +#define H_CPU_CHAR_BCCTRL_SERIALISED (1ull << 62) // IBM bit 1 +#define H_CPU_CHAR_L1D_FLUSH_ORI30 (1ull << 61) // IBM bit 2 +#define H_CPU_CHAR_L1D_FLUSH_TRIG2 (1ull << 60) // IBM bit 3 +#define H_CPU_CHAR_L1D_THREAD_PRIV (1ull << 59) // IBM bit 4 + +#define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 +#define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 + /* Flag values used in H_REGISTER_PROC_TBL hcall */ #define PROC_TABLE_OP_MASK 0x18 #define PROC_TABLE_DEREG 0x10 @@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc) } } +struct h_cpu_char_result { + u64 character; + u64 behaviour; +}; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 7f01b22fa6cb..55eddf50d149 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu) return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu); } +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); + if (rc == H_SUCCESS) { + p->character = retbuf[0]; + p->behaviour = retbuf[1]; + } + + return rc; +} + #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ From 46eb14a6e1585d99c1b9f58d0e7389082a5f466b Mon Sep 17 00:00:00 2001 From: Pete Zaitcev Date: Mon, 8 Jan 2018 15:46:41 -0600 Subject: [PATCH 132/236] USB: fix usbmon BUG trigger Automated tests triggered this by opening usbmon and accessing the mmap while simultaneously resizing the buffers. This bug was with us since 2006, because typically applications only size the buffers once and thus avoid racing. Reported by Kirill A. Shutemov. Reported-by: Signed-off-by: Pete Zaitcev Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mon/mon_bin.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index f6ae753ab99b..f932f40302df 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1004,7 +1004,9 @@ static long mon_bin_ioctl(struct file *file, unsigned int cmd, unsigned long arg break; case MON_IOCQ_RING_SIZE: + mutex_lock(&rp->fetch_lock); ret = rp->b_size; + mutex_unlock(&rp->fetch_lock); break; case MON_IOCT_RING_SIZE: @@ -1231,12 +1233,16 @@ static int mon_bin_vma_fault(struct vm_fault *vmf) unsigned long offset, chunk_idx; struct page *pageptr; + mutex_lock(&rp->fetch_lock); offset = vmf->pgoff << PAGE_SHIFT; - if (offset >= rp->b_size) + if (offset >= rp->b_size) { + mutex_unlock(&rp->fetch_lock); return VM_FAULT_SIGBUS; + } chunk_idx = offset / CHUNK_SIZE; pageptr = rp->b_vec[chunk_idx].pg; get_page(pageptr); + mutex_unlock(&rp->fetch_lock); vmf->page = pageptr; return 0; } From 7ae2c3c280db183ca9ada2675c34ec2f7378abfa Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 3 Jan 2018 12:51:51 -0500 Subject: [PATCH 133/236] USB: UDC core: fix double-free in usb_add_gadget_udc_release The error-handling pathways in usb_add_gadget_udc_release() are messed up. Aside from the uninformative statement labels, they can deallocate the udc structure after calling put_device(), which is a double-free. This was observed by KASAN in automatic testing. This patch cleans up the routine. It preserves the requirement that when any failure occurs, we call put_device(&gadget->dev). Signed-off-by: Alan Stern Reported-by: Fengguang Wu CC: Reviewed-by: Peter Chen Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 93eff7dec2f5..1b3efb14aec7 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1147,11 +1147,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, udc = kzalloc(sizeof(*udc), GFP_KERNEL); if (!udc) - goto err1; - - ret = device_add(&gadget->dev); - if (ret) - goto err2; + goto err_put_gadget; device_initialize(&udc->dev); udc->dev.release = usb_udc_release; @@ -1160,7 +1156,11 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, udc->dev.parent = parent; ret = dev_set_name(&udc->dev, "%s", kobject_name(&parent->kobj)); if (ret) - goto err3; + goto err_put_udc; + + ret = device_add(&gadget->dev); + if (ret) + goto err_put_udc; udc->gadget = gadget; gadget->udc = udc; @@ -1170,7 +1170,7 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, ret = device_add(&udc->dev); if (ret) - goto err4; + goto err_unlist_udc; usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED); udc->vbus = true; @@ -1178,27 +1178,25 @@ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, /* pick up one of pending gadget drivers */ ret = check_pending_gadget_drivers(udc); if (ret) - goto err5; + goto err_del_udc; mutex_unlock(&udc_lock); return 0; -err5: + err_del_udc: device_del(&udc->dev); -err4: + err_unlist_udc: list_del(&udc->list); mutex_unlock(&udc_lock); -err3: - put_device(&udc->dev); device_del(&gadget->dev); -err2: - kfree(udc); + err_put_udc: + put_device(&udc->dev); -err1: + err_put_gadget: put_device(&gadget->dev); return ret; } From 9ecccfaa7cb5249bd31bdceb93fcf5bedb8a24d8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 9 Jan 2018 15:02:51 +0000 Subject: [PATCH 134/236] sysfs/cpu: Fix typos in vulnerability documentation Fixes: 87590ce6e ("sysfs/cpu: Add vulnerability folder") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner --- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bd3a88e16d8b..258902db14bf 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -378,7 +378,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/meltdown /sys/devices/system/cpu/vulnerabilities/spectre_v1 /sys/devices/system/cpu/vulnerabilities/spectre_v2 -Date: Januar 2018 +Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities @@ -388,4 +388,4 @@ Description: Information about CPU vulnerabilities "Not affected" CPU is not affected by the vulnerability "Vulnerable" CPU is affected and no mitigation in effect - "Mitigation: $M" CPU is affetcted and mitigation $M is in effect + "Mitigation: $M" CPU is affected and mitigation $M is in effect From 50e51c13b3822d14ff6df4279423e4b7b2269bc3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 135/236] powerpc/64: Add macros for annotating the destination of rfid/hrfid The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is used for switching from the kernel to userspace, and from the hypervisor to the guest kernel. However it can and is also used for other transitions, eg. from real mode kernel code to virtual mode kernel code, and it's not always clear from the code what the destination context is. To make it clearer when reading the code, add macros which encode the expected destination context. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64e.h | 6 +++++ arch/powerpc/include/asm/exception-64s.h | 29 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a703452d67b6..555e22d5e07f 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ exc_##label##_book3e: ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL \ + rfi + +#define RFI_TO_USER \ + rfi + #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index b27205297e1d..1af427a3c74f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,6 +74,35 @@ */ #define EX_R3 EX_DAR +/* Macros for annotating the expected destination of (h)rfid */ + +#define RFI_TO_KERNEL \ + rfid + +#define RFI_TO_USER \ + rfid + +#define RFI_TO_USER_OR_KERNEL \ + rfid + +#define RFI_TO_GUEST \ + rfid + +#define HRFI_TO_KERNEL \ + hrfid + +#define HRFI_TO_USER \ + hrfid + +#define HRFI_TO_USER_OR_KERNEL \ + hrfid + +#define HRFI_TO_GUEST \ + hrfid + +#define HRFI_TO_UNKNOWN \ + hrfid + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ From 222f20f140623ef6033491d0103ee0875fe87d35 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 136/236] powerpc/64s: Simple RFI macro conversions This commit does simple conversions of rfi/rfid to the new macros that include the expected destination context. By simple we mean cases where there is a single well known destination context, and it's simply a matter of substituting the instruction for the appropriate macro. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 4 ++-- arch/powerpc/kernel/entry_64.S | 14 +++++++++----- arch/powerpc/kernel/exceptions-64s.S | 24 ++++++++++++------------ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 ++++----- arch/powerpc/kvm/book3s_rmhandlers.S | 7 +++++-- arch/powerpc/kvm/book3s_segment.S | 4 ++-- 6 files changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 1af427a3c74f..dfc56daed98b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -247,7 +247,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ __EXCEPTION_PROLOG_PSERIES_1(label, h) @@ -261,7 +261,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3320bcac7192..e68faa4d1b13 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,11 @@ #include #include #include +#ifdef CONFIG_PPC_BOOK3S +#include +#else +#include +#endif /* * System calls. @@ -397,8 +402,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtmsrd r10, 1 mtspr SPRN_SRR0, r11 mtspr SPRN_SRR1, r12 - - rfid + RFI_TO_USER b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); @@ -1073,7 +1077,7 @@ __enter_rtas: mtspr SPRN_SRR0,r5 mtspr SPRN_SRR1,r6 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ rtas_return_loc: @@ -1098,7 +1102,7 @@ rtas_return_loc: mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) @@ -1171,7 +1175,7 @@ _GLOBAL(enter_prom) LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) andc r11,r11,r12 mtsrr1 r11 - rfid + RFI_TO_KERNEL #endif /* CONFIG_PPC_BOOK3E */ 1: /* Return from OF */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e441b469dc8f..5502b0147c4e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -256,7 +256,7 @@ BEGIN_FTR_SECTION LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r11 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ 2: /* Stack overflow. Stay on emergency stack and panic. @@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) li r3,MSR_ME andc r10,r10,r3 /* Turn off MSR_ME */ mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 2: /* @@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP - rfid + RFI_TO_USER_OR_KERNEL 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP @@ -651,7 +651,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 8: std r3,PACA_EXSLB+EX_DAR(r13) @@ -662,7 +662,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . EXC_COMMON_BEGIN(unrecov_slb) @@ -901,7 +901,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) mtspr SPRN_SRR0,r10 ; \ ld r10,PACAKMSR(r13) ; \ mtspr SPRN_SRR1,r10 ; \ - rfid ; \ + RFI_TO_KERNEL ; \ b . ; /* prevent speculative execution */ #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH @@ -917,7 +917,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ mr r13,r9 ; \ - rfid ; /* return to userspace */ \ + RFI_TO_USER ; /* return to userspace */ \ b . ; /* prevent speculative execution */ #else #define SYSCALL_FASTENDIAN_TEST @@ -1063,7 +1063,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) mtcr r11 REST_GPR(11, r1) ld r1,GPR1(r1) - hrfid + HRFI_TO_USER_OR_KERNEL 1: mtcr r11 REST_GPR(11, r1) @@ -1314,7 +1314,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) ld r13,PACA_EXGEN+EX_R13(r13) - HRFID + HRFI_TO_UNKNOWN b . #endif @@ -1418,7 +1418,7 @@ masked_##_H##interrupt: \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ /* returns to kernel where r13 must be set up, so don't restore it */ \ - ##_H##rfid; \ + ##_H##RFI_TO_KERNEL; \ b .; \ MASKED_DEC_HANDLER(_H) @@ -1441,7 +1441,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) addi r13, r13, 4 mtspr SPRN_SRR0, r13 GET_SCRATCH0(r13) - rfid + RFI_TO_KERNEL b . TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) @@ -1453,7 +1453,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) addi r13, r13, 4 mtspr SPRN_HSRR0, r13 GET_SCRATCH0(r13) - hrfid + HRFI_TO_KERNEL b . #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..9c61f736c75b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) mtmsrd r0,1 /* clear RI in MSR */ mtsrr0 r5 mtsrr1 r6 - RFI + RFI_TO_KERNEL kvmppc_call_hv_entry: BEGIN_FTR_SECTION @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - RFI + RFI_TO_KERNEL /* Virtual-mode return */ .Lvirt_return: @@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) - - hrfid + HRFI_TO_GUEST b . secondary_too_late: @@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r4, PACAKMSR(r13) mtspr SPRN_SRR0, r3 mtspr SPRN_SRR1, r4 - rfid + RFI_TO_KERNEL 9: addi r3, r1, STACK_FRAME_OVERHEAD bl kvmppc_bad_interrupt b 9b diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 42a4b237df5f..34a5adeff084 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@ #define FUNC(name) name +#define RFI_TO_KERNEL RFI +#define RFI_TO_GUEST RFI + .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ kvmppc_handler_skip_ins: GET_SCRATCH0(r13) /* And get back into the code */ - RFI + RFI_TO_KERNEL #endif /* @@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline) ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 - RFI + RFI_TO_KERNEL #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 2a2b96d53999..93a180ceefad 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ no_dcbz32_on: PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) - RFI + RFI_TO_GUEST kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) cmpwi r12, BOOK3S_INTERRUPT_DOORBELL beqa BOOK3S_INTERRUPT_DOORBELL - RFI + RFI_TO_KERNEL kvmppc_handler_trampoline_exit_end: From b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 137/236] powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL In the syscall exit path we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e68faa4d1b13..724733b74744 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -267,13 +267,23 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + ld r2,GPR2(r1) + ld r1,GPR1(r1) + mtlr r4 + mtcr r5 + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r8 + RFI_TO_USER + b . /* prevent speculative execution */ + + /* exit to kernel */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ .Lsyscall_error: From a08f828cf47e6c605af21d2cdec68f84e799c318 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 138/236] powerpc/64: Convert fast_exception_return to use RFI_TO_USER/KERNEL Similar to the syscall return path, in fast_exception_return we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 724733b74744..2748584b767d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -892,7 +892,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) -1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -905,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r3,GPR3(r1) ld r4,GPR4(r1) ld r1,GPR1(r1) + RFI_TO_USER + b . /* prevent speculative execution */ - rfid +1: mtspr SPRN_SRR1,r3 + + ld r2,_CCR(r1) + mtcrf 0xFF,r2 + ld r2,_NIP(r1) + mtspr SPRN_SRR0,r2 + + ld r0,GPR0(r1) + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r4,GPR4(r1) + ld r1,GPR1(r1) + RFI_TO_KERNEL b . /* prevent speculative execution */ #endif /* CONFIG_PPC_BOOK3E */ From c7305645eb0c1621351cfc104038831ae87c0053 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 139/236] powerpc/64s: Convert slb_miss_common to use RFI_TO_USER/KERNEL In the SLB miss handler we may be returning to user or kernel. We need to add a check early on and save the result in the cr4 register, and then we bifurcate the return path based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 29 +++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5502b0147c4e..ed356194f09c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + andi. r9,r11,MSR_PR // Check for exception from userspace + cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later + /* * Test MSR_RI before calling slb_allocate_realmode, because the * MSR in r11 gets clobbered. However we still want to allocate @@ -624,9 +627,12 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) /* All done -- return from exception. */ + bne cr4,1f /* returning to kernel */ + .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ @@ -640,8 +646,29 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) ld r11,PACA_EXSLB+EX_R11(r13) ld r12,PACA_EXSLB+EX_R12(r13) ld r13,PACA_EXSLB+EX_R13(r13) - rfid + RFI_TO_USER b . /* prevent speculative execution */ +1: +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_KERNEL + b . /* prevent speculative execution */ + 2: std r3,PACA_EXSLB+EX_DAR(r13) mr r3,r12 From 928afc85270753657b5543e052cc270c279a3fe9 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sat, 6 Jan 2018 00:56:44 +0800 Subject: [PATCH 140/236] uas: ignore UAS for Norelsys NS1068(X) chips The UAS mode of Norelsys NS1068(X) is reported to fail to work on several platforms with the following error message: xhci-hcd xhci-hcd.0.auto: ERROR Transfer event for unknown stream ring slot 1 ep 8 xhci-hcd xhci-hcd.0.auto: @00000000bf04a400 00000000 00000000 1b000000 01098001 And when trying to mount a partition on the disk the disk will disconnect from the USB controller, then after re-connecting the device will be offlined and not working at all. Falling back to USB mass storage can solve this problem, so ignore UAS function of this chip. Cc: stable@vger.kernel.org Signed-off-by: Icenowy Zheng Acked-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index e6127fb21c12..a7d08ae0adad 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -143,6 +143,13 @@ UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NO_ATA_1X), +/* Reported-by: Icenowy Zheng */ +UNUSUAL_DEV(0x2537, 0x1068, 0x0000, 0x9999, + "Norelsys", + "NS1068X", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_IGNORE_UAS), + /* Reported-by: Takeo Nakayama */ UNUSUAL_DEV(0x357d, 0x7788, 0x0000, 0x9999, "JMicron", From b8fd0823e0770c2d5fdbd865bccf0d5e058e5287 Mon Sep 17 00:00:00 2001 From: Andrii Vladyka Date: Thu, 4 Jan 2018 13:09:17 +0200 Subject: [PATCH 141/236] net: core: fix module type in sock_diag_bind Use AF_INET6 instead of AF_INET in IPv6-related code path Signed-off-by: Andrii Vladyka Signed-off-by: David S. Miller --- net/core/sock_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 217f4e3b82f6..146b50e30659 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -288,7 +288,7 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + NETLINK_SOCK_DIAG, AF_INET6); break; } return 0; From edd8ca8015800b354453b891d38960f3a474b7e4 Mon Sep 17 00:00:00 2001 From: Florian Margaine Date: Wed, 13 Dec 2017 16:43:59 +0100 Subject: [PATCH 142/236] rbd: reacquire lock should update lock owner client id Otherwise, future operations on this RBD using exclusive-lock are going to require the lock from a non-existent client id. Cc: stable@vger.kernel.org Fixes: 14bb211d324d ("rbd: support updating the lock cookie without releasing the lock") Link: http://tracker.ceph.com/issues/19929 Signed-off-by: Florian Margaine [idryomov@gmail.com: rbd_set_owner_cid() call, __rbd_lock() helper] Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 38fc5f397fde..aacae6f7163e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3047,13 +3047,21 @@ static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) mutex_unlock(&rbd_dev->watch_mutex); } +static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) +{ + struct rbd_client_id cid = rbd_get_cid(rbd_dev); + + strcpy(rbd_dev->lock_cookie, cookie); + rbd_set_owner_cid(rbd_dev, &cid); + queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); +} + /* * lock_rwsem must be held for write */ static int rbd_lock(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct rbd_client_id cid = rbd_get_cid(rbd_dev); char cookie[32]; int ret; @@ -3068,9 +3076,7 @@ static int rbd_lock(struct rbd_device *rbd_dev) return ret; rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; - strcpy(rbd_dev->lock_cookie, cookie); - rbd_set_owner_cid(rbd_dev, &cid); - queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); + __rbd_lock(rbd_dev, cookie); return 0; } @@ -3856,7 +3862,7 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); } else { - strcpy(rbd_dev->lock_cookie, cookie); + __rbd_lock(rbd_dev, cookie); } } From 21acdf45f4958135940f0b4767185cf911d4b010 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Dec 2017 15:35:11 +0100 Subject: [PATCH 143/236] rbd: set max_segments to USHRT_MAX Commit d3834fefcfe5 ("rbd: bump queue_max_segments") bumped max_segments (unsigned short) to max_hw_sectors (unsigned int). max_hw_sectors is set to the number of 512-byte sectors in an object and overflows unsigned short for 32M (largest possible) objects, making the block layer resort to handing us single segment (i.e. single page or even smaller) bios in that case. Cc: stable@vger.kernel.org Fixes: d3834fefcfe5 ("rbd: bump queue_max_segments") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aacae6f7163e..cc93522a6d41 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4387,7 +4387,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) segment_size = rbd_obj_bytes(&rbd_dev->header); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); q->limits.max_sectors = queue_max_hw_sectors(q); - blk_queue_max_segments(q, segment_size / SECTOR_SIZE); + blk_queue_max_segments(q, USHRT_MAX); blk_queue_max_segment_size(q, segment_size); blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); From 3dc2fa47549aca71773afdd12a78d31802bb22b4 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Mon, 8 Jan 2018 19:43:00 +0800 Subject: [PATCH 144/236] net: caif: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_dev.c: In function 'caif_enroll_dev': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfctrl.c: In function 'cfctrl_linkup_request': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfcnfg.c: In function 'caif_connect_client': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 5 ++--- net/caif/cfcnfg.c | 10 ++++------ net/caif/cfctrl.c | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 2d38b6e34203..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -334,9 +334,8 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); - strncpy(caifd->layer.name, dev->name, - sizeof(caifd->layer.name) - 1); - caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + strlcpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; cfcnfg_add_phy_layer(cfg, dev, diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 273cb07f57d8..8f00bea093b9 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -268,17 +268,15 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; - strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, - sizeof(l->u.rfm.volume)-1); - l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0; + strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, + sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; - strncpy(l->u.utility.name, s->sockaddr.u.util.service, - sizeof(l->u.utility.name)-1); - l->u.utility.name[sizeof(l->u.utility.name)-1] = 0; + strlcpy(l->u.utility.name, s->sockaddr.u.util.service, + sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index f5afda1abc76..655ed7032150 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -258,8 +258,8 @@ int cfctrl_linkup_request(struct cflayer *layer, tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); memset(utility_name, 0, sizeof(utility_name)); - strncpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH - 1); + strlcpy(utility_name, param->u.utility.name, + UTILITY_NAME_LENGTH); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); From 20b50d79974ea3192e8c3ab7faf4e536e5f14d8f Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 8 Jan 2018 15:54:44 +0100 Subject: [PATCH 145/236] net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field in raw_sendmsg() Commit 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") fixed the issue of possibly inconsistent ->hdrincl handling due to concurrent updates by reading this bit-field member into a local variable and using the thus stabilized value in subsequent tests. However, aforementioned commit also adds the (correct) comment that /* hdrincl should be READ_ONCE(inet->hdrincl) * but READ_ONCE() doesn't work with bit fields */ because as it stands, the compiler is free to shortcut or even eliminate the local variable at its will. Note that I have not seen anything like this happening in reality and thus, the concern is a theoretical one. However, in order to be on the safe side, emulate a READ_ONCE() on the bit-field by doing it on the local 'hdrincl' variable itself: int hdrincl = inet->hdrincl; hdrincl = READ_ONCE(hdrincl); This breaks the chain in the sense that the compiler is not allowed to replace subsequent reads from hdrincl with reloads from inet->hdrincl. Fixes: 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") Signed-off-by: Nicolai Stange Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/raw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 125c1eab3eaa..5e570aa9e43b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -520,9 +520,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. */ hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); /* * Check the flags. */ From 2fdd18118dad86bf5e7880d8d02ea27be23e3671 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jan 2018 08:50:17 +0200 Subject: [PATCH 146/236] docs-rst: networking: wire up msg_zerocopy Fix the following 'make htmldocs' complaint: Documentation/networking/msg_zerocopy.rst:: WARNING: document isn't included in any toctree. Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 66e620866245..7d4b15977d61 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,6 +9,7 @@ Contents: batman-adv kapi z8530book + msg_zerocopy .. only:: subproject @@ -16,4 +17,3 @@ Contents: ======= * :ref:`genindex` - From 195e2addbce09e5afbc766efc1e6567c9ce840d3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:26 +0300 Subject: [PATCH 147/236] SolutionEngine771x: fix Ether platform data The 'sh_eth' driver's probe() method would fail on the SolutionEngine7710 board and crash on SolutionEngine7712 board as the platform code is hopelessly behind the driver's platform data -- it passes the PHY address instead of 'struct sh_eth_plat_data *'; pass the latter to the driver in order to fix the bug... Fixes: 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index 77c35350ee77..b7fa7a87e946 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include @@ -115,6 +116,11 @@ static struct platform_device heartbeat_device = { #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\ defined(CONFIG_CPU_SUBTYPE_SH7712) /* SH771X Ethernet driver */ +static struct sh_eth_plat_data sh_eth_plat = { + .phy = PHY_ID, + .phy_interface = PHY_INTERFACE_MODE_MII, +}; + static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, @@ -132,7 +138,7 @@ static struct platform_device sh_eth0_device = { .name = "sh771x-ether", .id = 0, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth0_resources), .resource = sh_eth0_resources, @@ -155,7 +161,7 @@ static struct platform_device sh_eth1_device = { .name = "sh771x-ether", .id = 1, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth1_resources), .resource = sh_eth1_resources, From f9a531d6731d74f1e24298d9641c2dc1fef2631b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:27 +0300 Subject: [PATCH 148/236] SolutionEngine771x: add Ether TSU resource After the Ether platform data is fixed, the driver probe() method would still fail since the 'struct sh_eth_cpu_data' corresponding to SH771x indicates the presence of TSU but the memory resource for it is absent. Add the missing TSU resource to both Ether devices and fix the harmless off-by-one error in the main memory resources, while at it... Fixes: 4986b996882d ("net: sh_eth: remove the SH_TSU_ADDR") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 14 ++++++++++++-- arch/sh/include/mach-se/mach/se.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index b7fa7a87e946..412326d59e6f 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -124,10 +124,15 @@ static struct sh_eth_plat_data sh_eth_plat = { static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, - .end = SH_ETH0_BASE + 0x1B8, + .end = SH_ETH0_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH0_IRQ, .end = SH_ETH0_IRQ, .flags = IORESOURCE_IRQ, @@ -147,10 +152,15 @@ static struct platform_device sh_eth0_device = { static struct resource sh_eth1_resources[] = { [0] = { .start = SH_ETH1_BASE, - .end = SH_ETH1_BASE + 0x1B8, + .end = SH_ETH1_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH1_IRQ, .end = SH_ETH1_IRQ, .flags = IORESOURCE_IRQ, diff --git a/arch/sh/include/mach-se/mach/se.h b/arch/sh/include/mach-se/mach/se.h index 4246ef9b07a3..aa83fe1ff0b1 100644 --- a/arch/sh/include/mach-se/mach/se.h +++ b/arch/sh/include/mach-se/mach/se.h @@ -100,6 +100,7 @@ /* Base address */ #define SH_ETH0_BASE 0xA7000000 #define SH_ETH1_BASE 0xA7000400 +#define SH_TSU_BASE 0xA7000800 /* PHY ID */ #if defined(CONFIG_CPU_SUBTYPE_SH7710) # define PHY_ID 0x00 From 4512c43eac7e007d982e7ea45152ea6f3f4d1921 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Jan 2018 10:34:00 -0800 Subject: [PATCH 149/236] ipv6: remove null_entry before adding default route In the current code, when creating a new fib6 table, tb6_root.leaf gets initialized to net->ipv6.ip6_null_entry. If a default route is being added with rt->rt6i_metric = 0xffffffff, fib6_add() will add this route after net->ipv6.ip6_null_entry. As null_entry is shared, it could cause problem. In order to fix it, set fn->leaf to NULL before calling fib6_add_rt2node() when trying to add the first default route. And reset fn->leaf to null_entry when adding fails or when deleting the last default route. syzkaller reported the following issue which is fixed by this commit: WARNING: suspicious RCU usage 4.15.0-rc5+ #171 Not tainted ----------------------------- net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 4 locks held by swapper/0/0: #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310 #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007 #2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560 #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline] #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585 fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701 fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892 fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815 fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863 fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933 __fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949 fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline] fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016 fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033 call_timer_fn+0x228/0x820 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d11a5578e4f8..9dcc3924a975 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -640,6 +640,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -1270,13 +1275,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1531,6 +1540,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1685,10 +1700,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } From be95a845cc4402272994ce290e3ad928aff06cb9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Jan 2018 13:17:44 +0100 Subject: [PATCH 150/236] bpf: avoid false sharing of map refcount with max_entries In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds speculation") also change the layout of struct bpf_map such that false sharing of fast-path members like max_entries is avoided when the maps reference counter is altered. Therefore enforce them to be placed into separate cachelines. pahole dump after change: struct bpf_map { const struct bpf_map_ops * ops; /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ u32 pages; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ bool unpriv_array; /* 56 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct user_struct * user; /* 64 8 */ atomic_t refcnt; /* 72 4 */ atomic_t usercnt; /* 76 4 */ struct work_struct work; /* 80 32 */ char name[16]; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ /* size: 128, cachelines: 2, members: 17 */ /* sum members: 121, holes: 1, sum holes: 7 */ }; Now all entries in the first cacheline are read only throughout the life time of the map, set up once during map creation. Overall struct size and number of cachelines doesn't change from the reordering. struct bpf_map is usually first member and embedded in map structs in specific map implementations, so also avoid those members to sit at the end where it could potentially share the cacheline with first map values e.g. in the array since remote CPUs could trigger map updates just as well for those (easily dirtying members like max_entries intentionally as well) while having subsequent values in cache. Quoting from Google's Project Zero blog [1]: Additionally, at least on the Intel machine on which this was tested, bouncing modified cache lines between cores is slow, apparently because the MESI protocol is used for cache coherence [8]. Changing the reference counter of an eBPF array on one physical CPU core causes the cache line containing the reference counter to be bounced over to that CPU core, making reads of the reference counter on all other CPU cores slow until the changed reference counter has been written back to memory. Because the length and the reference counter of an eBPF array are stored in the same cache line, this also means that changing the reference counter on one physical CPU core causes reads of the eBPF array's length to be slow on other physical CPU cores (intentional false sharing). While this doesn't 'control' the out-of-bounds speculation through masking the index as in commit b2157399cc98, triggering a manipulation of the map's reference counter is really trivial, so lets not allow to easily affect max_entries from it. Splitting to separate cachelines also generally makes sense from a performance perspective anyway in that fast-path won't have a cache miss if the map gets pinned, reused in other progs, etc out of control path, thus also avoids unintentional false sharing. [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1b985ca4ffbe..fe2cb7c398e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -43,7 +43,14 @@ struct bpf_map_ops { }; struct bpf_map { - atomic_t refcnt; + /* 1st cacheline with read-mostly members of which some + * are also accessed in fast-path (e.g. ops, max_entries). + */ + const struct bpf_map_ops *ops ____cacheline_aligned; + struct bpf_map *inner_map_meta; +#ifdef CONFIG_SECURITY + void *security; +#endif enum bpf_map_type map_type; u32 key_size; u32 value_size; @@ -53,15 +60,16 @@ struct bpf_map { u32 id; int numa_node; bool unpriv_array; - struct user_struct *user; - const struct bpf_map_ops *ops; - struct work_struct work; + /* 7 bytes hole */ + + /* 2nd cacheline with misc members to avoid false sharing + * particularly with refcounting. + */ + struct user_struct *user ____cacheline_aligned; + atomic_t refcnt; atomic_t usercnt; - struct bpf_map *inner_map_meta; + struct work_struct work; char name[BPF_OBJ_NAME_LEN]; -#ifdef CONFIG_SECURITY - void *security; -#endif }; /* function argument constraints */ From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: [PATCH 151/236] bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- init/Kconfig | 7 +++++++ kernel/bpf/core.c | 19 +++++++++++++++++++ lib/test_bpf.c | 11 +++++++---- net/core/filter.c | 6 ++---- net/core/sysctl_net_core.c | 6 ++++++ net/socket.c | 9 +++++++++ 6 files changed, 50 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 2934249fba46..5e2a4a391ba9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1392,6 +1392,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..51ec2dda7f08 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9e9748089270..f369889e521d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6250,9 +6250,8 @@ static struct bpf_prog *generate_filter(int which, int *err) return NULL; } } - /* We don't expect to fail. */ if (*err) { - pr_cont("FAIL to attach err=%d len=%d\n", + pr_cont("FAIL to prog_create err=%d len=%d\n", *err, fprog.len); return NULL; } @@ -6276,6 +6275,10 @@ static struct bpf_prog *generate_filter(int which, int *err) * checks. */ fp = bpf_prog_select_runtime(fp, err); + if (*err) { + pr_cont("FAIL to select_runtime err=%d\n", *err); + return NULL; + } break; } @@ -6461,8 +6464,8 @@ static __init int test_bpf(void) pass_cnt++; continue; } - - return err; + err_cnt++; + continue; } pr_cont("jited:%u ", fp->jited); diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..d339ef170df6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1054,11 +1054,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cbc3dde4cfcc..a47ad6cd41c0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -325,7 +325,13 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, +#ifndef CONFIG_BPF_JIT_ALWAYS_ON .proc_handler = proc_dointvec +#else + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, +#endif }, # ifdef CONFIG_HAVE_EBPF_JIT { diff --git a/net/socket.c b/net/socket.c index 05f361faec45..78acd6ce74c7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2619,6 +2619,15 @@ out_fs: core_initcall(sock_init); /* early initcall */ +static int __init jit_init(void) +{ +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_jit_enable = 1; +#endif + return 0; +} +pure_initcall(jit_init); + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { From 67dcf8a3e06582cb6b02952335b5612beb97889f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Jan 2018 18:09:33 +0200 Subject: [PATCH 152/236] ACPI: utils: Introduce acpi_dev_get_first_match_name() Sometimes the user wants to have device name of the match rather than just checking if device present or not. To make life easier for such users introduce acpi_dev_get_first_match_name() helper based on code for acpi_dev_present(). For example, GPIO driver for Intel Merrifield needs to know the device name of pin control to be able to apply GPIO mapping table to the proper device. To be more consistent with the purpose rename struct acpi_dev_present_info -> struct acpi_dev_match_info acpi_dev_present_cb() -> acpi_dev_match_cb() in the utils.c file. Tested-by: Pierre-Louis Bossart Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 41 ++++++++++++++++++++++++++++++++++------- include/acpi/acpi_bus.h | 3 +++ include/linux/acpi.h | 6 ++++++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 9d49a1acebe3..78db97687f26 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -737,16 +737,17 @@ bool acpi_dev_found(const char *hid) } EXPORT_SYMBOL(acpi_dev_found); -struct acpi_dev_present_info { +struct acpi_dev_match_info { + const char *dev_name; struct acpi_device_id hid[2]; const char *uid; s64 hrv; }; -static int acpi_dev_present_cb(struct device *dev, void *data) +static int acpi_dev_match_cb(struct device *dev, void *data) { struct acpi_device *adev = to_acpi_device(dev); - struct acpi_dev_present_info *match = data; + struct acpi_dev_match_info *match = data; unsigned long long hrv; acpi_status status; @@ -757,6 +758,8 @@ static int acpi_dev_present_cb(struct device *dev, void *data) strcmp(adev->pnp.unique_id, match->uid))) return 0; + match->dev_name = acpi_dev_name(adev); + if (match->hrv == -1) return 1; @@ -789,20 +792,44 @@ static int acpi_dev_present_cb(struct device *dev, void *data) */ bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) { - struct acpi_dev_present_info match = {}; + struct acpi_dev_match_info match = {}; struct device *dev; strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id)); match.uid = uid; match.hrv = hrv; - dev = bus_find_device(&acpi_bus_type, NULL, &match, - acpi_dev_present_cb); - + dev = bus_find_device(&acpi_bus_type, NULL, &match, acpi_dev_match_cb); return !!dev; } EXPORT_SYMBOL(acpi_dev_present); +/** + * acpi_dev_get_first_match_name - Return name of first match of ACPI device + * @hid: Hardware ID of the device. + * @uid: Unique ID of the device, pass NULL to not check _UID + * @hrv: Hardware Revision of the device, pass -1 to not check _HRV + * + * Return device name if a matching device was present + * at the moment of invocation, or NULL otherwise. + * + * See additional information in acpi_dev_present() as well. + */ +const char * +acpi_dev_get_first_match_name(const char *hid, const char *uid, s64 hrv) +{ + struct acpi_dev_match_info match = {}; + struct device *dev; + + strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id)); + match.uid = uid; + match.hrv = hrv; + + dev = bus_find_device(&acpi_bus_type, NULL, &match, acpi_dev_match_cb); + return dev ? match.dev_name : NULL; +} +EXPORT_SYMBOL(acpi_dev_get_first_match_name); + /* * acpi_backlight= handling, this is done here rather then in video_detect.c * because __setup cannot be used in modules. diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 79287629c888..c9608b0b80c6 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -91,6 +91,9 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); +const char * +acpi_dev_get_first_match_name(const char *hid, const char *uid, s64 hrv); + #ifdef CONFIG_ACPI #include diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dc1ebfeeb5ec..d918f1ea84e6 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -640,6 +640,12 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) return false; } +static inline const char * +acpi_dev_get_first_match_name(const char *hid, const char *uid, s64 hrv) +{ + return NULL; +} + static inline bool is_acpi_node(struct fwnode_handle *fwnode) { return false; From dd1dbf94d2826a045fbbe2649d84b27d48620d56 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 5 Jan 2018 18:09:34 +0200 Subject: [PATCH 153/236] gpio: merrifield: Add support of ACPI enabled platforms The driver needs the pin control device name for ACPI. We are looking through ACPI namespace and return first found device based on ACPI HID for Intel Merrifield FLIS (pin control device). Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg Acked-by: Linus Walleij Signed-off-by: Rafael J. Wysocki --- drivers/gpio/gpio-merrifield.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-merrifield.c b/drivers/gpio/gpio-merrifield.c index dd67a31ac337..c38624ea0251 100644 --- a/drivers/gpio/gpio-merrifield.c +++ b/drivers/gpio/gpio-merrifield.c @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -380,9 +381,16 @@ static void mrfld_irq_init_hw(struct mrfld_gpio *priv) } } +static const char *mrfld_gpio_get_pinctrl_dev_name(void) +{ + const char *dev_name = acpi_dev_get_first_match_name("INTC1002", NULL, -1); + return dev_name ? dev_name : "pinctrl-merrifield"; +} + static int mrfld_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct mrfld_gpio_pinrange *range; + const char *pinctrl_dev_name; struct mrfld_gpio *priv; u32 gpio_base, irq_base; void __iomem *base; @@ -439,10 +447,11 @@ static int mrfld_gpio_probe(struct pci_dev *pdev, const struct pci_device_id *id return retval; } + pinctrl_dev_name = mrfld_gpio_get_pinctrl_dev_name(); for (i = 0; i < ARRAY_SIZE(mrfld_gpio_ranges); i++) { range = &mrfld_gpio_ranges[i]; retval = gpiochip_add_pin_range(&priv->chip, - "pinctrl-merrifield", + pinctrl_dev_name, range->gpio_base, range->pin_base, range->npins); From 541676078b52f365f53d46ee5517d305cd1b6350 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Dec 2017 14:23:10 -0500 Subject: [PATCH 154/236] membarrier: Disable preemption when calling smp_call_function_many() smp_call_function_many() requires disabling preemption around the call. Signed-off-by: Mathieu Desnoyers Cc: # v4.14+ Cc: Andrea Parri Cc: Andrew Hunter Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E . McKenney Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171215192310.25293-1-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); From 6c7d47c33ed323f14f2a3b8de925e831dbaa4e69 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 22 Nov 2017 14:42:21 +1100 Subject: [PATCH 155/236] KVM: PPC: Book3S PR: Fix WIMG handling under pHyp Commit 96df226 ("KVM: PPC: Book3S PR: Preserve storage control bits") added code to preserve WIMG bits but it missed 2 special cases: - a magic page in kvmppc_mmu_book3s_64_xlate() and - guest real mode in kvmppc_handle_pagefault(). For these ptes, WIMG was 0 and pHyp failed on these causing a guest to stop in the very beginning at NIP=0x100 (due to bd9166ffe "KVM: PPC: Book3S PR: Exit KVM on failed mapping"). According to LoPAPR v1.1 14.5.4.1.2 H_ENTER: The hypervisor checks that the WIMG bits within the PTE are appropriate for the physical page number else H_Parameter return. (For System Memory pages WIMG=0010, or, 1110 if the SAO option is enabled, and for IO pages WIMG=01**.) This hence initializes WIMG to non-zero value HPTE_R_M (0x10), as expected by pHyp. [paulus@ozlabs.org - fix compile for 32-bit] Cc: stable@vger.kernel.org # v4.11+ Fixes: 96df226 "KVM: PPC: Book3S PR: Preserve storage control bits" Signed-off-by: Alexey Kardashevskiy Tested-by: Ruediger Oertel Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu.c | 1 + arch/powerpc/kvm/book3s_pr.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 29ebe2fd5867..a93d719edc90 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = true; gpte->may_write = true; gpte->page_size = MMU_PAGE_4K; + gpte->wimg = HPTE_R_M; return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d0dc8624198f..7deaeeb14b93 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define HPTE_R_M _PAGE_COHERENT #endif static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) @@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.eaddr = eaddr; pte.vpage = eaddr >> 12; pte.page_size = MMU_PAGE_64K; + pte.wimg = HPTE_R_M; } switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { From ecba8297aafd50db6ae867e90844eead1611ef1c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 10 Jan 2018 17:04:39 +1100 Subject: [PATCH 156/236] KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() The KVM_PPC_ALLOCATE_HTAB ioctl(), implemented by kvmppc_alloc_reset_hpt() is supposed to completely clear and reset a guest's Hashed Page Table (HPT) allocating or re-allocating it if necessary. In the case where an HPT of the right size already exists and it just zeroes it, it forces a TLB flush on all guest CPUs, to remove any stale TLB entries loaded from the old HPT. However, that situation can arise when the HPT is resizing as well - or even when switching from an RPT to HPT - so those cases need a TLB flush as well. So, move the TLB flush to trigger in all cases except for errors. Cc: stable@vger.kernel.org # v4.10+ Fixes: f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size") Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8355398f0bb6..b73dbc9e797d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -165,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); - /* Ensure that each vcpu will flush its TLB on next entry. */ - cpumask_setall(&kvm->arch.need_tlb_flush); err = 0; goto out; } @@ -182,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) kvmppc_set_hpt(kvm, &info); out: + if (err == 0) + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + mutex_unlock(&kvm->lock); return err; } From aa8a5e0062ac940f7659394f4817c948dc8c0667 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 157/236] powerpc/64s: Add support for RFI flush of L1-D cache On some CPUs we can prevent the Meltdown vulnerability by flushing the L1-D cache on exit from kernel to user mode, and from hypervisor to guest. This is known to be the case on at least Power7, Power8 and Power9. At this time we do not know the status of the vulnerability on other CPUs such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale CPUs. As more information comes to light we can enable this, or other mechanisms on those CPUs. The vulnerability occurs when the load of an architecturally inaccessible memory region (eg. userspace load of kernel memory) is speculatively executed to the point where its result can influence the address of a subsequent speculatively executed load. In order for that to happen, the first load must hit in the L1, because before the load is sent to the L2 the permission check is performed. Therefore if no kernel addresses hit in the L1 the vulnerability can not occur. We can ensure that is the case by flushing the L1 whenever we return to userspace. Similarly for hypervisor vs guest. In order to flush the L1-D cache on exit, we add a section of nops at each (h)rfi location that returns to a lower privileged context, and patch that with some sequence. Newer firmwares are able to advertise to us that there is a special nop instruction that flushes the L1-D. If we do not see that advertised, we fall back to doing a displacement flush in software. For guest kernels we support migration between some CPU versions, and different CPUs may use different flush instructions. So that we are prepared to migrate to a machine with a different flush instruction activated, we may have to patch more than one flush instruction at boot if the hypervisor tells us to. In the end this patch is mostly the work of Nicholas Piggin and Michael Ellerman. However a cast of thousands contributed to analysis of the issue, earlier versions of the patch, back ports testing etc. Many thanks to all of them. Tested-by: Jon Masters Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 40 ++++++++--- arch/powerpc/include/asm/feature-fixups.h | 13 ++++ arch/powerpc/include/asm/paca.h | 10 +++ arch/powerpc/include/asm/setup.h | 13 ++++ arch/powerpc/kernel/asm-offsets.c | 5 ++ arch/powerpc/kernel/exceptions-64s.S | 84 +++++++++++++++++++++++ arch/powerpc/kernel/setup_64.c | 79 +++++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 9 +++ arch/powerpc/lib/feature-fixups.c | 41 +++++++++++ 9 files changed, 286 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index dfc56daed98b..7197b179c1b1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,34 +74,58 @@ */ #define EX_R3 EX_DAR -/* Macros for annotating the expected destination of (h)rfid */ +/* + * Macros for annotating the expected destination of (h)rfid + * + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when returning to userspace or a guest. + */ +#define RFI_FLUSH_SLOT \ + RFI_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop #define RFI_TO_KERNEL \ rfid #define RFI_TO_USER \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_USER_OR_KERNEL \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_GUEST \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define HRFI_TO_KERNEL \ hrfid #define HRFI_TO_USER \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_USER_OR_KERNEL \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_GUEST \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_UNKNOWN \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 8f88f771cc55..1e82eb3caabd 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -187,7 +187,20 @@ label##3: \ FTR_ENTRY_OFFSET label##1b-label##3b; \ .popsection; +#define RFI_FLUSH_FIXUP_SECTION \ +951: \ + .pushsection __rfi_flush_fixup,"a"; \ + .align 2; \ +952: \ + FTR_ENTRY_OFFSET 951b-952b; \ + .popsection; + + #ifndef __ASSEMBLY__ +#include + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; + void apply_feature_fixups(void); void setup_feature_keys(void); #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 3892db93b837..23ac7fc0af23 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -232,6 +232,16 @@ struct paca_struct { struct sibling_subcore_state *sibling_subcore_state; #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * rfi fallback flush must be in its own cacheline to prevent + * other paca data leaking into the L1d + */ + u64 exrfi[EX_SIZE] __aligned(0x80); + void *rfi_flush_fallback_area; + u64 l1d_flush_congruence; + u64 l1d_flush_sets; +#endif }; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index cf00ec26303a..469b7fdc9be4 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -39,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {} static inline void pseries_little_endian_exceptions(void) {} #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { + L1D_FLUSH_NONE = 0x1, + L1D_FLUSH_FALLBACK = 0x2, + L1D_FLUSH_ORI = 0x4, + L1D_FLUSH_MTTRIG = 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e0..f390d57cf2e1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -237,6 +237,11 @@ int main(void) OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp); OFFSET(PACA_IN_MCE, paca_struct, in_mce); OFFSET(PACA_IN_NMI, paca_struct, in_nmi); + OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area); + OFFSET(PACA_EXRFI, paca_struct, exrfi); + OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence); + OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets); + #endif OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ed356194f09c..2dc10bf646b8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1449,6 +1449,90 @@ masked_##_H##interrupt: \ b .; \ MASKED_DEC_HANDLER(_H) +TRAMP_REAL_BEGIN(rfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + rfid + +TRAMP_REAL_BEGIN(hrfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + hrfid + /* * Real mode exceptions actually use this too, but alternate * instruction code patches (which end up in the common .text area) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 8956a9856604..96163f4c3673 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -801,3 +801,82 @@ static int __init disable_hardlockup_detector(void) return 0; } early_initcall(disable_hardlockup_detector); + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +bool rfi_flush; + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (rfi_flush == enable) + return; + + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + l1d_size = ppc64_caches.l1d.size; + limit = min(safe_stack_limit(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); + memset(l1d_flush_fallback_area, 0, l1d_size * 2); + + for_each_possible_cpu(cpu) { + /* + * The fallback flush is currently coded for 8-way + * associativity. Different associativity is possible, but it + * will be treated as 8-way and may not evict the lines as + * effectively. + * + * 128 byte lines are mandatory. + */ + u64 c = l1d_size / 8; + + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; + paca[cpu].l1d_flush_congruence = c; + paca[cpu].l1d_flush_sets = c / 128; + } +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: Using fallback displacement flush\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: Using ori type flush\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: Using mttrig type flush\n"); + + enabled_flush_types = types; + + rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0494e1566ee2..307843d23682 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -132,6 +132,15 @@ SECTIONS /* Read-only data */ RO_DATA(PAGE_SIZE) +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { + __start___rfi_flush_fixup = .; + *(__rfi_flush_fixup) + __stop___rfi_flush_fixup = .; + } +#endif + EXCEPTION_TABLE(0) NOTES :kernel :notes diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 41cf5ae273cf..a95ea007d654 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[3], *dest; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup), + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = 0x60000000; /* nop */ + instrs[1] = 0x60000000; /* nop */ + instrs[2] = 0x60000000; /* nop */ + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = 0x48000010; + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction(dest, instrs[0]); + patch_instruction(dest + 1, instrs[1]); + patch_instruction(dest + 2, instrs[2]); + } + + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; From bc9c9304a45480797e13a8e1df96ffcf44fb62fe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 158/236] powerpc/64s: Support disabling RFI flush with no_rfi_flush and nopti Because there may be some performance overhead of the RFI flush, add kernel command line options to disable it. We add a sensibly named 'no_rfi_flush' option, but we also hijack the x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we see 'nopti' we can guess that the user is trying to avoid any overhead of Meltdown mitigations, and it means we don't have to educate every one about a different command line option. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 96163f4c3673..491be4179ddd 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -805,8 +805,29 @@ early_initcall(disable_hardlockup_detector); #ifdef CONFIG_PPC_BOOK3S_64 static enum l1d_flush_type enabled_flush_types; static void *l1d_flush_fallback_area; +static bool no_rfi_flush; bool rfi_flush; +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + static void do_nothing(void *unused) { /* @@ -877,6 +898,7 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - rfi_flush_enable(enable); + if (!no_rfi_flush) + rfi_flush_enable(enable); } #endif /* CONFIG_PPC_BOOK3S_64 */ From 8989d56878a7735dfdb234707a2fee6faf631085 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 159/236] powerpc/pseries: Query hypervisor for RFI flush settings A new hypervisor call is available which tells the guest settings related to the RFI flush. Use it to query the appropriate flush instruction(s), and whether the flush is required. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/setup.c | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index a8531e012658..ae4f596273b5 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void) of_pci_check_probe_only(); } +static void pseries_setup_rfi_flush(void) +{ + struct h_cpu_char_result result; + enum l1d_flush_type types; + bool enable; + long rc; + + /* Enable by default */ + enable = true; + + rc = plpar_get_cpu_characteristics(&result); + if (rc == H_SUCCESS) { + types = L1D_FLUSH_NONE; + + if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2) + types |= L1D_FLUSH_MTTRIG; + if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30) + types |= L1D_FLUSH_ORI; + + /* Use fallback if nothing set in hcall */ + if (types == L1D_FLUSH_NONE) + types = L1D_FLUSH_FALLBACK; + + if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) + enable = false; + } else { + /* Default to fallback if case hcall is not available */ + types = L1D_FLUSH_FALLBACK; + } + + setup_rfi_flush(types, enable); +} + static void __init pSeries_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + pseries_setup_rfi_flush(); + /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); From 6e032b350cd1fdb830f18f8320ef0e13b4e24094 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 160/236] powerpc/powernv: Check device-tree for RFI flush settings New device-tree properties are available which tell the hypervisor settings related to the RFI flush. Use them to determine the appropriate flush instruction to use, and whether the flush is required. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 49 ++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1edfbc1e40f4..4fb21e17504a 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -37,13 +37,62 @@ #include #include #include +#include #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ + struct device_node *np, *fw_features; + enum l1d_flush_type type; + int enable; + + /* Default to fallback in case fw-features are not available */ + type = L1D_FLUSH_FALLBACK; + enable = 1; + + np = of_find_node_by_name(NULL, "ibm,opal"); + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + + if (fw_features) { + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_MTTRIG; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_ORI; + + of_node_put(np); + + /* Enable unless firmware says NOT to */ + enable = 2; + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + of_node_put(fw_features); + } + + setup_rfi_flush(type, enable > 0); +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + pnv_setup_rfi_flush(); + /* Initialize SMP */ pnv_smp_init(); From d780537f9b49e9d714a454e5ed989d909beab8ec Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 10 Jan 2018 13:04:58 +0100 Subject: [PATCH 161/236] drm/tegra: sor: Fix hang on Tegra124 eDP The SOR0 found on Tegra124 and Tegra210 only supports eDP and LVDS and therefore has a slightly different clock tree than the SOR1 which does not support eDP, but HDMI and DP instead. Commit e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock") breaks setups with eDP because the sor->clk_out clock is uninitialized and therefore setting the parent clock (either the safe clock or either of the display PLLs) fails, which can cause hangs later on since there is no clock driving the module. Fix this by falling back to the module clock for sor->clk_out on those setups. This guarantees that the module will always be clocked by an enabled clock and hence prevents those hangs. Fixes: e1335e2f0cfc ("drm/tegra: sor: Reimplement pad clock") Reported-by: Guillaume Tucker Tested-by: Jon Hunter Signed-off-by: Thierry Reding --- drivers/gpu/drm/tegra/sor.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c index b0a1dedac802..476079f1255f 100644 --- a/drivers/gpu/drm/tegra/sor.c +++ b/drivers/gpu/drm/tegra/sor.c @@ -2656,6 +2656,9 @@ static int tegra_sor_probe(struct platform_device *pdev) name, err); goto remove; } + } else { + /* fall back to the module clock on SOR0 (eDP/LVDS only) */ + sor->clk_out = sor->clk; } sor->clk_parent = devm_clk_get(&pdev->dev, "parent"); From 1e77fc82110ac36febf46c1e2782f504f7d23099 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 9 Jan 2018 19:08:21 +0100 Subject: [PATCH 162/236] gpio: Add missing open drain/source handling to gpiod_set_value_cansleep() Since commit f11a04464ae57e8d ("i2c: gpio: Enable working over slow can_sleep GPIOs"), probing the i2c RTC connected to an i2c-gpio bus on r8a7740/armadillo fails with: rtc-s35390a 0-0030: error resetting chip rtc-s35390a: probe of 0-0030 failed with error -5 More debug code reveals: i2c i2c-0: master_xfer[0] R, addr=0x30, len=1 i2c i2c-0: NAK from device addr 0x30 msg #0 s35390a_get_reg: ret = -6 Commit 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to actually be raw") moved open drain/source handling from gpiod_set_raw_value_commit() to gpiod_set_value(), but forgot to take into account that gpiod_set_value_cansleep() also needs this handling. The i2c protocol mandates that i2c signals are open drain, hence i2c communication fails. Fix this by adding the missing handling to gpiod_set_value_cansleep(), using a new common helper gpiod_set_value_nocheck(). Fixes: 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to actually be raw") Signed-off-by: Geert Uytterhoeven [removed underscore syntax, added kerneldoc] Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 44332b793718..14532d9576e4 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2892,6 +2892,27 @@ void gpiod_set_raw_value(struct gpio_desc *desc, int value) } EXPORT_SYMBOL_GPL(gpiod_set_raw_value); +/** + * gpiod_set_value_nocheck() - set a GPIO line value without checking + * @desc: the descriptor to set the value on + * @value: value to set + * + * This sets the value of a GPIO line backing a descriptor, applying + * different semantic quirks like active low and open drain/source + * handling. + */ +static void gpiod_set_value_nocheck(struct gpio_desc *desc, int value) +{ + if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + value = !value; + if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) + gpio_set_open_drain_value_commit(desc, value); + else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) + gpio_set_open_source_value_commit(desc, value); + else + gpiod_set_raw_value_commit(desc, value); +} + /** * gpiod_set_value() - assign a gpio's value * @desc: gpio whose value will be assigned @@ -2906,16 +2927,8 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_value); void gpiod_set_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); - /* Should be using gpiod_set_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) - value = !value; - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) - gpio_set_open_drain_value_commit(desc, value); - else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) - gpio_set_open_source_value_commit(desc, value); - else - gpiod_set_raw_value_commit(desc, value); + gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value); @@ -3243,9 +3256,7 @@ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { might_sleep_if(extra_checks); VALIDATE_DESC_VOID(desc); - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) - value = !value; - gpiod_set_raw_value_commit(desc, value); + gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep); From 951a010233625b77cde3430b4b8785a9a22968d1 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 9 Jan 2018 12:10:21 +0000 Subject: [PATCH 163/236] xen/gntdev: Fix off-by-one error when unmapping with holes If the requested range has a hole, the calculation of the number of pages to unmap is off by one. Fix it. Signed-off-by: Ross Lagerwall Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 57efbd3b053b..d3391a1e3796 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -380,10 +380,8 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages) } range = 0; while (range < pages) { - if (map->unmap_ops[offset+range].handle == -1) { - range--; + if (map->unmap_ops[offset+range].handle == -1) break; - } range++; } err = __unmap_grant_pages(map, offset, range); From cf2acf66ad43abb39735568f55e1f85f9844e990 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 9 Jan 2018 12:10:22 +0000 Subject: [PATCH 164/236] xen/gntdev: Fix partial gntdev_mmap() cleanup When cleaning up after a partially successful gntdev_mmap(), unmap the successfully mapped grant pages otherwise Xen will kill the domain if in debug mode (Attempt to implicitly unmap a granted PTE) or Linux will kill the process and emit "BUG: Bad page map in process" if Xen is in release mode. This is only needed when use_ptemod is true because gntdev_put_map() will unmap grant pages itself when use_ptemod is false. Signed-off-by: Ross Lagerwall Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/gntdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index d3391a1e3796..bd56653b9bbc 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1071,8 +1071,10 @@ unlock_out: out_unlock_put: mutex_unlock(&priv->lock); out_put_map: - if (use_ptemod) + if (use_ptemod) { map->vma = NULL; + unmap_grant_pages(map, 0, map->count); + } gntdev_put_map(priv, map); return err; } From 0d9cac0ca0429830c40fe1a4e50e60f6221fd7b6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Jan 2018 12:40:04 +0300 Subject: [PATCH 165/236] drm/vmwgfx: Potential off by one in vmw_view_add() The vmw_view_cmd_to_type() function returns vmw_view_max (3) on error. It's one element beyond the end of the vmw_view_cotables[] table. My read on this is that it's possible to hit this failure. header->id comes from vmw_cmd_check() and it's a user controlled number between 1040 and 1225 so we can hit that error. But I don't have the hardware to test this code. Fixes: d80efd5cb3de ("drm/vmwgfx: Initial DX support") Signed-off-by: Dan Carpenter Reviewed-by: Thomas Hellstrom Cc: --- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 21c62a34e558..87e8af5776a3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -2731,6 +2731,8 @@ static int vmw_cmd_dx_view_define(struct vmw_private *dev_priv, } view_type = vmw_view_cmd_to_type(header->id); + if (view_type == vmw_view_max) + return -EINVAL; cmd = container_of(header, typeof(*cmd), header); ret = vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface, user_surface_converter, From 612e8e9350fd19cae6900cf36ea0c6892d1a0dca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Jan 2018 12:28:16 +0100 Subject: [PATCH 166/236] x86/alternatives: Fix optimize_nops() checking The alternatives code checks only the first byte whether it is a NOP, but with NOPs in front of the payload and having actual instructions after it breaks the "optimized' test. Make sure to scan all bytes before deciding to optimize the NOPs in there. Reported-by: David Woodhouse Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Dave Hansen Cc: Andi Kleen Cc: Andrew Lutomirski Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic --- arch/x86/kernel/alternative.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..e0b97e4d1db5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -344,9 +344,12 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; + int i; - if (instr[0] != 0x90) - return; + for (i = 0; i < a->padlen; i++) { + if (instr[i] != 0x90) + return; + } local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); From 2e83acb970684008baee471427270c029a76ddbd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:27 -0200 Subject: [PATCH 167/236] sctp: GFP_ATOMIC is not needed in sctp_setsockopt_events So replace it with GFP_USER and also add __GFP_NOWARN. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b4fb6e4886d2..54c046783a89 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2277,7 +2277,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, - GFP_ATOMIC); + GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; From 5960cefab9df76600a1a7d4ff592c59e14616e88 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:28 -0200 Subject: [PATCH 168/236] sctp: add a ceiling to optlen in some sockopts Hangbin Liu reported that some sockopt calls could cause the kernel to log a warning on memory allocation failure if the user supplied a large optlen value. That is because some of them called memdup_user() without a ceiling on optlen, allowing it to try to allocate really large buffers. This patch adds a ceiling by limiting optlen to the maximum allowed that would still make sense for these sockopt. Reported-by: Hangbin Liu Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 54c046783a89..022b94f11fd8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; + optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + + SCTP_AUTH_NUM_HMACS * sizeof(u16)); hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) @@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk, if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; + /* authkey->sca_keylength is u16, so optlen can't be bigger than + * this. + */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(struct sctp_authkey)); authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) @@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, if (optlen < sizeof(*params)) return -EINVAL; + /* srs_number_streams is u16, so optlen can't be bigger than this. */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(__u16) * sizeof(*params)); params = memdup_user(optval, optlen); if (IS_ERR(params)) From c76f97c99ae6d26d14c7f0e50e074382bfbc9f98 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:29 -0200 Subject: [PATCH 169/236] sctp: make use of pre-calculated len Some sockopt handling functions were calculating the length of the buffer to be written to userspace and then calculating it again when actually writing the buffer, which could lead to some write not using an up-to-date length. This patch updates such places to just make use of the len variable. Also, replace some sizeof(type) to sizeof(var). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 022b94f11fd8..9b01e994f661 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5025,7 +5025,7 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int))) + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len)) return -EFAULT; return 0; } @@ -5655,6 +5655,9 @@ copy_getaddrs: err = -EFAULT; goto out; } + /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, + * but we can't change it anymore. + */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: @@ -6091,7 +6094,7 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); - if (copy_from_user(¶ms, optval, sizeof(params))) + if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; @@ -6261,7 +6264,9 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid))) + + len = sizeof(struct sctp_authkeyid); + if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); @@ -6273,7 +6278,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, else val.scact_keynumber = ep->active_key_id; - len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) @@ -6299,7 +6303,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; @@ -6344,7 +6348,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; From 11d827a993a969c3c6ec56758ff63a44ba19b466 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 9 Jan 2018 11:02:33 +0800 Subject: [PATCH 170/236] net: gianfar_ptp: move set_fipers() to spinlock protecting area set_fipers() calling should be protected by spinlock in case that any interrupt breaks related registers setting and the function we expect. This patch is to move set_fipers() to spinlock protecting area in ptp_gianfar_adjtime(). Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Reviewed-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ptp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c index 544114281ea7..9f8d4f8e57e3 100644 --- a/drivers/net/ethernet/freescale/gianfar_ptp.c +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c @@ -319,11 +319,10 @@ static int ptp_gianfar_adjtime(struct ptp_clock_info *ptp, s64 delta) now = tmr_cnt_read(etsects); now += delta; tmr_cnt_write(etsects, now); + set_fipers(etsects); spin_unlock_irqrestore(&etsects->lock, flags); - set_fipers(etsects); - return 0; } From af60d61fa846725566f4a876ae04f891bdff1c7a Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 9 Jan 2018 09:52:22 +0100 Subject: [PATCH 171/236] doc: clarification about setting SO_ZEROCOPY Signed-off-by: Kornilios Kourtis Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index 77f6d7e25cfd..291a01264967 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -72,6 +72,10 @@ this flag, a process must first signal intent by setting a socket option: if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) error(1, errno, "setsockopt zerocopy"); +Setting the socket option only works when the socket is in its initial +(TCP_CLOSED) state. Trying to set the option for a socket returned by accept(), +for example, will lead to an EBUSY error. In this case, the option should be set +to the listening socket and it will be inherited by the accepted sockets. Transmission ------------ From b0d55b5bc77755501be9de2c935d106ff8dba9ac Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 9 Jan 2018 19:58:18 +0800 Subject: [PATCH 172/236] caif_usb: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_usb.c: In function 'cfusbl_device_notify': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5cd44f001f64..1a082a946045 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -176,9 +176,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, dev_add_pack(&caif_usb_type); pack_added = true; - strncpy(layer->name, dev->name, - sizeof(layer->name) - 1); - layer->name[sizeof(layer->name) - 1] = 0; + strlcpy(layer->name, dev->name, sizeof(layer->name)); return 0; } From 95f566de0269a0c59fd6a737a147731302136429 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Tue, 9 Jan 2018 14:43:34 +0200 Subject: [PATCH 173/236] of_mdio: avoid MDIO bus removal when a PHY is missing If one of the child devices is missing the of_mdiobus_register_phy() call will return -ENODEV. When a missing device is encountered the registration of the remaining PHYs is stopped and the MDIO bus will fail to register. Propagate all errors except ENODEV to avoid it. Signed-off-by: Madalin Bucur Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 3481e69738b5..a327be1d264b 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -231,7 +231,12 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) rc = of_mdiobus_register_phy(mdio, child, addr); else rc = of_mdiobus_register_device(mdio, child, addr); - if (rc) + + if (rc == -ENODEV) + dev_err(&mdio->dev, + "MDIO device at address %d is missing.\n", + addr); + else if (rc) goto unregister; } @@ -255,7 +260,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) if (of_mdiobus_child_is_phy(child)) { rc = of_mdiobus_register_phy(mdio, child, addr); - if (rc) + if (rc && rc != -ENODEV) goto unregister; } } From 78bbb15f2239bc8e663aa20bbe1987c91a0b75f6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 9 Jan 2018 13:40:41 -0800 Subject: [PATCH 174/236] 8021q: fix a memory leak for VLAN 0 device A vlan device with vid 0 is allow to creat by not able to be fully cleaned up by unregister_vlan_dev() which checks for vlan_id!=0. Also, VLAN 0 is probably not a valid number and it is kinda "reserved" for HW accelerating devices, but it is probably too late to reject it from creation even if makes sense. Instead, just remove the check in unregister_vlan_dev(). Reported-by: Dmitry Vyukov Fixes: ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)") Cc: Vlad Yasevich Cc: Ben Hutchings Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8dfdd94e430f..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -111,12 +111,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); } - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); From fc2336505fb49a8b932a0a67a9745c408b79992c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2018 18:14:28 -0800 Subject: [PATCH 175/236] nfp: always unmask aux interrupts at init The link state and exception interrupts may be masked when we probe. The firmware should in theory prevent sending (and automasking) those interrupts if the device is disabled, but if my reading of the FW code is correct there are firmwares out there with race conditions in this area. The interrupt may also be masked if previous driver which used the device was malfunctioning and we didn't load the FW (there is no other good way to comprehensively reset the PF). Note that FW unmasks the data interrupts by itself when vNIC is enabled, such helpful operation is not performed for LSC/EXN interrupts. Always unmask the auxiliary interrupts after request_irq(). On the remove path add missing PCI write flush before free_irq(). Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 1a603fdd9e80..99b0487b6d82 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -568,6 +568,7 @@ nfp_net_aux_irq_request(struct nfp_net *nn, u32 ctrl_offset, return err; } nn_writeb(nn, ctrl_offset, entry->entry); + nfp_net_irq_unmask(nn, entry->entry); return 0; } @@ -582,6 +583,7 @@ static void nfp_net_aux_irq_free(struct nfp_net *nn, u32 ctrl_offset, unsigned int vector_idx) { nn_writeb(nn, ctrl_offset, 0xff); + nn_pci_flush(nn); free_irq(nn->irq_entries[vector_idx].vector, nn); } From 8e033a93b37f37aa9fca71a370a895155320af60 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 10 Jan 2018 11:42:43 +0100 Subject: [PATCH 176/236] mlxsw: pci: Wait after reset before accessing HW After performing reset driver polls on HW indication until learning that the reset is done, but immediately after reset the device becomes unresponsive which might lead to completion timeout on the first read. Wait for 100ms before starting the polling. Fixes: 233fa44bd67a ("mlxsw: pci: Implement reset done check") Signed-off-by: Yuval Mintz Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 7 ++++++- drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 23f7d828cf67..6ef20e5cc77d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1643,7 +1643,12 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci, return 0; } - wmb(); /* reset needs to be written before we read control register */ + /* Reset needs to be written before we read control register, and + * we must wait for the HW to become responsive once again + */ + wmb(); + msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS); + end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS); do { u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY); diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index a6441208e9d9..fb082ad21b00 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -59,6 +59,7 @@ #define MLXSW_PCI_SW_RESET 0xF0010 #define MLXSW_PCI_SW_RESET_RST_BIT BIT(0) #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 5000 +#define MLXSW_PCI_SW_RESET_WAIT_MSECS 100 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF #define MLXSW_PCI_FW_READY_MAGIC 0x5E From db84924c4fc3be1ef0c965d5ece5f6d785c77c5f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 10 Jan 2018 11:42:44 +0100 Subject: [PATCH 177/236] mlxsw: spectrum_qdisc: Don't use variable array in mlxsw_sp_tclass_congestion_enable Resolve the sparse warning: "sparse: Variable length array is used." Use 2 arrays for 2 PRM register accesses. Fixes: 96f17e0776c2 ("mlxsw: spectrum: Support RED qdisc offload") Signed-off-by: Jiri Pirko Reviewed-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c index c33beac5def0..b5397da94d7f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c @@ -46,7 +46,8 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port, int tclass_num, u32 min, u32 max, u32 probability, bool is_ecn) { - char cwtp_cmd[max_t(u8, MLXSW_REG_CWTP_LEN, MLXSW_REG_CWTPM_LEN)]; + char cwtpm_cmd[MLXSW_REG_CWTPM_LEN]; + char cwtp_cmd[MLXSW_REG_CWTP_LEN]; struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; int err; @@ -60,10 +61,10 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port, if (err) return err; - mlxsw_reg_cwtpm_pack(cwtp_cmd, mlxsw_sp_port->local_port, tclass_num, + mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num, MLXSW_REG_CWTP_DEFAULT_PROFILE, true, is_ecn); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtp_cmd); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd); } static int From 862c03ee1deb7e19e0f9931682e0294ecd1fcaf9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jan 2018 03:45:49 -0800 Subject: [PATCH 178/236] ipv6: fix possible mem leaks in ipv6_make_skb() ip6_setup_cork() might return an error, while memory allocations have been done and must be rolled back. Fixes: 6422398c2ab0 ("ipv6: introduce ipv6_make_skb") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Reported-by: Mike Maloney Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7dd51c42314..688ba5f7516b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1735,9 +1735,10 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); - if (err) + if (err) { + ip6_cork_release(&cork, &v6_cork); return ERR_PTR(err); - + } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; From ccc12b11c5332c84442ef120dcd631523be75089 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Wed, 10 Jan 2018 13:35:49 +0000 Subject: [PATCH 179/236] ipv6: sr: fix TLVs not being copied using setsockopt Function ipv6_push_rthdr4 allows to add an IPv6 Segment Routing Header to a socket through setsockopt, but the current implementation doesn't copy possible TLVs at the end of the SRH received from userspace. Therefore, the execution of the following branch if (sr_has_hmac(sr_phdr)) { ... } will never complete since the len and type fields of a possible HMAC TLV are not copied, hence seg6_get_tlv_hmac will return an error, and the HMAC will not be computed. This commit adds a memcpy in case TLVs have been appended to the SRH. Fixes: a149e7c7ce81 ("ipv6: sr: add support for SRH injection through setsockopt") Acked-by: David Lebrun Signed-off-by: Mathieu Xhonneux Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 83bd75713535..bc68eb661970 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; + if (sr_ihdr->hdrlen > hops * 2) { + int tlvs_offset, tlvs_length; + + tlvs_offset = (1 + hops * 2) << 3; + tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; + memcpy((char *)sr_phdr + tlvs_offset, + (char *)sr_ihdr + tlvs_offset, tlvs_length); + } + #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; From ce4bb04cae8924792ed92f4af2793b77fc986f0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Jan 2018 18:47:05 -0500 Subject: [PATCH 180/236] Fix a leak in socket(2) when we fail to allocate a file descriptor. Got broken by "make sock_alloc_file() do sock_release() on failures" - cleanup after sock_map_fd() failure got pulled all the way into sock_alloc_file(), but it used to serve the case when sock_map_fd() failed *before* getting to sock_alloc_file() as well, and that got lost. Trivial to fix, fortunately. Fixes: 8e1611e23579 (make sock_alloc_file() do sock_release() on failures) Reported-by: Dmitry Vyukov Signed-off-by: Al Viro --- net/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..82433a2200ec 100644 --- a/net/socket.c +++ b/net/socket.c @@ -432,8 +432,10 @@ static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) + if (unlikely(fd < 0)) { + sock_release(sock); return fd; + } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { From 4636bda86aa1f34f45c629477476a0dcfa04e597 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 5 Jan 2018 00:59:05 -0800 Subject: [PATCH 181/236] drm/i915: Whitelist SLICE_COMMON_ECO_CHICKEN1 on Geminilake. Geminilake requires the 3D driver to select whether barriers are intended for compute shaders, or tessellation control shaders, by whacking a "Barrier Mode" bit in SLICE_COMMON_ECO_CHICKEN1 when switching pipelines. Failure to do this properly can result in GPU hangs. Unfortunately, this means it needs to switch mid-batch, so only userspace can properly set it. To facilitate this, the kernel needs to whitelist the register. The workarounds page currently tags this as applying to Broxton only, but that doesn't make sense. The documentation for the register it references says the bit userspace is supposed to toggle only exists on Geminilake. Empirically, the Mesa patch to toggle this bit appears to fix intermittent GPU hangs in tessellation control shader barrier tests on Geminilake; we haven't seen those hangs on Broxton. v2: Mention WA #0862 in the comment (it doesn't have a name). Signed-off-by: Kenneth Graunke Acked-by: Rodrigo Vivi Cc: stable@vger.kernel.org Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180105085905.9298-1-kenneth@whitecape.org (cherry picked from commit ab062639edb0412daf6de540725276b9a5d217f9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_reg.h | 2 ++ drivers/gpu/drm/i915/intel_engine_cs.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 333f40bc03bb..7923dfd9963c 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7027,6 +7027,8 @@ enum { #define GEN9_SLICE_COMMON_ECO_CHICKEN0 _MMIO(0x7308) #define DISABLE_PIXEL_MASK_CAMMING (1<<14) +#define GEN9_SLICE_COMMON_ECO_CHICKEN1 _MMIO(0x731c) + #define GEN7_L3SQCREG1 _MMIO(0xB010) #define VLV_B0_WA_L3SQCREG1_VALUE 0x00D30000 diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index ab5bf4e2e28e..6074e04dc99f 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -1390,6 +1390,11 @@ static int glk_init_workarounds(struct intel_engine_cs *engine) if (ret) return ret; + /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */ + ret = wa_ring_whitelist_reg(engine, GEN9_SLICE_COMMON_ECO_CHICKEN1); + if (ret) + return ret; + /* WaToEnableHwFixForPushConstHWBug:glk */ WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); From 5005c8514285ae4f28e862f8d91faaa2015e03a3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 6 Jan 2018 10:56:18 +0000 Subject: [PATCH 182/236] drm/i915: Don't adjust priority on an already signaled fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we retire a signaled fence, we free the dependency tree. However, we skip clearing the list so that if we then try to adjust the priority of the signaled fence, we may walk the list of freed dependencies. [ 3083.156757] ================================================================== [ 3083.156806] BUG: KASAN: use-after-free in execlists_schedule+0x199/0x660 [i915] [ 3083.156810] Read of size 8 at addr ffff8806bf20f400 by task Xorg/831 [ 3083.156815] CPU: 0 PID: 831 Comm: Xorg Not tainted 4.15.0-rc6-no-psn+ #1 [ 3083.156817] Hardware name: Notebook N24_25BU/N24_25BU, BIOS 5.12 02/17/2017 [ 3083.156818] Call Trace: [ 3083.156823] dump_stack+0x5c/0x7a [ 3083.156827] print_address_description+0x6b/0x290 [ 3083.156830] kasan_report+0x28f/0x380 [ 3083.156872] ? execlists_schedule+0x199/0x660 [i915] [ 3083.156914] execlists_schedule+0x199/0x660 [i915] [ 3083.156956] ? intel_crtc_atomic_check+0x146/0x4e0 [i915] [ 3083.156997] ? execlists_submit_request+0xe0/0xe0 [i915] [ 3083.157038] ? i915_vma_misplaced.part.4+0x25/0xb0 [i915] [ 3083.157079] ? __i915_vma_do_pin+0x7c8/0xc80 [i915] [ 3083.157121] ? intel_atomic_state_alloc+0x44/0x60 [i915] [ 3083.157130] ? drm_atomic_helper_page_flip+0x3e/0xb0 [drm_kms_helper] [ 3083.157145] ? drm_mode_page_flip_ioctl+0x7d2/0x850 [drm] [ 3083.157159] ? drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157172] ? drm_ioctl+0x45b/0x560 [drm] [ 3083.157211] i915_gem_object_wait_priority+0x14c/0x2c0 [i915] [ 3083.157251] ? i915_gem_get_aperture_ioctl+0x150/0x150 [i915] [ 3083.157290] ? i915_vma_pin_fence+0x1d8/0x320 [i915] [ 3083.157331] ? intel_pin_and_fence_fb_obj+0x175/0x250 [i915] [ 3083.157372] ? intel_rotation_info_size+0x60/0x60 [i915] [ 3083.157413] ? intel_link_compute_m_n+0x80/0x80 [i915] [ 3083.157428] ? drm_dev_printk+0x1b0/0x1b0 [drm] [ 3083.157443] ? drm_dev_printk+0x1b0/0x1b0 [drm] [ 3083.157485] intel_prepare_plane_fb+0x2f8/0x5a0 [i915] [ 3083.157527] ? intel_crtc_get_vblank_counter+0x80/0x80 [i915] [ 3083.157536] drm_atomic_helper_prepare_planes+0xa0/0x1c0 [drm_kms_helper] [ 3083.157587] intel_atomic_commit+0x12e/0x4e0 [i915] [ 3083.157605] drm_atomic_helper_page_flip+0xa2/0xb0 [drm_kms_helper] [ 3083.157621] drm_mode_page_flip_ioctl+0x7d2/0x850 [drm] [ 3083.157638] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157652] ? drm_lease_owner+0x1a/0x30 [drm] [ 3083.157668] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157681] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157696] drm_ioctl+0x45b/0x560 [drm] [ 3083.157711] ? drm_mode_cursor2_ioctl+0x10/0x10 [drm] [ 3083.157725] ? drm_getstats+0x20/0x20 [drm] [ 3083.157729] ? timerqueue_del+0x49/0x80 [ 3083.157732] ? __remove_hrtimer+0x62/0xb0 [ 3083.157735] ? hrtimer_try_to_cancel+0x173/0x210 [ 3083.157738] do_vfs_ioctl+0x13b/0x880 [ 3083.157741] ? ioctl_preallocate+0x140/0x140 [ 3083.157744] ? _raw_spin_unlock_irq+0xe/0x30 [ 3083.157746] ? do_setitimer+0x234/0x370 [ 3083.157750] ? SyS_setitimer+0x19e/0x1b0 [ 3083.157752] ? SyS_alarm+0x140/0x140 [ 3083.157755] ? __rcu_read_unlock+0x66/0x80 [ 3083.157757] ? __fget+0xc4/0x100 [ 3083.157760] SyS_ioctl+0x74/0x80 [ 3083.157763] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.157765] RIP: 0033:0x7f6135d0c6a7 [ 3083.157767] RSP: 002b:00007fff01451888 EFLAGS: 00003246 ORIG_RAX: 0000000000000010 [ 3083.157769] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f6135d0c6a7 [ 3083.157771] RDX: 00007fff01451950 RSI: 00000000c01864b0 RDI: 000000000000000c [ 3083.157772] RBP: 00007f613076f600 R08: 0000000000000001 R09: 0000000000000000 [ 3083.157773] R10: 0000000000000060 R11: 0000000000003246 R12: 0000000000000000 [ 3083.157774] R13: 0000000000000060 R14: 000000000000001b R15: 0000000000000060 [ 3083.157779] Allocated by task 831: [ 3083.157783] kmem_cache_alloc+0xc0/0x200 [ 3083.157822] i915_gem_request_await_dma_fence+0x2c4/0x5d0 [i915] [ 3083.157861] i915_gem_request_await_object+0x321/0x370 [i915] [ 3083.157900] i915_gem_do_execbuffer+0x1165/0x19c0 [i915] [ 3083.157937] i915_gem_execbuffer2+0x1ad/0x550 [i915] [ 3083.157950] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.157962] drm_ioctl+0x45b/0x560 [drm] [ 3083.157964] do_vfs_ioctl+0x13b/0x880 [ 3083.157966] SyS_ioctl+0x74/0x80 [ 3083.157968] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.157971] Freed by task 831: [ 3083.157973] kmem_cache_free+0x77/0x220 [ 3083.158012] i915_gem_request_retire+0x72c/0xa70 [i915] [ 3083.158051] i915_gem_request_alloc+0x1e9/0x8b0 [i915] [ 3083.158089] i915_gem_do_execbuffer+0xa96/0x19c0 [i915] [ 3083.158127] i915_gem_execbuffer2+0x1ad/0x550 [i915] [ 3083.158140] drm_ioctl_kernel+0xa7/0xf0 [drm] [ 3083.158153] drm_ioctl+0x45b/0x560 [drm] [ 3083.158155] do_vfs_ioctl+0x13b/0x880 [ 3083.158156] SyS_ioctl+0x74/0x80 [ 3083.158158] entry_SYSCALL_64_fastpath+0x1a/0x7d [ 3083.158162] The buggy address belongs to the object at ffff8806bf20f400 which belongs to the cache i915_dependency of size 64 [ 3083.158166] The buggy address is located 0 bytes inside of 64-byte region [ffff8806bf20f400, ffff8806bf20f440) [ 3083.158168] The buggy address belongs to the page: [ 3083.158171] page:00000000d43decc4 count:1 mapcount:0 mapping: (null) index:0x0 [ 3083.158174] flags: 0x17ffe0000000100(slab) [ 3083.158179] raw: 017ffe0000000100 0000000000000000 0000000000000000 0000000180200020 [ 3083.158182] raw: ffffea001afc16c0 0000000500000005 ffff880731b881c0 0000000000000000 [ 3083.158184] page dumped because: kasan: bad access detected [ 3083.158187] Memory state around the buggy address: [ 3083.158190] ffff8806bf20f300: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158192] ffff8806bf20f380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158195] >ffff8806bf20f400: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158196] ^ [ 3083.158199] ffff8806bf20f480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158201] ffff8806bf20f500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc [ 3083.158203] ================================================================== Reported-by: Alexandru Chirvasitu Reported-by: Mike Keehan Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104436 Fixes: 1f181225f8ec ("drm/i915/execlists: Keep request->priority for its lifetime") Signed-off-by: Chris Wilson Cc: Alexandru Chirvasitu Cc: Michał Winiarski Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Tested-by: Alexandru Chirvasitu Reviewed-by: Michał Winiarski Link: https://patchwork.freedesktop.org/patch/msgid/20180106105618.13532-1-chris@chris-wilson.co.uk (cherry picked from commit c218ee03b9315073ce43992792554dafa0626eb8) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/intel_lrc.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 18de6569d04a..5cfba89ed586 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -467,7 +467,7 @@ static void __fence_set_priority(struct dma_fence *fence, int prio) struct drm_i915_gem_request *rq; struct intel_engine_cs *engine; - if (!dma_fence_is_i915(fence)) + if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence)) return; rq = to_request(fence); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index d36e25607435..e71a8cd50498 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -974,6 +974,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) GEM_BUG_ON(prio == I915_PRIORITY_INVALID); + if (i915_gem_request_completed(request)) + return; + if (prio <= READ_ONCE(request->priotree.priority)) return; From 2a266f23550be997d783f27e704b9b40c4010292 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 10 Jan 2018 21:44:42 +0800 Subject: [PATCH 183/236] KVM MMU: check pending exception before injecting APF For example, when two APF's for page ready happen after one exit and the first one becomes pending, the second one will result in #DF. Instead, just handle the second page fault synchronously. Reported-by: Ross Zwisler Message-ID: Reported-by: Alec Blayne Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c4deb1f34faa..e577bacd4bd0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) return false; if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) From ab271bd4dfd568060ffcf5a21b667c7c5df7ab99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jan 2018 17:26:59 +0100 Subject: [PATCH 184/236] x86: kvm: propagate register_shrinker return code Patch "mm,vmscan: mark register_shrinker() as __must_check" is queued for 4.16 in linux-mm and adds a warning about the unchecked call to register_shrinker: arch/x86/kvm/mmu.c:5485:2: warning: ignoring return value of 'register_shrinker', declared with attribute warn_unused_result [-Wunused-result] This changes the kvm_mmu_module_init() function to fail itself when the call to register_shrinker fails. Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e577bacd4bd0..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5466,30 +5466,34 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + int ret = -ENOMEM; + kvm_mmu_clear_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) - goto nomem; + goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) - goto nomem; + goto out; if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; + goto out; - register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker); + if (ret) + goto out; return 0; -nomem: +out: mmu_destroy_caches(); - return -ENOMEM; + return ret; } /* From bd89525a823ce6edddcedbe9aed79faa1b9cf544 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 16:55:24 +0100 Subject: [PATCH 185/236] KVM: x86: emulate #UD while in guest mode This reverts commits ae1f57670703656cc9f293722c3b8b6782f8ab3f and ac9b305caa0df6f5b75d294e4b86c1027648991e. If the hardware doesn't support MOVBE, but L0 sets CPUID.01H:ECX.MOVBE in L1's emulated CPUID information, then L1 is likely to pass that CPUID bit through to L2. L2 will expect MOVBE to work, but if L1 doesn't intercept #UD, then any MOVBE instruction executed in L2 will raise #UD, and the exception will be delivered in L2. Commit ac9b305caa0df6f5b75d294e4b86c1027648991e is a better and more complete version of ae1f57670703 ("KVM: nVMX: Do not emulate #UD while in guest mode"); however, neither considers the above case. Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +-------- arch/x86/kvm/vmx.c | 5 +---- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bb31c801f1fc..3158dac87f82 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -361,7 +361,6 @@ static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; struct nested_state *g; - u32 h_intercept_exceptions; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -372,14 +371,9 @@ static void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested; - /* No need to intercept #UD if L1 doesn't intercept it */ - h_intercept_exceptions = - h->intercept_exceptions & ~(1U << UD_VECTOR); - c->intercept_cr = h->intercept_cr | g->intercept_cr; c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = - h_intercept_exceptions | g->intercept_exceptions; + c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; c->intercept = h->intercept | g->intercept; } @@ -2202,7 +2196,6 @@ static int ud_interception(struct vcpu_svm *svm) { int er; - WARN_ON_ONCE(is_guest_mode(&svm->vcpu)); er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c14d65f676a..427fd3200dd8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1887,7 +1887,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) | + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == @@ -1905,8 +1905,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - else - eb |= 1u << UD_VECTOR; vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -5917,7 +5915,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; /* already handled by vmx_vcpu_run() */ if (is_invalid_opcode(intr_info)) { - WARN_ON_ONCE(is_guest_mode(vcpu)); er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; From 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Wed, 10 Jan 2018 10:12:03 -0800 Subject: [PATCH 186/236] KVM: x86: Add memory barrier on vmcs field lookup This adds a memory barrier when performing a lookup into the vmcs_field_to_offset_table. This is related to CVE-2017-5753. Signed-off-by: Andrew Honig Reviewed-by: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6f4f095f8f4..7f8fcc5ce664 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -884,8 +884,16 @@ static inline short vmcs_field_to_offset(unsigned long field) { BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || - vmcs_field_to_offset_table[field] == 0) + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + return -ENOENT; + + /* + * FIXME: Mitigation for CVE-2017-5753. To be replaced with a + * generic mechanism. + */ + asm("lfence"); + + if (vmcs_field_to_offset_table[field] == 0) return -ENOENT; return vmcs_field_to_offset_table[field]; From f32ab7547161b9fa7ebfbc4f18ea1eb3fd49fe25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?= Date: Thu, 11 Jan 2018 14:23:29 +0100 Subject: [PATCH 187/236] x86/PCI: Add "pci=big_root_window" option for AMD 64-bit windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only try to enable a 64-bit window on AMD CPUs when "pci=big_root_window" is specified. This taints the kernel because the new 64-bit window uses address space we don't know anything about, and it may contain unreported devices or memory that would conflict with the window. The pci_amd_enable_64bit_bar() quirk that enables the window is specific to AMD CPUs. The generic solution would be to have the firmware enable the window and describe it in the host bridge's _CRS method, or at least describe it in the _PRS method so the OS would have the option of enabling it. Signed-off-by: Christian König [bhelgaas: changelog, extend doc, mention taint in dmesg] Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 5 +++++ arch/x86/pci/fixup.c | 7 ++++++- 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..619638362416 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3094,6 +3094,12 @@ pcie_scan_all Scan all possible PCIe devices. Otherwise we only look for one device below a PCIe downstream port. + big_root_window Try to add a big 64bit memory window to the PCIe + root complex on AMD CPUs. Some GFX hardware + can resize a BAR to allow access to all VRAM. + Adding the window is slightly risky (it may + conflict with unreported devices), so this + taints the kernel. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7a5d6695abd3..eb66fa9cd0fc 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -38,6 +38,7 @@ do { \ #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 +#define PCI_BIG_ROOT_WINDOW 0x400000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7a5350d08cef..563049c483a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; +#ifdef CONFIG_PHYS_ADDR_T_64BIT + } else if (!strcmp(str, "big_root_window")) { + pci_probe |= PCI_BIG_ROOT_WINDOW; + return NULL; +#endif } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e663d6bf1328..8bad19c7473d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -667,6 +667,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) struct resource *res, *conflict; struct pci_dev *other; + if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) + return; + /* Check that we are the only device of that type */ other = pci_get_device(dev->vendor, dev->device, NULL); if (other != dev || @@ -714,7 +717,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->start = conflict->end + 1; } - dev_info(&dev->dev, "adding root bus resource %pR\n", res); + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; From b8626f1dc29d3eee444bfaa92146ec7b291ef41c Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 11 Jan 2018 14:47:40 +0100 Subject: [PATCH 188/236] usb: misc: usb3503: make sure reset is low for at least 100us When using a GPIO which is high by default, and initialize the driver in USB Hub mode, initialization fails with: [ 111.757794] usb3503 0-0008: SP_ILOCK failed (-5) The reason seems to be that the chip is not properly reset. Probe does initialize reset low, however some lines later the code already set it back high, which is not long enouth. Make sure reset is asserted for at least 100us by inserting a delay after initializing the reset pin during probe. Signed-off-by: Stefan Agner Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb3503.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/misc/usb3503.c b/drivers/usb/misc/usb3503.c index 465dbf68b463..f723f7b8c9ac 100644 --- a/drivers/usb/misc/usb3503.c +++ b/drivers/usb/misc/usb3503.c @@ -279,6 +279,8 @@ static int usb3503_probe(struct usb3503 *hub) if (gpio_is_valid(hub->gpio_reset)) { err = devm_gpio_request_one(dev, hub->gpio_reset, GPIOF_OUT_INIT_LOW, "usb3503 reset"); + /* Datasheet defines a hardware reset to be at least 100us */ + usleep_range(100, 10000); if (err) { dev_err(dev, "unable to request GPIO %d as reset pin (%d)\n", From 1a2e91e795def04e15fac87b8e16b635691d0b82 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Tue, 9 Jan 2018 13:27:17 -0600 Subject: [PATCH 189/236] Documentation: usb: fix typo in UVC gadgetfs config command This seems to be a copy&paste error. With the fix the uvc gadget now can be created by following the instrucitons. Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- Documentation/usb/gadget-testing.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt index 441a4b9b666f..5908a21fddb6 100644 --- a/Documentation/usb/gadget-testing.txt +++ b/Documentation/usb/gadget-testing.txt @@ -693,7 +693,7 @@ such specification consists of a number of lines with an inverval value in each line. The rules stated above are best illustrated with an example: # mkdir functions/uvc.usb0/control/header/h -# cd functions/uvc.usb0/control/header/h +# cd functions/uvc.usb0/control/ # ln -s header/h class/fs # ln -s header/h class/ss # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p From 03a551734cfc2b93f83950a595974e3c9cbd82fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FChristian=3D20K=3DC3=3DB6nig=3F=3D?= Date: Thu, 11 Jan 2018 14:23:30 +0100 Subject: [PATCH 190/236] x86/PCI: Move and shrink AMD 64-bit window to avoid conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid problems with BIOS implementations which don't report all used resources to the OS by only allocating a 256GB window directly below the hardware limit (from the BKDG, sec 2.4.6). Fixes a silent reboot loop reported by Aaro Koskinen on an AMD-based MSI MS-7699/760GA-P43(FX) system. This was apparently caused by RAM or other unreported hardware that conflicted with the new window. Link: https://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf Link: https://lkml.kernel.org/r/20180105220412.fzpwqe4zljdawr36@darkstar.musicnaut.iki.fi Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") Reported-by: Aaro Koskinen Signed-off-by: Christian König [bhelgaas: changelog, comment, Fixes:] Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 8bad19c7473d..f6a26e3cb476 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,10 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); */ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) { - unsigned i; u32 base, limit, high; - struct resource *res, *conflict; struct pci_dev *other; + struct resource *res; + unsigned i; + int r; if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) return; @@ -702,19 +703,20 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) if (!res) return; + /* + * Allocate a 256GB window directly below the 0xfd00000000 hardware + * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). + */ res->name = "PCI Bus 0000:00"; res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_WINDOW; - res->start = 0x100000000ull; + res->start = 0xbd00000000ull; res->end = 0xfd00000000ull - 1; - /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) { - if (conflict->end >= res->end) { - kfree(res); - return; - } - res->start = conflict->end + 1; + r = request_resource(&iomem_resource, res); + if (r) { + kfree(res); + return; } dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", From 445b69e3b75e42362a5bdc13c8b8f61599e2228a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Jan 2018 14:49:39 -0800 Subject: [PATCH 191/236] x86/pti: Make unpoison of pgd for trusted boot work for real The inital fix for trusted boot and PTI potentially misses the pgd clearing if pud_alloc() sets a PGD. It probably works in *practice* because for two adjacent calls to map_tboot_page() that share a PGD entry, the first will clear NX, *then* allocate and set the PGD (without NX clear). The second call will *not* allocate but will clear the NX bit. Defer the NX clearing to a point after it is known that all top-level allocations have occurred. Add a comment to clarify why. [ tglx: Massaged changelog ] Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Andrea Arcangeli Cc: Jon Masters Cc: "Tim Chen" Cc: gnomes@lxorguk.ukuu.org.uk Cc: peterz@infradead.org Cc: ning.sun@intel.com Cc: tboot-devel@lists.sourceforge.net Cc: andi@firstfloor.org Cc: luto@kernel.org Cc: law@redhat.com Cc: pbonzini@redhat.com Cc: torvalds@linux-foundation.org Cc: gregkh@linux-foundation.org Cc: dwmw@amazon.co.uk Cc: nickc@redhat.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com --- arch/x86/kernel/tboot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 75869a4b6c41..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; - pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; @@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in p4d_alloc() _or_ + * pud_alloc() depending on 4/5-level paging. + */ + pgd->pgd &= ~_PAGE_NX; + return 0; } From 39b735332cb8b33a27c28592d969e4016c86c3ea Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 11 Jan 2018 21:46:23 +0000 Subject: [PATCH 192/236] objtool: Detect jumps to retpoline thunks A direct jump to a retpoline thunk is really an indirect jump in disguise. Change the objtool instruction type accordingly. Objtool needs to know where indirect branches are so it can detect switch statement jump tables. This fixes a bunch of warnings with CONFIG_RETPOLINE like: arch/x86/events/intel/uncore_nhmex.o: warning: objtool: nhmex_rbox_msr_enable_event()+0x44: sibling call from callable instruction with modified stack frame kernel/signal.o: warning: objtool: copy_siginfo_to_user()+0x91: sibling call from callable instruction with modified stack frame ... Signed-off-by: Josh Poimboeuf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-2-git-send-email-dwmw@amazon.co.uk --- tools/objtool/check.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9b341584eb1b..de053fb7049b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -456,6 +456,13 @@ static int add_jump_destinations(struct objtool_file *file) } else if (rela->sym->sec->idx) { dest_sec = rela->sym->sec; dest_off = rela->sym->sym.st_value + rela->addend + 4; + } else if (strstr(rela->sym->name, "_indirect_thunk_")) { + /* + * Retpoline jumps are really dynamic jumps in + * disguise, so convert them accordingly. + */ + insn->type = INSN_JUMP_DYNAMIC; + continue; } else { /* sibling call */ insn->jump_dest = 0; From 258c76059cece01bebae098e81bacb1af2edad17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 11 Jan 2018 21:46:24 +0000 Subject: [PATCH 193/236] objtool: Allow alternatives to be ignored Getting objtool to understand retpolines is going to be a bit of a challenge. For now, take advantage of the fact that retpolines are patched in with alternatives. Just read the original (sane) non-alternative instruction, and ignore the patched-in retpoline. This allows objtool to understand the control flow *around* the retpoline, even if it can't yet follow what's inside. This means the ORC unwinder will fail to unwind from inside a retpoline, but will work fine otherwise. Signed-off-by: Josh Poimboeuf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-3-git-send-email-dwmw@amazon.co.uk --- tools/objtool/check.c | 62 ++++++++++++++++++++++++++++++++++++++----- tools/objtool/check.h | 2 +- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index de053fb7049b..f40d46e24bcc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -427,6 +427,40 @@ static void add_ignores(struct objtool_file *file) } } +/* + * FIXME: For now, just ignore any alternatives which add retpolines. This is + * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline. + * But it at least allows objtool to understand the control flow *around* the + * retpoline. + */ +static int add_nospec_ignores(struct objtool_file *file) +{ + struct section *sec; + struct rela *rela; + struct instruction *insn; + + sec = find_section_by_name(file->elf, ".rela.discard.nospec"); + if (!sec) + return 0; + + list_for_each_entry(rela, &sec->rela_list, list) { + if (rela->sym->type != STT_SECTION) { + WARN("unexpected relocation symbol type in %s", sec->name); + return -1; + } + + insn = find_insn(file, rela->sym->sec, rela->addend); + if (!insn) { + WARN("bad .discard.nospec entry"); + return -1; + } + + insn->ignore_alts = true; + } + + return 0; +} + /* * Find the destination instructions for all jumps. */ @@ -509,11 +543,18 @@ static int add_call_destinations(struct objtool_file *file) dest_off = insn->offset + insn->len + insn->immediate; insn->call_dest = find_symbol_by_offset(insn->sec, dest_off); + /* + * FIXME: Thanks to retpolines, it's now considered + * normal for a function to call within itself. So + * disable this warning for now. + */ +#if 0 if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at offset 0x%lx", insn->sec, insn->offset, dest_off); return -1; } +#endif } else if (rela->sym->type == STT_SECTION) { insn->call_dest = find_symbol_by_offset(rela->sym->sec, rela->addend+4); @@ -678,12 +719,6 @@ static int add_special_section_alts(struct objtool_file *file) return ret; list_for_each_entry_safe(special_alt, tmp, &special_alts, list) { - alt = malloc(sizeof(*alt)); - if (!alt) { - WARN("malloc failed"); - ret = -1; - goto out; - } orig_insn = find_insn(file, special_alt->orig_sec, special_alt->orig_off); @@ -694,6 +729,10 @@ static int add_special_section_alts(struct objtool_file *file) goto out; } + /* Ignore retpoline alternatives. */ + if (orig_insn->ignore_alts) + continue; + new_insn = NULL; if (!special_alt->group || special_alt->new_len) { new_insn = find_insn(file, special_alt->new_sec, @@ -719,6 +758,13 @@ static int add_special_section_alts(struct objtool_file *file) goto out; } + alt = malloc(sizeof(*alt)); + if (!alt) { + WARN("malloc failed"); + ret = -1; + goto out; + } + alt->insn = new_insn; list_add_tail(&alt->list, &orig_insn->alts); @@ -1035,6 +1081,10 @@ static int decode_sections(struct objtool_file *file) add_ignores(file); + ret = add_nospec_ignores(file); + if (ret) + return ret; + ret = add_jump_destinations(file); if (ret) return ret; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 47d9ea70a83d..dbadb304a410 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -44,7 +44,7 @@ struct instruction { unsigned int len; unsigned char type; unsigned long immediate; - bool alt_group, visited, dead_end, ignore, hint, save, restore; + bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts; struct symbol *call_dest; struct instruction *jump_dest; struct list_head alts; From 76b043848fd22dbf7f8bf3a1452f8c70d557b860 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:25 +0000 Subject: [PATCH 194/236] x86/retpoline: Add initial retpoline support Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide the corresponding thunks. Provide assembler macros for invoking the thunks in the same way that GCC does, from native and inline assembler. This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In some circumstances, IBRS microcode features may be used instead, and the retpoline can be disabled. On AMD CPUs if lfence is serialising, the retpoline can be dramatically simplified to a simple "lfence; jmp *\reg". A future patch, after it has been verified that lfence really is serialising in all circumstances, can enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition to X86_FEATURE_RETPOLINE. Do not align the retpoline in the altinstr section, because there is no guarantee that it stays aligned when it's copied over the oldinstr during alternative patching. [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] [ tglx: Put actual function CALL/JMP in front of the macros, convert to symbolic labels ] [ dwmw2: Convert back to numeric labels, merge objtool fixes ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk --- arch/x86/Kconfig | 13 +++ arch/x86/Makefile | 10 ++ arch/x86/include/asm/asm-prototypes.h | 25 +++++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/nospec-branch.h | 128 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 4 + arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 48 ++++++++++ 8 files changed, 231 insertions(+) create mode 100644 arch/x86/include/asm/nospec-branch.h create mode 100644 arch/x86/lib/retpoline.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e23d21ac745a..d1819161cc6c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -429,6 +429,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + + Without compiler support, at least indirect branches in assembler + code are eliminated. Since this includes the syscall entry path, + it is not entirely pointless. + config INTEL_RDT bool "Intel Resource Director Technology support" default n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a20eacd9c7e9..974c61864978 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -235,6 +235,16 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE + else + $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) + endif +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ff700d81e91e..0927cdc4f946 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -11,7 +11,32 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1641c2f96363..f275447862f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -203,6 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..e20e92ef2ca8 --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +.macro ANNOTATE_NOSPEC_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.nospec + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.nospec\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE( \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 372ba3fb400f..7a671d1ae3cb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +#ifdef CONFIG_RETPOLINE + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +#endif + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 457f681ef379..d435c89875c1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S new file mode 100644 index 000000000000..cb45c6cb465f --- /dev/null +++ b/arch/x86/lib/retpoline.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include +#include + +.macro THUNK reg + .section .text.__x86.indirect_thunk.\reg + +ENTRY(__x86_indirect_thunk_\reg) + CFI_STARTPROC + JMP_NOSPEC %\reg + CFI_ENDPROC +ENDPROC(__x86_indirect_thunk_\reg) +.endm + +/* + * Despite being an assembler file we can't just use .irp here + * because __KSYM_DEPS__ only uses the C preprocessor and would + * only see one instance of "__x86_indirect_thunk_\reg" rather + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + +GENERATE_THUNK(_ASM_AX) +GENERATE_THUNK(_ASM_BX) +GENERATE_THUNK(_ASM_CX) +GENERATE_THUNK(_ASM_DX) +GENERATE_THUNK(_ASM_SI) +GENERATE_THUNK(_ASM_DI) +GENERATE_THUNK(_ASM_BP) +GENERATE_THUNK(_ASM_SP) +#ifdef CONFIG_64BIT +GENERATE_THUNK(r8) +GENERATE_THUNK(r9) +GENERATE_THUNK(r10) +GENERATE_THUNK(r11) +GENERATE_THUNK(r12) +GENERATE_THUNK(r13) +GENERATE_THUNK(r14) +GENERATE_THUNK(r15) +#endif From da285121560e769cc31797bba6422eea71d473e0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:26 +0000 Subject: [PATCH 195/236] x86/spectre: Add boot time option to select Spectre v2 mitigation Add a spectre_v2= option to select the mitigation used for the indirect branch speculation vulnerability. Currently, the only option available is retpoline, in its various forms. This will be expanded to cover the new IBRS/IBPB microcode features. The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a serializing instruction, which is indicated by the LFENCE_RDTSC feature. [ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS integration becomes simple ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk --- .../admin-guide/kernel-parameters.txt | 28 ++++ arch/x86/include/asm/nospec-branch.h | 10 ++ arch/x86/kernel/cpu/bugs.c | 158 +++++++++++++++++- arch/x86/kernel/cpu/common.c | 4 - 4 files changed, 195 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 905991745d26..8122b5f98ea1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2599,6 +2599,11 @@ nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -3908,6 +3913,29 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches + retpoline,generic - google's original retpoline + retpoline,amd - AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e20e92ef2ca8..ea034fa6e261 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -124,5 +124,15 @@ # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 76ad6cb44b40..e4dc26185aa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include #include #include @@ -21,6 +24,8 @@ #include #include +static void __init spectre_v2_select_mitigation(void); + void __init check_bugs(void) { identify_boot_cpu(); @@ -30,6 +35,9 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -62,6 +70,153 @@ void __init check_bugs(void) #endif } +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret > 0) { + if (match_option(arg, ret, "off")) { + goto disable; + } else if (match_option(arg, ret, "on")) { + spec2_print_if_secure("force enabled on command line."); + return SPECTRE_V2_CMD_FORCE; + } else if (match_option(arg, ret, "retpoline")) { + spec2_print_if_insecure("retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE; + } else if (match_option(arg, ret, "retpoline,amd")) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + spec2_print_if_insecure("AMD retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_AMD; + } else if (match_option(arg, ret, "retpoline,generic")) { + spec2_print_if_insecure("generic retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (match_option(arg, ret, "auto")) { + return SPECTRE_V2_CMD_AUTO; + } + } + + if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_AUTO; +disable: + spec2_print_if_insecure("disabled on command line."); + return SPECTRE_V2_CMD_NONE; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: + goto retpoline_auto; + + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); +} + +#undef pr_fmt + #ifdef CONFIG_SYSFS ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -86,6 +241,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Vulnerable\n"); + + return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7a671d1ae3cb..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,10 +905,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -#ifdef CONFIG_RETPOLINE - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); -#endif - fpu__init_system(c); #ifdef CONFIG_X86_32 From 9697fa39efd3fc3692f2949d4045f393ec58450b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:27 +0000 Subject: [PATCH 196/236] x86/retpoline/crypto: Convert crypto assembler indirect jumps Convert all indirect jumps in crypto assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fec80b2..3d09e3aca18d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index f7c495e2863c..a14af6eb09cb 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way: vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index eee5b3982cfd..b66bbfa62f50 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way: vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 7a7de27c6f41..d9b734d0c8cc 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block: movzxw (bufp, %rax, 2), len lea crc_array(%rip), bufp lea (bufp, len, 1), bufp - jmp *bufp + JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:28 +0000 Subject: [PATCH 197/236] x86/retpoline/entry: Convert entry assembler indirect jumps Convert indirect jumps in core 32/64bit entry assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return address after the 'call' instruction must be *precisely* at the .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, and the use of alternatives will mess that up unless we play horrid games to prepend with NOPs and make the variants the same length. It's not worth it; in the case where we ALTERNATIVE out the retpoline, the first instruction at __x86.indirect_thunk.rax is going to be a bare jmp *%rax anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 5 +++-- arch/x86/entry/entry_64.S | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ace8f321a5a1..a1f28a54f23a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -290,7 +291,7 @@ ENTRY(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_NOSPEC %ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -919,7 +920,7 @@ common_exception: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed31d00dc5ee..59874bc1aed2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "calling.h" @@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline) */ pushq %rdi movq $entry_SYSCALL_64_stage2, %rdi - jmp *%rdi + JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) .popsection @@ -266,7 +267,12 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -438,7 +444,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -517,7 +523,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() From 9351803bd803cdbeb9b5a7850b7b6f464806e3db Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:29 +0000 Subject: [PATCH 198/236] x86/retpoline/ftrace: Convert ftrace assembler indirect jumps Convert all indirect jumps in ftrace assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/ftrace_32.S | 6 ++++-- arch/x86/kernel/ftrace_64.S | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -197,7 +198,8 @@ ftrace_stub: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -241,5 +243,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..7cb8ba08beb9 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,7 @@ #include #include #include - +#include .code64 .section .entry.text, "ax" @@ -286,8 +286,8 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace @@ -329,5 +329,5 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi #endif From e70e5892b28c18f517f29ab6e83bd57705104b31 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:30 +0000 Subject: [PATCH 199/236] x86/retpoline/hyperv: Convert assembler indirect jumps Convert all indirect jumps in hyperv inline asm code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/mshyperv.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 581bb54dd464..5119e4b555cc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return U64_MAX; __asm__ __volatile__("mov %4, %%r8\n" - "call *%5" + CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), "m" (hv_hypercall_pg) + : "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("call *%7" + __asm__ __volatile__(CALL_NOSPEC : "=A" (hv_status), "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory"); #endif /* !x86_64 */ return hv_status; @@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) #ifdef CONFIG_X86_64 { - __asm__ __volatile__("call *%4" + __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "m" (hv_hypercall_pg) + : THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } #else @@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) u32 input1_hi = upper_32_bits(input1); u32 input1_lo = lower_32_bits(input1); - __asm__ __volatile__ ("call *%5" + __asm__ __volatile__ (CALL_NOSPEC : "=A"(hv_status), "+c"(input1_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "edi", "esi"); } #endif From ea08816d5b185ab3d09e95e393f265af54560350 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:31 +0000 Subject: [PATCH 200/236] x86/retpoline/xen: Convert Xen hypercall indirect jumps Convert indirect call in Xen hypercall to use non-speculative sequence, when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7cb282e9e587..bfd882617613 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -217,9 +218,9 @@ privcmd_call(unsigned call, __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); From 5096732f6f695001fa2d6f1335a2680b37912c69 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:32 +0000 Subject: [PATCH 201/236] x86/retpoline/checksum32: Convert assembler indirect jumps Convert all indirect jumps in 32bit checksum assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk --- arch/x86/lib/checksum_32.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4d34bb548b41..46e71a74e612 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -29,7 +29,8 @@ #include #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -156,7 +157,7 @@ ENTRY(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) From 7614e913db1f40fff819b36216484dc3808995d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 11 Jan 2018 21:46:33 +0000 Subject: [PATCH 202/236] x86/retpoline/irq32: Convert assembler indirect jumps Convert all indirect jumps in 32bit irq inline asm code to use non speculative sequences. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/irq_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } From 117cc7a908c83697b0b737d15ae1eb5943afe35b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 11:11:27 +0000 Subject: [PATCH 203/236] x86/retpoline: Fill return stack buffer on vmexit In accordance with the Intel and AMD documentation, we need to overwrite all entries in the RSB on exiting a guest, to prevent malicious branch target predictions from affecting the host kernel. This is needed both for retpoline and for IBRS. [ak: numbers again for the RSB stuffing labels] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/nospec-branch.h | 78 +++++++++++++++++++++++++++- arch/x86/kvm/svm.c | 4 ++ arch/x86/kvm/vmx.c | 4 ++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ea034fa6e261..402a11c803c3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,6 +7,48 @@ #include #include +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include ") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; + #ifdef __ASSEMBLY__ /* @@ -74,6 +116,20 @@ #else call *\reg #endif +.endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: +#endif .endm #else /* __ASSEMBLY__ */ @@ -119,7 +175,7 @@ X86_FEATURE_RETPOLINE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) -#else /* No retpoline */ +#else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -134,5 +190,25 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + unsigned long loops = RSB_CLEAR_LOOPS / 2; + + asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=&r" (loops), ASM_CALL_CONSTRAINT + : "r" (loops) : "memory" ); +#endif +} #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e68f0b3cbf7..2744b97345b8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "trace.h" @@ -4985,6 +4986,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 62ee4362e1c1..d1e25dba3112 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -9403,6 +9404,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); From 3c22a73fb87366851dcf48d852357a6d808921cc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 11 Jan 2018 13:52:08 -0600 Subject: [PATCH 204/236] ASoC: Intel: bytcht_es8316: fix HID handling Same problem as with previous machine drivers, the codec dai uses a hard-coded name of "i2c-ESSX8316:00" but ACPI provides "i2c-ESSX8316:01" in some systems. Fix by overriding the hard-coded value with the codec name derived from the HID information Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189261 Signed-off-by: Pierre-Louis Bossart Reviewed-by: Andy Shevchenko Acked-By: Vinod Koul Signed-off-by: Mark Brown --- sound/soc/intel/boards/Kconfig | 1 + sound/soc/intel/boards/bytcht_es8316.c | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/Kconfig b/sound/soc/intel/boards/Kconfig index de598dcbef30..d4e103615f51 100644 --- a/sound/soc/intel/boards/Kconfig +++ b/sound/soc/intel/boards/Kconfig @@ -139,6 +139,7 @@ config SND_SOC_INTEL_BYT_CHT_DA7213_MACH config SND_SOC_INTEL_BYT_CHT_ES8316_MACH tristate "Baytrail & Cherrytrail with ES8316 codec" depends on X86_INTEL_LPSS && I2C && ACPI + select SND_SOC_ACPI select SND_SOC_ES8316 help This adds support for ASoC machine driver for Intel(R) Baytrail & diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 8088396717e3..ae24f6205f05 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -232,15 +232,39 @@ static struct snd_soc_card byt_cht_es8316_card = { .fully_routed = true, }; +static char codec_name[16]; /* i2c-:00 with HID being 8 chars */ + static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) { - int ret = 0; struct byt_cht_es8316_private *priv; + struct snd_soc_acpi_mach *mach; + const char *i2c_name = NULL; + int dai_index = 0; + int i; + int ret = 0; priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_ATOMIC); if (!priv) return -ENOMEM; + mach = (&pdev->dev)->platform_data; + /* fix index of codec dai */ + for (i = 0; i < ARRAY_SIZE(byt_cht_es8316_dais); i++) { + if (!strcmp(byt_cht_es8316_dais[i].codec_name, + "i2c-ESSX8316:00")) { + dai_index = i; + break; + } + } + + /* fixup codec name based on HID */ + i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + if (i2c_name) { + snprintf(codec_name, sizeof(codec_name), + "%s%s", "i2c-", i2c_name); + byt_cht_es8316_dais[dai_index].codec_name = codec_name; + } + /* register the soc card */ byt_cht_es8316_card.dev = &pdev->dev; snd_soc_card_set_drvdata(&byt_cht_es8316_card, priv); From 2be2d57986431626e905ee344086affa44c5bb9b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 11 Jan 2018 13:52:09 -0600 Subject: [PATCH 205/236] ASoC: acpi: remove hard-coded i2c-device name length Remove hard-codec [16] array size, replace with clearer description and dependency on ACPI_ID_LEN No functionality change Suggested-by: Andy Shevchenko Signed-off-by: Pierre-Louis Bossart Reviewed-by: Andy Shevchenko Acked-By: Vinod Koul Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 3 +++ sound/soc/intel/boards/bytcht_da7213.c | 2 +- sound/soc/intel/boards/bytcht_es8316.c | 2 +- sound/soc/intel/boards/bytcr_rt5640.c | 2 +- sound/soc/intel/boards/bytcr_rt5651.c | 2 +- sound/soc/intel/boards/cht_bsw_rt5645.c | 4 ++-- sound/soc/intel/boards/cht_bsw_rt5672.c | 2 +- 7 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index d1aaf876cd26..83320af8def2 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -27,6 +27,9 @@ struct snd_soc_acpi_package_context { bool data_valid; }; +/* codec name is used in DAIs is i2c-:00 with HID being 8 chars */ +#define SND_ACPI_I2C_ID_LEN (4 + ACPI_ID_LEN + 3 + 1) + #if IS_ENABLED(CONFIG_ACPI) /* translation fron HID to I2C name, needed for DAI codec_name */ const char *snd_soc_acpi_find_name_from_hid(const u8 hid[ACPI_ID_LEN]); diff --git a/sound/soc/intel/boards/bytcht_da7213.c b/sound/soc/intel/boards/bytcht_da7213.c index c4d82ad41bd7..6219c04d4731 100644 --- a/sound/soc/intel/boards/bytcht_da7213.c +++ b/sound/soc/intel/boards/bytcht_da7213.c @@ -219,7 +219,7 @@ static struct snd_soc_card bytcht_da7213_card = { .num_dapm_routes = ARRAY_SIZE(audio_map), }; -static char codec_name[16]; /* i2c-:00 with HID being 8 chars */ +static char codec_name[SND_ACPI_I2C_ID_LEN]; static int bytcht_da7213_probe(struct platform_device *pdev) { diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index ae24f6205f05..079f35cd4eaf 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -232,7 +232,7 @@ static struct snd_soc_card byt_cht_es8316_card = { .fully_routed = true, }; -static char codec_name[16]; /* i2c-:00 with HID being 8 chars */ +static char codec_name[SND_ACPI_I2C_ID_LEN]; static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) { diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index f2c0fc415e52..4548f75498d0 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -713,7 +713,7 @@ static struct snd_soc_card byt_rt5640_card = { .fully_routed = true, }; -static char byt_rt5640_codec_name[16]; /* i2c-:00 with HID being 8 chars */ +static char byt_rt5640_codec_name[SND_ACPI_I2C_ID_LEN]; static char byt_rt5640_codec_aif_name[12]; /* = "rt5640-aif[1|2]" */ static char byt_rt5640_cpu_dai_name[10]; /* = "ssp[0|2]-port" */ diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 22c9cc5d135e..5a6b7dedb773 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -509,7 +509,7 @@ static struct snd_soc_card byt_rt5651_card = { .fully_routed = true, }; -static char byt_rt5651_codec_name[16]; /* i2c-:00 with HID being 8 chars */ +static char byt_rt5651_codec_name[SND_ACPI_I2C_ID_LEN]; static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) { diff --git a/sound/soc/intel/boards/cht_bsw_rt5645.c b/sound/soc/intel/boards/cht_bsw_rt5645.c index 18d129caa974..cef6a8c31c8d 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5645.c +++ b/sound/soc/intel/boards/cht_bsw_rt5645.c @@ -49,7 +49,7 @@ struct cht_acpi_card { struct cht_mc_private { struct snd_soc_jack jack; struct cht_acpi_card *acpi_card; - char codec_name[16]; + char codec_name[SND_ACPI_I2C_ID_LEN]; struct clk *mclk; }; @@ -499,7 +499,7 @@ static struct cht_acpi_card snd_soc_cards[] = { {"10EC5650", CODEC_TYPE_RT5650, &snd_soc_card_chtrt5650}, }; -static char cht_rt5645_codec_name[16]; /* i2c-:00 with HID being 8 chars */ +static char cht_rt5645_codec_name[SND_ACPI_I2C_ID_LEN]; static char cht_rt5645_codec_aif_name[12]; /* = "rt5645-aif[1|2]" */ static char cht_rt5645_cpu_dai_name[10]; /* = "ssp[0|2]-port" */ diff --git a/sound/soc/intel/boards/cht_bsw_rt5672.c b/sound/soc/intel/boards/cht_bsw_rt5672.c index f8f21eee9b2d..1f3d38dc4fcb 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5672.c +++ b/sound/soc/intel/boards/cht_bsw_rt5672.c @@ -35,7 +35,7 @@ struct cht_mc_private { struct snd_soc_jack headset; - char codec_name[16]; + char codec_name[SND_ACPI_I2C_ID_LEN]; struct clk *mclk; }; From ef3d687c795c82c44aef1bf25fcd4900ba60be9a Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Thu, 11 Jan 2018 16:27:04 +0000 Subject: [PATCH 206/236] ASoC: cs42l73: Remove trailing semicolon The trailing semicolon is an empty statement that does no operation. Removing it since it doesn't do anything. Signed-off-by: Luis de Bethencourt Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l73.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs42l73.c b/sound/soc/codecs/cs42l73.c index dde37e569ade..aebaa97490b6 100644 --- a/sound/soc/codecs/cs42l73.c +++ b/sound/soc/codecs/cs42l73.c @@ -1355,7 +1355,7 @@ static int cs42l73_i2c_probe(struct i2c_client *i2c_client, ret = regmap_read(cs42l73->regmap, CS42L73_REVID, ®); if (ret < 0) { dev_err(&i2c_client->dev, "Get Revision ID failed\n"); - return ret;; + return ret; } dev_info(&i2c_client->dev, From 3a1479599a610cd49cbf91da68963ade90ee0fa3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Jan 2018 23:16:17 +0200 Subject: [PATCH 207/236] ASoC: Intel - Convert to use acpi_dev_get_first_match_name() Instead of home grown snd_soc_acpi_find_name_from_hid() use acpi_dev_get_first_match_name(). Tested-by: Pierre-Louis Bossart Signed-off-by: Andy Shevchenko Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 7 ------ sound/soc/intel/boards/bytcht_da7213.c | 2 +- sound/soc/intel/boards/bytcht_es8316.c | 2 +- sound/soc/intel/boards/bytcr_rt5640.c | 2 +- sound/soc/intel/boards/bytcr_rt5651.c | 2 +- sound/soc/intel/boards/cht_bsw_rt5645.c | 2 +- sound/soc/intel/boards/cht_bsw_rt5672.c | 2 +- sound/soc/soc-acpi.c | 33 ------------------------- 8 files changed, 6 insertions(+), 46 deletions(-) diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index 83320af8def2..082224275f52 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -31,16 +31,9 @@ struct snd_soc_acpi_package_context { #define SND_ACPI_I2C_ID_LEN (4 + ACPI_ID_LEN + 3 + 1) #if IS_ENABLED(CONFIG_ACPI) -/* translation fron HID to I2C name, needed for DAI codec_name */ -const char *snd_soc_acpi_find_name_from_hid(const u8 hid[ACPI_ID_LEN]); bool snd_soc_acpi_find_package_from_hid(const u8 hid[ACPI_ID_LEN], struct snd_soc_acpi_package_context *ctx); #else -static inline const char * -snd_soc_acpi_find_name_from_hid(const u8 hid[ACPI_ID_LEN]) -{ - return NULL; -} static inline bool snd_soc_acpi_find_package_from_hid(const u8 hid[ACPI_ID_LEN], struct snd_soc_acpi_package_context *ctx) diff --git a/sound/soc/intel/boards/bytcht_da7213.c b/sound/soc/intel/boards/bytcht_da7213.c index 6219c04d4731..2179dedb28ad 100644 --- a/sound/soc/intel/boards/bytcht_da7213.c +++ b/sound/soc/intel/boards/bytcht_da7213.c @@ -243,7 +243,7 @@ static int bytcht_da7213_probe(struct platform_device *pdev) } /* fixup codec name based on HID */ - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(codec_name, sizeof(codec_name), "%s%s", "i2c-", i2c_name); diff --git a/sound/soc/intel/boards/bytcht_es8316.c b/sound/soc/intel/boards/bytcht_es8316.c index 079f35cd4eaf..305e7f4fe55a 100644 --- a/sound/soc/intel/boards/bytcht_es8316.c +++ b/sound/soc/intel/boards/bytcht_es8316.c @@ -258,7 +258,7 @@ static int snd_byt_cht_es8316_mc_probe(struct platform_device *pdev) } /* fixup codec name based on HID */ - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(codec_name, sizeof(codec_name), "%s%s", "i2c-", i2c_name); diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 4548f75498d0..b6a1cfeec830 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -762,7 +762,7 @@ static int snd_byt_rt5640_mc_probe(struct platform_device *pdev) } /* fixup codec name based on HID */ - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(byt_rt5640_codec_name, sizeof(byt_rt5640_codec_name), "%s%s", "i2c-", i2c_name); diff --git a/sound/soc/intel/boards/bytcr_rt5651.c b/sound/soc/intel/boards/bytcr_rt5651.c index 5a6b7dedb773..456526a93dd5 100644 --- a/sound/soc/intel/boards/bytcr_rt5651.c +++ b/sound/soc/intel/boards/bytcr_rt5651.c @@ -539,7 +539,7 @@ static int snd_byt_rt5651_mc_probe(struct platform_device *pdev) } /* fixup codec name based on HID */ - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(byt_rt5651_codec_name, sizeof(byt_rt5651_codec_name), "%s%s", "i2c-", i2c_name); diff --git a/sound/soc/intel/boards/cht_bsw_rt5645.c b/sound/soc/intel/boards/cht_bsw_rt5645.c index cef6a8c31c8d..976ea6bf9539 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5645.c +++ b/sound/soc/intel/boards/cht_bsw_rt5645.c @@ -566,7 +566,7 @@ static int snd_cht_mc_probe(struct platform_device *pdev) } /* fixup codec name based on HID */ - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(cht_rt5645_codec_name, sizeof(cht_rt5645_codec_name), "%s%s", "i2c-", i2c_name); diff --git a/sound/soc/intel/boards/cht_bsw_rt5672.c b/sound/soc/intel/boards/cht_bsw_rt5672.c index 1f3d38dc4fcb..c14a52d2f714 100644 --- a/sound/soc/intel/boards/cht_bsw_rt5672.c +++ b/sound/soc/intel/boards/cht_bsw_rt5672.c @@ -396,7 +396,7 @@ static int snd_cht_mc_probe(struct platform_device *pdev) /* fixup codec name based on HID */ if (mach) { - i2c_name = snd_soc_acpi_find_name_from_hid(mach->id); + i2c_name = acpi_dev_get_first_match_name(mach->id, NULL, -1); if (i2c_name) { snprintf(drv->codec_name, sizeof(drv->codec_name), "i2c-%s", i2c_name); diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index 7f43c9bf3d09..3d7e1ff79139 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -16,39 +16,6 @@ #include -static acpi_status snd_soc_acpi_find_name(acpi_handle handle, u32 level, - void *context, void **ret) -{ - struct acpi_device *adev; - const char *name = NULL; - - if (acpi_bus_get_device(handle, &adev)) - return AE_OK; - - if (adev->status.present && adev->status.functional) { - name = acpi_dev_name(adev); - *(const char **)ret = name; - return AE_CTRL_TERMINATE; - } - - return AE_OK; -} - -const char *snd_soc_acpi_find_name_from_hid(const u8 hid[ACPI_ID_LEN]) -{ - const char *name = NULL; - acpi_status status; - - status = acpi_get_devices(hid, snd_soc_acpi_find_name, NULL, - (void **)&name); - - if (ACPI_FAILURE(status) || name[0] == '\0') - return NULL; - - return name; -} -EXPORT_SYMBOL_GPL(snd_soc_acpi_find_name_from_hid); - struct snd_soc_acpi_mach * snd_soc_acpi_find_machine(struct snd_soc_acpi_mach *machines) { From 949293d45d6b0951e2dfdfd670a9c0092b10fd27 Mon Sep 17 00:00:00 2001 From: Christian Fischer Date: Fri, 12 Jan 2018 14:22:59 +0100 Subject: [PATCH 208/236] ASoC: mxs-sgtl5000: add audio-routing support Add dapm_widgets to machine-driver (from imx-sgtl5000). If the "audio-routing"-property is present at probing the dapm-widgets getting linked to the card. Signed-off-by: Christian Fischer Signed-off-by: Mark Brown --- .../bindings/sound/mxs-audio-sgtl5000.txt | 33 ++++++++++++++++--- sound/soc/mxs/mxs-sgtl5000.c | 20 +++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt b/Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt index 601c518eddaa..4eb980bd0287 100644 --- a/Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt +++ b/Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt @@ -1,10 +1,31 @@ * Freescale MXS audio complex with SGTL5000 codec Required properties: -- compatible: "fsl,mxs-audio-sgtl5000" -- model: The user-visible name of this sound complex -- saif-controllers: The phandle list of the MXS SAIF controller -- audio-codec: The phandle of the SGTL5000 audio codec +- compatible : "fsl,mxs-audio-sgtl5000" +- model : The user-visible name of this sound complex +- saif-controllers : The phandle list of the MXS SAIF controller +- audio-codec : The phandle of the SGTL5000 audio codec +- audio-routing : A list of the connections between audio components. + Each entry is a pair of strings, the first being the + connection's sink, the second being the connection's + source. Valid names could be power supplies, SGTL5000 + pins, and the jacks on the board: + + Power supplies: + * Mic Bias + + SGTL5000 pins: + * MIC_IN + * LINE_IN + * HP_OUT + * LINE_OUT + + Board connectors: + * Mic Jack + * Line In Jack + * Headphone Jack + * Line Out Jack + * Ext Spk Example: @@ -14,4 +35,8 @@ sound { model = "imx28-evk-sgtl5000"; saif-controllers = <&saif0 &saif1>; audio-codec = <&sgtl5000>; + audio-routing = + "MIC_IN", "Mic Jack", + "Mic Jack", "Mic Bias", + "Headphone Jack", "HP_OUT"; }; diff --git a/sound/soc/mxs/mxs-sgtl5000.c b/sound/soc/mxs/mxs-sgtl5000.c index 2ed3240cc682..b593f76212e0 100644 --- a/sound/soc/mxs/mxs-sgtl5000.c +++ b/sound/soc/mxs/mxs-sgtl5000.c @@ -93,6 +93,14 @@ static struct snd_soc_dai_link mxs_sgtl5000_dai[] = { }, }; +static const struct snd_soc_dapm_widget mxs_sgtl5000_dapm_widgets[] = { + SND_SOC_DAPM_MIC("Mic Jack", NULL), + SND_SOC_DAPM_LINE("Line In Jack", NULL), + SND_SOC_DAPM_HP("Headphone Jack", NULL), + SND_SOC_DAPM_SPK("Line Out Jack", NULL), + SND_SOC_DAPM_SPK("Ext Spk", NULL), +}; + static struct snd_soc_card mxs_sgtl5000 = { .name = "mxs_sgtl5000", .owner = THIS_MODULE, @@ -141,6 +149,18 @@ static int mxs_sgtl5000_probe(struct platform_device *pdev) card->dev = &pdev->dev; + if (of_find_property(np, "audio-routing", NULL)) { + card->dapm_widgets = mxs_sgtl5000_dapm_widgets; + card->num_dapm_widgets = ARRAY_SIZE(mxs_sgtl5000_dapm_widgets); + + ret = snd_soc_of_parse_audio_routing(card, "audio-routing"); + if (ret) { + dev_err(&pdev->dev, "failed to parse audio-routing (%d)\n", + ret); + return ret; + } + } + ret = devm_snd_soc_register_card(&pdev->dev, card); if (ret) { dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", From 0dda0b3fb255048a221f736c8a2a24c674da8bf3 Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 8 Dec 2017 17:43:18 -0800 Subject: [PATCH 209/236] apparmor: fix ptrace label match when matching stacked labels Given a label with a profile stack of A//&B or A//&C ... A ptrace rule should be able to specify a generic trace pattern with a rule like ptrace trace A//&**, however this is failing because while the correct label match routine is called, it is being done post label decomposition so it is always being done against a profile instead of the stacked label. To fix this refactor the cross check to pass the full peer label in to the label_match. Fixes: 290f458a4f16 ("apparmor: allow ptrace checks to be finer grained than just capability") Cc: Stable Reported-by: Matthew Garrett Tested-by: Matthew Garrett Signed-off-by: John Johansen --- security/apparmor/include/perms.h | 3 ++ security/apparmor/ipc.c | 53 +++++++++++++++++++------------ 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/security/apparmor/include/perms.h b/security/apparmor/include/perms.h index 2b27bb79aec4..d7b7e7115160 100644 --- a/security/apparmor/include/perms.h +++ b/security/apparmor/include/perms.h @@ -133,6 +133,9 @@ extern struct aa_perms allperms; #define xcheck_labels_profiles(L1, L2, FN, args...) \ xcheck_ns_labels((L1), (L2), xcheck_ns_profile_label, (FN), args) +#define xcheck_labels(L1, L2, P, FN1, FN2) \ + xcheck(fn_for_each((L1), (P), (FN1)), fn_for_each((L2), (P), (FN2))) + void aa_perm_mask_to_str(char *str, const char *chrs, u32 mask); void aa_audit_perm_names(struct audit_buffer *ab, const char **names, u32 mask); diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index 7ca0032e7ba9..b40678f3c1d5 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -64,40 +64,48 @@ static void audit_ptrace_cb(struct audit_buffer *ab, void *va) FLAGS_NONE, GFP_ATOMIC); } +/* assumes check for PROFILE_MEDIATES is already done */ /* TODO: conditionals */ static int profile_ptrace_perm(struct aa_profile *profile, - struct aa_profile *peer, u32 request, - struct common_audit_data *sa) + struct aa_label *peer, u32 request, + struct common_audit_data *sa) { struct aa_perms perms = { }; - /* need because of peer in cross check */ - if (profile_unconfined(profile) || - !PROFILE_MEDIATES(profile, AA_CLASS_PTRACE)) - return 0; - - aad(sa)->peer = &peer->label; - aa_profile_match_label(profile, &peer->label, AA_CLASS_PTRACE, request, + aad(sa)->peer = peer; + aa_profile_match_label(profile, peer, AA_CLASS_PTRACE, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, sa, audit_ptrace_cb); } -static int cross_ptrace_perm(struct aa_profile *tracer, - struct aa_profile *tracee, u32 request, - struct common_audit_data *sa) +static int profile_tracee_perm(struct aa_profile *tracee, + struct aa_label *tracer, u32 request, + struct common_audit_data *sa) { + if (profile_unconfined(tracee) || unconfined(tracer) || + !PROFILE_MEDIATES(tracee, AA_CLASS_PTRACE)) + return 0; + + return profile_ptrace_perm(tracee, tracer, request, sa); +} + +static int profile_tracer_perm(struct aa_profile *tracer, + struct aa_label *tracee, u32 request, + struct common_audit_data *sa) +{ + if (profile_unconfined(tracer)) + return 0; + if (PROFILE_MEDIATES(tracer, AA_CLASS_PTRACE)) - return xcheck(profile_ptrace_perm(tracer, tracee, request, sa), - profile_ptrace_perm(tracee, tracer, - request << PTRACE_PERM_SHIFT, - sa)); - /* policy uses the old style capability check for ptrace */ - if (profile_unconfined(tracer) || tracer == tracee) + return profile_ptrace_perm(tracer, tracee, request, sa); + + /* profile uses the old style capability check for ptrace */ + if (&tracer->label == tracee) return 0; aad(sa)->label = &tracer->label; - aad(sa)->peer = &tracee->label; + aad(sa)->peer = tracee; aad(sa)->request = 0; aad(sa)->error = aa_capable(&tracer->label, CAP_SYS_PTRACE, 1); @@ -115,10 +123,13 @@ static int cross_ptrace_perm(struct aa_profile *tracer, int aa_may_ptrace(struct aa_label *tracer, struct aa_label *tracee, u32 request) { + struct aa_profile *profile; + u32 xrequest = request << PTRACE_PERM_SHIFT; DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_NONE, OP_PTRACE); - return xcheck_labels_profiles(tracer, tracee, cross_ptrace_perm, - request, &sa); + return xcheck_labels(tracer, tracee, profile, + profile_tracer_perm(profile, tracee, request, &sa), + profile_tracee_perm(profile, tracer, xrequest, &sa)); } From 1a3881d305592d947ed47887306919d50112394d Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 11 Jan 2018 13:07:54 -0800 Subject: [PATCH 210/236] apparmor: Fix regression in profile conflict logic The intended behaviour in apparmor profile matching is to flag a conflict if two profiles match equally well. However, right now a conflict is generated if another profile has the same match length even if that profile doesn't actually match. Fix the logic so we only generate a conflict if the profiles match. Fixes: 844b8292b631 ("apparmor: ensure that undecidable profile attachments fail") Cc: Stable Signed-off-by: Matthew Garrett Signed-off-by: John Johansen --- security/apparmor/domain.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 04ba9d0718ea..6a54d2ffa840 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -330,10 +330,7 @@ static struct aa_profile *__attach_match(const char *name, continue; if (profile->xmatch) { - if (profile->xmatch_len == len) { - conflict = true; - continue; - } else if (profile->xmatch_len > len) { + if (profile->xmatch_len >= len) { unsigned int state; u32 perm; @@ -342,6 +339,10 @@ static struct aa_profile *__attach_match(const char *name, perm = dfa_user_allow(profile->xmatch, state); /* any accepting state means a valid match. */ if (perm & MAY_EXEC) { + if (profile->xmatch_len == len) { + conflict = true; + continue; + } candidate = profile; len = profile->xmatch_len; conflict = false; From 352909b49ba0d74929b96af6dfbefc854ab6ebb5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jan 2018 17:16:51 -0800 Subject: [PATCH 211/236] selftests/x86: Add test_vsyscall This tests that the vsyscall entries do what they're expected to do. It also confirms that attempts to read the vsyscall page behave as expected. If changes are made to the vsyscall code or its memory map handling, running this test in all three of vsyscall=none, vsyscall=emulate, and vsyscall=native are helpful. (Because it's easy, this also compares the vsyscall results to their vDSO equivalents.) Note to KAISER backporters: please test this under all three vsyscall modes. Also, in the emulate and native modes, make sure that test_vsyscall_64 agrees with the command line or config option as to which mode you're in. It's quite easy to mess up the kernel such that native mode accidentally emulates or vice versa. Greg, etc: please backport this to all your Meltdown-patched kernels. It'll help make sure the patches didn't regress vsyscalls. CSigned-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/2b9c5a174c1d60fd7774461d518aa75598b1d8fd.1515719552.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/test_vsyscall.c | 500 ++++++++++++++++++++ 2 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/test_vsyscall.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 7b1adeee4b0f..91fbfa8fdc15 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -7,7 +7,7 @@ include ../lib.mk TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \ - protection_keys test_vdso + protection_keys test_vdso test_vsyscall TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c new file mode 100644 index 000000000000..7a744fa7b786 --- /dev/null +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -0,0 +1,500 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __x86_64__ +# define VSYS(x) (x) +#else +# define VSYS(x) 0 +#endif + +#ifndef SYS_getcpu +# ifdef __x86_64__ +# define SYS_getcpu 309 +# else +# define SYS_getcpu 318 +# endif +#endif + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +/* vsyscalls and vDSO */ +bool should_read_vsyscall = false; + +typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); +gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); +gtod_t vdso_gtod; + +typedef int (*vgettime_t)(clockid_t, struct timespec *); +vgettime_t vdso_gettime; + +typedef long (*time_func_t)(time_t *t); +time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); +time_func_t vdso_time; + +typedef long (*getcpu_t)(unsigned *, unsigned *, void *); +getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); +getcpu_t vdso_getcpu; + +static void init_vdso(void) +{ + void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + printf("[WARN]\tfailed to find vDSO\n"); + return; + } + + vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); + if (!vdso_gtod) + printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); + + vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_gettime) + printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); + + vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); + if (!vdso_time) + printf("[WARN]\tfailed to find time in vDSO\n"); + + vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); + if (!vdso_getcpu) { + /* getcpu() was never wired up in the 32-bit vDSO. */ + printf("[%s]\tfailed to find getcpu in vDSO\n", + sizeof(long) == 8 ? "WARN" : "NOTE"); + } +} + +static int init_vsys(void) +{ +#ifdef __x86_64__ + int nerrs = 0; + FILE *maps; + char line[128]; + bool found = false; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); + should_read_vsyscall = true; + return 0; + } + + while (fgets(line, sizeof(line), maps)) { + char r, x; + void *start, *end; + char name[128]; + if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", + &start, &end, &r, &x, name) != 5) + continue; + + if (strcmp(name, "[vsyscall]")) + continue; + + printf("\tvsyscall map: %s", line); + + if (start != (void *)0xffffffffff600000 || + end != (void *)0xffffffffff601000) { + printf("[FAIL]\taddress range is nonsense\n"); + nerrs++; + } + + printf("\tvsyscall permissions are %c-%c\n", r, x); + should_read_vsyscall = (r == 'r'); + if (x != 'x') { + vgtod = NULL; + vtime = NULL; + vgetcpu = NULL; + } + + found = true; + break; + } + + fclose(maps); + + if (!found) { + printf("\tno vsyscall map in /proc/self/maps\n"); + should_read_vsyscall = false; + vgtod = NULL; + vtime = NULL; + vgetcpu = NULL; + } + + return nerrs; +#else + return 0; +#endif +} + +/* syscalls */ +static inline long sys_gtod(struct timeval *tv, struct timezone *tz) +{ + return syscall(SYS_gettimeofday, tv, tz); +} + +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + return syscall(SYS_clock_gettime, id, ts); +} + +static inline long sys_time(time_t *t) +{ + return syscall(SYS_time, t); +} + +static inline long sys_getcpu(unsigned * cpu, unsigned * node, + void* cache) +{ + return syscall(SYS_getcpu, cpu, node, cache); +} + +static jmp_buf jmpbuf; + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +static double tv_diff(const struct timeval *a, const struct timeval *b) +{ + return (double)(a->tv_sec - b->tv_sec) + + (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; +} + +static int check_gtod(const struct timeval *tv_sys1, + const struct timeval *tv_sys2, + const struct timezone *tz_sys, + const char *which, + const struct timeval *tv_other, + const struct timezone *tz_other) +{ + int nerrs = 0; + double d1, d2; + + if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { + printf("[FAIL] %s tz mismatch\n", which); + nerrs++; + } + + d1 = tv_diff(tv_other, tv_sys1); + d2 = tv_diff(tv_sys2, tv_other); + printf("\t%s time offsets: %lf %lf\n", which, d1, d2); + + if (d1 < 0 || d2 < 0) { + printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); + nerrs++; + } else { + printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); + } + + return nerrs; +} + +static int test_gtod(void) +{ + struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; + struct timezone tz_sys, tz_vdso, tz_vsys; + long ret_vdso = -1; + long ret_vsys = -1; + int nerrs = 0; + + printf("[RUN]\ttest gettimeofday()\n"); + + if (sys_gtod(&tv_sys1, &tz_sys) != 0) + err(1, "syscall gettimeofday"); + if (vdso_gtod) + ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); + if (vgtod) + ret_vsys = vgtod(&tv_vsys, &tz_vsys); + if (sys_gtod(&tv_sys2, &tz_sys) != 0) + err(1, "syscall gettimeofday"); + + if (vdso_gtod) { + if (ret_vdso == 0) { + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); + } else { + printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); + nerrs++; + } + } + + if (vgtod) { + if (ret_vsys == 0) { + nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); + } else { + printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); + nerrs++; + } + } + + return nerrs; +} + +static int test_time(void) { + int nerrs = 0; + + printf("[RUN]\ttest time()\n"); + long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; + long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; + t_sys1 = sys_time(&t2_sys1); + if (vdso_time) + t_vdso = vdso_time(&t2_vdso); + if (vtime) + t_vsys = vtime(&t2_vsys); + t_sys2 = sys_time(&t2_sys2); + if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { + printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); + nerrs++; + return nerrs; + } + + if (vdso_time) { + if (t_vdso < 0 || t_vdso != t2_vdso) { + printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); + nerrs++; + } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { + printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); + nerrs++; + } else { + printf("[OK]\tvDSO time() is okay\n"); + } + } + + if (vtime) { + if (t_vsys < 0 || t_vsys != t2_vsys) { + printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); + nerrs++; + } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { + printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); + nerrs++; + } else { + printf("[OK]\tvsyscall time() is okay\n"); + } + } + + return nerrs; +} + +static int test_getcpu(int cpu) +{ + int nerrs = 0; + long ret_sys, ret_vdso = -1, ret_vsys = -1; + + printf("[RUN]\tgetcpu() on CPU %d\n", cpu); + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { + printf("[SKIP]\tfailed to force CPU %d\n", cpu); + return nerrs; + } + + unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; + unsigned node = 0; + bool have_node = false; + ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); + if (vdso_getcpu) + ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); + if (vgetcpu) + ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); + + if (ret_sys == 0) { + if (cpu_sys != cpu) { + printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); + nerrs++; + } + + have_node = true; + node = node_sys; + } + + if (vdso_getcpu) { + if (ret_vdso) { + printf("[FAIL]\tvDSO getcpu() failed\n"); + nerrs++; + } else { + if (!have_node) { + have_node = true; + node = node_vdso; + } + + if (cpu_vdso != cpu) { + printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); + nerrs++; + } else { + printf("[OK]\tvDSO reported correct CPU\n"); + } + + if (node_vdso != node) { + printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); + nerrs++; + } else { + printf("[OK]\tvDSO reported correct node\n"); + } + } + } + + if (vgetcpu) { + if (ret_vsys) { + printf("[FAIL]\tvsyscall getcpu() failed\n"); + nerrs++; + } else { + if (!have_node) { + have_node = true; + node = node_vsys; + } + + if (cpu_vsys != cpu) { + printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); + nerrs++; + } else { + printf("[OK]\tvsyscall reported correct CPU\n"); + } + + if (node_vsys != node) { + printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); + nerrs++; + } else { + printf("[OK]\tvsyscall reported correct node\n"); + } + } + } + + return nerrs; +} + +static int test_vsys_r(void) +{ +#ifdef __x86_64__ + printf("[RUN]\tChecking read access to the vsyscall page\n"); + bool can_read; + if (sigsetjmp(jmpbuf, 1) == 0) { + *(volatile int *)0xffffffffff600000; + can_read = true; + } else { + can_read = false; + } + + if (can_read && !should_read_vsyscall) { + printf("[FAIL]\tWe have read access, but we shouldn't\n"); + return 1; + } else if (!can_read && should_read_vsyscall) { + printf("[FAIL]\tWe don't have read access, but we should\n"); + return 1; + } else { + printf("[OK]\tgot expected result\n"); + } +#endif + + return 0; +} + + +#ifdef __x86_64__ +#define X86_EFLAGS_TF (1UL << 8) +static volatile sig_atomic_t num_vsyscall_traps; + +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags"); +} + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP]; + + if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0) + num_vsyscall_traps++; +} + +static int test_native_vsyscall(void) +{ + time_t tmp; + bool is_native; + + if (!vtime) + return 0; + + printf("[RUN]\tchecking for native vsyscall\n"); + sethandler(SIGTRAP, sigtrap, 0); + set_eflags(get_eflags() | X86_EFLAGS_TF); + vtime(&tmp); + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + + /* + * If vsyscalls are emulated, we expect a single trap in the + * vsyscall page -- the call instruction will trap with RIP + * pointing to the entry point before emulation takes over. + * In native mode, we expect two traps, since whatever code + * the vsyscall page contains will be more than just a ret + * instruction. + */ + is_native = (num_vsyscall_traps > 1); + + printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "native" : "emulated"), + (int)num_vsyscall_traps); + + return 0; +} +#endif + +int main(int argc, char **argv) +{ + int nerrs = 0; + + init_vdso(); + nerrs += init_vsys(); + + nerrs += test_gtod(); + nerrs += test_time(); + nerrs += test_getcpu(0); + nerrs += test_getcpu(1); + + sethandler(SIGSEGV, sigsegv, 0); + nerrs += test_vsys_r(); + +#ifdef __x86_64__ + nerrs += test_native_vsyscall(); +#endif + + return nerrs ? 1 : 0; +} From 36c1681678b507346e7397a235a7303dad665fc3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 11 Jan 2018 18:28:08 +0900 Subject: [PATCH 212/236] genksyms: drop *.hash.c from .gitignore This is a left-over of commit bb3290d91695 ("Remove gperf usage from toolchain"). We do not generate a hash function any more. Signed-off-by: Masahiro Yamada --- scripts/genksyms/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/genksyms/.gitignore b/scripts/genksyms/.gitignore index 86dc07a01b43..e7836b47f060 100644 --- a/scripts/genksyms/.gitignore +++ b/scripts/genksyms/.gitignore @@ -1,4 +1,3 @@ -*.hash.c *.lex.c *.tab.c *.tab.h From bed6760cf2c40778a58f2e399c8947b3b3c55518 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 12 Jan 2018 16:53:07 -0800 Subject: [PATCH 213/236] MAINTAINERS, nilfs2: change project home URLs The domain of NILFS project home was changed to "nilfs.sourceforge.io" to enable https access (the previous domain "nilfs.sourceforge.net" is redirected to the new one). Modify URLs of the project home to reflect this change and to replace their protocol from http to https. Link: http://lkml.kernel.org/r/1515416141-5614-1-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/nilfs2.txt | 4 ++-- MAINTAINERS | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index c0727dc36271..f2f3f8592a6f 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -25,8 +25,8 @@ available from the following download page. At least "mkfs.nilfs2", cleaner or garbage collector) are required. Details on the tools are described in the man pages included in the package. -Project web page: http://nilfs.sourceforge.net/ -Download page: http://nilfs.sourceforge.net/en/download.html +Project web page: https://nilfs.sourceforge.io/ +Download page: https://nilfs.sourceforge.io/en/download.html List info: http://vger.kernel.org/vger-lists.html#linux-nilfs Caveats diff --git a/MAINTAINERS b/MAINTAINERS index d76af75a653a..18994806e441 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9638,8 +9638,8 @@ F: include/uapi/linux/sunrpc/ NILFS2 FILESYSTEM M: Ryusuke Konishi L: linux-nilfs@vger.kernel.org -W: http://nilfs.sourceforge.net/ -W: http://nilfs.osdn.jp/ +W: https://nilfs.sourceforge.io/ +W: https://nilfs.osdn.jp/ T: git git://github.com/konis/nilfs2.git S: Supported F: Documentation/filesystems/nilfs2.txt From d9570ee3bd1d4f20ce63485f5ef05663866fe6c0 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 12 Jan 2018 16:53:10 -0800 Subject: [PATCH 214/236] kmemleak: allow to coexist with fault injection kmemleak does one slab allocation per user allocation. So if slab fault injection is enabled to any degree, kmemleak instantly fails to allocate and turns itself off. However, it's useful to use kmemleak with fault injection to find leaks on error paths. On the other hand, checking kmemleak itself is not so useful because (1) it's a debugging tool and (2) it has a very regular allocation pattern (basically a single allocation site, so it either works or not). Turn off fault injection for kmemleak allocations. Link: http://lkml.kernel.org/r/20180109192243.19316-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d73c14294f3a..f656ca27f6c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -127,7 +127,7 @@ /* GFP bitmask for kmemleak internal allocations */ #define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ - __GFP_NOWARN) + __GFP_NOWARN | __GFP_NOFAIL) /* scanning area inside a memory block */ struct kmemleak_scan_area { From a0b1280368d1e91ab72f849ef095b4f07a39bbf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 12 Jan 2018 16:53:14 -0800 Subject: [PATCH 215/236] kdump: write correct address of mem_section into vmcoreinfo Depending on configuration mem_section can now be an array or a pointer to an array allocated dynamically. In most cases, we can continue to refer to it as 'mem_section' regardless of what it is. But there's one exception: '&mem_section' means "address of the array" if mem_section is an array, but if mem_section is a pointer, it would mean "address of the pointer". We've stepped onto this in kdump code. VMCOREINFO_SYMBOL(mem_section) writes down address of pointer into vmcoreinfo, not array as we wanted. Let's introduce VMCOREINFO_SYMBOL_ARRAY() that would handle the situation correctly for both cases. Link: http://lkml.kernel.org/r/20180112162532.35896-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Fixes: 83e3c48729d9 ("mm/sparsemem: Allocate mem_section at runtime for CONFIG_SPARSEMEM_EXTREME=y") Acked-by: Baoquan He Acked-by: Dave Young Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 06097ef30449..b511f6d24b42 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -42,6 +42,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SYMBOL_ARRAY(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name) #define VMCOREINFO_SIZE(name) \ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ (unsigned long)sizeof(name)) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); From 0f908ccbeca99ddf0ad60afa710e72aded4a5ea7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 12 Jan 2018 16:53:17 -0800 Subject: [PATCH 216/236] tools/objtool/Makefile: don't assume sync-check.sh is executable patch(1) loses the x bit. So if a user follows our patching instructions in Documentation/admin-guide/README.rst, their kernel will not compile. Fixes: 3bd51c5a371de ("objtool: Move kernel headers/code sync check to a script") Reported-by: Nicolas Bock Reported-by Joakim Tjernlund Cc: Ingo Molnar Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/objtool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index ae0272f9a091..e6acc281dd37 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -46,7 +46,7 @@ $(OBJTOOL_IN): fixdep FORCE @$(MAKE) $(build)=objtool $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN) - @./sync-check.sh + @$(CONFIG_SHELL) ./sync-check.sh $(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@ From f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 00:23:57 +0100 Subject: [PATCH 217/236] x86/pti: Fix !PCID and sanitize defines The switch to the user space page tables in the low level ASM code sets unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base address of the page directory to the user part, bit 11 is switching the PCID to the PCID associated with the user page tables. This fails on a machine which lacks PCID support because bit 11 is set in CR3. Bit 11 is reserved when PCID is inactive. While the Intel SDM claims that the reserved bits are ignored when PCID is disabled, the AMD APM states that they should be cleared. This went unnoticed as the AMD APM was not checked when the code was developed and reviewed and test systems with Intel CPUs never failed to boot. The report is against a Centos 6 host where the guest fails to boot, so it's not yet clear whether this is a virt issue or can happen on real hardware too, but thats irrelevant as the AMD APM clearly ask for clearing the reserved bits. Make sure that on non PCID machines bit 11 is not set by the page table switching code. Andy suggested to rename the related bits and masks so they are clearly describing what they should be used for, which is done as well for clarity. That split could have been done with alternatives but the macro hell is horrible and ugly. This can be done on top if someone cares to remove the extra orq. For now it's a straight forward fix. Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Willy Tarreau Cc: David Woodhouse Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos --- arch/x86/entry/calling.h | 36 ++++++++++++++------------ arch/x86/include/asm/processor-flags.h | 2 +- arch/x86/include/asm/tlbflush.h | 6 ++--- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 45a63e00a6af..3f48f695d5e6 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ -#define PTI_SWITCH_PGTABLES_MASK (1<= (1 << X86_CR3_PTI_SWITCH_BIT)); + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); /* * The ASID being passed in here should have respected the * MAX_ASID_AVAILABLE and thus never have the switch bit set. */ - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); #endif /* * The dynamically-assigned ASIDs that get passed in are small @@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); #ifdef CONFIG_PAGE_TABLE_ISOLATION - ret |= 1 << X86_CR3_PTI_SWITCH_BIT; + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; } From a237f762681e2a394ca67f21df2feb2b76a3609b Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 12 Jan 2018 15:24:59 -0800 Subject: [PATCH 218/236] security/Kconfig: Correct the Documentation reference for PTI When the config option for PTI was added a reference to documentation was added as well. But the documentation did not exist at that point. The final documentation has a different file name. Fix it up to point to the proper file. Fixes: 385ce0ea ("x86/mm/pti: Add Kconfig") Signed-off-by: W. Trevor King Signed-off-by: Thomas Gleixner Cc: Dave Hansen Cc: linux-mm@kvack.org Cc: linux-security-module@vger.kernel.org Cc: James Morris Cc: "Serge E. Hallyn" Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/3009cc8ccbddcd897ec1e0cb6dda524929de0d14.1515799398.git.wking@tremily.us --- security/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/Kconfig b/security/Kconfig index 3d4debd0257e..b0cb9a5f9448 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -63,7 +63,7 @@ config PAGE_TABLE_ISOLATION ensuring that the majority of kernel addresses are not mapped into userspace. - See Documentation/x86/pagetable-isolation.txt for more details. + See Documentation/x86/pti.txt for more details. config SECURITY_INFINIBAND bool "Infiniband Security Hooks" From 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 14 Jan 2018 11:27:13 +0100 Subject: [PATCH 219/236] x86,perf: Disable intel_bts when PTI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_bts driver does not use the 'normal' BTS buffer which is exposed through the cpu_entry_area but instead uses the memory allocated for the perf AUX buffer. This obviously comes apart when using PTI because then the kernel mapping; which includes that AUX buffer memory; disappears. Fixing this requires to expose a mapping which is visible in all context and that's not trivial. As a quick fix disable this driver when PTI is enabled to prevent malfunction. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Reported-by: Vince Weaver Reported-by: Robert Święcki Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: greg@kroah.com Cc: hughd@google.com Cc: luto@amacapital.net Cc: Vince Weaver Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net --- arch/x86/events/intel/bts.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 141e07b06216..24ffa1e88cf9 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -582,6 +582,24 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PTI)) { + /* + * BTS hardware writes through a virtual memory map we must + * either use the kernel physical map, or the user mapping of + * the AUX buffer. + * + * However, since this driver supports per-CPU and per-task inherit + * we cannot use the user mapping since it will not be availble + * if we're not running the owning process. + * + * With PTI we can't use the kernal map either, because its not + * there when we run userspace. + * + * For now, disable this driver when using PTI. + */ + return -ENODEV; + } + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; From b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 22:13:29 +0100 Subject: [PATCH 220/236] x86/retpoline: Remove compile time warning Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler does not have retpoline support. Linus rationale for this is: It's wrong because it will just make people turn off RETPOLINE, and the asm updates - and return stack clearing - that are independent of the compiler are likely the most important parts because they are likely the ones easiest to target. And it's annoying because most people won't be able to do anything about it. The number of people building their own compiler? Very small. So if their distro hasn't got a compiler yet (and pretty much nobody does), the warning is just annoying crap. It is already properly reported as part of the sysfs interface. The compile-time warning only encourages bad things. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Requested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com --- arch/x86/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 974c61864978..504b1a4535ac 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE - else - $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) endif endif From a8750ddca918032d6349adbf9a4b6555e7db20da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Jan 2018 15:32:30 -0800 Subject: [PATCH 221/236] Linux 4.15-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c4aa6210a2a4..bf5b8cbb9469 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Fearless Coyote # *DOCUMENTATION* From 123af9043e93cb6f235207d260d50f832cdb5439 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 15 Jan 2018 11:08:38 +0300 Subject: [PATCH 222/236] ASoC: au1x: Fix timeout tests in au1xac97c_ac97_read() The loop timeout doesn't work because it's a post op and ends with "tmo" set to -1. I changed it from a post-op to a pre-op and I changed the initial the starting value from 5 to 6 so we still iterate 5 times. I left the other as it was because it's a large number. Fixes: b3c70c9ea62a ("ASoC: Alchemy AC97C/I2SC audio support") Signed-off-by: Dan Carpenter Signed-off-by: Mark Brown --- sound/soc/au1x/ac97c.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/au1x/ac97c.c b/sound/soc/au1x/ac97c.c index 29a97d52e8ad..66d6c52e7761 100644 --- a/sound/soc/au1x/ac97c.c +++ b/sound/soc/au1x/ac97c.c @@ -91,8 +91,8 @@ static unsigned short au1xac97c_ac97_read(struct snd_ac97 *ac97, do { mutex_lock(&ctx->lock); - tmo = 5; - while ((RD(ctx, AC97_STATUS) & STAT_CP) && tmo--) + tmo = 6; + while ((RD(ctx, AC97_STATUS) & STAT_CP) && --tmo) udelay(21); /* wait an ac97 frame time */ if (!tmo) { pr_debug("ac97rd timeout #1\n"); @@ -105,7 +105,7 @@ static unsigned short au1xac97c_ac97_read(struct snd_ac97 *ac97, * poll, Forrest, poll... */ tmo = 0x10000; - while ((RD(ctx, AC97_STATUS) & STAT_CP) && tmo--) + while ((RD(ctx, AC97_STATUS) & STAT_CP) && --tmo) asm volatile ("nop"); data = RD(ctx, AC97_CMDRESP); From abaca806fd13afd069e04e883de8ec75924b0598 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Mon, 15 Jan 2018 09:57:39 +0100 Subject: [PATCH 223/236] IIO: ADC: stm32-dfsdm: code optimization Use of_device_get_match_data to optimize the source code. No check is needed on dev_data as match table is defined in driver. Signed-off-by: Arnaud Pouliquen Acked-by: Jonathan Cameron Signed-off-by: Mark Brown --- drivers/iio/adc/stm32-dfsdm-adc.c | 9 +-------- drivers/iio/adc/stm32-dfsdm-core.c | 8 +------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 5e871404f565..3fe9b34ac6af 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -1087,18 +1087,11 @@ static int stm32_dfsdm_adc_probe(struct platform_device *pdev) struct device_node *np = dev->of_node; const struct stm32_dfsdm_dev_data *dev_data; struct iio_dev *iio; - const struct of_device_id *of_id; char *name; int ret, irq, val; - of_id = of_match_node(stm32_dfsdm_adc_match, np); - if (!of_id->data) { - dev_err(&pdev->dev, "Data associated to device is missing\n"); - return -EINVAL; - } - - dev_data = (const struct stm32_dfsdm_dev_data *)of_id->data; + dev_data = of_device_get_match_data(dev); iio = devm_iio_device_alloc(dev, sizeof(*adc)); if (!iio) { dev_err(dev, "%s: Failed to allocate IIO\n", __func__); diff --git a/drivers/iio/adc/stm32-dfsdm-core.c b/drivers/iio/adc/stm32-dfsdm-core.c index 6cd655f8239b..6290332cfd3f 100644 --- a/drivers/iio/adc/stm32-dfsdm-core.c +++ b/drivers/iio/adc/stm32-dfsdm-core.c @@ -242,7 +242,6 @@ MODULE_DEVICE_TABLE(of, stm32_dfsdm_of_match); static int stm32_dfsdm_probe(struct platform_device *pdev) { struct dfsdm_priv *priv; - const struct of_device_id *of_id; const struct stm32_dfsdm_dev_data *dev_data; struct stm32_dfsdm *dfsdm; int ret; @@ -253,13 +252,8 @@ static int stm32_dfsdm_probe(struct platform_device *pdev) priv->pdev = pdev; - of_id = of_match_node(stm32_dfsdm_of_match, pdev->dev.of_node); - if (!of_id->data) { - dev_err(&pdev->dev, "Data associated to device is missing\n"); - return -EINVAL; - } + dev_data = of_device_get_match_data(&pdev->dev); - dev_data = (const struct stm32_dfsdm_dev_data *)of_id->data; dfsdm = &priv->dfsdm; dfsdm->fl_list = devm_kcalloc(&pdev->dev, dev_data->num_filters, sizeof(*dfsdm->fl_list), GFP_KERNEL); From 1175d0f9f4031ce02845f6f843f58a9caaee7817 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Mon, 15 Jan 2018 10:00:26 +0100 Subject: [PATCH 224/236] IIO: ADC: stm32-dfsdm: fix static check warning iio_priv does not return an error pointer, so check is not valid. Patch suppresses it. Signed-off-by: Arnaud Pouliquen Acked-by: Jonathan Cameron Signed-off-by: Mark Brown --- drivers/iio/adc/stm32-dfsdm-adc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 3fe9b34ac6af..daa026d6a94f 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -1099,10 +1099,6 @@ static int stm32_dfsdm_adc_probe(struct platform_device *pdev) } adc = iio_priv(iio); - if (IS_ERR(adc)) { - dev_err(dev, "%s: Failed to allocate ADC\n", __func__); - return PTR_ERR(adc); - } adc->dfsdm = dev_get_drvdata(dev->parent); iio->dev.parent = dev; From 3d3dd0d3ac207e8d28f6289896b99c1c0dad2fbe Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 16 Jan 2018 01:59:01 +0000 Subject: [PATCH 225/236] ASoC: tlv320dac33: fix regression by adding back .read/.write commit c4305af43a8 ("ASoC: use internal reg_cache on tlv320dac33") removed .read/.write from driver, but it might breaks non-regmap driver, because ALSA SoC framework might call it. To fix this regression, this patch back .read/.write Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/codecs/tlv320dac33.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c index 675f5b1b90a6..8c71d2f876ff 100644 --- a/sound/soc/codecs/tlv320dac33.c +++ b/sound/soc/codecs/tlv320dac33.c @@ -246,6 +246,19 @@ static int dac33_write(struct snd_soc_codec *codec, unsigned int reg, return ret; } +static int dac33_write_locked(struct snd_soc_codec *codec, unsigned int reg, + unsigned int value) +{ + struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec); + int ret; + + mutex_lock(&dac33->mutex); + ret = dac33_write(codec, reg, value); + mutex_unlock(&dac33->mutex); + + return ret; +} + #define DAC33_I2C_ADDR_AUTOINC 0x80 static int dac33_write16(struct snd_soc_codec *codec, unsigned int reg, unsigned int value) @@ -1422,6 +1435,8 @@ static int dac33_soc_remove(struct snd_soc_codec *codec) } static const struct snd_soc_codec_driver soc_codec_dev_tlv320dac33 = { + .read = dac33_read_reg_cache, + .write = dac33_write_locked, .set_bias_level = dac33_set_bias_level, .idle_bias_off = true, From 7604d8068e6253c143b27ce7e164ee8e7a9da5b9 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 16 Jan 2018 01:59:53 +0000 Subject: [PATCH 226/236] ASoC: uda1380: fix regression by adding back .read/.write commit c001bf633a9 ("ASoC: use internal reg_cache on uda1380") removed .read/.write from driver, but it might breaks non-regmap driver, because ALSA SoC framework might call it. To fix this regression, this patch back .read/.write Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/codecs/uda1380.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/uda1380.c b/sound/soc/codecs/uda1380.c index 46a495b4da8d..c73e6a192224 100644 --- a/sound/soc/codecs/uda1380.c +++ b/sound/soc/codecs/uda1380.c @@ -726,6 +726,8 @@ static int uda1380_probe(struct snd_soc_codec *codec) static const struct snd_soc_codec_driver soc_codec_dev_uda1380 = { .probe = uda1380_probe, + .read = uda1380_read_reg_cache, + .write = uda1380_write, .set_bias_level = uda1380_set_bias_level, .suspend_bias_off = true, From 3c89724e994f4aee6ae2637ccd4e12aa4f92666c Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 16 Jan 2018 02:00:18 +0000 Subject: [PATCH 227/236] ASoC: cx20442: fix regression by adding back .read/.write commit 39b5a0f80c07f ("ASoC: cx20442: don't use reg_cache") removed .read/.write from driver, but it might breaks non-regmap driver, because ALSA SoC framework might call it. To fix this regression, this patch back .read/.write. and also this patch uses cx20442 internal reg_cache which is needed for .read/.write. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/codecs/cx20442.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sound/soc/codecs/cx20442.c b/sound/soc/codecs/cx20442.c index 6b6f8e44369b..95bb10ba80dc 100644 --- a/sound/soc/codecs/cx20442.c +++ b/sound/soc/codecs/cx20442.c @@ -28,6 +28,7 @@ struct cx20442_priv { struct tty_struct *tty; struct regulator *por; + u8 reg_cache; }; #define CX20442_PM 0x0 @@ -88,6 +89,17 @@ static const struct snd_soc_dapm_route cx20442_audio_map[] = { {"ADC", NULL, "Input Mixer"}, }; +static unsigned int cx20442_read_reg_cache(struct snd_soc_codec *codec, + unsigned int reg) +{ + struct cx20442_priv *cx20442 = snd_soc_codec_get_drvdata(codec); + + if (reg >= 1) + return -EINVAL; + + return cx20442->reg_cache; +} + enum v253_vls { V253_VLS_NONE = 0, V253_VLS_T, @@ -112,8 +124,6 @@ enum v253_vls { V253_VLS_TEST, }; -#if 0 -/* FIXME : these function will be re-used */ static int cx20442_pm_to_v253_vls(u8 value) { switch (value & ~(1 << CX20442_AGC)) { @@ -147,11 +157,10 @@ static int cx20442_write(struct snd_soc_codec *codec, unsigned int reg, unsigned int value) { struct cx20442_priv *cx20442 = snd_soc_codec_get_drvdata(codec); - u8 *reg_cache = codec->reg_cache; int vls, vsp, old, len; char buf[18]; - if (reg >= codec->driver->reg_cache_size) + if (reg >= 1) return -EINVAL; /* tty and write pointers required for talking to the modem @@ -159,8 +168,8 @@ static int cx20442_write(struct snd_soc_codec *codec, unsigned int reg, if (!cx20442->tty || !cx20442->tty->ops->write) return -EIO; - old = reg_cache[reg]; - reg_cache[reg] = value; + old = cx20442->reg_cache; + cx20442->reg_cache = value; vls = cx20442_pm_to_v253_vls(value); if (vls < 0) @@ -190,7 +199,6 @@ static int cx20442_write(struct snd_soc_codec *codec, unsigned int reg, return 0; } -#endif /* * Line discpline related code @@ -384,12 +392,12 @@ static int cx20442_codec_remove(struct snd_soc_codec *codec) return 0; } -static const u8 cx20442_reg; - static const struct snd_soc_codec_driver cx20442_codec_dev = { .probe = cx20442_codec_probe, .remove = cx20442_codec_remove, .set_bias_level = cx20442_set_bias_level, + .read = cx20442_read_reg_cache, + .write = cx20442_write, .component_driver = { .dapm_widgets = cx20442_dapm_widgets, From d0ff8ba57d9654e6f7a2e18f192bac3b93268fef Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 16 Jan 2018 02:00:59 +0000 Subject: [PATCH 228/236] ASoC: add Component level .read/.write In current ALSA SoC, Codec only has .read/.write callback. Codec will be merged into Component in next generation ALSA SoC, thus current Codec specific feature need to be merged into it. This is glue patch for it. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- include/sound/soc.h | 3 +++ sound/soc/soc-io.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/include/sound/soc.h b/include/sound/soc.h index 1a7323238c49..6e865c2bcffe 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -802,6 +802,9 @@ struct snd_soc_component_driver { int (*suspend)(struct snd_soc_component *); int (*resume)(struct snd_soc_component *); + unsigned int (*read)(struct snd_soc_component *, unsigned int); + int (*write)(struct snd_soc_component *, unsigned int, unsigned int); + /* pcm creation and destruction */ int (*pcm_new)(struct snd_soc_pcm_runtime *); void (*pcm_free)(struct snd_pcm *); diff --git a/sound/soc/soc-io.c b/sound/soc/soc-io.c index 20340ade20a7..2bc1c4c17896 100644 --- a/sound/soc/soc-io.c +++ b/sound/soc/soc-io.c @@ -34,6 +34,10 @@ int snd_soc_component_read(struct snd_soc_component *component, ret = regmap_read(component->regmap, reg, val); else if (component->read) ret = component->read(component, reg, val); + else if (component->driver->read) { + *val = component->driver->read(component, reg); + ret = 0; + } else ret = -EIO; @@ -70,6 +74,8 @@ int snd_soc_component_write(struct snd_soc_component *component, return regmap_write(component->regmap, reg, val); else if (component->write) return component->write(component, reg, val); + else if (component->driver->write) + return component->driver->write(component, reg, val); else return -EIO; } From fb2fcaeaad504ae9dad26f7b26a8ea840d00535f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 Jan 2018 10:17:08 +0000 Subject: [PATCH 229/236] ASoC: Intel: remove second duplicated assignment to pointer 'res' The second assignment to res is identical to the previous assignment so it is redundant and can be removed. Cleans up clang warning: sound/soc/intel/skylake/skl-topology.c:191:25: warning: Value stored to 'res' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Mark Brown --- sound/soc/intel/skylake/skl-topology.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c index 28bc16a8e09a..73af6e19ebbd 100644 --- a/sound/soc/intel/skylake/skl-topology.c +++ b/sound/soc/intel/skylake/skl-topology.c @@ -190,7 +190,6 @@ skl_tplg_free_pipe_mcps(struct skl *skl, struct skl_module_cfg *mconfig) u8 res_idx = mconfig->res_idx; struct skl_module_res *res = &mconfig->module->resources[res_idx]; - res = &mconfig->module->resources[res_idx]; skl->resource.mcps -= res->cps; } From e29a22a86a20ea7651ff8c731ab034c31bd9764e Mon Sep 17 00:00:00 2001 From: Corentin LABBE Date: Wed, 17 Jan 2018 19:43:24 +0100 Subject: [PATCH 230/236] ASoC: Intel: remove select on non-existing SND_SOC_INTEL_COMMON SND_SST_ATOM_HIFI2_PLATFORM_PCI select SND_SOC_INTEL_COMMON which do not exists anymore. So remove this select. Fixes: c6059879be29 ("ASoC: Intel: Fix Kconfig with top-level selector") Signed-off-by: Corentin Labbe Acked-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- sound/soc/intel/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/intel/Kconfig b/sound/soc/intel/Kconfig index b0bd1938b71e..f2c9e8c5970a 100644 --- a/sound/soc/intel/Kconfig +++ b/sound/soc/intel/Kconfig @@ -77,7 +77,6 @@ config SND_SST_ATOM_HIFI2_PLATFORM_PCI depends on X86 && PCI select SND_SST_IPC_PCI select SND_SOC_COMPRESS - select SND_SOC_INTEL_COMMON help If you have a Intel Medfield or Merrifield/Edison platform, then enable this option by saying Y or m. Distros will typically not From 8af5748fa48698a433ba9a1766204bda283dffa8 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 17 Jan 2018 13:48:54 -0200 Subject: [PATCH 231/236] ASoC: sgtl5000: Do not print error on probe deferral When the MCLK is not yet available when the codec is probed, probe deferral will happen and in this case we should not print an error message. Signed-off-by: Fabio Estevam Signed-off-by: Mark Brown --- sound/soc/codecs/sgtl5000.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/sgtl5000.c b/sound/soc/codecs/sgtl5000.c index f2bb4feba3b6..633cdcfc933d 100644 --- a/sound/soc/codecs/sgtl5000.c +++ b/sound/soc/codecs/sgtl5000.c @@ -1332,10 +1332,13 @@ static int sgtl5000_i2c_probe(struct i2c_client *client, sgtl5000->mclk = devm_clk_get(&client->dev, NULL); if (IS_ERR(sgtl5000->mclk)) { ret = PTR_ERR(sgtl5000->mclk); - dev_err(&client->dev, "Failed to get mclock: %d\n", ret); /* Defer the probe to see if the clk will be provided later */ if (ret == -ENOENT) ret = -EPROBE_DEFER; + + if (ret != -EPROBE_DEFER) + dev_err(&client->dev, "Failed to get mclock: %d\n", + ret); goto disable_regs; } From d04c413f2ab3aa5998bf86f7a2f6235ed82b2ee2 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 17 Jan 2018 13:48:55 -0200 Subject: [PATCH 232/236] ASoC: mxs-sgtl5000: Do not print error on probe deferral Probe deferral may happen, so do not print an error message in this case. Signed-off-by: Fabio Estevam Signed-off-by: Mark Brown --- sound/soc/mxs/mxs-sgtl5000.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/mxs/mxs-sgtl5000.c b/sound/soc/mxs/mxs-sgtl5000.c index 2ed3240cc682..5a871f25f438 100644 --- a/sound/soc/mxs/mxs-sgtl5000.c +++ b/sound/soc/mxs/mxs-sgtl5000.c @@ -143,8 +143,9 @@ static int mxs_sgtl5000_probe(struct platform_device *pdev) ret = devm_snd_soc_register_card(&pdev->dev, card); if (ret) { - dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", - ret); + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", + ret); return ret; } From 8d5737a5f53902a916ee1e1cb248c9b8b883b2ea Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Wed, 17 Jan 2018 13:50:50 +0100 Subject: [PATCH 233/236] ASoC: bcm2835: fix hw_params error when device is in prepared state If bcm2835 is configured as bitclock master calling hw_params() after prepare() fails with EBUSY. This also makes it impossible to use bcm2835 in full duplex mode. The error is caused by the split clock setup: clk_set_rate is called in hw_params, clk_prepare_enable in prepare. As hw_params doesn't check if the clock was already enabled clk_set_rate fails with EBUSY. Fix this by moving clock startup from prepare to hw_params and let hw_params properly deal with an already set up or enabled clock. Signed-off-by: Matthias Reichl Signed-off-by: Mark Brown --- sound/soc/bcm/bcm2835-i2s.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sound/soc/bcm/bcm2835-i2s.c b/sound/soc/bcm/bcm2835-i2s.c index 2e449d7173fc..d5f73a8ab893 100644 --- a/sound/soc/bcm/bcm2835-i2s.c +++ b/sound/soc/bcm/bcm2835-i2s.c @@ -130,6 +130,7 @@ struct bcm2835_i2s_dev { struct regmap *i2s_regmap; struct clk *clk; bool clk_prepared; + int clk_rate; }; static void bcm2835_i2s_start_clock(struct bcm2835_i2s_dev *dev) @@ -419,10 +420,19 @@ static int bcm2835_i2s_hw_params(struct snd_pcm_substream *substream, } /* Clock should only be set up here if CPU is clock master */ - if (bit_clock_master) { - ret = clk_set_rate(dev->clk, bclk_rate); - if (ret) - return ret; + if (bit_clock_master && + (!dev->clk_prepared || dev->clk_rate != bclk_rate)) { + if (dev->clk_prepared) + bcm2835_i2s_stop_clock(dev); + + if (dev->clk_rate != bclk_rate) { + ret = clk_set_rate(dev->clk, bclk_rate); + if (ret) + return ret; + dev->clk_rate = bclk_rate; + } + + bcm2835_i2s_start_clock(dev); } /* Setup the frame format */ @@ -618,8 +628,6 @@ static int bcm2835_i2s_prepare(struct snd_pcm_substream *substream, struct bcm2835_i2s_dev *dev = snd_soc_dai_get_drvdata(dai); uint32_t cs_reg; - bcm2835_i2s_start_clock(dev); - /* * Clear both FIFOs if the one that should be started * is not empty at the moment. This should only happen From 166a5a33d5a7bfa62c039eb10e69589b09fd0557 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jan 2018 12:17:47 +0100 Subject: [PATCH 234/236] IIO: ADC: stm32-dfsdm: remove unused variable again The merge between commit abaca806fd13 ("IIO: ADC: stm32-dfsdm: code optimization") and commit 2353758bc2d4 ("IIO: ADC: stm32-dfsdm: avoid unused-variable warning") left one variable behind that is no longer needed and can be removed, as shown by the gcc warning: drivers/iio/adc/stm32-dfsdm-core.c: In function 'stm32_dfsdm_probe': drivers/iio/adc/stm32-dfsdm-core.c:245:29: error: unused variable 'of_id' [-Werror=unused-variable] Fixes: d84b4c7c706f ("Merge branch 'topic/iio' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into asoc-st-dfsdm") Signed-off-by: Arnd Bergmann Signed-off-by: Mark Brown --- drivers/iio/adc/stm32-dfsdm-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iio/adc/stm32-dfsdm-core.c b/drivers/iio/adc/stm32-dfsdm-core.c index 84277bcc465f..6290332cfd3f 100644 --- a/drivers/iio/adc/stm32-dfsdm-core.c +++ b/drivers/iio/adc/stm32-dfsdm-core.c @@ -242,7 +242,6 @@ MODULE_DEVICE_TABLE(of, stm32_dfsdm_of_match); static int stm32_dfsdm_probe(struct platform_device *pdev) { struct dfsdm_priv *priv; - const struct of_device_id *of_id; const struct stm32_dfsdm_dev_data *dev_data; struct stm32_dfsdm *dfsdm; int ret; From 031734b7d6532633d0cde73475c30646bf37cd6d Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 18 Jan 2018 01:13:54 +0000 Subject: [PATCH 235/236] ASoC: soc-core: add missing EXPORT_SYMBOL_GPL() for snd_soc_rtdcom_lookup Reported-by: Atsushi Nemoto Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index b9ca939fd05c..9b79c2199781 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -598,6 +598,7 @@ struct snd_soc_component *snd_soc_rtdcom_lookup(struct snd_soc_pcm_runtime *rtd, return NULL; } +EXPORT_SYMBOL_GPL(snd_soc_rtdcom_lookup); struct snd_pcm_substream *snd_soc_get_dai_substream(struct snd_soc_card *card, const char *dai_link, int stream) From 700c17ca8968f473631594e8a7c2cc880ba2c891 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Thu, 18 Jan 2018 13:31:26 +0800 Subject: [PATCH 236/236] ASoC: use seq_file to dump the contents of dai_list,platform_list and codec_list Now the debugfs files dais/platforms/codecs have a size limit PAGE_SIZE and the user can not see the whole contents of dai_list/platform_list/codec_list when they are larger than this limit. This patch uses seq_file instead to make sure dais/platforms/codecs show the full contents of dai_list/platform_list/codec_list. Signed-off-by: Donglin Peng Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 111 +++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 74 deletions(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index c0edac80df34..7b582112e3fc 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -349,120 +349,84 @@ static void soc_init_codec_debugfs(struct snd_soc_component *component) "ASoC: Failed to create codec register debugfs file\n"); } -static ssize_t codec_list_read_file(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static int codec_list_seq_show(struct seq_file *m, void *v) { - char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - ssize_t len, ret = 0; struct snd_soc_codec *codec; - if (!buf) - return -ENOMEM; - mutex_lock(&client_mutex); - list_for_each_entry(codec, &codec_list, list) { - len = snprintf(buf + ret, PAGE_SIZE - ret, "%s\n", - codec->component.name); - if (len >= 0) - ret += len; - if (ret > PAGE_SIZE) { - ret = PAGE_SIZE; - break; - } - } + list_for_each_entry(codec, &codec_list, list) + seq_printf(m, "%s\n", codec->component.name); mutex_unlock(&client_mutex); - if (ret >= 0) - ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); + return 0; +} - kfree(buf); - - return ret; +static int codec_list_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, codec_list_seq_show, NULL); } static const struct file_operations codec_list_fops = { - .read = codec_list_read_file, - .llseek = default_llseek,/* read accesses f_pos */ + .open = codec_list_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; -static ssize_t dai_list_read_file(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static int dai_list_seq_show(struct seq_file *m, void *v) { - char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - ssize_t len, ret = 0; struct snd_soc_component *component; struct snd_soc_dai *dai; - if (!buf) - return -ENOMEM; - mutex_lock(&client_mutex); - list_for_each_entry(component, &component_list, list) { - list_for_each_entry(dai, &component->dai_list, list) { - len = snprintf(buf + ret, PAGE_SIZE - ret, "%s\n", - dai->name); - if (len >= 0) - ret += len; - if (ret > PAGE_SIZE) { - ret = PAGE_SIZE; - break; - } - } - } + list_for_each_entry(component, &component_list, list) + list_for_each_entry(dai, &component->dai_list, list) + seq_printf(m, "%s\n", dai->name); mutex_unlock(&client_mutex); - ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); + return 0; +} - kfree(buf); - - return ret; +static int dai_list_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, dai_list_seq_show, NULL); } static const struct file_operations dai_list_fops = { - .read = dai_list_read_file, - .llseek = default_llseek,/* read accesses f_pos */ + .open = dai_list_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; -static ssize_t platform_list_read_file(struct file *file, - char __user *user_buf, - size_t count, loff_t *ppos) +static int platform_list_seq_show(struct seq_file *m, void *v) { - char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - ssize_t len, ret = 0; struct snd_soc_platform *platform; - if (!buf) - return -ENOMEM; - mutex_lock(&client_mutex); - list_for_each_entry(platform, &platform_list, list) { - len = snprintf(buf + ret, PAGE_SIZE - ret, "%s\n", - platform->component.name); - if (len >= 0) - ret += len; - if (ret > PAGE_SIZE) { - ret = PAGE_SIZE; - break; - } - } + list_for_each_entry(platform, &platform_list, list) + seq_printf(m, "%s\n", platform->component.name); mutex_unlock(&client_mutex); - ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); + return 0; +} - kfree(buf); - - return ret; +static int platform_list_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, platform_list_seq_show, NULL); } static const struct file_operations platform_list_fops = { - .read = platform_list_read_file, - .llseek = default_llseek,/* read accesses f_pos */ + .open = platform_list_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static void soc_init_card_debugfs(struct snd_soc_card *card) @@ -491,7 +455,6 @@ static void soc_cleanup_card_debugfs(struct snd_soc_card *card) debugfs_remove_recursive(card->debugfs_card_root); } - static void snd_soc_debugfs_init(void) { snd_soc_debugfs_root = debugfs_create_dir("asoc", NULL);