From 92f43c452c5313a7914eab2b08d966a6c5007baa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2015 00:05:14 +0200 Subject: [PATCH 0001/1466] kbuild/mkspec: Simplify vmlinux.bz2 creation No need for the intermediary vmlinux.orig - bzip2 can keep the original files used for compression with --keep. Signed-off-by: Borislav Petkov Signed-off-by: Michal Marek --- scripts/package/mkspec | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index d9ab94b17de0..89f9669d4f00 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -111,10 +111,8 @@ echo 'cp System.map $RPM_BUILD_ROOT'"/boot/System.map-$KERNELRELEASE" echo 'cp .config $RPM_BUILD_ROOT'"/boot/config-$KERNELRELEASE" echo "%ifnarch ppc64" -echo 'cp vmlinux vmlinux.orig' -echo 'bzip2 -9 vmlinux' +echo 'bzip2 -9 --keep vmlinux' echo 'mv vmlinux.bz2 $RPM_BUILD_ROOT'"/boot/vmlinux-$KERNELRELEASE.bz2" -echo 'mv vmlinux.orig vmlinux' echo "%endif" if ! $PREBUILT; then From dca0c0246fb739bccdd19ff2bfd0f02ccffdb07c Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Thu, 16 Apr 2015 16:42:46 +0300 Subject: [PATCH 0002/1466] deb-pkg: move setting debarch for a separate function create_package() function tries to resolve used architecture for everry package. Split the setting the architecture to a new function, set_debarch(), called once on startup. This allows using debarch from other parts of script as needed. v2: Follow Michals suggestion on setting variables at top scope and also setting the fallback $debarch in the new function Signed-off-by: Riku Voipio Signed-off-by: Michal Marek --- scripts/package/builddeb | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 88dbf23b6970..fccabe5fb72b 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -25,8 +25,13 @@ create_package() { chown -R root:root "$pdir" chmod -R go-w "$pdir" + # Create the package + dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir" + dpkg --build "$pdir" .. +} + +set_debarch() { # Attempt to find the correct Debian architecture - local forcearch="" debarch="" case "$UTS_MACHINE" in i386|ia64|alpha) debarch="$UTS_MACHINE" ;; @@ -47,6 +52,7 @@ create_package() { arm*) debarch=arm$(grep -q CONFIG_AEABI=y $KCONFIG_CONFIG && echo el || true) ;; *) + debarch=$(dpkg --print-architecture) echo "" >&2 echo "** ** ** WARNING ** ** **" >&2 echo "" >&2 @@ -59,13 +65,8 @@ create_package() { if [ -n "$KBUILD_DEBARCH" ] ; then debarch="$KBUILD_DEBARCH" fi - if [ -n "$debarch" ] ; then - forcearch="-DArchitecture=$debarch" - fi + forcearch="-DArchitecture=$debarch" - # Create the package - dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch:-$(dpkg --print-architecture)}" -p$pname -P"$pdir" - dpkg --build "$pdir" .. } # Some variables and settings used throughout the script @@ -86,6 +87,9 @@ fwpackagename=linux-firmware-image-$version kernel_headers_packagename=linux-headers-$version libc_headers_packagename=linux-libc-dev dbg_packagename=$packagename-dbg +debarch= +forcearch= +set_debarch if [ "$ARCH" = "um" ] ; then packagename=user-mode-linux-$version From 64178cb62c329350fe06622cd215264d849b27b1 Mon Sep 17 00:00:00 2001 From: Andrey Skvortsov Date: Mon, 16 Mar 2015 11:20:54 +0300 Subject: [PATCH 0003/1466] builddeb: fix stripped module signatures if CONFIG_DEBUG_INFO and CONFIG_MODULE_SIG_ALL are set If CONFIG_MODULE_SIG_ALL is set, then user expects that all modules are automatically signed in the result package, as it's for rpm-pkg, binrpm-pkg, tar, tar-*. For deb-pkg this is correct only if CONFIG_DEBUG_INFO is NOT set. In that case deb-package contains signed modules. But if CONFIG_DEBUG_INFO is set, builddeb creates separate package with debug information. To do that, debug information from all modules is copied into separate files by objcopy. And loadable kernel modules are stripped afterwards. Stripping removes previously (during modules_install) added signatures from loadable kernel modules. Therefore final deb-package contains unsigned modules despite of set option CONFIG_MODULE_SIG_ALL. This patch resigns all stripped modules if CONFIG_MODULE_SIG_ALL is set to solve this problem. Signed-off-by: Andrey Skvortsov Acked-by: maximilian attems Signed-off-by: Michal Marek --- scripts/package/builddeb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index fccabe5fb72b..222770c1b775 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -166,6 +166,12 @@ if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then # then add a link to those $OBJCOPY --add-gnu-debuglink=$dbg_dir/usr/lib/debug/$module $tmpdir/$module done + + # resign stripped modules + MODULE_SIG_ALL="$(grep -s '^CONFIG_MODULE_SIG_ALL=y' $KCONFIG_CONFIG || true)" + if [ -n "$MODULE_SIG_ALL" ]; then + INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_sign + fi fi fi From ca2a9d2cf6cf3dd852c3926ac7e30ee774da4638 Mon Sep 17 00:00:00 2001 From: "Arnaud Patard (Rtp)" Date: Tue, 3 Feb 2015 13:16:33 +0100 Subject: [PATCH 0004/1466] deb-pkg: Add device tree blobs to the package When building a package with make deb-pkg (say, for arm), the dtb files are not added to the package. Given that things are still evolving on arm, it make sense to have them along with the kernel and modules. Signed-off-by: Arnaud Patard Reviewed-by: Ben Hutchings Acked-by: maximilian attems Signed-off-by: Michal Marek --- scripts/package/builddeb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 222770c1b775..d30116b57e7e 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -147,6 +147,13 @@ else cp arch/$ARCH/boot/$KBUILD_IMAGE "$tmpdir/$installed_image_path" fi +if grep -q "^CONFIG_OF=y" $KCONFIG_CONFIG ; then + # Only some architectures with OF support have this target + if grep -q dtbs_install "${srctree}/arch/$SRCARCH/Makefile"; then + $MAKE KBUILD_SRC= INSTALL_DTBS_PATH="$tmpdir/usr/lib/$packagename" dtbs_install + fi +fi + if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_install rm -f "$tmpdir/lib/modules/$version/build" From f9beafc9d8bf7febf673df9b41e13596ca669f75 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:27 -0300 Subject: [PATCH 0005/1466] coccinelle: pm_runtime: Insert blank line Insert a blank line in order to improve the readability of the generated patch and also make it consistent with the other .cocci files. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/pm_runtime.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/api/pm_runtime.cocci b/scripts/coccinelle/api/pm_runtime.cocci index f01789e967ec..b7042d074078 100644 --- a/scripts/coccinelle/api/pm_runtime.cocci +++ b/scripts/coccinelle/api/pm_runtime.cocci @@ -1,5 +1,5 @@ /// Make sure pm_runtime_* calls does not use unnecessary IS_ERR_VALUE -// +/// // Keywords: pm_runtime // Confidence: Medium // Copyright (C) 2013 Texas Instruments Incorporated - GPLv2. From fe8c46b632505a880c527bc9ae246e868aa3ece5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:28 -0300 Subject: [PATCH 0006/1466] coccinelle: returnvar: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour." So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/returnvar.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/returnvar.cocci b/scripts/coccinelle/misc/returnvar.cocci index 605955a91c44..d8286ef5307f 100644 --- a/scripts/coccinelle/misc/returnvar.cocci +++ b/scripts/coccinelle/misc/returnvar.cocci @@ -1,5 +1,5 @@ /// -/// Removes unneeded variable used to store return value. +/// Remove unneeded variable used to store return value. /// // Confidence: Moderate // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6. GPLv2. From dd494ac0de48ded6a7ec0525f253116fde5c7be5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:29 -0300 Subject: [PATCH 0007/1466] coccinelle: ifaddr: Fix the sentence Make the sentence sensible. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/ifaddr.cocci | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/coccinelle/misc/ifaddr.cocci b/scripts/coccinelle/misc/ifaddr.cocci index 8aebd1875e75..c2663c677ac1 100644 --- a/scripts/coccinelle/misc/ifaddr.cocci +++ b/scripts/coccinelle/misc/ifaddr.cocci @@ -1,5 +1,4 @@ -/// the address of a variable or field is non-zero is likely always to bo -/// non-zero +/// The address of a variable or field is likely always to be non-zero. /// // Confidence: High // Copyright: (C) 2012 Julia Lawall, INRIA/LIP6. GPLv2. From ca34cba43168830dd96f8f6407282131733e6fb4 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 9 May 2015 17:09:30 -0300 Subject: [PATCH 0008/1466] coccinelle: simple_open: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour." So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/simple_open.cocci | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/coccinelle/api/simple_open.cocci b/scripts/coccinelle/api/simple_open.cocci index b67e174f3d95..bd1a2a4ee106 100644 --- a/scripts/coccinelle/api/simple_open.cocci +++ b/scripts/coccinelle/api/simple_open.cocci @@ -1,5 +1,5 @@ -/// This removes an open coded simple_open() function -/// and replaces file operations references to the function +/// Remove an open coded simple_open() function +/// and replace file operations references to the function /// with simple_open() instead. /// // Confidence: High From 4341f6e5ce448dd79c3e663513213b936ba34c83 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 May 2015 08:02:34 -0300 Subject: [PATCH 0009/1466] scripts/coccinelle/misc/semicolon.cocci: Use imperative mood According to Documentation/SubmittingPatches: "Describe your changes in imperative mood, e.g. "make xyzzy do frotz" instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy to do frotz", as if you are giving orders to the codebase to change its behaviour. So do as recommended. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/semicolon.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/semicolon.cocci b/scripts/coccinelle/misc/semicolon.cocci index a47eba2edc9e..6740c659a2b3 100644 --- a/scripts/coccinelle/misc/semicolon.cocci +++ b/scripts/coccinelle/misc/semicolon.cocci @@ -1,5 +1,5 @@ /// -/// Removes unneeded semicolon. +/// Remove unneeded semicolon. /// // Confidence: Moderate // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6. GPLv2. From 74de120d8096f72bdf95aba7234428c798d931cd Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 20 May 2015 08:02:35 -0300 Subject: [PATCH 0010/1466] scripts/coccinelle/misc/irqf_oneshot.cocci: Fix grammar Correct form is 'always requested'. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/irqf_oneshot.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/irqf_oneshot.cocci b/scripts/coccinelle/misc/irqf_oneshot.cocci index a24a754ae1d7..b17ac8b99894 100644 --- a/scripts/coccinelle/misc/irqf_oneshot.cocci +++ b/scripts/coccinelle/misc/irqf_oneshot.cocci @@ -1,4 +1,4 @@ -/// Make sure threaded IRQs without a primary handler are always request with +/// Make sure threaded IRQs without a primary handler are always requested with /// IRQF_ONESHOT /// // From 4c8f20bb8e0ba6eecf62958bbf0502a2dc445ce6 Mon Sep 17 00:00:00 2001 From: Dmitry Kalinkin Date: Thu, 21 May 2015 19:19:13 +0800 Subject: [PATCH 0011/1466] coccinelle: api: add vma_pages.cocci This semantic patch replaces explicit computations of vma page count with explicit function call. Signed-off-by: Dmitry Kalinkin Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/api/vma_pages.cocci | 60 ++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scripts/coccinelle/api/vma_pages.cocci diff --git a/scripts/coccinelle/api/vma_pages.cocci b/scripts/coccinelle/api/vma_pages.cocci new file mode 100644 index 000000000000..3e52e11ea1dc --- /dev/null +++ b/scripts/coccinelle/api/vma_pages.cocci @@ -0,0 +1,60 @@ +/// +/// Use vma_pages function on vma object instead of explicit computation. +/// +// Confidence: High +// Keywords: vma_pages vma +// Comment: Based on resource_size.cocci + +virtual context +virtual patch +virtual org +virtual report + +//---------------------------------------------------------- +// For context mode +//---------------------------------------------------------- + +@r_context depends on context && !patch && !org && !report@ +struct vm_area_struct *vma; +@@ + +* (vma->vm_end - vma->vm_start) >> PAGE_SHIFT + +//---------------------------------------------------------- +// For patch mode +//---------------------------------------------------------- + +@r_patch depends on !context && patch && !org && !report@ +struct vm_area_struct *vma; +@@ + +- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) ++ vma_pages(vma) + +//---------------------------------------------------------- +// For org mode +//---------------------------------------------------------- + +@r_org depends on !context && !patch && (org || report)@ +struct vm_area_struct *vma; +position p; +@@ + + (vma->vm_end@p - vma->vm_start) >> PAGE_SHIFT + +@script:python depends on report@ +p << r_org.p; +x << r_org.vma; +@@ + +msg="WARNING: Consider using vma_pages helper on %s" % (x) +coccilib.report.print_report(p[0], msg) + +@script:python depends on org@ +p << r_org.p; +x << r_org.vma; +@@ + +msg="WARNING: Consider using vma_pages helper on %s" % (x) +msg_safe=msg.replace("[","@(").replace("]",")") +coccilib.org.print_todo(p[0], msg_safe) From 9473a62f779d78bae646e7ef1a792d53ad4ac29e Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sun, 24 May 2015 17:45:54 -0300 Subject: [PATCH 0012/1466] coccinelle: irqf_oneshot.cocci: Improve the generated commit log Improve the commit log of the generated patch by mentioning the commit log that makes threaded IRQs without a primary handler to be requested with the IRQF_ONESHOT flag. Signed-off-by: Fabio Estevam Acked- by: Valentin Rothberg Signed-off-by: Michal Marek --- scripts/coccinelle/misc/irqf_oneshot.cocci | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/coccinelle/misc/irqf_oneshot.cocci b/scripts/coccinelle/misc/irqf_oneshot.cocci index b17ac8b99894..b421150a2eff 100644 --- a/scripts/coccinelle/misc/irqf_oneshot.cocci +++ b/scripts/coccinelle/misc/irqf_oneshot.cocci @@ -1,5 +1,8 @@ -/// Make sure threaded IRQs without a primary handler are always requested with -/// IRQF_ONESHOT +/// Since commit 1c6c69525b40 ("genirq: Reject bogus threaded irq requests") +/// threaded IRQs without a primary handler need to be requested with +/// IRQF_ONESHOT, otherwise the request will fail. +/// +/// So pass the IRQF_ONESHOT flag in this case. /// // // Confidence: Good From f94c56f4f33dd34551af6bcc1afde5082fdf6e86 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 1 Jun 2015 22:52:20 -0300 Subject: [PATCH 0013/1466] coccinelle: simple_return: Add a blank line Insert a blank line in order to improve the readability of the generated patch and also make it consistent with the other .cocci files. Signed-off-by: Fabio Estevam Acked-by: Julia Lawall Signed-off-by: Michal Marek --- scripts/coccinelle/misc/simple_return.cocci | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/coccinelle/misc/simple_return.cocci b/scripts/coccinelle/misc/simple_return.cocci index 47f7084b6360..e8b6313b116f 100644 --- a/scripts/coccinelle/misc/simple_return.cocci +++ b/scripts/coccinelle/misc/simple_return.cocci @@ -1,6 +1,6 @@ /// Simplify a trivial if-return sequence. Possibly combine with a /// preceding function call. -// +/// // Confidence: High // Copyright: (C) 2014 Julia Lawall, INRIA/LIP6. GPLv2. // Copyright: (C) 2014 Gilles Muller, INRIA/LiP6. GPLv2. From d0fe116b4554d79125f384f7ba23722b41c3cb93 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 24 Apr 2015 10:27:40 -0700 Subject: [PATCH 0014/1466] gitignore: Add MIPS vmlinux.32 to the list MIPS64 kernels builds will produce a vmlinux.32 kernel image for compatibility, ignore them. Signed-off-by: Florian Fainelli Signed-off-by: Michal Marek --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4ad4a98b884b..34d6bad9317b 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ Module.symvers /TAGS /linux /vmlinux +/vmlinux.32 /vmlinux-gdb.py /vmlinuz /System.map From a37161c0588c0d3ff4afb08ef83106a80bde604e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 16 Apr 2015 14:02:41 -0700 Subject: [PATCH 0015/1466] Kbuild: Add ID files to .gitignore I use GNU id-utils to find code (essentially a database backed grep), which generates an ID file to maintain its data. Add ID to the .gitignore file. Signed-off-by: Andi Kleen Signed-off-by: Michal Marek --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 34d6bad9317b..98b91fccff45 100644 --- a/.gitignore +++ b/.gitignore @@ -90,6 +90,9 @@ GRTAGS GSYMS GTAGS +# id-utils files +ID + *.orig *~ \#*# From 21a59991ce0cd9a0b54b135305e3fcf880f2aaf1 Mon Sep 17 00:00:00 2001 From: Jim Davis Date: Mon, 8 Jun 2015 13:19:08 -0700 Subject: [PATCH 0016/1466] scripts/package/Makefile: rpmbuild is needed for rpm targets Before rpm release 4.1, in 2002, either the rpm command or the rpmbuild command could be used in the rpm-pkg or binrpm-pkg targets, and the Makefile chose the rpm command if the rpmbuild command wasn't found. After release 4.1, however, the rpm command could no longer be used in place of the rpmbuild command. As the rpmbuild command is not installed by default, this can lead to failures with the rpm-pkg and binrpm-pkg targets: rpm --define "_builddir ." --target \ x86_64 -bb ./binkernel.spec rpm --target: unknown option scripts/package/Makefile:60: recipe for target 'binrpm-pkg' failed Change the Makefile to use rpmbuild unconditionally to avoid this. Signed-off-by: Jim Davis Signed-off-by: Michal Marek --- scripts/package/Makefile | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/package/Makefile b/scripts/package/Makefile index 99ca6e76eb0a..8b11d5adec7f 100644 --- a/scripts/package/Makefile +++ b/scripts/package/Makefile @@ -21,10 +21,6 @@ # Note that the rpm-pkg target cannot be used with KBUILD_OUTPUT, # but the binrpm-pkg target can; for some reason O= gets ignored. -# Do we have rpmbuild, otherwise fall back to the older rpm -RPM := $(shell if [ -x "/usr/bin/rpmbuild" ]; then echo rpmbuild; \ - else echo rpm; fi) - # Remove hyphens since they have special meaning in RPM filenames KERNELPATH := kernel-$(subst -,_,$(KERNELRELEASE)) # Include only those top-level files that are needed by make, plus the GPL copy @@ -51,7 +47,7 @@ rpm-pkg rpm: FORCE rm -f $(objtree)/.scmversion $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - $(RPM) $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz + rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz rm $(KERNELPATH).tar.gz kernel.spec # binrpm-pkg @@ -62,7 +58,7 @@ binrpm-pkg: FORCE $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - $(RPM) $(RPMOPTS) --define "_builddir $(objtree)" --target \ + rpmbuild --define "_builddir $(objtree)" --target \ $(UTS_MACHINE) -bb $(objtree)/binkernel.spec rm binkernel.spec From fe052a1810ec4687ee7d606290561af504047707 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 29 Jun 2015 13:08:19 +0300 Subject: [PATCH 0017/1466] target: Use struct t10_pi_tuple Its not a good idea to keep target specific definition of the same t10-pi tuple. (Fix v4.2-rc1 patch fuzz - nab) Signed-off-by: Sagi Grimberg Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_device.c | 2 +- drivers/target/target_core_sbc.c | 10 +++++----- include/target/target_core_base.h | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 09e682b1c549..db7034292053 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -754,7 +754,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) dev->dev_link_magic = SE_DEV_LINK_MAGIC; dev->se_hba = hba; dev->transport = hba->backend->ops; - dev->prot_length = sizeof(struct se_dif_v1_tuple); + dev->prot_length = sizeof(struct t10_pi_tuple); dev->hba_index = hba->hba_index; INIT_LIST_HEAD(&dev->dev_list); diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index e318ddbe15da..ac7215039e5a 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -1191,7 +1191,7 @@ void sbc_dif_generate(struct se_cmd *cmd) { struct se_device *dev = cmd->se_dev; - struct se_dif_v1_tuple *sdt; + struct t10_pi_tuple *sdt; struct scatterlist *dsg = cmd->t_data_sg, *psg; sector_t sector = cmd->t_task_lba; void *daddr, *paddr; @@ -1203,7 +1203,7 @@ sbc_dif_generate(struct se_cmd *cmd) daddr = kmap_atomic(sg_page(dsg)) + dsg->offset; for (j = 0; j < psg->length; - j += sizeof(struct se_dif_v1_tuple)) { + j += sizeof(*sdt)) { __u16 crc; unsigned int avail; @@ -1256,7 +1256,7 @@ sbc_dif_generate(struct se_cmd *cmd) } static sense_reason_t -sbc_dif_v1_verify(struct se_cmd *cmd, struct se_dif_v1_tuple *sdt, +sbc_dif_v1_verify(struct se_cmd *cmd, struct t10_pi_tuple *sdt, __u16 crc, sector_t sector, unsigned int ei_lba) { __be16 csum; @@ -1346,7 +1346,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors, unsigned int ei_lba, struct scatterlist *psg, int psg_off) { struct se_device *dev = cmd->se_dev; - struct se_dif_v1_tuple *sdt; + struct t10_pi_tuple *sdt; struct scatterlist *dsg = cmd->t_data_sg; sector_t sector = start; void *daddr, *paddr; @@ -1361,7 +1361,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors, for (i = psg_off; i < psg->length && sector < start + sectors; - i += sizeof(struct se_dif_v1_tuple)) { + i += sizeof(*sdt)) { __u16 crc; unsigned int avail; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 17ae2d6a4891..a6816444d81b 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -426,12 +427,6 @@ enum target_core_dif_check { TARGET_DIF_CHECK_REFTAG = 0x1 << 2, }; -struct se_dif_v1_tuple { - __be16 guard_tag; - __be16 app_tag; - __be32 ref_tag; -}; - /* for sam_task_attr */ #define TCM_SIMPLE_TAG 0x20 #define TCM_HEAD_TAG 0x21 From 04dc91ce2cca5927159c689aa1f47663f8c51530 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 13 Jul 2015 12:26:44 +0200 Subject: [PATCH 0018/1466] regmap: Add better support for devices without readback support Currently regmap requires that a reg_read callback is supplied, otherwise a warning is emitted each time regmap_read() is called. This means a device or bus without readback support needs to supply dummy reg_read callback. Apart from that regmap_read() will still work fine if a cache is used. Remove the warning and let regmap_readable() return false if not reg_read callback is supplied. This means a device no longer has to supply a dummy callback if it does not support readback and it also doesn't have to have a readable_reg callback that always returns false since this is now implicit. Signed-off-by: Lars-Peter Clausen Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 7111d04f2621..8894b992043e 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -93,6 +93,9 @@ bool regmap_writeable(struct regmap *map, unsigned int reg) bool regmap_readable(struct regmap *map, unsigned int reg) { + if (!map->reg_read) + return false; + if (map->max_register && reg > map->max_register) return false; @@ -2097,8 +2100,6 @@ static int _regmap_read(struct regmap *map, unsigned int reg, int ret; void *context = _regmap_map_get_context(map); - WARN_ON(!map->reg_read); - if (!map->cache_bypass) { ret = regcache_read(map, reg, val); if (ret == 0) From 671a2781ff01abf4fdc8904881fc3abd3a8279af Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 10 Jul 2015 17:19:55 -0400 Subject: [PATCH 0019/1466] security: add ioctl specific auditing to lsm_audit Add information about ioctl calls to the LSM audit data. Log the file path and command number. Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 7 +++++++ security/lsm_audit.c | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 1cc89e9df480..ffb9c9da4f39 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -40,6 +40,11 @@ struct lsm_network_audit { } fam; }; +struct lsm_ioctlop_audit { + struct path path; + u16 cmd; +}; + /* Auxiliary data to use in generating the audit record. */ struct common_audit_data { char type; @@ -53,6 +58,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_KMOD 8 #define LSM_AUDIT_DATA_INODE 9 #define LSM_AUDIT_DATA_DENTRY 10 +#define LSM_AUDIT_DATA_IOCTL_OP 11 union { struct path path; struct dentry *dentry; @@ -68,6 +74,7 @@ struct common_audit_data { } key_struct; #endif char *kmod_name; + struct lsm_ioctlop_audit *op; } u; /* this union contains LSM specific data */ union { diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 1d34277dc402..9f6c649c65e9 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -245,6 +245,21 @@ static void dump_common_audit_data(struct audit_buffer *ab, } break; } + case LSM_AUDIT_DATA_IOCTL_OP: { + struct inode *inode; + + audit_log_d_path(ab, " path=", &a->u.op->path); + + inode = a->u.op->path.dentry->d_inode; + if (inode) { + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", inode->i_ino); + } + + audit_log_format(ab, " ioctlcmd=%hx", a->u.op->cmd); + break; + } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; From fa1aa143ac4a682c7f5fd52a3cf05f5a6fe44a0a Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 10 Jul 2015 17:19:56 -0400 Subject: [PATCH 0020/1466] selinux: extended permissions for ioctls Add extended permissions logic to selinux. Extended permissions provides additional permissions in 256 bit increments. Extend the generic ioctl permission check to use the extended permissions for per-command filtering. Source/target/class sets including the ioctl permission may additionally include a set of commands. Example: allowxperm : ioctl unpriv_app_socket_cmds auditallowxperm : ioctl priv_gpu_cmds Where unpriv_app_socket_cmds and priv_gpu_cmds are macros representing commonly granted sets of ioctl commands. When ioctl commands are omitted only the permissions are checked. This feature is intended to provide finer granularity for the ioctl permission that may be too imprecise. For example, the same driver may use ioctls to provide important and benign functionality such as driver version or socket type as well as dangerous capabilities such as debugging features, read/write/execute to physical memory or access to sensitive data. Per-command filtering provides a mechanism to reduce the attack surface of the kernel, and limit applications to the subset of commands required. The format of the policy binary has been modified to include ioctl commands, and the policy version number has been incremented to POLICYDB_VERSION_XPERMS_IOCTL=30 to account for the format change. The extended permissions logic is deliberately generic to allow components to be reused e.g. netlink filters Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich Signed-off-by: Paul Moore --- security/selinux/avc.c | 415 +++++++++++++++++++++++++++- security/selinux/hooks.c | 42 ++- security/selinux/include/avc.h | 6 + security/selinux/include/security.h | 32 ++- security/selinux/ss/avtab.c | 104 ++++++- security/selinux/ss/avtab.h | 33 ++- security/selinux/ss/conditional.c | 38 ++- security/selinux/ss/conditional.h | 6 +- security/selinux/ss/policydb.c | 5 + security/selinux/ss/services.c | 213 ++++++++++++-- security/selinux/ss/services.h | 6 + 11 files changed, 837 insertions(+), 63 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 3c17dda9571d..2d5e1b04cd50 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ struct avc_entry { u32 tsid; u16 tclass; struct av_decision avd; + struct avc_xperms_node *xp_node; }; struct avc_node { @@ -56,6 +58,16 @@ struct avc_node { struct rcu_head rhead; }; +struct avc_xperms_decision_node { + struct extended_perms_decision xpd; + struct list_head xpd_list; /* list of extended_perms_decision */ +}; + +struct avc_xperms_node { + struct extended_perms xp; + struct list_head xpd_head; /* list head of extended_perms_decision */ +}; + struct avc_cache { struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */ spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */ @@ -80,6 +92,9 @@ DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 }; static struct avc_cache avc_cache; static struct avc_callback_node *avc_callbacks; static struct kmem_cache *avc_node_cachep; +static struct kmem_cache *avc_xperms_data_cachep; +static struct kmem_cache *avc_xperms_decision_cachep; +static struct kmem_cache *avc_xperms_cachep; static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) { @@ -170,7 +185,17 @@ void __init avc_init(void) atomic_set(&avc_cache.lru_hint, 0); avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC, NULL); + avc_xperms_cachep = kmem_cache_create("avc_xperms_node", + sizeof(struct avc_xperms_node), + 0, SLAB_PANIC, NULL); + avc_xperms_decision_cachep = kmem_cache_create( + "avc_xperms_decision_node", + sizeof(struct avc_xperms_decision_node), + 0, SLAB_PANIC, NULL); + avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data", + sizeof(struct extended_perms_data), + 0, SLAB_PANIC, NULL); audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n"); } @@ -205,9 +230,261 @@ int avc_get_hash_stats(char *page) slots_used, AVC_CACHE_SLOTS, max_chain_len); } +/* + * using a linked list for extended_perms_decision lookup because the list is + * always small. i.e. less than 5, typically 1 + */ +static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver, + struct avc_xperms_node *xp_node) +{ + struct avc_xperms_decision_node *xpd_node; + + list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) { + if (xpd_node->xpd.driver == driver) + return &xpd_node->xpd; + } + return NULL; +} + +static inline unsigned int +avc_xperms_has_perm(struct extended_perms_decision *xpd, + u8 perm, u8 which) +{ + unsigned int rc = 0; + + if ((which == XPERMS_ALLOWED) && + (xpd->used & XPERMS_ALLOWED)) + rc = security_xperm_test(xpd->allowed->p, perm); + else if ((which == XPERMS_AUDITALLOW) && + (xpd->used & XPERMS_AUDITALLOW)) + rc = security_xperm_test(xpd->auditallow->p, perm); + else if ((which == XPERMS_DONTAUDIT) && + (xpd->used & XPERMS_DONTAUDIT)) + rc = security_xperm_test(xpd->dontaudit->p, perm); + return rc; +} + +static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node, + u8 driver, u8 perm) +{ + struct extended_perms_decision *xpd; + security_xperm_set(xp_node->xp.drivers.p, driver); + xpd = avc_xperms_decision_lookup(driver, xp_node); + if (xpd && xpd->allowed) + security_xperm_set(xpd->allowed->p, perm); +} + +static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node) +{ + struct extended_perms_decision *xpd; + + xpd = &xpd_node->xpd; + if (xpd->allowed) + kmem_cache_free(avc_xperms_data_cachep, xpd->allowed); + if (xpd->auditallow) + kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow); + if (xpd->dontaudit) + kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit); + kmem_cache_free(avc_xperms_decision_cachep, xpd_node); +} + +static void avc_xperms_free(struct avc_xperms_node *xp_node) +{ + struct avc_xperms_decision_node *xpd_node, *tmp; + + if (!xp_node) + return; + + list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) { + list_del(&xpd_node->xpd_list); + avc_xperms_decision_free(xpd_node); + } + kmem_cache_free(avc_xperms_cachep, xp_node); +} + +static void avc_copy_xperms_decision(struct extended_perms_decision *dest, + struct extended_perms_decision *src) +{ + dest->driver = src->driver; + dest->used = src->used; + if (dest->used & XPERMS_ALLOWED) + memcpy(dest->allowed->p, src->allowed->p, + sizeof(src->allowed->p)); + if (dest->used & XPERMS_AUDITALLOW) + memcpy(dest->auditallow->p, src->auditallow->p, + sizeof(src->auditallow->p)); + if (dest->used & XPERMS_DONTAUDIT) + memcpy(dest->dontaudit->p, src->dontaudit->p, + sizeof(src->dontaudit->p)); +} + +/* + * similar to avc_copy_xperms_decision, but only copy decision + * information relevant to this perm + */ +static inline void avc_quick_copy_xperms_decision(u8 perm, + struct extended_perms_decision *dest, + struct extended_perms_decision *src) +{ + /* + * compute index of the u32 of the 256 bits (8 u32s) that contain this + * command permission + */ + u8 i = perm >> 5; + + dest->used = src->used; + if (dest->used & XPERMS_ALLOWED) + dest->allowed->p[i] = src->allowed->p[i]; + if (dest->used & XPERMS_AUDITALLOW) + dest->auditallow->p[i] = src->auditallow->p[i]; + if (dest->used & XPERMS_DONTAUDIT) + dest->dontaudit->p[i] = src->dontaudit->p[i]; +} + +static struct avc_xperms_decision_node + *avc_xperms_decision_alloc(u8 which) +{ + struct avc_xperms_decision_node *xpd_node; + struct extended_perms_decision *xpd; + + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd_node) + return NULL; + + xpd = &xpd_node->xpd; + if (which & XPERMS_ALLOWED) { + xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->allowed) + goto error; + } + if (which & XPERMS_AUDITALLOW) { + xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->auditallow) + goto error; + } + if (which & XPERMS_DONTAUDIT) { + xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, + GFP_ATOMIC | __GFP_NOMEMALLOC); + if (!xpd->dontaudit) + goto error; + } + return xpd_node; +error: + avc_xperms_decision_free(xpd_node); + return NULL; +} + +static int avc_add_xperms_decision(struct avc_node *node, + struct extended_perms_decision *src) +{ + struct avc_xperms_decision_node *dest_xpd; + + node->ae.xp_node->xp.len++; + dest_xpd = avc_xperms_decision_alloc(src->used); + if (!dest_xpd) + return -ENOMEM; + avc_copy_xperms_decision(&dest_xpd->xpd, src); + list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); + return 0; +} + +static struct avc_xperms_node *avc_xperms_alloc(void) +{ + struct avc_xperms_node *xp_node; + + xp_node = kmem_cache_zalloc(avc_xperms_cachep, + GFP_ATOMIC|__GFP_NOMEMALLOC); + if (!xp_node) + return xp_node; + INIT_LIST_HEAD(&xp_node->xpd_head); + return xp_node; +} + +static int avc_xperms_populate(struct avc_node *node, + struct avc_xperms_node *src) +{ + struct avc_xperms_node *dest; + struct avc_xperms_decision_node *dest_xpd; + struct avc_xperms_decision_node *src_xpd; + + if (src->xp.len == 0) + return 0; + dest = avc_xperms_alloc(); + if (!dest) + return -ENOMEM; + + memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p)); + dest->xp.len = src->xp.len; + + /* for each source xpd allocate a destination xpd and copy */ + list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) { + dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used); + if (!dest_xpd) + goto error; + avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd); + list_add(&dest_xpd->xpd_list, &dest->xpd_head); + } + node->ae.xp_node = dest; + return 0; +error: + avc_xperms_free(dest); + return -ENOMEM; + +} + +static inline u32 avc_xperms_audit_required(u32 requested, + struct av_decision *avd, + struct extended_perms_decision *xpd, + u8 perm, + int result, + u32 *deniedp) +{ + u32 denied, audited; + + denied = requested & ~avd->allowed; + if (unlikely(denied)) { + audited = denied & avd->auditdeny; + if (audited && xpd) { + if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT)) + audited &= ~requested; + } + } else if (result) { + audited = denied = requested; + } else { + audited = requested & avd->auditallow; + if (audited && xpd) { + if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW)) + audited &= ~requested; + } + } + + *deniedp = denied; + return audited; +} + +static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass, + u32 requested, struct av_decision *avd, + struct extended_perms_decision *xpd, + u8 perm, int result, + struct common_audit_data *ad) +{ + u32 audited, denied; + + audited = avc_xperms_audit_required( + requested, avd, xpd, perm, result, &denied); + if (likely(!audited)) + return 0; + return slow_avc_audit(ssid, tsid, tclass, requested, + audited, denied, result, ad, 0); +} + static void avc_node_free(struct rcu_head *rhead) { struct avc_node *node = container_of(rhead, struct avc_node, rhead); + avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); } @@ -221,6 +498,7 @@ static void avc_node_delete(struct avc_node *node) static void avc_node_kill(struct avc_node *node) { + avc_xperms_free(node->ae.xp_node); kmem_cache_free(avc_node_cachep, node); avc_cache_stats_incr(frees); atomic_dec(&avc_cache.active_nodes); @@ -367,6 +645,7 @@ static int avc_latest_notif_update(int seqno, int is_insert) * @tsid: target security identifier * @tclass: target security class * @avd: resulting av decision + * @xp_node: resulting extended permissions * * Insert an AVC entry for the SID pair * (@ssid, @tsid) and class @tclass. @@ -378,7 +657,9 @@ static int avc_latest_notif_update(int seqno, int is_insert) * the access vectors into a cache entry, returns * avc_node inserted. Otherwise, this function returns NULL. */ -static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) +static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, + struct av_decision *avd, + struct avc_xperms_node *xp_node) { struct avc_node *pos, *node = NULL; int hvalue; @@ -391,10 +672,15 @@ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_dec if (node) { struct hlist_head *head; spinlock_t *lock; + int rc = 0; hvalue = avc_hash(ssid, tsid, tclass); avc_node_populate(node, ssid, tsid, tclass, avd); - + rc = avc_xperms_populate(node, xp_node); + if (rc) { + kmem_cache_free(avc_node_cachep, node); + return NULL; + } head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; @@ -523,14 +809,17 @@ out: * @perms : Permission mask bits * @ssid,@tsid,@tclass : identifier of an AVC entry * @seqno : sequence number when decision was made + * @xpd: extended_perms_decision to be added to the node * * if a valid AVC entry doesn't exist,this function returns -ENOENT. * if kmalloc() called internal returns NULL, this function returns -ENOMEM. * otherwise, this function updates the AVC entry. The original AVC-entry object * will release later by RCU. */ -static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, - u32 seqno) +static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid, + u32 tsid, u16 tclass, u32 seqno, + struct extended_perms_decision *xpd, + u32 flags) { int hvalue, rc = 0; unsigned long flag; @@ -574,9 +863,19 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd); + if (orig->ae.xp_node) { + rc = avc_xperms_populate(node, orig->ae.xp_node); + if (rc) { + kmem_cache_free(avc_node_cachep, node); + goto out_unlock; + } + } + switch (event) { case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; + if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS)) + avc_xperms_allow_perm(node->ae.xp_node, driver, xperm); break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: @@ -594,6 +893,9 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, case AVC_CALLBACK_AUDITDENY_DISABLE: node->ae.avd.auditdeny &= ~perms; break; + case AVC_CALLBACK_ADD_XPERMS: + avc_add_xperms_decision(node, xpd); + break; } avc_node_replace(node, orig); out_unlock: @@ -665,18 +967,20 @@ int avc_ss_reset(u32 seqno) * results in a bigger stack frame. */ static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid, - u16 tclass, struct av_decision *avd) + u16 tclass, struct av_decision *avd, + struct avc_xperms_node *xp_node) { rcu_read_unlock(); - security_compute_av(ssid, tsid, tclass, avd); + INIT_LIST_HEAD(&xp_node->xpd_head); + security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp); rcu_read_lock(); - return avc_insert(ssid, tsid, tclass, avd); + return avc_insert(ssid, tsid, tclass, avd, xp_node); } static noinline int avc_denied(u32 ssid, u32 tsid, - u16 tclass, u32 requested, - unsigned flags, - struct av_decision *avd) + u16 tclass, u32 requested, + u8 driver, u8 xperm, unsigned flags, + struct av_decision *avd) { if (flags & AVC_STRICT) return -EACCES; @@ -684,11 +988,91 @@ static noinline int avc_denied(u32 ssid, u32 tsid, if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE)) return -EACCES; - avc_update_node(AVC_CALLBACK_GRANT, requested, ssid, - tsid, tclass, avd->seqno); + avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid, + tsid, tclass, avd->seqno, NULL, flags); return 0; } +/* + * The avc extended permissions logic adds an additional 256 bits of + * permissions to an avc node when extended permissions for that node are + * specified in the avtab. If the additional 256 permissions is not adequate, + * as-is the case with ioctls, then multiple may be chained together and the + * driver field is used to specify which set contains the permission. + */ +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 xperm, struct common_audit_data *ad) +{ + struct avc_node *node; + struct av_decision avd; + u32 denied; + struct extended_perms_decision local_xpd; + struct extended_perms_decision *xpd = NULL; + struct extended_perms_data allowed; + struct extended_perms_data auditallow; + struct extended_perms_data dontaudit; + struct avc_xperms_node local_xp_node; + struct avc_xperms_node *xp_node; + int rc = 0, rc2; + + xp_node = &local_xp_node; + BUG_ON(!requested); + + rcu_read_lock(); + + node = avc_lookup(ssid, tsid, tclass); + if (unlikely(!node)) { + node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node); + } else { + memcpy(&avd, &node->ae.avd, sizeof(avd)); + xp_node = node->ae.xp_node; + } + /* if extended permissions are not defined, only consider av_decision */ + if (!xp_node || !xp_node->xp.len) + goto decision; + + local_xpd.allowed = &allowed; + local_xpd.auditallow = &auditallow; + local_xpd.dontaudit = &dontaudit; + + xpd = avc_xperms_decision_lookup(driver, xp_node); + if (unlikely(!xpd)) { + /* + * Compute the extended_perms_decision only if the driver + * is flagged + */ + if (!security_xperm_test(xp_node->xp.drivers.p, driver)) { + avd.allowed &= ~requested; + goto decision; + } + rcu_read_unlock(); + security_compute_xperms_decision(ssid, tsid, tclass, driver, + &local_xpd); + rcu_read_lock(); + avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm, + ssid, tsid, tclass, avd.seqno, &local_xpd, 0); + } else { + avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd); + } + xpd = &local_xpd; + + if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED)) + avd.allowed &= ~requested; + +decision: + denied = requested & ~(avd.allowed); + if (unlikely(denied)) + rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm, + AVC_EXTENDED_PERMS, &avd); + + rcu_read_unlock(); + + rc2 = avc_xperms_audit(ssid, tsid, tclass, requested, + &avd, xpd, xperm, rc, ad); + if (rc2) + return rc2; + return rc; +} /** * avc_has_perm_noaudit - Check permissions but perform no auditing. @@ -716,6 +1100,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, struct av_decision *avd) { struct avc_node *node; + struct avc_xperms_node xp_node; int rc = 0; u32 denied; @@ -725,13 +1110,13 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid, node = avc_lookup(ssid, tsid, tclass); if (unlikely(!node)) - node = avc_compute_av(ssid, tsid, tclass, avd); + node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node); else memcpy(avd, &node->ae.avd, sizeof(*avd)); denied = requested & ~(avd->allowed); if (unlikely(denied)) - rc = avc_denied(ssid, tsid, tclass, requested, flags, avd); + rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd); rcu_read_unlock(); return rc; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 692e3cc8ce23..a049b7216270 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3216,6 +3216,46 @@ static void selinux_file_free_security(struct file *file) file_free_security(file); } +/* + * Check whether a task has the ioctl permission and cmd + * operation to an inode. + */ +int ioctl_has_perm(const struct cred *cred, struct file *file, + u32 requested, u16 cmd) +{ + struct common_audit_data ad; + struct file_security_struct *fsec = file->f_security; + struct inode *inode = file_inode(file); + struct inode_security_struct *isec = inode->i_security; + struct lsm_ioctlop_audit ioctl; + u32 ssid = cred_sid(cred); + int rc; + u8 driver = cmd >> 8; + u8 xperm = cmd & 0xff; + + ad.type = LSM_AUDIT_DATA_IOCTL_OP; + ad.u.op = &ioctl; + ad.u.op->cmd = cmd; + ad.u.op->path = file->f_path; + + if (ssid != fsec->sid) { + rc = avc_has_perm(ssid, fsec->sid, + SECCLASS_FD, + FD__USE, + &ad); + if (rc) + goto out; + } + + if (unlikely(IS_PRIVATE(inode))) + return 0; + + rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, + requested, driver, xperm, &ad); +out: + return rc; +} + static int selinux_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3258,7 +3298,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd, * to the file's ioctl() function. */ default: - error = file_has_perm(cred, file, FILE__IOCTL); + error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd); } return error; } diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index ddf8eec03f21..db12ff14277b 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -142,6 +142,7 @@ static inline int avc_audit(u32 ssid, u32 tsid, } #define AVC_STRICT 1 /* Ignore permissive mode. */ +#define AVC_EXTENDED_PERMS 2 /* update extended permissions */ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, @@ -151,6 +152,10 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); +int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested, + u8 driver, u8 perm, struct common_audit_data *ad); + + u32 avc_policy_seqno(void); #define AVC_CALLBACK_GRANT 1 @@ -161,6 +166,7 @@ u32 avc_policy_seqno(void); #define AVC_CALLBACK_AUDITALLOW_DISABLE 32 #define AVC_CALLBACK_AUDITDENY_ENABLE 64 #define AVC_CALLBACK_AUDITDENY_DISABLE 128 +#define AVC_CALLBACK_ADD_XPERMS 256 int avc_add_callback(int (*callback)(u32 event), u32 events); diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 36993ad1c067..6a681d26bf20 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -35,13 +35,14 @@ #define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS 27 #define POLICYDB_VERSION_DEFAULT_TYPE 28 #define POLICYDB_VERSION_CONSTRAINT_NAMES 29 +#define POLICYDB_VERSION_XPERMS_IOCTL 30 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_CONSTRAINT_NAMES +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_XPERMS_IOCTL #endif /* Mask for just the mount related flags */ @@ -109,11 +110,38 @@ struct av_decision { u32 flags; }; +#define XPERMS_ALLOWED 1 +#define XPERMS_AUDITALLOW 2 +#define XPERMS_DONTAUDIT 4 + +#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f)) +#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f))) +struct extended_perms_data { + u32 p[8]; +}; + +struct extended_perms_decision { + u8 used; + u8 driver; + struct extended_perms_data *allowed; + struct extended_perms_data *auditallow; + struct extended_perms_data *dontaudit; +}; + +struct extended_perms { + u16 len; /* length associated decision chain */ + struct extended_perms_data drivers; /* flag drivers that are used */ +}; + /* definitions of av_decision.flags */ #define AVD_FLAGS_PERMISSIVE 0x0001 void security_compute_av(u32 ssid, u32 tsid, - u16 tclass, struct av_decision *avd); + u16 tclass, struct av_decision *avd, + struct extended_perms *xperms); + +void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass, + u8 driver, struct extended_perms_decision *xpermd); void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd); diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index b64f2772b030..3628d3a868b6 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -24,6 +24,7 @@ #include "policydb.h" static struct kmem_cache *avtab_node_cachep; +static struct kmem_cache *avtab_xperms_cachep; /* Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. @@ -70,11 +71,24 @@ avtab_insert_node(struct avtab *h, int hvalue, struct avtab_key *key, struct avtab_datum *datum) { struct avtab_node *newnode; + struct avtab_extended_perms *xperms; newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; newnode->key = *key; - newnode->datum = *datum; + + if (key->specified & AVTAB_XPERMS) { + xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL); + if (xperms == NULL) { + kmem_cache_free(avtab_node_cachep, newnode); + return NULL; + } + *xperms = *(datum->u.xperms); + newnode->datum.u.xperms = xperms; + } else { + newnode->datum.u.data = datum->u.data; + } + if (prev) { newnode->next = prev->next; prev->next = newnode; @@ -107,8 +121,12 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && - (specified & cur->key.specified)) + (specified & cur->key.specified)) { + /* extended perms may not be unique */ + if (specified & AVTAB_XPERMS) + break; return -EEXIST; + } if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && @@ -271,6 +289,9 @@ void avtab_destroy(struct avtab *h) while (cur) { temp = cur; cur = cur->next; + if (temp->key.specified & AVTAB_XPERMS) + kmem_cache_free(avtab_xperms_cachep, + temp->datum.u.xperms); kmem_cache_free(avtab_node_cachep, temp); } } @@ -359,7 +380,10 @@ static uint16_t spec_order[] = { AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, - AVTAB_MEMBER + AVTAB_MEMBER, + AVTAB_XPERMS_ALLOWED, + AVTAB_XPERMS_AUDITALLOW, + AVTAB_XPERMS_DONTAUDIT }; int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, @@ -369,10 +393,11 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, { __le16 buf16[4]; u16 enabled; - __le32 buf32[7]; u32 items, items2, val, vers = pol->policyvers; struct avtab_key key; struct avtab_datum datum; + struct avtab_extended_perms xperms; + __le32 buf32[ARRAY_SIZE(xperms.perms.p)]; int i, rc; unsigned set; @@ -429,11 +454,15 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, printk(KERN_ERR "SELinux: avtab: entry has both access vectors and types\n"); return -EINVAL; } + if (val & AVTAB_XPERMS) { + printk(KERN_ERR "SELinux: avtab: entry has extended permissions\n"); + return -EINVAL; + } for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (val & spec_order[i]) { key.specified = spec_order[i] | enabled; - datum.data = le32_to_cpu(buf32[items++]); + datum.u.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if (rc) return rc; @@ -476,14 +505,42 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, return -EINVAL; } - rc = next_entry(buf32, fp, sizeof(u32)); - if (rc) { - printk(KERN_ERR "SELinux: avtab: truncated entry\n"); - return rc; + if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) && + (key.specified & AVTAB_XPERMS)) { + printk(KERN_ERR "SELinux: avtab: policy version %u does not " + "support extended permissions rules and one " + "was specified\n", vers); + return -EINVAL; + } else if (key.specified & AVTAB_XPERMS) { + memset(&xperms, 0, sizeof(struct avtab_extended_perms)); + rc = next_entry(&xperms.specified, fp, sizeof(u8)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + rc = next_entry(&xperms.driver, fp, sizeof(u8)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++) + xperms.perms.p[i] = le32_to_cpu(buf32[i]); + datum.u.xperms = &xperms; + } else { + rc = next_entry(buf32, fp, sizeof(u32)); + if (rc) { + printk(KERN_ERR "SELinux: avtab: truncated entry\n"); + return rc; + } + datum.u.data = le32_to_cpu(*buf32); } - datum.data = le32_to_cpu(*buf32); if ((key.specified & AVTAB_TYPE) && - !policydb_type_isvalid(pol, datum.data)) { + !policydb_type_isvalid(pol, datum.u.data)) { printk(KERN_ERR "SELinux: avtab: invalid type\n"); return -EINVAL; } @@ -543,8 +600,9 @@ bad: int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) { __le16 buf16[4]; - __le32 buf32[1]; + __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)]; int rc; + unsigned int i; buf16[0] = cpu_to_le16(cur->key.source_type); buf16[1] = cpu_to_le16(cur->key.target_type); @@ -553,8 +611,22 @@ int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) rc = put_entry(buf16, sizeof(u16), 4, fp); if (rc) return rc; - buf32[0] = cpu_to_le32(cur->datum.data); - rc = put_entry(buf32, sizeof(u32), 1, fp); + + if (cur->key.specified & AVTAB_XPERMS) { + rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp); + if (rc) + return rc; + rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp); + if (rc) + return rc; + for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++) + buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]); + rc = put_entry(buf32, sizeof(u32), + ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp); + } else { + buf32[0] = cpu_to_le32(cur->datum.u.data); + rc = put_entry(buf32, sizeof(u32), 1, fp); + } if (rc) return rc; return 0; @@ -588,9 +660,13 @@ void avtab_cache_init(void) avtab_node_cachep = kmem_cache_create("avtab_node", sizeof(struct avtab_node), 0, SLAB_PANIC, NULL); + avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms", + sizeof(struct avtab_extended_perms), + 0, SLAB_PANIC, NULL); } void avtab_cache_destroy(void) { kmem_cache_destroy(avtab_node_cachep); + kmem_cache_destroy(avtab_xperms_cachep); } diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index adb451cd44f9..d946c9dc3c9c 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -23,6 +23,7 @@ #ifndef _SS_AVTAB_H_ #define _SS_AVTAB_H_ +#include "security.h" #include struct avtab_key { @@ -37,13 +38,43 @@ struct avtab_key { #define AVTAB_MEMBER 0x0020 #define AVTAB_CHANGE 0x0040 #define AVTAB_TYPE (AVTAB_TRANSITION | AVTAB_MEMBER | AVTAB_CHANGE) +/* extended permissions */ +#define AVTAB_XPERMS_ALLOWED 0x0100 +#define AVTAB_XPERMS_AUDITALLOW 0x0200 +#define AVTAB_XPERMS_DONTAUDIT 0x0400 +#define AVTAB_XPERMS (AVTAB_XPERMS_ALLOWED | \ + AVTAB_XPERMS_AUDITALLOW | \ + AVTAB_XPERMS_DONTAUDIT) #define AVTAB_ENABLED_OLD 0x80000000 /* reserved for used in cond_avtab */ #define AVTAB_ENABLED 0x8000 /* reserved for used in cond_avtab */ u16 specified; /* what field is specified */ }; +/* + * For operations that require more than the 32 permissions provided by the avc + * extended permissions may be used to provide 256 bits of permissions. + */ +struct avtab_extended_perms { +/* These are not flags. All 256 values may be used */ +#define AVTAB_XPERMS_IOCTLFUNCTION 0x01 +#define AVTAB_XPERMS_IOCTLDRIVER 0x02 + /* extension of the avtab_key specified */ + u8 specified; /* ioctl, netfilter, ... */ + /* + * if 256 bits is not adequate as is often the case with ioctls, then + * multiple extended perms may be used and the driver field + * specifies which permissions are included. + */ + u8 driver; + /* 256 bits of permissions */ + struct extended_perms_data perms; +}; + struct avtab_datum { - u32 data; /* access vector or type value */ + union { + u32 data; /* access vector or type value */ + struct avtab_extended_perms *xperms; + } u; }; struct avtab_node { diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index 62c6773be0b7..18643bf9894d 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -15,6 +15,7 @@ #include "security.h" #include "conditional.h" +#include "services.h" /* * cond_evaluate_expr evaluates a conditional expr @@ -612,21 +613,39 @@ int cond_write_list(struct policydb *p, struct cond_node *list, void *fp) return 0; } -/* Determine whether additional permissions are granted by the conditional - * av table, and if so, add them to the result - */ -void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd) + +void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, + struct extended_perms_decision *xpermd) { struct avtab_node *node; - if (!ctab || !key || !avd) + if (!ctab || !key || !xpermd) + return; + + for (node = avtab_search_node(ctab, key); node; + node = avtab_search_node_next(node, key->specified)) { + if (node->key.specified & AVTAB_ENABLED) + services_compute_xperms_decision(xpermd, node); + } + return; + +} +/* Determine whether additional permissions are granted by the conditional + * av table, and if so, add them to the result + */ +void cond_compute_av(struct avtab *ctab, struct avtab_key *key, + struct av_decision *avd, struct extended_perms *xperms) +{ + struct avtab_node *node; + + if (!ctab || !key || !avd || !xperms) return; for (node = avtab_search_node(ctab, key); node; node = avtab_search_node_next(node, key->specified)) { if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED))) - avd->allowed |= node->datum.data; + avd->allowed |= node->datum.u.data; if ((u16)(AVTAB_AUDITDENY|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITDENY|AVTAB_ENABLED))) /* Since a '0' in an auditdeny mask represents a @@ -634,10 +653,13 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi * the '&' operand to ensure that all '0's in the mask * are retained (much unlike the allow and auditallow cases). */ - avd->auditdeny &= node->datum.data; + avd->auditdeny &= node->datum.u.data; if ((u16)(AVTAB_AUDITALLOW|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITALLOW|AVTAB_ENABLED))) - avd->auditallow |= node->datum.data; + avd->auditallow |= node->datum.u.data; + if ((node->key.specified & AVTAB_ENABLED) && + (node->key.specified & AVTAB_XPERMS)) + services_compute_xperms_drivers(xperms, node); } return; } diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h index 4d1f87466508..ddb43e7e1c75 100644 --- a/security/selinux/ss/conditional.h +++ b/security/selinux/ss/conditional.h @@ -73,8 +73,10 @@ int cond_read_list(struct policydb *p, void *fp); int cond_write_bool(void *key, void *datum, void *ptr); int cond_write_list(struct policydb *p, struct cond_node *list, void *fp); -void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); - +void cond_compute_av(struct avtab *ctab, struct avtab_key *key, + struct av_decision *avd, struct extended_perms *xperms); +void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key, + struct extended_perms_decision *xpermd); int evaluate_cond_node(struct policydb *p, struct cond_node *node); #endif /* _CONDITIONAL_H_ */ diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 74aa224267c1..992a31530825 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -148,6 +148,11 @@ static struct policydb_compat_info policydb_compat[] = { .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, + { + .version = POLICYDB_VERSION_XPERMS_IOCTL, + .sym_num = SYM_NUM, + .ocon_num = OCON_NUM, + }, }; static struct policydb_compat_info *policydb_lookup_compat(int version) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 9e2d82070915..b7df12ba61d8 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -93,9 +93,10 @@ static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len); static void context_struct_compute_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - struct av_decision *avd); + struct context *tcontext, + u16 tclass, + struct av_decision *avd, + struct extended_perms *xperms); struct selinux_mapping { u16 value; /* policy value */ @@ -565,7 +566,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(&lo_scontext, tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -580,7 +582,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(scontext, &lo_tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -596,7 +599,8 @@ static void type_attribute_bounds_av(struct context *scontext, context_struct_compute_av(&lo_scontext, &lo_tcontext, tclass, - &lo_avd); + &lo_avd, + NULL); if ((lo_avd.allowed & avd->allowed) == avd->allowed) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; @@ -613,13 +617,39 @@ static void type_attribute_bounds_av(struct context *scontext, } /* - * Compute access vectors based on a context structure pair for - * the permissions in a particular class. + * flag which drivers have permissions + * only looking for ioctl based extended permssions + */ +void services_compute_xperms_drivers( + struct extended_perms *xperms, + struct avtab_node *node) +{ + unsigned int i; + + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + /* if one or more driver has all permissions allowed */ + for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) + xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; + } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + /* if allowing permissions within a driver */ + security_xperm_set(xperms->drivers.p, + node->datum.u.xperms->driver); + } + + /* If no ioctl commands are allowed, ignore auditallow and auditdeny */ + if (node->key.specified & AVTAB_XPERMS_ALLOWED) + xperms->len = 1; +} + +/* + * Compute access vectors and extended permissions based on a context + * structure pair for the permissions in a particular class. */ static void context_struct_compute_av(struct context *scontext, - struct context *tcontext, - u16 tclass, - struct av_decision *avd) + struct context *tcontext, + u16 tclass, + struct av_decision *avd, + struct extended_perms *xperms) { struct constraint_node *constraint; struct role_allow *ra; @@ -633,6 +663,10 @@ static void context_struct_compute_av(struct context *scontext, avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; + if (xperms) { + memset(&xperms->drivers, 0, sizeof(xperms->drivers)); + xperms->len = 0; + } if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) { if (printk_ratelimit()) @@ -647,7 +681,7 @@ static void context_struct_compute_av(struct context *scontext, * this permission check, then use it. */ avkey.target_class = tclass; - avkey.specified = AVTAB_AV; + avkey.specified = AVTAB_AV | AVTAB_XPERMS; sattr = flex_array_get(policydb.type_attr_map_array, scontext->type - 1); BUG_ON(!sattr); tattr = flex_array_get(policydb.type_attr_map_array, tcontext->type - 1); @@ -660,15 +694,18 @@ static void context_struct_compute_av(struct context *scontext, node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) - avd->allowed |= node->datum.data; + avd->allowed |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITALLOW) - avd->auditallow |= node->datum.data; + avd->auditallow |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITDENY) - avd->auditdeny &= node->datum.data; + avd->auditdeny &= node->datum.u.data; + else if (xperms && (node->key.specified & AVTAB_XPERMS)) + services_compute_xperms_drivers(xperms, node); } /* Check conditional av table for additional permissions */ - cond_compute_av(&policydb.te_cond_avtab, &avkey, avd); + cond_compute_av(&policydb.te_cond_avtab, &avkey, + avd, xperms); } } @@ -899,6 +936,139 @@ static void avd_init(struct av_decision *avd) avd->flags = 0; } +void services_compute_xperms_decision(struct extended_perms_decision *xpermd, + struct avtab_node *node) +{ + unsigned int i; + + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + if (xpermd->driver != node->datum.u.xperms->driver) + return; + } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + if (!security_xperm_test(node->datum.u.xperms->perms.p, + xpermd->driver)) + return; + } else { + BUG(); + } + + if (node->key.specified == AVTAB_XPERMS_ALLOWED) { + xpermd->used |= XPERMS_ALLOWED; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->allowed->p, 0xff, + sizeof(xpermd->allowed->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++) + xpermd->allowed->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) { + xpermd->used |= XPERMS_AUDITALLOW; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->auditallow->p, 0xff, + sizeof(xpermd->auditallow->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++) + xpermd->auditallow->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) { + xpermd->used |= XPERMS_DONTAUDIT; + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { + memset(xpermd->dontaudit->p, 0xff, + sizeof(xpermd->dontaudit->p)); + } + if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { + for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++) + xpermd->dontaudit->p[i] |= + node->datum.u.xperms->perms.p[i]; + } + } else { + BUG(); + } +} + +void security_compute_xperms_decision(u32 ssid, + u32 tsid, + u16 orig_tclass, + u8 driver, + struct extended_perms_decision *xpermd) +{ + u16 tclass; + struct context *scontext, *tcontext; + struct avtab_key avkey; + struct avtab_node *node; + struct ebitmap *sattr, *tattr; + struct ebitmap_node *snode, *tnode; + unsigned int i, j; + + xpermd->driver = driver; + xpermd->used = 0; + memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); + memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); + memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); + + read_lock(&policy_rwlock); + if (!ss_initialized) + goto allow; + + scontext = sidtab_search(&sidtab, ssid); + if (!scontext) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n", + __func__, ssid); + goto out; + } + + tcontext = sidtab_search(&sidtab, tsid); + if (!tcontext) { + printk(KERN_ERR "SELinux: %s: unrecognized SID %d\n", + __func__, tsid); + goto out; + } + + tclass = unmap_class(orig_tclass); + if (unlikely(orig_tclass && !tclass)) { + if (policydb.allow_unknown) + goto allow; + goto out; + } + + + if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) { + pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass); + goto out; + } + + avkey.target_class = tclass; + avkey.specified = AVTAB_XPERMS; + sattr = flex_array_get(policydb.type_attr_map_array, + scontext->type - 1); + BUG_ON(!sattr); + tattr = flex_array_get(policydb.type_attr_map_array, + tcontext->type - 1); + BUG_ON(!tattr); + ebitmap_for_each_positive_bit(sattr, snode, i) { + ebitmap_for_each_positive_bit(tattr, tnode, j) { + avkey.source_type = i + 1; + avkey.target_type = j + 1; + for (node = avtab_search_node(&policydb.te_avtab, &avkey); + node; + node = avtab_search_node_next(node, avkey.specified)) + services_compute_xperms_decision(xpermd, node); + + cond_compute_xperms(&policydb.te_cond_avtab, + &avkey, xpermd); + } + } +out: + read_unlock(&policy_rwlock); + return; +allow: + memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); + goto out; +} /** * security_compute_av - Compute access vector decisions. @@ -906,6 +1076,7 @@ static void avd_init(struct av_decision *avd) * @tsid: target security identifier * @tclass: target security class * @avd: access vector decisions + * @xperms: extended permissions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. @@ -913,13 +1084,15 @@ static void avd_init(struct av_decision *avd) void security_compute_av(u32 ssid, u32 tsid, u16 orig_tclass, - struct av_decision *avd) + struct av_decision *avd, + struct extended_perms *xperms) { u16 tclass; struct context *scontext = NULL, *tcontext = NULL; read_lock(&policy_rwlock); avd_init(avd); + xperms->len = 0; if (!ss_initialized) goto allow; @@ -947,7 +1120,7 @@ void security_compute_av(u32 ssid, goto allow; goto out; } - context_struct_compute_av(scontext, tcontext, tclass, avd); + context_struct_compute_av(scontext, tcontext, tclass, avd, xperms); map_decision(orig_tclass, avd, policydb.allow_unknown); out: read_unlock(&policy_rwlock); @@ -993,7 +1166,7 @@ void security_compute_av_user(u32 ssid, goto out; } - context_struct_compute_av(scontext, tcontext, tclass, avd); + context_struct_compute_av(scontext, tcontext, tclass, avd, NULL); out: read_unlock(&policy_rwlock); return; @@ -1515,7 +1688,7 @@ static int security_compute_sid(u32 ssid, if (avdatum) { /* Use the type from the type transition/member/change rule. */ - newcontext.type = avdatum->data; + newcontext.type = avdatum->u.data; } /* if we have a objname this is a file trans check so check those rules */ diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h index e8d907e903cd..6abcd8729ec3 100644 --- a/security/selinux/ss/services.h +++ b/security/selinux/ss/services.h @@ -11,5 +11,11 @@ extern struct policydb policydb; +void services_compute_xperms_drivers(struct extended_perms *xperms, + struct avtab_node *node); + +void services_compute_xperms_decision(struct extended_perms_decision *xpermd, + struct avtab_node *node); + #endif /* _SS_SERVICES_H_ */ From 9629d04ae06812f217846b69728c969afee690b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 10 Jul 2015 17:19:56 -0400 Subject: [PATCH 0021/1466] selinux: reduce locking overhead in inode_free_security() The inode_free_security() function just took the superblock's isec_lock before checking and trying to remove the inode security struct from the linked list. In many cases, the list was empty and so the lock taking is wasteful as no useful work is done. On multi-socket systems with a large number of CPUs, there can also be a fair amount of spinlock contention on the isec_lock if many tasks are exiting at the same time. This patch changes the code to check the state of the list first before taking the lock and attempting to dequeue it. The list_del_init() can be called more than once on the same list with no harm as long as they are properly serialized. It should not be possible to have inode_free_security() called concurrently with list_add(). For better safety, however, we use list_empty_careful() here even though it is still not completely safe in case that happens. Signed-off-by: Waiman Long Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a049b7216270..4de09f0227b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -254,10 +254,21 @@ static void inode_free_security(struct inode *inode) struct inode_security_struct *isec = inode->i_security; struct superblock_security_struct *sbsec = inode->i_sb->s_security; - spin_lock(&sbsec->isec_lock); - if (!list_empty(&isec->list)) + /* + * As not all inode security structures are in a list, we check for + * empty list outside of the lock to make sure that we won't waste + * time taking a lock doing nothing. + * + * The list_del_init() function can be safely called more than once. + * It should not be possible for this function to be called with + * concurrent list_add(), but for better safety against future changes + * in the code, we use list_empty_careful() here. + */ + if (!list_empty_careful(&isec->list)) { + spin_lock(&sbsec->isec_lock); list_del_init(&isec->list); - spin_unlock(&sbsec->isec_lock); + spin_unlock(&sbsec->isec_lock); + } /* * The inode may still be referenced in a path walk and From 5dee25d08eac01472904b0ab32ce35edee5c0518 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 10 Jul 2015 17:19:57 -0400 Subject: [PATCH 0022/1466] selinux: initialize sock security class to default value Initialize the security class of sock security structures to the generic socket class. This is similar to what is already done in inode_alloc_security for files. Generally the sclass field will later by set by socket_post_create or sk_clone or sock_graft, but for protocol implementations that fail to call any of these for newly accepted sockets, we want some sane default that will yield a legitimate avc denied message with non-garbage values for class and permission. Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4de09f0227b4..ef310f82717d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4559,6 +4559,7 @@ static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority sksec->peer_sid = SECINITSID_UNLABELED; sksec->sid = SECINITSID_UNLABELED; + sksec->sclass = SECCLASS_SOCKET; selinux_netlbl_sk_security_reset(sksec); sk->sk_security = sksec; From bd1741f4cf05d7709348f591d16eeb5f786de673 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 10 Jul 2015 17:19:57 -0400 Subject: [PATCH 0023/1466] selinux: Augment BUG_ON assertion for secclass_map. Ensure that we catch any cases where tclass == 0. Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/avc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 2d5e1b04cd50..324acc62f7e0 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -116,6 +116,7 @@ static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av) return; } + BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); perms = secclass_map[tclass-1].perms; audit_log_format(ab, " {"); @@ -164,7 +165,7 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla kfree(scontext); } - BUG_ON(tclass >= ARRAY_SIZE(secclass_map)); + BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)); audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name); } From c3c188b2c3ed29effe8693672ee1c84184103b4e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 10 Jul 2015 17:19:58 -0400 Subject: [PATCH 0024/1466] selinux: Create a common helper to determine an inode label [ver #3] Create a common helper function to determine the label for a new inode. This is then used by: - may_create() - selinux_dentry_init_security() - selinux_inode_init_security() This will change the behaviour of the functions slightly, bringing them all into line. Suggested-by: Stephen Smalley Signed-off-by: David Howells Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/hooks.c | 87 +++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ef310f82717d..f4be0a110788 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1709,6 +1709,32 @@ out: return rc; } +/* + * Determine the label for an inode that might be unioned. + */ +static int selinux_determine_inode_label(const struct inode *dir, + const struct qstr *name, + u16 tclass, + u32 *_new_isid) +{ + const struct superblock_security_struct *sbsec = dir->i_sb->s_security; + const struct inode_security_struct *dsec = dir->i_security; + const struct task_security_struct *tsec = current_security(); + + if ((sbsec->flags & SE_SBINITIALIZED) && + (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { + *_new_isid = sbsec->mntpoint_sid; + } else if ((sbsec->flags & SBLABEL_MNT) && + tsec->create_sid) { + *_new_isid = tsec->create_sid; + } else { + return security_transition_sid(tsec->sid, dsec->sid, tclass, + name, _new_isid); + } + + return 0; +} + /* Check whether a task can create a file. */ static int may_create(struct inode *dir, struct dentry *dentry, @@ -1725,7 +1751,6 @@ static int may_create(struct inode *dir, sbsec = dir->i_sb->s_security; sid = tsec->sid; - newsid = tsec->create_sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; @@ -1736,12 +1761,10 @@ static int may_create(struct inode *dir, if (rc) return rc; - if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { - rc = security_transition_sid(sid, dsec->sid, tclass, - &dentry->d_name, &newsid); - if (rc) - return rc; - } + rc = selinux_determine_inode_label(dir, &dentry->d_name, tclass, + &newsid); + if (rc) + return rc; rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad); if (rc) @@ -2715,32 +2738,14 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, struct qstr *name, void **ctx, u32 *ctxlen) { - const struct cred *cred = current_cred(); - struct task_security_struct *tsec; - struct inode_security_struct *dsec; - struct superblock_security_struct *sbsec; - struct inode *dir = d_backing_inode(dentry->d_parent); u32 newsid; int rc; - tsec = cred->security; - dsec = dir->i_security; - sbsec = dir->i_sb->s_security; - - if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) { - newsid = tsec->create_sid; - } else { - rc = security_transition_sid(tsec->sid, dsec->sid, - inode_mode_to_security_class(mode), - name, - &newsid); - if (rc) { - printk(KERN_WARNING - "%s: security_transition_sid failed, rc=%d\n", - __func__, -rc); - return rc; - } - } + rc = selinux_determine_inode_label(d_inode(dentry->d_parent), name, + inode_mode_to_security_class(mode), + &newsid); + if (rc) + return rc; return security_sid_to_context(newsid, (char **)ctx, ctxlen); } @@ -2763,22 +2768,12 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, sid = tsec->sid; newsid = tsec->create_sid; - if ((sbsec->flags & SE_SBINITIALIZED) && - (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) - newsid = sbsec->mntpoint_sid; - else if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { - rc = security_transition_sid(sid, dsec->sid, - inode_mode_to_security_class(inode->i_mode), - qstr, &newsid); - if (rc) { - printk(KERN_WARNING "%s: " - "security_transition_sid failed, rc=%d (dev=%s " - "ino=%ld)\n", - __func__, - -rc, inode->i_sb->s_id, inode->i_ino); - return rc; - } - } + rc = selinux_determine_inode_label( + dir, qstr, + inode_mode_to_security_class(inode->i_mode), + &newsid); + if (rc) + return rc; /* Possibly defer initialization to selinux_complete_init. */ if (sbsec->flags & SE_SBINITIALIZED) { From fda4d578ed0a7e1d116f56a15efea0e4ba78acad Mon Sep 17 00:00:00 2001 From: Laurent Bigonville Date: Tue, 7 Jul 2015 23:10:52 +0200 Subject: [PATCH 0025/1466] selinux: explicitly declare the role "base_r" This fixes the compilation of policy generated by mdp with the recent version of checkpolicy. Signed-off-by: Laurent Bigonville Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- scripts/selinux/mdp/mdp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index 62b34ce1f50d..e10beb11b696 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -98,6 +98,7 @@ int main(int argc, char *argv[]) /* types, roles, and allows */ fprintf(fout, "type base_t;\n"); + fprintf(fout, "role base_r;\n"); fprintf(fout, "role base_r types { base_t };\n"); for (i = 0; secclass_map[i].name; i++) fprintf(fout, "allow base_t base_t:%s *;\n", From 7ce0c22b07871402f9fb8939e09fb4c10811438a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 14 Jul 2015 09:32:32 +0100 Subject: [PATCH 0026/1466] metag/irq: Use access helper irq_data_get_affinity_mask() This is a preparatory patch for moving irq_data struct members. Signed-off-by: Jiang Liu Signed-off-by: Thomas Gleixner Signed-off-by: James Hogan --- arch/metag/kernel/irq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c index 4f8f1f87ef11..a336094a7a6c 100644 --- a/arch/metag/kernel/irq.c +++ b/arch/metag/kernel/irq.c @@ -270,23 +270,25 @@ void migrate_irqs(void) for_each_active_irq(i) { struct irq_data *data = irq_get_irq_data(i); + struct cpumask *mask; unsigned int newcpu; if (irqd_is_per_cpu(data)) continue; - if (!cpumask_test_cpu(cpu, data->affinity)) + mask = irq_data_get_affinity_mask(data); + if (!cpumask_test_cpu(cpu, mask)) continue; - newcpu = cpumask_any_and(data->affinity, cpu_online_mask); + newcpu = cpumask_any_and(mask, cpu_online_mask); if (newcpu >= nr_cpu_ids) { pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n", i, cpu); - cpumask_setall(data->affinity); + cpumask_setall(mask); } - irq_set_affinity(i, data->affinity); + irq_set_affinity(i, mask); } } #endif /* CONFIG_HOTPLUG_CPU */ From fa3eec7791b0fe27e3112804a71ba445ff336a6b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 1 Jul 2015 23:51:43 +0100 Subject: [PATCH 0027/1466] regmap: Silence warning on invalid zero length read Zero length reads make no sense in a regmap context and are likely to trigger bugs further down the stack so insert an error check, also silencing compiler warnings about use of ret in cases where we iterate per register. Reported-by: Russell King Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 8894b992043e..9c1f856842a3 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -2180,6 +2180,8 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val, return -EINVAL; if (reg % map->reg_stride) return -EINVAL; + if (val_count == 0) + return -EINVAL; map->lock(map->lock_arg); From 8225d3853f34f6cf9caff15d8c385a528e0d7cb1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 21 Nov 2014 10:06:01 -0500 Subject: [PATCH 0028/1466] seccomp: Replace smp_read_barrier_depends() with lockless_dereference() Recently lockless_dereference() was added which can be used in place of hard-coding smp_read_barrier_depends(). The following PATCH makes the change. Signed-off-by: Pranith Kumar Signed-off-by: Kees Cook --- kernel/seccomp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4f44028943e6..980fd26da22e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) */ static u32 seccomp_run_filters(struct seccomp_data *sd) { - struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; + /* Make sure cross-thread synced filter points somewhere sane. */ + struct seccomp_filter *f = + lockless_dereference(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) return SECCOMP_RET_KILL; - /* Make sure cross-thread synced filter points somewhere sane. */ - smp_read_barrier_depends(); - if (!sd) { populate_seccomp_data(&sd_local); sd = &sd_local; From 13c4a90119d28cfcb6b5bdd820c233b86c2b0237 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Sat, 13 Jun 2015 09:02:48 -0600 Subject: [PATCH 0029/1466] seccomp: add ptrace options for suspend/resume This patch is the first step in enabling checkpoint/restore of processes with seccomp enabled. One of the things CRIU does while dumping tasks is inject code into them via ptrace to collect information that is only available to the process itself. However, if we are in a seccomp mode where these processes are prohibited from making these syscalls, then what CRIU does kills the task. This patch adds a new ptrace option, PTRACE_O_SUSPEND_SECCOMP, that enables a task from the init user namespace which has CAP_SYS_ADMIN and no seccomp filters to disable (and re-enable) seccomp filters for another task so that they can be successfully dumped (and restored). We restrict the set of processes that can disable seccomp through ptrace because although today ptrace can be used to bypass seccomp, there is some discussion of closing this loophole in the future and we would like this patch to not depend on that behavior and be future proofed for when it is removed. Note that seccomp can be suspended before any filters are actually installed; this behavior is useful on criu restore, so that we can suspend seccomp, restore the filters, unmap our restore code from the restored process' address space, and then resume the task by detaching and have the filters resumed as well. v2 changes: * require that the tracer have no seccomp filters installed * drop TIF_NOTSC manipulation from the patch * change from ptrace command to a ptrace option and use this ptrace option as the flag to check. This means that as soon as the tracer detaches/dies, seccomp is re-enabled and as a corrollary that one can not disable seccomp across PTRACE_ATTACHs. v3 changes: * get rid of various #ifdefs everywhere * report more sensible errors when PTRACE_O_SUSPEND_SECCOMP is incorrectly used v4 changes: * get rid of may_suspend_seccomp() in favor of a capable() check in ptrace directly v5 changes: * check that seccomp is not enabled (or suspended) on the tracer Signed-off-by: Tycho Andersen CC: Will Drewry CC: Roland McGrath CC: Pavel Emelyanov CC: Serge E. Hallyn Acked-by: Oleg Nesterov Acked-by: Andy Lutomirski [kees: access seccomp.mode through seccomp_mode() instead] Signed-off-by: Kees Cook --- include/linux/ptrace.h | 1 + include/uapi/linux/ptrace.h | 6 ++++-- kernel/ptrace.c | 13 +++++++++++++ kernel/seccomp.c | 8 ++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 987a73a40ef8..061265f92876 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -34,6 +34,7 @@ #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) +#define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) /* single stepping state bits (used on ARM and PA-RISC) */ #define PT_SINGLESTEP_BIT 31 diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index cf1019e15f5b..a7a697986614 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args { #define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP) /* eventless options */ -#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_EXITKILL (1 << 20) +#define PTRACE_O_SUSPEND_SECCOMP (1 << 21) -#define PTRACE_O_MASK (0x000000ff | PTRACE_O_EXITKILL) +#define PTRACE_O_MASK (\ + 0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP) #include diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e050a36a..787320de68e0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || + !config_enabled(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + /* Avoid intermediate state when all opts are cleared */ flags = child->ptrace; flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 980fd26da22e..645e42d6fa4d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -590,6 +590,10 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return; + if (mode == 0) return; else if (mode == SECCOMP_MODE_STRICT) @@ -691,6 +695,10 @@ u32 seccomp_phase1(struct seccomp_data *sd) int this_syscall = sd ? sd->nr : syscall_get_nr(current, task_pt_regs(current)); + if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) + return SECCOMP_PHASE1_OK; + switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ From 221272f97ca528048a577a3ff23d7774286ca5fd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2015 15:29:16 -0700 Subject: [PATCH 0030/1466] seccomp: swap hard-coded zeros to defined name For clarity, if CONFIG_SECCOMP isn't defined, seccomp_mode() is returning "disabled". This makes that more clear, along with another 0-use, and results in no operational change. Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- kernel/seccomp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index a19ddacdac30..f4265039a94c 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -78,7 +78,7 @@ static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3) static inline int seccomp_mode(struct seccomp *s) { - return 0; + return SECCOMP_MODE_DISABLED; } #endif /* CONFIG_SECCOMP */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 645e42d6fa4d..383bd6caca81 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -594,7 +594,7 @@ void secure_computing_strict(int this_syscall) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; - if (mode == 0) + if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); From 2de9d6006c190bb0f706e8404de94cd94293801f Mon Sep 17 00:00:00 2001 From: Nariman Poushin Date: Thu, 16 Jul 2015 16:36:22 +0100 Subject: [PATCH 0031/1466] regmap: Apply optional delay in multi_reg_write/register_patch Add an optional delay_us field in reg_sequence to allow the client to specify a delay (in microseconds) to be applied after any given write in a sequence of writes. We treat a delay in a sequence the same way we treat a page change as they are logically similar in that you can coalesce all write before a delay (in the same way you can coalesce all writes before a page change is needed) Signed-off-by: Nariman Poushin Signed-off-by: Mark Brown --- drivers/base/regmap/regmap.c | 54 ++++++++++++++++++++++++++++++++---- include/linux/regmap.h | 5 +++- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 2cbb4502747d..b3a5aa5cd580 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -18,6 +18,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1807,10 +1808,12 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map, int i, n; struct reg_sequence *base; unsigned int this_page = 0; + unsigned int page_change = 0; /* * the set of registers are not neccessarily in order, but * since the order of write must be preserved this algorithm - * chops the set each time the page changes + * chops the set each time the page changes. This also applies + * if there is a delay required at any point in the sequence. */ base = regs; for (i = 0, n = 0; i < num_regs; i++, n++) { @@ -1826,16 +1829,48 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map, this_page = win_page; if (win_page != this_page) { this_page = win_page; + page_change = 1; + } + } + + /* If we have both a page change and a delay make sure to + * write the regs and apply the delay before we change the + * page. + */ + + if (page_change || regs[i].delay_us) { + + /* For situations where the first write requires + * a delay we need to make sure we don't call + * raw_multi_reg_write with n=0 + * This can't occur with page breaks as we + * never write on the first iteration + */ + if (regs[i].delay_us && i == 0) + n = 1; + ret = _regmap_raw_multi_reg_write(map, base, n); if (ret != 0) return ret; + + if (regs[i].delay_us) + udelay(regs[i].delay_us); + base += n; n = 0; - } - ret = _regmap_select_page(map, &base[n].reg, range, 1); - if (ret != 0) - return ret; + + if (page_change) { + ret = _regmap_select_page(map, + &base[n].reg, + range, 1); + if (ret != 0) + return ret; + + page_change = 0; + } + } + } if (n > 0) return _regmap_raw_multi_reg_write(map, base, n); @@ -1854,6 +1889,9 @@ static int _regmap_multi_reg_write(struct regmap *map, ret = _regmap_write(map, regs[i].reg, regs[i].def); if (ret != 0) return ret; + + if (regs[i].delay_us) + udelay(regs[i].delay_us); } return 0; } @@ -1893,8 +1931,12 @@ static int _regmap_multi_reg_write(struct regmap *map, for (i = 0; i < num_regs; i++) { unsigned int reg = regs[i].reg; struct regmap_range_node *range; + + /* Coalesce all the writes between a page break or a delay + * in a sequence + */ range = _regmap_range_lookup(map, reg); - if (range) { + if (range || regs[i].delay_us) { size_t len = sizeof(struct reg_sequence)*num_regs; struct reg_sequence *base = kmemdup(regs, len, GFP_KERNEL); diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c9ef2ec69142..5a7cf2136c81 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -51,14 +51,17 @@ struct reg_default { }; /** - * Register/value pairs for sequences of writes + * Register/value pairs for sequences of writes with an optional delay in + * microseconds to be applied after each write. * * @reg: Register address. * @def: Register value. + * @delay_us: Delay to be applied after the register write in microseconds */ struct reg_sequence { unsigned int reg; unsigned int def; + unsigned int delay_us; }; #ifdef CONFIG_REGMAP From b2c0b2cbb282f0cf42518ffacbe197e6f2884168 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2014 23:57:13 +0100 Subject: [PATCH 0032/1466] nmi: create generic NMI backtrace implementation x86s NMI backtrace implementation (for arch_trigger_all_cpu_backtrace()) is fairly generic in nature - the only architecture specific bits are the act of raising the NMI to other CPUs, and reporting the status of the NMI handler. These are fairly simple to factor out, and produce a generic implementation which can be shared between ARM and x86. Reviewed-by: Thomas Gleixner Signed-off-by: Russell King --- include/linux/nmi.h | 6 ++ lib/Makefile | 2 +- lib/nmi_backtrace.c | 162 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 lib/nmi_backtrace.c diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f94da0e65dea..5791e3229068 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -49,6 +49,12 @@ static inline bool trigger_allbutself_cpu_backtrace(void) arch_trigger_all_cpu_backtrace(false); return true; } + +/* generic implementation */ +void nmi_trigger_all_cpu_backtrace(bool include_self, + void (*raise)(cpumask_t *mask)); +bool nmi_cpu_backtrace(struct pt_regs *regs); + #else static inline bool trigger_all_cpu_backtrace(void) { diff --git a/lib/Makefile b/lib/Makefile index 6897b527581a..392169c5bc4e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o + earlycpio.o seq_buf.o nmi_backtrace.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c new file mode 100644 index 000000000000..88d3d32e5923 --- /dev/null +++ b/lib/nmi_backtrace.c @@ -0,0 +1,162 @@ +/* + * NMI backtrace support + * + * Gratuitously copied from arch/x86/kernel/apic/hw_nmi.c by Russell King, + * with the following header: + * + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + */ +#include +#include +#include +#include +#include + +#ifdef arch_trigger_all_cpu_backtrace +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +static cpumask_t printtrace_mask; + +#define NMI_BUF_SIZE 4096 + +struct nmi_seq_buf { + unsigned char buffer[NMI_BUF_SIZE]; + struct seq_buf seq; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; + +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +void nmi_trigger_all_cpu_backtrace(bool include_self, + void (*raise)(cpumask_t *mask)) +{ + struct nmi_seq_buf *s; + int i, cpu, this_cpu = get_cpu(); + + if (test_and_set_bit(0, &backtrace_flag)) { + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + put_cpu(); + return; + } + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + + cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); + + /* + * Set up per_cpu seq_buf buffers that the NMIs running on the other + * CPUs will write to. + */ + for_each_cpu(cpu, to_cpumask(backtrace_mask)) { + s = &per_cpu(nmi_print_seq, cpu); + seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); + } + + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("Sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + raise(to_cpumask(backtrace_mask)); + } + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + touch_softlockup_watchdog(); + } + + /* + * Now that all the NMIs have triggered, we can dump out their + * back traces safely to the console. + */ + for_each_cpu(cpu, &printtrace_mask) { + int len, last_i = 0; + + s = &per_cpu(nmi_print_seq, cpu); + len = seq_buf_used(&s->seq); + if (!len) + continue; + + /* Print line by line. */ + for (i = 0; i < len; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < len) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + } + + clear_bit(0, &backtrace_flag); + smp_mb__after_atomic(); + put_cpu(); +} + +/* + * It is not safe to call printk() directly from NMI handlers. + * It may be fine if the NMI detected a lock up and we have no choice + * but to do so, but doing a NMI on all other CPUs to get a back trace + * can be done with a sysrq-l. We don't want that to lock up, which + * can happen if the NMI interrupts a printk in progress. + * + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes + * the content into a per cpu seq_buf buffer. Then when the NMIs are + * all done, we can safely dump the contents of the seq_buf to a printk() + * from a non NMI context. + */ +static int nmi_vprintk(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + unsigned int len = seq_buf_used(&s->seq); + + seq_buf_vprintf(&s->seq, fmt, args); + return seq_buf_used(&s->seq) - len; +} + +bool nmi_cpu_backtrace(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + printk_func_t printk_func_save = this_cpu_read(printk_func); + + /* Replace printk to write into the NMI seq */ + this_cpu_write(printk_func, nmi_vprintk); + pr_warn("NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + this_cpu_write(printk_func, printk_func_save); + + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return true; + } + + return false; +} +NOKPROBE_SYMBOL(nmi_cpu_backtrace); +#endif From 4d7489ffba0aef4d2c708b6ff1428efd6ccf41df Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 10 Jul 2015 21:47:36 +0100 Subject: [PATCH 0033/1466] nmi: x86: convert to generic nmi handler Convert x86 to use the generic nmi handler code which can be shared between architectures. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: Russell King --- arch/x86/kernel/apic/hw_nmi.c | 133 +--------------------------------- 1 file changed, 4 insertions(+), 129 deletions(-) diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00..045e424fb368 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); - -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ -static unsigned long backtrace_flag; - -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); + apic->send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; - int this_cpu = get_cpu(); - - if (test_and_set_bit(0, &backtrace_flag)) { - /* - * If there is already a trigger_all_cpu_backtrace() in progress - * (backtrace_flag == 1), don't output double cpu dump infos. - */ - put_cpu(); - return; - } - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) - cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - - if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); - apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); - } - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - touch_softlockup_watchdog(); - } - - /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. - */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); - put_cpu(); -} - -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; + nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); } static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - int cpu; - - cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - this_cpu_write(printk_func, printk_func_save); - - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; - } return NMI_DONE; } From 96f0e00378d4a1fc1b79933ef84e1595015de808 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2014 23:57:13 +0100 Subject: [PATCH 0034/1466] ARM: add basic support for on-demand backtrace of other CPUs As we now have generic infrastructure to support backtracing of other CPUs in the system on lockups, we can start to implement this for ARM. Initially, we add an IPI based implementation, as the GIC code needs modification to support the generation of FIQ IPIs, and not all ARM platforms have the ability to raise a FIQ in the non-secure world. This provides us with a "best efforts" implementation in the absence of FIQs. Signed-off-by: Russell King --- arch/arm/include/asm/irq.h | 5 +++++ arch/arm/kernel/smp.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 53c15dec7af6..be1d07d59ee9 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -35,6 +35,11 @@ extern void (*handle_arch_irq)(struct pt_regs *); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); #endif +#ifdef CONFIG_SMP +extern void arch_trigger_all_cpu_backtrace(bool); +#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x) +#endif + #endif #endif diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 90dfbedfbfb8..3a20c386fd33 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + IPI_CPU_BACKTRACE = 15, }; static DECLARE_COMPLETION(cpu_running); @@ -630,6 +632,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) irq_exit(); break; + case IPI_CPU_BACKTRACE: + irq_enter(); + nmi_cpu_backtrace(regs); + irq_exit(); + break; + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -724,3 +732,13 @@ static int __init register_cpufreq_notifier(void) core_initcall(register_cpufreq_notifier); #endif + +static void raise_nmi(cpumask_t *mask) +{ + smp_cross_call(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_all_cpu_backtrace(bool include_self) +{ + nmi_trigger_all_cpu_backtrace(include_self, raise_nmi); +} From 5c31252c4a86dc591c23f1a951edd52ad791ef0e Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 1 Jul 2015 10:21:47 +0200 Subject: [PATCH 0035/1466] pwm: Add the pwm_is_enabled() helper Some PWM drivers are testing the PWMF_ENABLED flag. Create a helper function to hide the logic behind enabled test. This will allow us to smoothly move from the current approach to an atomic PWM update approach. Signed-off-by: Boris Brezillon Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 4 ++-- drivers/pwm/pwm-atmel-tcb.c | 2 +- drivers/pwm/pwm-atmel.c | 6 +++--- drivers/pwm/pwm-bcm-kona.c | 4 ++-- drivers/pwm/pwm-ep93xx.c | 4 ++-- drivers/pwm/pwm-imx.c | 2 +- drivers/pwm/pwm-mxs.c | 4 ++-- drivers/pwm/pwm-renesas-tpu.c | 2 +- drivers/pwm/pwm-tegra.c | 6 +++--- drivers/pwm/pwm-tiecap.c | 10 +++++----- drivers/pwm/pwm-tiehrpwm.c | 6 +++--- drivers/pwm/sysfs.c | 2 +- include/linux/pwm.h | 5 +++++ 13 files changed, 31 insertions(+), 26 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 3a7769fe53de..f7c11d2dec37 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -455,7 +455,7 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity) if (!pwm->chip->ops->set_polarity) return -ENOSYS; - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) return -EBUSY; err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity); @@ -853,7 +853,7 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s) if (test_bit(PWMF_REQUESTED, &pwm->flags)) seq_puts(s, " requested"); - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) seq_puts(s, " enabled"); seq_puts(s, "\n"); diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index d14e0677c92d..6da01b3bf6f4 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -347,7 +347,7 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, tcbpwm->duty = duty; /* If the PWM is enabled, call enable to apply the new conf */ - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) atmel_tcb_pwm_enable(chip, pwm); return 0; diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index a947c9095d9d..b3b294de88e0 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -114,7 +114,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, u32 val; int ret; - if (test_bit(PWMF_ENABLED, &pwm->flags) && (period_ns != pwm->period)) { + if (pwm_is_enabled(pwm) && (period_ns != pwm->period)) { dev_err(chip->dev, "cannot change PWM period while enabled\n"); return -EBUSY; } @@ -176,7 +176,7 @@ static void atmel_pwm_config_v1(struct pwm_chip *chip, struct pwm_device *pwm, * If the PWM channel is enabled, only update CDTY by using the update * register, it needs to set bit 10 of CMR to 0 */ - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) return; /* * If the PWM channel is disabled, write value to duty and period @@ -191,7 +191,7 @@ static void atmel_pwm_config_v2(struct pwm_chip *chip, struct pwm_device *pwm, { struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip); - if (test_bit(PWMF_ENABLED, &pwm->flags)) { + if (pwm_is_enabled(pwm)) { /* * If the PWM channel is enabled, using the duty update register * to update the value. diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c index 7af8fea2dc5b..dfdcf88279ae 100644 --- a/drivers/pwm/pwm-bcm-kona.c +++ b/drivers/pwm/pwm-bcm-kona.c @@ -134,7 +134,7 @@ static int kona_pwmc_config(struct pwm_chip *chip, struct pwm_device *pwm, } /* If the PWM channel is enabled, write the settings to the HW */ - if (test_bit(PWMF_ENABLED, &pwm->flags)) { + if (pwm_is_enabled(pwm)) { value = readl(kp->base + PRESCALE_OFFSET); value &= ~PRESCALE_MASK(chan); value |= prescale << PRESCALE_SHIFT(chan); @@ -287,7 +287,7 @@ static int kona_pwmc_remove(struct platform_device *pdev) unsigned int chan; for (chan = 0; chan < kp->chip.npwm; chan++) - if (test_bit(PWMF_ENABLED, &kp->chip.pwms[chan].flags)) + if (pwm_is_enabled(&kp->chip.pwms[chan])) clk_disable_unprepare(kp->clk); return pwmchip_remove(&kp->chip); diff --git a/drivers/pwm/pwm-ep93xx.c b/drivers/pwm/pwm-ep93xx.c index e593e9c45c51..bbf10ae02f0e 100644 --- a/drivers/pwm/pwm-ep93xx.c +++ b/drivers/pwm/pwm-ep93xx.c @@ -82,7 +82,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, * The clock needs to be enabled to access the PWM registers. * Configuration can be changed at any time. */ - if (!test_bit(PWMF_ENABLED, &pwm->flags)) { + if (!pwm_is_enabled(pwm)) { ret = clk_enable(ep93xx_pwm->clk); if (ret) return ret; @@ -113,7 +113,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, ret = -EINVAL; } - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) clk_disable(ep93xx_pwm->clk); return ret; diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index 66d6f0c5c421..008dc646225e 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -114,7 +114,7 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, unsigned long long c; unsigned long period_cycles, duty_cycles, prescale; unsigned int period_ms; - bool enable = test_bit(PWMF_ENABLED, &pwm->flags); + bool enable = pwm_is_enabled(pwm); int wait_count = 0, fifoav; u32 cr, sr; diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c index b430811e14f5..9a596324ebef 100644 --- a/drivers/pwm/pwm-mxs.c +++ b/drivers/pwm/pwm-mxs.c @@ -77,7 +77,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, * If the PWM channel is disabled, make sure to turn on the clock * before writing the register. Otherwise, keep it enabled. */ - if (!test_bit(PWMF_ENABLED, &pwm->flags)) { + if (!pwm_is_enabled(pwm)) { ret = clk_prepare_enable(mxs->clk); if (ret) return ret; @@ -92,7 +92,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, /* * If the PWM is not enabled, turn the clock off again to save power. */ - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) clk_disable_unprepare(mxs->clk); return 0; diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index ee63f9e9d0fb..075c1a764ba2 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -301,7 +301,7 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, pwm->duty = duty; /* If the channel is disabled we're done. */ - if (!test_bit(PWMF_ENABLED, &_pwm->flags)) + if (!pwm_is_enabled(_pwm)) return 0; if (duty_only && pwm->timer_on) { diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index cabd7d8e05cc..d4de0607b502 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -112,7 +112,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, * If the PWM channel is disabled, make sure to turn on the clock * before writing the register. Otherwise, keep it enabled. */ - if (!test_bit(PWMF_ENABLED, &pwm->flags)) { + if (!pwm_is_enabled(pwm)) { err = clk_prepare_enable(pc->clk); if (err < 0) return err; @@ -124,7 +124,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, /* * If the PWM is not enabled, turn the clock off again to save power. */ - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) clk_disable_unprepare(pc->clk); return 0; @@ -214,7 +214,7 @@ static int tegra_pwm_remove(struct platform_device *pdev) for (i = 0; i < NUM_PWM; i++) { struct pwm_device *pwm = &pc->chip.pwms[i]; - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) if (clk_prepare_enable(pc->clk) < 0) continue; diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c index e557befdf4e6..616af764a276 100644 --- a/drivers/pwm/pwm-tiecap.c +++ b/drivers/pwm/pwm-tiecap.c @@ -97,7 +97,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, writew(reg_val, pc->mmio_base + ECCTL2); - if (!test_bit(PWMF_ENABLED, &pwm->flags)) { + if (!pwm_is_enabled(pwm)) { /* Update active registers if not running */ writel(duty_cycles, pc->mmio_base + CAP2); writel(period_cycles, pc->mmio_base + CAP1); @@ -111,7 +111,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, writel(period_cycles, pc->mmio_base + CAP3); } - if (!test_bit(PWMF_ENABLED, &pwm->flags)) { + if (!pwm_is_enabled(pwm)) { reg_val = readw(pc->mmio_base + ECCTL2); /* Disable APWM mode to put APWM output Low */ reg_val &= ~ECCTL2_APWM_MODE; @@ -179,7 +179,7 @@ static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) static void ecap_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { - if (test_bit(PWMF_ENABLED, &pwm->flags)) { + if (pwm_is_enabled(pwm)) { dev_warn(chip->dev, "Removing PWM device without disabling\n"); pm_runtime_put_sync(chip->dev); } @@ -306,7 +306,7 @@ static int ecap_pwm_suspend(struct device *dev) ecap_pwm_save_context(pc); /* Disable explicitly if PWM is running */ - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) pm_runtime_put_sync(dev); return 0; @@ -318,7 +318,7 @@ static int ecap_pwm_resume(struct device *dev) struct pwm_device *pwm = pc->chip.pwms; /* Enable explicitly if PWM was running */ - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (pwm_is_enabled(pwm)) pm_runtime_get_sync(dev); ecap_pwm_restore_context(pc); diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index 694b3cf7694b..6a41e66015b6 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -407,7 +407,7 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - if (test_bit(PWMF_ENABLED, &pwm->flags)) { + if (pwm_is_enabled(pwm)) { dev_warn(chip->dev, "Removing PWM device without disabling\n"); pm_runtime_put_sync(chip->dev); } @@ -565,7 +565,7 @@ static int ehrpwm_pwm_suspend(struct device *dev) for (i = 0; i < pc->chip.npwm; i++) { struct pwm_device *pwm = &pc->chip.pwms[i]; - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) continue; /* Disable explicitly if PWM is running */ @@ -582,7 +582,7 @@ static int ehrpwm_pwm_resume(struct device *dev) for (i = 0; i < pc->chip.npwm; i++) { struct pwm_device *pwm = &pc->chip.pwms[i]; - if (!test_bit(PWMF_ENABLED, &pwm->flags)) + if (!pwm_is_enabled(pwm)) continue; /* Enable explicitly if PWM was running */ diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index 4bd0c639e16d..eecf21d68108 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -97,7 +97,7 @@ static ssize_t pwm_enable_show(struct device *child, char *buf) { const struct pwm_device *pwm = child_to_pwm_device(child); - int enabled = test_bit(PWMF_ENABLED, &pwm->flags); + int enabled = pwm_is_enabled(pwm); return sprintf(buf, "%d\n", enabled); } diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 36262d08a9da..ec34f4d9a9ee 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -92,6 +92,11 @@ struct pwm_device { enum pwm_polarity polarity; }; +static inline bool pwm_is_enabled(const struct pwm_device *pwm) +{ + return test_bit(PWMF_ENABLED, &pwm->flags); +} + static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period) { if (pwm) From a1cf42171a2e3c33cbc12bb037795caf0589149b Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 1 Jul 2015 10:21:48 +0200 Subject: [PATCH 0036/1466] pwm: Constify PWM device where possible The PWM argument is not modified in PWM property accessors, make it a const argument so that the accessors can be used from sysfs. Signed-off-by: Boris Brezillon Signed-off-by: Thierry Reding --- include/linux/pwm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pwm.h b/include/linux/pwm.h index ec34f4d9a9ee..d8f691339a45 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -103,7 +103,7 @@ static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period) pwm->period = period; } -static inline unsigned int pwm_get_period(struct pwm_device *pwm) +static inline unsigned int pwm_get_period(const struct pwm_device *pwm) { return pwm ? pwm->period : 0; } @@ -114,7 +114,7 @@ static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty) pwm->duty_cycle = duty; } -static inline unsigned int pwm_get_duty_cycle(struct pwm_device *pwm) +static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm) { return pwm ? pwm->duty_cycle : 0; } From 011e76314818b6a24d5347b2d83b8a577e6aaae6 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 1 Jul 2015 10:21:49 +0200 Subject: [PATCH 0037/1466] pwm: Add pwm_get_polarity() helper function Some drivers are directly accessing the ->polarity field in pwm_device. Add a helper to retrieve the current polarity so that we can easily move this field elsewhere (required to support atomic update). Signed-off-by: Boris Brezillon Signed-off-by: Thierry Reding --- include/linux/pwm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pwm.h b/include/linux/pwm.h index d8f691339a45..6f286df30021 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -124,6 +124,11 @@ static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm) */ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity); +static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm) +{ + return pwm ? pwm->polarity : PWM_POLARITY_NORMAL; +} + /** * struct pwm_ops - PWM controller operations * @request: optional hook for requesting a PWM From 15da7b5001e498fa7dc619d4d7951f9665b071e4 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Wed, 1 Jul 2015 10:21:50 +0200 Subject: [PATCH 0038/1466] pwm: Make use of pwm_get_xxx() helpers where appropriate Use the pwm_get_xxx() helpers instead of directly accessing the fields in struct pwm_device. This will allow us to smoothly move to the atomic update approach. Signed-off-by: Boris Brezillon Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 2 +- drivers/pwm/pwm-bcm-kona.c | 3 ++- drivers/pwm/pwm-imx.c | 3 ++- drivers/pwm/pwm-rockchip.c | 2 +- drivers/pwm/sysfs.c | 11 ++++++----- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index b3b294de88e0..0e4bd4e8e582 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -114,7 +114,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, u32 val; int ret; - if (pwm_is_enabled(pwm) && (period_ns != pwm->period)) { + if (pwm_is_enabled(pwm) && (period_ns != pwm_get_period(pwm))) { dev_err(chip->dev, "cannot change PWM period while enabled\n"); return -EBUSY; } diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c index dfdcf88279ae..920cd1b5aa9d 100644 --- a/drivers/pwm/pwm-bcm-kona.c +++ b/drivers/pwm/pwm-bcm-kona.c @@ -194,7 +194,8 @@ static int kona_pwmc_enable(struct pwm_chip *chip, struct pwm_device *pwm) return ret; } - ret = kona_pwmc_config(chip, pwm, pwm->duty_cycle, pwm->period); + ret = kona_pwmc_config(chip, pwm, pwm_get_duty_cycle(pwm), + pwm_get_period(pwm)); if (ret < 0) { clk_disable_unprepare(kp->clk); return ret; diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index 008dc646225e..d600fd5cd4ba 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -129,7 +129,8 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, sr = readl(imx->mmio_base + MX3_PWMSR); fifoav = sr & MX3_PWMSR_FIFOAV_MASK; if (fifoav == MX3_PWMSR_FIFOAV_4WORDS) { - period_ms = DIV_ROUND_UP(pwm->period, NSEC_PER_MSEC); + period_ms = DIV_ROUND_UP(pwm_get_period(pwm), + NSEC_PER_MSEC); msleep(period_ms); sr = readl(imx->mmio_base + MX3_PWMSR); diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 9442df244101..7d9cc9049522 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -83,7 +83,7 @@ static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, PWM_CONTINUOUS; u32 val; - if (pwm->polarity == PWM_POLARITY_INVERSED) + if (pwm_get_polarity(pwm) == PWM_POLARITY_INVERSED) enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE; else enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE; diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index eecf21d68108..ac0abecfbaa0 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -46,7 +46,7 @@ static ssize_t pwm_period_show(struct device *child, { const struct pwm_device *pwm = child_to_pwm_device(child); - return sprintf(buf, "%u\n", pwm->period); + return sprintf(buf, "%u\n", pwm_get_period(pwm)); } static ssize_t pwm_period_store(struct device *child, @@ -61,7 +61,7 @@ static ssize_t pwm_period_store(struct device *child, if (ret) return ret; - ret = pwm_config(pwm, pwm->duty_cycle, val); + ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val); return ret ? : size; } @@ -72,7 +72,7 @@ static ssize_t pwm_duty_cycle_show(struct device *child, { const struct pwm_device *pwm = child_to_pwm_device(child); - return sprintf(buf, "%u\n", pwm->duty_cycle); + return sprintf(buf, "%u\n", pwm_get_duty_cycle(pwm)); } static ssize_t pwm_duty_cycle_store(struct device *child, @@ -87,7 +87,7 @@ static ssize_t pwm_duty_cycle_store(struct device *child, if (ret) return ret; - ret = pwm_config(pwm, val, pwm->period); + ret = pwm_config(pwm, val, pwm_get_period(pwm)); return ret ? : size; } @@ -134,7 +134,8 @@ static ssize_t pwm_polarity_show(struct device *child, { const struct pwm_device *pwm = child_to_pwm_device(child); - return sprintf(buf, "%s\n", pwm->polarity ? "inversed" : "normal"); + return sprintf(buf, "%s\n", + pwm_get_polarity(pwm) ? "inversed" : "normal"); } static ssize_t pwm_polarity_store(struct device *child, From 0642ef6f2992eba46c41abb5ceb7d4fa14ba888e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Jun 2015 14:32:54 +0100 Subject: [PATCH 0039/1466] debugfs: Export bool read/write functions The file read/write functions for bools have no special dependencies on debugfs internals and are sufficiently non-trivial to be worth exporting so clients can re-use the implementation. Signed-off-by: Richard Fitzgerald Acked-by: Greg Kroah-Hartman Signed-off-by: Mark Brown --- fs/debugfs/file.c | 14 ++++++++------ include/linux/debugfs.h | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 284f9aa0028b..6c55ade071c3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); -static ssize_t read_file_bool(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[3]; u32 *val = file->private_data; @@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } +EXPORT_SYMBOL_GPL(debugfs_read_file_bool); -static ssize_t write_file_bool(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; @@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, return count; } +EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { - .read = read_file_bool, - .write = write_file_bool, + .read = debugfs_read_file_bool, + .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 420311bcee38..9beb636b97eb 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -116,6 +116,12 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, bool debugfs_initialized(void); +ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + +ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos); + #else #include @@ -282,6 +288,20 @@ static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev, return ERR_PTR(-ENODEV); } +static inline ssize_t debugfs_read_file_bool(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + return -ENODEV; +} + +static inline ssize_t debugfs_write_file_bool(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + return -ENODEV; +} + #endif #endif From d3dc5430d68fb91a62d971648170b34d46ab85bc Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 23 Jun 2015 14:32:55 +0100 Subject: [PATCH 0040/1466] regmap: debugfs: Allow writes to cache state settings Allow the user to write the cache_only and cache_bypass settings. This can be useful for debugging. Since this can lead to the hardware getting out-of-sync with the cache, at least for the period that the cache state is forced, the kernel is tainted and the action is recorded in the kernel log. When disabling cache_only through debugfs a cache sync will be performed. Signed-off-by: Richard Fitzgerald Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-debugfs.c | 90 ++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 5799a0b9e6cc..6a61e4fa73a2 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -469,6 +469,87 @@ static const struct file_operations regmap_access_fops = { .llseek = default_llseek, }; +static ssize_t regmap_cache_only_write_file(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct regmap *map = container_of(file->private_data, + struct regmap, cache_only); + ssize_t result; + bool was_enabled, require_sync = false; + int err; + + map->lock(map->lock_arg); + + was_enabled = map->cache_only; + + result = debugfs_write_file_bool(file, user_buf, count, ppos); + if (result < 0) { + map->unlock(map->lock_arg); + return result; + } + + if (map->cache_only && !was_enabled) { + dev_warn(map->dev, "debugfs cache_only=Y forced\n"); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } else if (!map->cache_only && was_enabled) { + dev_warn(map->dev, "debugfs cache_only=N forced: syncing cache\n"); + require_sync = true; + } + + map->unlock(map->lock_arg); + + if (require_sync) { + err = regcache_sync(map); + if (err) + dev_err(map->dev, "Failed to sync cache %d\n", err); + } + + return result; +} + +static const struct file_operations regmap_cache_only_fops = { + .open = simple_open, + .read = debugfs_read_file_bool, + .write = regmap_cache_only_write_file, +}; + +static ssize_t regmap_cache_bypass_write_file(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct regmap *map = container_of(file->private_data, + struct regmap, cache_bypass); + ssize_t result; + bool was_enabled; + + map->lock(map->lock_arg); + + was_enabled = map->cache_bypass; + + result = debugfs_write_file_bool(file, user_buf, count, ppos); + if (result < 0) + goto out; + + if (map->cache_bypass && !was_enabled) { + dev_warn(map->dev, "debugfs cache_bypass=Y forced\n"); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } else if (!map->cache_bypass && was_enabled) { + dev_warn(map->dev, "debugfs cache_bypass=N forced\n"); + } + +out: + map->unlock(map->lock_arg); + + return result; +} + +static const struct file_operations regmap_cache_bypass_fops = { + .open = simple_open, + .read = debugfs_read_file_bool, + .write = regmap_cache_bypass_write_file, +}; + void regmap_debugfs_init(struct regmap *map, const char *name) { struct rb_node *next; @@ -530,12 +611,13 @@ void regmap_debugfs_init(struct regmap *map, const char *name) } if (map->cache_type) { - debugfs_create_bool("cache_only", 0400, map->debugfs, - &map->cache_only); + debugfs_create_file("cache_only", 0600, map->debugfs, + &map->cache_only, ®map_cache_only_fops); debugfs_create_bool("cache_dirty", 0400, map->debugfs, &map->cache_dirty); - debugfs_create_bool("cache_bypass", 0400, map->debugfs, - &map->cache_bypass); + debugfs_create_file("cache_bypass", 0600, map->debugfs, + &map->cache_bypass, + ®map_cache_bypass_fops); } next = rb_first(&map->range_tree); From 9fe6b778ca93e6171dbb8e54df557a278a91abea Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:34 +0300 Subject: [PATCH 0041/1466] tracing: Prefer kcalloc over kzalloc with multiply Use kcalloc for allocating an array instead of kzalloc with multiply, as that is what kcalloc is used for. Found with checkpatch. Link: http://lkml.kernel.org/r/1433835155-6894-2-git-send-email-gilf@ezchip.com Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abcbf7ff8743..5d219384b4d1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; From 72917235fd5f08638be1d52dcdb0fee3ce2cc95f Mon Sep 17 00:00:00 2001 From: Gil Fruchter Date: Tue, 9 Jun 2015 10:32:35 +0300 Subject: [PATCH 0042/1466] tracing: Fix for non-continuous cpu ids Currently exception occures due to access beyond buffer_iter range while using index of cpu bigger than num_possible_cpus(). Below there is an example for such exception when we use cpus 0,1,16,17. In order to fix buffer allocation size for non-continuous cpu ids we allocate according to the max cpu id and not according to the amount of possible cpus. Example: $ cat /sys/kernel/debug/tracing/per_cpu/cpu1/trace Path: /bin/busybox CPU: 0 PID: 82 Comm: cat Not tainted 4.0.0 #29 task: 80734c80 ti: 80012000 task.ti: 80012000 [ECR ]: 0x00220100 => Invalid Read @ 0x00000000 by insn @ 0x800abafc [EFA ]: 0x00000000 [BLINK ]: ring_buffer_read_finish+0x24/0x64 [ERET ]: rb_check_pages+0x20/0x188 [STAT32]: 0x00001a00 : BTA: 0x800abafc SP: 0x80013f0c FP: 0x57719cf8 LPS: 0x200036b4 LPE: 0x200036b8 LPC: 0x00000000 r00: 0x8002aca0 r01: 0x00001606 r02: 0x00000000 r03: 0x00000001 r04: 0x00000000 r05: 0x804b4954 r06: 0x00030003 r07: 0x8002a260 r08: 0x00000286 r09: 0x00080002 r10: 0x00001006 r11: 0x807351a4 r12: 0x00000001 Stack Trace: rb_check_pages+0x20/0x188 ring_buffer_read_finish+0x24/0x64 tracing_release+0x4e/0x170 __fput+0x62/0x158 task_work_run+0xa2/0xd4 do_notify_resume+0x52/0x7c resume_user_mode_begin+0xdc/0xe0 Link: http://lkml.kernel.org/r/1433835155-6894-3-git-send-email-gilf@ezchip.com Signed-off-by: Noam Camus Signed-off-by: Gil Fruchter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d219384b4d1..59814adc39d6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kcalloc(num_possible_cpus(), sizeof(*iter->buffer_iter), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; From 5e2d5ef8ec1e3854daec41a3697a8d2ce05ff2ef Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:55:06 +0530 Subject: [PATCH 0043/1466] ftrace: correct the counter increment for trace_buffer data In ftrace_dump, for disabling buffer, iter.tr->trace_buffer.data is used. But for enabling, iter.trace_buffer->data is used. Even though, both point to same buffer, for readability, same convention should be used. Link: http://lkml.kernel.org/r/1434972306-20043-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59814adc39d6..6e79408674aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; From fcc742eaad7cbcbbb2a96edc8f1d22adbaa804cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:13:14 -0400 Subject: [PATCH 0044/1466] ring-buffer: Add event descriptor to simplify passing data Add rb_event_info descriptor to pass event info to functions a bit easier than using a bunch of parameters. This will also allow for changing the code around a bit to find better fast paths. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 91 ++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6260717c18e3..ba8f25ffcf6f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -399,6 +399,17 @@ struct rb_irq_work { bool wakeup_full; }; +/* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + /* * Used for which event context the event is in. * NMI = 0 @@ -2000,9 +2011,12 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) */ static void rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, unsigned length, - int add_timestamp, u64 delta) + struct ring_buffer_event *event, + struct rb_event_info *info) { + unsigned length = info->length; + u64 delta = info->delta; + /* Only a commit updates the timestamp */ if (unlikely(!rb_event_is_commit(cpu_buffer, event))) delta = 0; @@ -2011,7 +2025,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, * If we need to add a timestamp, then we * add it to the start of the resevered space. */ - if (unlikely(add_timestamp)) { + if (unlikely(info->add_timestamp)) { event = rb_add_time_stamp(event, delta); length -= RB_LEN_TIME_EXTEND; delta = 0; @@ -2203,10 +2217,11 @@ static unsigned rb_calculate_event_length(unsigned length) static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_page *tail_page, - unsigned long tail, unsigned long length) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; + unsigned long length = info->length; /* * Only the event that crossed the page boundary @@ -2276,13 +2291,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, */ static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 ts) + unsigned long tail, struct rb_event_info *info) { + struct buffer_page *tail_page = info->tail_page; struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; + u64 ts; next_page = tail_page; @@ -2368,25 +2384,24 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, out_again: - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); out_reset: /* reset write */ - rb_reset_tail(cpu_buffer, tail_page, tail, length); + rb_reset_tail(cpu_buffer, tail, info); return NULL; } static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long length, u64 ts, - u64 delta, int add_timestamp) + struct rb_event_info *info) { - struct buffer_page *tail_page; struct ring_buffer_event *event; + struct buffer_page *tail_page; unsigned long tail, write; /* @@ -2394,33 +2409,32 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * hold in the time field of the event, then we append a * TIME EXTEND event ahead of the data event. */ - if (unlikely(add_timestamp)) - length += RB_LEN_TIME_EXTEND; + if (unlikely(info->add_timestamp)) + info->length += RB_LEN_TIME_EXTEND; - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); + tail_page = info->tail_page = cpu_buffer->tail_page; + write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ write &= RB_WRITE_MASK; - tail = write - length; + tail = write - info->length; /* * If this is the first commit on the page, then it has the same * timestamp as the page itself. */ if (!tail) - delta = 0; + info->delta = 0; /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) - return rb_move_tail(cpu_buffer, length, tail, - tail_page, ts); + return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); @@ -2429,10 +2443,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * its timestamp. */ if (!tail) - tail_page->page->time_stamp = ts; + tail_page->page->time_stamp = info->ts; /* account for these added bytes */ - local_add(length, &cpu_buffer->entries_bytes); + local_add(info->length, &cpu_buffer->entries_bytes); return event; } @@ -2521,9 +2535,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + struct rb_event_info info; int nr_loops = 0; - int add_timestamp; u64 diff; rb_start_commit(cpu_buffer); @@ -2543,10 +2556,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, } #endif - length = rb_calculate_event_length(length); + info.length = rb_calculate_event_length(length); again: - add_timestamp = 0; - delta = 0; + info.add_timestamp = 0; + info.delta = 0; /* * We allow for interrupts to reenter here and do a trace. @@ -2560,35 +2573,35 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer); - diff = ts - cpu_buffer->write_stamp; + info.ts = rb_time_stamp(cpu_buffer->buffer); + diff = info.ts - cpu_buffer->write_stamp; /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ - if (likely(ts >= cpu_buffer->write_stamp)) { - delta = diff; - if (unlikely(test_time_stamp(delta))) { + if (likely(info.ts >= cpu_buffer->write_stamp)) { + info.delta = diff; + if (unlikely(test_time_stamp(info.delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK local_clock_stable = sched_clock_stable(); #endif - WARN_ONCE(delta > (1ULL << 59), + WARN_ONCE(info.delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)delta, - (unsigned long long)ts, + (unsigned long long)info.delta, + (unsigned long long)info.ts, (unsigned long long)cpu_buffer->write_stamp, local_clock_stable ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - add_timestamp = 1; + info.add_timestamp = 1; } } - event = __rb_reserve_next(cpu_buffer, length, ts, - delta, add_timestamp); + event = __rb_reserve_next(cpu_buffer, &info); + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; From 9826b2733a4399149072058a11f611357479229d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 May 2015 17:36:45 -0400 Subject: [PATCH 0045/1466] ring-buffer: Move the adding of the extended timestamp out of line Requiring a extended time stamp is an uncommon occurrence, and it is best to do it out of line when needed. Add a noinline function that handles the extended timestamp and have it called with an unlikely to completely move it out of the fast path. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba8f25ffcf6f..a78d4ee4bc58 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2396,6 +2396,29 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static noinline void +rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + WARN_ONCE(info->delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + info->add_timestamp = 1; +} + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -2582,22 +2605,8 @@ rb_reserve_next_event(struct ring_buffer *buffer, /* Did the write stamp get updated already? */ if (likely(info.ts >= cpu_buffer->write_stamp)) { info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) { - int local_clock_stable = 1; -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable(); -#endif - WARN_ONCE(info.delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", - (unsigned long long)info.delta, - (unsigned long long)info.ts, - (unsigned long long)cpu_buffer->write_stamp, - local_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info.add_timestamp = 1; - } + if (unlikely(test_time_stamp(info.delta))) + rb_handle_timestamp(cpu_buffer, &info); } event = __rb_reserve_next(cpu_buffer, &info); From a4543a2fa9ef31d6d0f854a4e14f8f82e7996d8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 09:40:18 -0400 Subject: [PATCH 0046/1466] ring-buffer: Get timestamp after event is allocated Move the capturing of the timestamp to after an event is allocated. If the event is not a commit (where it is an event that preempted another event), then no timestamp is needed, because the delta of nested events is always zero. If the event starts on a new page, no delta needs to be calculated as the full timestamp will be added to the page header, and the event will have a delta of zero. Now if the event requires a time extend (the delta does not fit in the 27 bit delta slot in the header), then the event is discarded, the length is extended to hold the TIME_EXTEND event that allows for a 59 bit delta, and the commit is tried again. If the event can't be discarded (another event came in after it), then the TIME_EXTEND is added directly to the allocated event and the rest of the event is given padding. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 161 ++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 47 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a78d4ee4bc58..b5ed553e0a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2009,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) * and with this, we can determine what to place into the * data field. */ -static void +static void __always_inline rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event, struct rb_event_info *info) @@ -2017,10 +2017,6 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned length = info->length; u64 delta = info->delta; - /* Only a commit updates the timestamp */ - if (unlikely(!rb_event_is_commit(cpu_buffer, event))) - delta = 0; - /* * If we need to add a timestamp, then we * add it to the start of the resevered space. @@ -2286,6 +2282,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2300,6 +2298,16 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, int ret; u64 ts; + /* + * If the event had a timestamp attached to it, remove it. + * The first event on a page (nested or not) always uses + * the full timestamp of the new page. + */ + if (info->add_timestamp) { + info->add_timestamp = 0; + info->length -= RB_LEN_TIME_EXTEND; + } + next_page = tail_page; rb_inc_page(cpu_buffer, &next_page); @@ -2386,6 +2394,11 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now to update timestamps */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2403,10 +2416,23 @@ static inline bool sched_clock_stable(void) } #endif +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); +static inline void rb_event_discard(struct ring_buffer_event *event); +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event); + static noinline void rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, struct rb_event_info *info) { + struct ring_buffer_event *padding; + int length; + int size; + WARN_ONCE(info->delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)info->delta, @@ -2416,7 +2442,61 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" " echo global > /sys/kernel/debug/tracing/trace_clock\n"); - info->add_timestamp = 1; + + /* + * Discarding this event to add a timestamp in front, but + * we still need to update the length of it to perform the discard. + */ + rb_update_event(cpu_buffer, event, info); + + if (rb_try_to_discard(cpu_buffer, event)) { + info->add_timestamp = 1; + /* + * The time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + info->length += RB_LEN_TIME_EXTEND; + return; + } + + /* + * Humpf! An event came in after this one, and because it is not a + * commit, it will have a delta of zero, thus, it will take on + * the timestamp of the previous commit, which happened a long time + * ago (we need to add a timestamp, remember?). + * We need to add the timestamp here. A timestamp is a fixed size + * of 8 bytes. That means the rest of the event needs to be + * padding. + */ + size = info->length - RB_LEN_TIME_EXTEND; + + /* The padding will have a delta of 1 */ + if (size) + info->delta--; + + padding = rb_add_time_stamp(event, info->delta); + + if (size) { + length = info->length; + info->delta = 0; + info->length = size; + rb_update_event(cpu_buffer, padding, info); + + rb_event_discard(padding); + + /* Still visible, need to update write_stamp */ + rb_update_write_stamp(cpu_buffer, event); + + /* Still need to commit the padding. */ + rb_end_commit(cpu_buffer); + + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + + /* The next iteration still uses the original length */ + info->length = length; + } } static struct ring_buffer_event * @@ -2426,14 +2506,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write; - - /* - * If the time delta since the last event is too big to - * hold in the time field of the event, then we append a - * TIME EXTEND event ahead of the data event. - */ - if (unlikely(info->add_timestamp)) - info->length += RB_LEN_TIME_EXTEND; + bool is_commit; tail_page = info->tail_page = cpu_buffer->tail_page; write = local_add_return(info->length, &tail_page->write); @@ -2442,32 +2515,43 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - info->length; - /* - * If this is the first commit on the page, then it has the same - * timestamp as the page itself. - */ - if (!tail) - info->delta = 0; - /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, tail, info); /* We reserved something on the buffer */ - event = __rb_page_index(tail_page, tail); + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself, otherwise we need to figure out + * the delta. + */ + info->ts = rb_time_stamp(cpu_buffer->buffer); + is_commit = rb_event_is_commit(cpu_buffer, event); + + /* Commits are special (non nested events) */ + info->delta = is_commit ? info->ts - cpu_buffer->write_stamp : 0; + + if (!tail) { + /* + * If this is the first commit on the page, set the + * page to its timestamp. + */ + tail_page->page->time_stamp = info->ts; + info->delta = 0; + + } else if (unlikely(test_time_stamp(info->delta)) && + !info->add_timestamp) { + rb_handle_timestamp(cpu_buffer, event, info); + return ERR_PTR(-EAGAIN); + } + kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); - /* - * If this is the first commit on the page, then update - * its timestamp. - */ - if (!tail) - tail_page->page->time_stamp = info->ts; - /* account for these added bytes */ local_add(info->length, &cpu_buffer->entries_bytes); @@ -2560,7 +2644,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_event *event; struct rb_event_info info; int nr_loops = 0; - u64 diff; rb_start_commit(cpu_buffer); @@ -2578,12 +2661,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } #endif - info.length = rb_calculate_event_length(length); - again: info.add_timestamp = 0; - info.delta = 0; - + again: /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2596,19 +2676,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - info.ts = rb_time_stamp(cpu_buffer->buffer); - diff = info.ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (likely(info.ts >= cpu_buffer->write_stamp)) { - info.delta = diff; - if (unlikely(test_time_stamp(info.delta))) - rb_handle_timestamp(cpu_buffer, &info); - } - event = __rb_reserve_next(cpu_buffer, &info); if (unlikely(PTR_ERR(event) == -EAGAIN)) From 7d75e6833b579adb3de2c7b917de1204eeafea47 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 10:29:10 -0400 Subject: [PATCH 0047/1466] ring-buffer: Make sure event has enough room for extend and padding Now that events only add time extends after it is committed, in case an event comes in before it can discard the allocated event, the time extend needs to be stored within the event. If the event is bigger than then size needed for the time extend, padding must be added. The minimum padding size is 8 bytes. Thus if the event is 12 bytes (size of time extend + 4), there will not be enough room to add both the time extend and padding. Make sure all events are either 8 bytes or 16 or more bytes. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b5ed553e0a45..781ce359976c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2208,6 +2208,21 @@ static unsigned rb_calculate_event_length(unsigned length) length += RB_EVNT_HDR_SIZE; length = ALIGN(length, RB_ARCH_ALIGNMENT); + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + return length; } From d90fd77402d3de56a9ca3df04e5d868d0979dc59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 May 2015 12:12:27 -0400 Subject: [PATCH 0048/1466] ring-buffer: Reorganize function locations Functions in ring-buffer.c have gotten interleaved between different use cases. Move the functions around to get like functions closer together. This may or may not help gcc keep cache locality, but it makes it a little easier to work with the code. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 814 ++++++++++++++++++------------------- 1 file changed, 403 insertions(+), 411 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 781ce359976c..1cce0fbf92ce 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static inline int -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - -static void -rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long max_count; - - /* - * We only race with interrupts and NMIs on this CPU. - * If we own the commit event, then we can commit - * all others that interrupted us, since the interruptions - * are in stack format (they finish before they come - * back to us). This allows us to do a simple loop to - * assign the commit to the tail. - */ - again: - max_count = cpu_buffer->nr_pages * 100; - - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { - if (RB_WARN_ON(cpu_buffer, !(--max_count))) - return; - if (RB_WARN_ON(cpu_buffer, - rb_is_reader_page(cpu_buffer->tail_page))) - return; - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - /* add barrier to keep gcc from optimizing too much */ - barrier(); - } - while (rb_commit_index(cpu_buffer) != - rb_page_write(cpu_buffer->commit_page)) { - - local_set(&cpu_buffer->commit_page->page->commit, - rb_page_write(cpu_buffer->commit_page)); - RB_WARN_ON(cpu_buffer, - local_read(&cpu_buffer->commit_page->page->commit) & - ~RB_WRITE_MASK); - barrier(); - } - - /* again, keep gcc from optimizing */ - barrier(); - - /* - * If an interrupt came in just after the first while loop - * and pushed the tail page forward, we will be left with - * a dangling commit that will never go forward. - */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) - goto again; -} - static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; @@ -1979,63 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } -/* Slow path, do not inline */ -static noinline struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) -{ - event->type_len = RINGBUF_TYPE_TIME_EXTEND; - - /* Not the first event on the page? */ - if (rb_event_index(event)) { - event->time_delta = delta & TS_MASK; - event->array[0] = delta >> TS_SHIFT; - } else { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - - return skip_time_extend(event); -} - -/** - * rb_update_event - update event type and data - * @event: the event to update - * @type: the type of event - * @length: the size of the event field in the ring buffer - * - * Update the type and data fields of the event. The length - * is the actual size that is written to the ring buffer, - * and with this, we can determine what to place into the - * data field. - */ -static void __always_inline -rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event, - struct rb_event_info *info) -{ - unsigned length = info->length; - u64 delta = info->delta; - - /* - * If we need to add a timestamp, then we - * add it to the start of the resevered space. - */ - if (unlikely(info->add_timestamp)) { - event = rb_add_time_stamp(event, delta); - length -= RB_LEN_TIME_EXTEND; - delta = 0; - } - - event->time_delta = delta; - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { - event->type_len = 0; - event->array[0] = length; - } else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); -} - /* * rb_handle_head_page - writer hit the head page * @@ -2194,38 +2070,6 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static unsigned rb_calculate_event_length(unsigned length) -{ - struct ring_buffer_event event; /* Used only for sizeof array */ - - /* zero length can cause confusions */ - if (!length) - length++; - - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - length += sizeof(event.array[0]); - - length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ARCH_ALIGNMENT); - - /* - * In case the time delta is larger than the 27 bits for it - * in the header, we need to add a timestamp. If another - * event comes in when trying to discard this one to increase - * the length, then the timestamp will be added in the allocated - * space of this event. If length is bigger than the size needed - * for the TIME_EXTEND, then padding has to be used. The events - * length must be either RB_LEN_TIME_EXTEND, or greater than or equal - * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. - * As length is a multiple of 4, we only need to worry if it - * is 12 (RB_LEN_TIME_EXTEND + 4). - */ - if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) - length += RB_ALIGNMENT; - - return length; -} - static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) @@ -2424,6 +2268,95 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + +/** + * rb_update_event - update event type and data + * @event: the event to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void __always_inline +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(info->add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; + } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length++; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + + return length; +} + #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { @@ -2433,11 +2366,322 @@ static inline bool sched_clock_stable(void) static inline int rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); -static inline void rb_event_discard(struct ring_buffer_event *event); + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); + return 1; + } + } + + /* could not discard */ + return 0; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); + struct ring_buffer_event *event) +{ + u64 delta; + + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} + +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + bool pagebusy; + + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } + + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + + if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ + +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned int val = cpu_buffer->current_context; + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = RB_CTX_NMI; + else if (in_irq()) + bit = RB_CTX_IRQ; + else + bit = RB_CTX_SOFTIRQ; + } else + bit = RB_CTX_NORMAL; + + if (unlikely(val & (1 << bit))) + return 1; + + val |= (1 << bit); + cpu_buffer->current_context = val; + + return 0; +} + +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->current_context &= cpu_buffer->current_context - 1; +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static noinline void rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer, @@ -2573,84 +2817,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static inline int -rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; - - new_index = rb_event_index(event); - old_index = new_index + rb_event_ts_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - unsigned long write_mask = - local_read(&bpage->write) & ~RB_WRITE_MASK; - unsigned long event_length = rb_event_length(event); - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - old_index += write_mask; - new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { - /* update counters */ - local_sub(event_length, &cpu_buffer->entries_bytes); - return 1; - } - } - - /* could not discard */ - return 0; -} - -static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - local_inc(&cpu_buffer->committing); - local_inc(&cpu_buffer->commits); -} - -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned long commits; - - if (RB_WARN_ON(cpu_buffer, - !local_read(&cpu_buffer->committing))) - return; - - again: - commits = local_read(&cpu_buffer->commits); - /* synchronize with interrupts */ - barrier(); - if (local_read(&cpu_buffer->committing) == 1) - rb_set_commit_to_write(cpu_buffer); - - local_dec(&cpu_buffer->committing); - - /* synchronize with interrupts */ - barrier(); - - /* - * Need to account for interrupts coming in between the - * updating of the commit page and the clearing of the - * committing counter. - */ - if (unlikely(local_read(&cpu_buffer->commits) != commits) && - !local_read(&cpu_buffer->committing)) { - local_inc(&cpu_buffer->committing); - goto again; - } -} - static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, @@ -2706,75 +2872,6 @@ rb_reserve_next_event(struct ring_buffer *buffer, return NULL; } -/* - * The lock and unlock are done within a preempt disable section. - * The current_context per_cpu variable can only be modified - * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. To pass this - * information from the lock to the unlock without having to - * access the 'in_interrupt()' functions again (which do show - * a bit of overhead in something as critical as function tracing, - * we use a bitmask trick. - * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. - * - * This works because this is the order of contexts that can - * preempt other contexts. A SoftIRQ never preempts an IRQ - * context. - * - * When the context is determined, the corresponding bit is - * checked and set (if it was set, then a recursion of that context - * happened). - * - * On unlock, we need to clear this bit. To do so, just subtract - * 1 from the current_context and AND it to itself. - * - * (binary) - * 101 - 1 = 100 - * 101 & 100 = 100 (clearing bit zero) - * - * 1010 - 1 = 1001 - * 1010 & 1001 = 1000 (clearing bit 1) - * - * The least significant bit can be cleared this way, and it - * just so happens that it is the same bit corresponding to - * the current context. - */ - -static __always_inline int -trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) -{ - unsigned int val = cpu_buffer->current_context; - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = RB_CTX_NMI; - else if (in_irq()) - bit = RB_CTX_IRQ; - else - bit = RB_CTX_SOFTIRQ; - } else - bit = RB_CTX_NORMAL; - - if (unlikely(val & (1 << bit))) - return 1; - - val |= (1 << bit); - cpu_buffer->current_context = val; - - return 0; -} - -static __always_inline void -trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->current_context &= cpu_buffer->current_context - 1; -} - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2833,111 +2930,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); -static void -rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - u64 delta; - - /* - * The event first in the commit queue updates the - * time stamp. - */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * A commit event that is first on a page - * updates the write timestamp with the page stamp - */ - if (!rb_event_index(event)) - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; - else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { - delta = event->array[0]; - delta <<= TS_SHIFT; - delta += event->time_delta; - cpu_buffer->write_stamp += delta; - } else - cpu_buffer->write_stamp += event->time_delta; - } -} - -static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - local_inc(&cpu_buffer->entries); - rb_update_write_stamp(cpu_buffer, event); - rb_end_commit(cpu_buffer); -} - -static __always_inline void -rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) -{ - bool pagebusy; - - if (buffer->irq_work.waiters_pending) { - buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); - } - - if (cpu_buffer->irq_work.waiters_pending) { - cpu_buffer->irq_work.waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } - - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - - if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) { - cpu_buffer->irq_work.wakeup_full = true; - cpu_buffer->irq_work.full_waiters_pending = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); - } -} - -/** - * ring_buffer_unlock_commit - commit a reserved - * @buffer: The buffer to commit to - * @event: The event pointer to commit. - * - * This commits the data to the ring buffer, and releases any locks held. - * - * Must be paired with ring_buffer_lock_reserve. - */ -int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - struct ring_buffer_per_cpu *cpu_buffer; - int cpu = raw_smp_processor_id(); - - cpu_buffer = buffer->buffers[cpu]; - - rb_commit(cpu_buffer, event); - - rb_wakeups(buffer, cpu_buffer); - - trace_recursive_unlock(cpu_buffer); - - preempt_enable_notrace(); - - return 0; -} -EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); - -static inline void rb_event_discard(struct ring_buffer_event *event) -{ - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) - event = skip_time_extend(event); - - /* array[0] holds the actual length for the discarded event */ - event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; - event->type_len = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - /* * Decrement the entries to the page that an event is on. * The event does not even need to exist, only the pointer From 72ac426a5bb0cec572d26b4456f8c1e14601694e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 13:24:54 -0400 Subject: [PATCH 0049/1466] tracing: Clean up stack tracing and fix fentry updates Akashi Takahiro was porting the stack tracer to arm64 and found some issues with it. One was that it repeats the top function, due to the stack frame added by the mcount caller and added by itself. This was added when fentry came in, and before fentry created its own stack frame. But x86's fentry now creates its own stack frame, and there's no need to insert the function again. This also cleans up the code a bit, where it doesn't need to do something special for fentry, and doesn't include insertion of a duplicate entry for the called function being traced. Link: http://lkml.kernel.org/r/55A646EE.6030402@linaro.org Some-suggestions-by: Jungseok Lee Some-suggestions-by: Mark Rutland Reported-by: AKASHI Takahiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 68 +++++++++++++------------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3f34496244e9..b746399ab59c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,12 +18,6 @@ #define STACK_TRACE_ENTRIES 500 -#ifdef CC_USING_FENTRY -# define fentry 1 -#else -# define fentry 0 -#endif - static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; @@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; */ static struct stack_trace max_stack_trace = { .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[1], + .entries = &stack_dump_trace[0], }; static unsigned long max_stack_size; @@ -55,7 +49,7 @@ static inline void print_max_stack(void) pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); for (i = 0; i < max_stack_trace.nr_entries; i++) { if (stack_dump_trace[i] == ULONG_MAX) @@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack) unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); - int i; + int i, x; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; @@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; max_stack_trace.nr_entries = 0; - - if (using_ftrace_ops_list_func()) - max_stack_trace.skip = 4; - else - max_stack_trace.skip = 3; + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); - /* - * Add the passed in ip from the function tracer. - * Searching for this on the stack will skip over - * most of the overhead from the stack tracer itself. - */ - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } /* * Now find where in the stack these are. */ - i = 0; + x = 0; start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack) while (i < max_stack_trace.nr_entries) { int found = 0; - stack_dump_index[i] = this_size; + stack_dump_index[x] = this_size; p = start; for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; if (*p == stack_dump_trace[i]) { - this_size = stack_dump_index[i++] = + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_dump_index[x++] = (top - p) * sizeof(unsigned long); found = 1; /* Start the search from here */ @@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack) * out what that is, then figure it out * now. */ - if (unlikely(!tracer_frame) && i == 1) { + if (unlikely(!tracer_frame)) { tracer_frame = (p - stack) * sizeof(unsigned long); max_stack_size -= tracer_frame; @@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + max_stack_trace.nr_entries = x; + for (; x < i; x++) + stack_dump_trace[x] = ULONG_MAX; + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); @@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * The ip is pretty useless because the function tracer - * was called before that function set up its stack frame. - * In this case, we use the parent ip. - * - * By adding the return address of either the parent ip - * or the current ip we can disregard most of the stack usage - * caused by the stack tracer itself. - * - * The function tracer always reports the address of where the - * mcount call was, but the stack will hold the return address. - */ - if (fentry) - ip = parent_ip; - else - ip += MCOUNT_INSN_SIZE; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); @@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) return NULL; m->private = (void *)n; @@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries - 1); + max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); From 8e436ca042d904533a1e14fdc85f0facdfca752f Mon Sep 17 00:00:00 2001 From: Umesh Tiwari Date: Mon, 22 Jun 2015 16:58:08 +0530 Subject: [PATCH 0050/1466] ftrace: add tracing_thresh to function profile This patch extends tracing_thresh functionality to function profile tracer. If tracing_thresh is set, print those entries only, whose average is > tracing thresh. Link: http://lkml.kernel.org/r/1434972488-8571-1-git-send-email-umesh.t@samsung.com Signed-off-by: Umesh Tiwari [ Removed unnecessary 'moved' comment ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea..f46dbb5cdf76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -613,13 +613,18 @@ static int function_stat_show(struct seq_file *m, void *v) goto out; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = rec->time; + do_div(avg, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - avg = rec->time; - do_div(avg, rec->counter); /* Sample standard deviation (s^2) */ if (rec->counter <= 1) From 82c355e81afbf16bc1ab379899a79eb66e2b7504 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 16 Jul 2015 21:58:52 -0400 Subject: [PATCH 0051/1466] ftrace: Fix function_graph duration spacing with 7-digits Jungseok Lee noticed the following: Currently, row's width of 7-digit duration numbers not aligned with other cases like the following example. 3) $ 3999884 us | } 3) | finish_task_switch() { 3) 0.365 us | _raw_spin_unlock_irq(); 3) 3.333 us | } 3) $ 3999976 us | } 3) $ 3999979 us | } /* schedule */ As adding a single white space in case of 7-digit numbers, the format could be unified easily as follows. 3) $ 2237472 us | } 3) | finish_task_switch() { 3) 0.364 us | _raw_spin_unlock_irq(); 3) 3.125 us | } 3) $ 2237556 us | } 3) $ 2237559 us | } /* schedule */ Instead of making a special case for 7-digit numbers, the logic of the len and the space loop is slightly modified to make the two cases have the same format. Link: http://lkml.kernel.org/r/1436626300-1679-2-git-send-email-jungseoklee85@gmail.com Reported-by: Jungseok Lee Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8968bf720c12..ca98445782ac 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) snprintf(nsecs_str, slen, "%03lu", nsecs_rem); trace_seq_printf(s, ".%s", nsecs_str); - len += strlen(nsecs_str); + len += strlen(nsecs_str) + 1; } trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) + for (i = len; i < 8; i++) trace_seq_putc(s, ' '); } From b838e1d96c613019095ba008afbee800977b0582 Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Sat, 11 Jul 2015 14:51:40 +0000 Subject: [PATCH 0052/1466] tracing: Introduce two additional marks for delay A fine granulity support for delay would be very useful when profiling VM logics, such as page allocation including page reclaim and memory compaction with function graph. Thus, this patch adds two additional marks with two changes. - An equal sign in mark selection function is removed to align code behavior with comments and documentation. - The function graph example related to delay in ftrace.txt is updated to cover all supported marks. Link: http://lkml.kernel.org/r/1436626300-1679-3-git-send-email-jungseoklee85@gmail.com Cc: Byungchul Park Signed-off-by: Jungseok Lee Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 51 +++++++++++++++++++++++++--------- kernel/trace/trace_output.c | 4 ++- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 7ddb1e319f84..072d3c4d5753 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -686,6 +686,8 @@ The above is mostly meaningful for kernel developers. The marks are determined by the difference between this current trace and the next trace. '$' - greater than 1 second + '@' - greater than 100 milisecond + '*' - greater than 10 milisecond '#' - greater than 1000 microsecond '!' - greater than 100 microsecond '+' - greater than 10 microsecond @@ -1939,26 +1941,49 @@ want, depending on your needs. ie: - 0) | up_write() { - 0) 0.646 us | _spin_lock_irqsave(); - 0) 0.684 us | _spin_unlock_irqrestore(); - 0) 3.123 us | } - 0) 0.548 us | fput(); - 0) + 58.628 us | } + 3) # 1837.709 us | } /* __switch_to */ + 3) | finish_task_switch() { + 3) 0.313 us | _raw_spin_unlock_irq(); + 3) 3.177 us | } + 3) # 1889.063 us | } /* __schedule */ + 3) ! 140.417 us | } /* __schedule */ + 3) # 2034.948 us | } /* schedule */ + 3) * 33998.59 us | } /* schedule_preempt_disabled */ [...] - 0) | putname() { - 0) | kmem_cache_free() { - 0) 0.518 us | __phys_addr(); - 0) 1.757 us | } - 0) 2.861 us | } - 0) ! 115.305 us | } - 0) ! 116.402 us | } + 1) 0.260 us | msecs_to_jiffies(); + 1) 0.313 us | __rcu_read_unlock(); + 1) + 61.770 us | } + 1) + 64.479 us | } + 1) 0.313 us | rcu_bh_qs(); + 1) 0.313 us | __local_bh_enable(); + 1) ! 217.240 us | } + 1) 0.365 us | idle_cpu(); + 1) | rcu_irq_exit() { + 1) 0.417 us | rcu_eqs_enter_common.isra.47(); + 1) 3.125 us | } + 1) ! 227.812 us | } + 1) ! 457.395 us | } + 1) @ 119760.2 us | } + + [...] + + 2) | handle_IPI() { + 1) 6.979 us | } + 2) 0.417 us | scheduler_ipi(); + 1) 9.791 us | } + 1) + 12.917 us | } + 2) 3.490 us | } + 1) + 15.729 us | } + 1) + 18.542 us | } + 2) $ 3594274 us | } + means that the function exceeded 10 usecs. ! means that the function exceeded 100 usecs. # means that the function exceeded 1000 usecs. + * means that the function exceeded 10 msecs. + @ means that the function exceeded 100 msecs. $ means that the function exceeded 1 sec. diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index dfab253727dc..8e481a84aeea 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -496,6 +496,8 @@ static const struct trace_mark { char sym; } mark[] = { MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ MARK(1000000ULL , '#'), /* 1000 usecs */ MARK(100000ULL , '!'), /* 100 usecs */ MARK(10000ULL , '+'), /* 10 usecs */ @@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d) int size = ARRAY_SIZE(mark); for (i = 0; i < size; i++) { - if (d >= mark[i].val) + if (d > mark[i].val) break; } From c93bf928fea22c61f6b5c04786b325c9bfbc0462 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Sun, 12 Jul 2015 17:52:24 +0800 Subject: [PATCH 0053/1466] ftrace: Format MCOUNT_ADDR address as type unsigned long Always we use type unsigned long to format the ip address, since the value of ip address is never the negative. This patch uses type unsigned long, instead of long, to format the ip address. The code is more clearly to be viewed by using type unsigned long, although it is correct by using either unsigned long or long. Link: http://lkml.kernel.org/r/1436694744-16747-1-git-send-email-mhuang@redhat.com Cc: Minfei Huang Cc: "H. Peter Anvin" Cc: James Hogan Cc: Michal Simek Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Signed-off-by: Minfei Huang Signed-off-by: Steven Rostedt --- arch/metag/include/asm/ftrace.h | 2 +- arch/microblaze/include/asm/ftrace.h | 2 +- arch/powerpc/include/asm/ftrace.h | 2 +- arch/sh/include/asm/ftrace.h | 2 +- arch/sparc/include/asm/ftrace.h | 2 +- arch/x86/include/asm/ftrace.h | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/metag/include/asm/ftrace.h b/arch/metag/include/asm/ftrace.h index 2901f0f7d944..a2269d60a945 100644 --- a/arch/metag/include/asm/ftrace.h +++ b/arch/metag/include/asm/ftrace.h @@ -6,7 +6,7 @@ #ifndef __ASSEMBLY__ extern void mcount_wrapper(void); -#define MCOUNT_ADDR ((long)(mcount_wrapper)) +#define MCOUNT_ADDR ((unsigned long)(mcount_wrapper)) static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index fd2fa2eca62f..da0144f40d99 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -3,7 +3,7 @@ #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 8 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index e3661872fbea..ef89b1465573 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_POWERPC_FTRACE #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifdef __ASSEMBLY__ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index e79fb6ebaa42..1f157b86eaa7 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -9,7 +9,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_ADDR ((unsigned long)(mcount)) #ifdef CONFIG_DYNAMIC_FTRACE #define CALL_ADDR ((long)(ftrace_call)) diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 9ec94ad116fb..3192a8e42fd6 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_SPARC64_FTRACE #ifdef CONFIG_MCOUNT -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c4b6..24938852db30 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ From 8ab30c1538b14424015e45063c41d509b24c1dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:53 +0100 Subject: [PATCH 0054/1466] KVM: add comments for kvm_debug_exit_arch struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring into line with the comments for the other structures and their KVM_EXIT_* cases. Also update api.txt to reflect use in kvm_run documentation. Signed-off-by: Alex Bennée Reviewed-by: David Hildenbrand Reviewed-by: Andrew Jones Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 4 +++- include/uapi/linux/kvm.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a7926a90156f..9f746eab333d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3111,11 +3111,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or where kvm expects application code to place the data for the next KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + /* KVM_EXIT_DEBUG */ struct { struct kvm_debug_exit_arch arch; } debug; -Unused. +If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event +for which architecture specific information is returned. /* KVM_EXIT_MMIO */ struct { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 716ad4ae4d4b..4ab3c6a8d563 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -237,6 +237,7 @@ struct kvm_run { __u32 count; __u64 data_offset; /* relative to kvm_run start */ } io; + /* KVM_EXIT_DEBUG */ struct { struct kvm_debug_exit_arch arch; } debug; @@ -285,6 +286,7 @@ struct kvm_run { __u32 data; __u8 is_write; } dcr; + /* KVM_EXIT_INTERNAL_ERROR */ struct { __u32 suberror; /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */ @@ -295,6 +297,7 @@ struct kvm_run { struct { __u64 gprs[32]; } osi; + /* KVM_EXIT_PAPR_HCALL */ struct { __u64 nr; __u64 ret; From 21b6f32f9471284f6d4621fc8be71719266db557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:54 +0100 Subject: [PATCH 0055/1466] KVM: arm64: guest debug, define API headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit defines the API headers for guest debugging. There are two architecture specific debug structures: - kvm_guest_debug_arch, allows us to pass in HW debug registers - kvm_debug_exit_arch, signals exception and possible faulting address The type of debugging being used is controlled by the architecture specific control bits of the kvm_guest_debug->control flags in the ioctl structure. Signed-off-by: Alex Bennée Reviewed-by: David Hildenbrand Reviewed-by: Andrew Jones Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/uapi/asm/kvm.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index d26832022127..d82f3f316ba4 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -100,12 +100,39 @@ struct kvm_sregs { struct kvm_fpu { }; +/* + * See v8 ARM ARM D7.3: Debug Registers + * + * The architectural limit is 16 debug registers of each type although + * in practice there are usually less (see ID_AA64DFR0_EL1). + * + * Although the control registers are architecturally defined as 32 + * bits wide we use a 64 bit structure here to keep parity with + * KVM_GET/SET_ONE_REG behaviour which treats all system registers as + * 64 bit values. It also allows for the possibility of the + * architecture expanding the control registers without having to + * change the userspace ABI. + */ +#define KVM_ARM_MAX_DBG_REGS 16 struct kvm_guest_debug_arch { + __u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS]; + __u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS]; + __u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS]; + __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS]; }; struct kvm_debug_exit_arch { + __u32 hsr; + __u64 far; /* used for watchpoints */ }; +/* + * Architecture specific defines for kvm_guest_debug->control + */ + +#define KVM_GUESTDBG_USE_SW_BP (1 << 16) +#define KVM_GUESTDBG_USE_HW (1 << 17) + struct kvm_sync_regs { }; From 0e6f07f29cfb8d79dbbdb12560a73f7121ba324e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:55 +0100 Subject: [PATCH 0056/1466] KVM: arm: guest debug, add stub KVM_SET_GUEST_DEBUG ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a stub function to support the KVM_SET_GUEST_DEBUG ioctl. Any unsupported flag will return -EINVAL. For now, only KVM_GUESTDBG_ENABLE is supported, although it won't have any effects. Signed-off-by: Alex Bennée . Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 2 +- arch/arm/kvm/arm.c | 7 ------- arch/arm/kvm/guest.c | 6 ++++++ arch/arm64/kvm/guest.c | 27 +++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 9f746eab333d..19adfd385882 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2671,7 +2671,7 @@ handled. 4.87 KVM_SET_GUEST_DEBUG Capability: KVM_CAP_SET_GUEST_DEBUG -Architectures: x86, s390, ppc +Architectures: x86, s390, ppc, arm64 Type: vcpu ioctl Parameters: struct kvm_guest_debug (in) Returns: 0 on success; -1 on error diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index bc738d2b8392..1b693cb2d5b2 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -301,13 +301,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_arm_set_running_vcpu(NULL); } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - - int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index d503fbb787d3..96e935bbc38c 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -290,3 +290,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { return -EINVAL; } + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 9535bd555d1d..0ba86775235d 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -331,3 +331,30 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { return -EINVAL; } + +#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE) + +/** + * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging + * @kvm: pointer to the KVM struct + * @kvm_guest_debug: the ioctl data buffer + * + * This sets up and enables the VM for guest debugging. Userspace + * passes in a control flag to enable different debug types and + * potentially other architecture specific information in the rest of + * the structure. + */ +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; + + if (dbg->control & KVM_GUESTDBG_ENABLE) { + vcpu->guest_debug = dbg->control; + } else { + /* If not enabled clear all flags */ + vcpu->guest_debug = 0; + } + return 0; +} From 56c7f5e77f797fd0dcf2376ce1496f4238e6be33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:56 +0100 Subject: [PATCH 0057/1466] KVM: arm: introduce kvm_arm_init/setup/clear_debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a precursor for later patches which will need to do more to setup debug state before entering the hyp.S switch code. The existing functionality for setting mdcr_el2 has been moved out of hyp.S and now uses the value kept in vcpu->arch.mdcr_el2. As the assembler used to previously mask and preserve MDCR_EL2.HPMN I've had to add a mechanism to save the value of mdcr_el2 as a per-cpu variable during the initialisation code. The kernel never sets this number so we are assuming the bootcode has set up the correct value here. This also moves the conditional setting of the TDA bit from the hyp code into the C code which is currently used for the lazy debug register context switch code. Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 4 ++ arch/arm/kvm/arm.c | 6 +++ arch/arm64/include/asm/kvm_asm.h | 2 + arch/arm64/include/asm/kvm_host.h | 5 ++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/debug.c | 81 +++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp.S | 19 +++----- 8 files changed, 108 insertions(+), 12 deletions(-) create mode 100644 arch/arm64/kvm/debug.c diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index e896d2c196e6..2b0bc8c57552 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -231,4 +231,8 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arm_init_debug(void) {} +static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} +static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 1b693cb2d5b2..77151b111d32 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -543,6 +543,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) continue; } + kvm_arm_setup_debug(vcpu); + /************************************************************** * Enter the guest */ @@ -557,6 +559,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Back from guest *************************************************************/ + kvm_arm_clear_debug(vcpu); + /* * We may have taken a host interrupt in HYP mode (ie * while executing the guest). This interrupt is still @@ -914,6 +918,8 @@ static void cpu_init_hyp_mode(void *dummy) vector_ptr = (unsigned long)__kvm_hyp_vector; __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); + + kvm_arm_init_debug(); } static int hyp_init_cpu_notify(struct notifier_block *self, diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 3c5fe685a2d6..f5e40dae291a 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -132,6 +132,8 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u32 __kvm_get_mdcr_el2(void); + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2709db2a7eac..c90c6a41c448 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -103,6 +103,7 @@ struct kvm_vcpu_arch { /* HYP configuration */ u64 hcr_el2; + u32 mdcr_el2; /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -227,4 +228,8 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +void kvm_arm_init_debug(void); +void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); +void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index c99701a34d7b..5c900d49b906 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -117,6 +117,7 @@ int main(void) DEFINE(VCPU_HPFAR_EL2, offsetof(struct kvm_vcpu, arch.fault.hpfar_el2)); DEFINE(VCPU_DEBUG_FLAGS, offsetof(struct kvm_vcpu, arch.debug_flags)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); + DEFINE(VCPU_MDCR_EL2, offsetof(struct kvm_vcpu, arch.mdcr_el2)); DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines)); DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); DEFINE(VCPU_TIMER_CNTV_CTL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl)); diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index f90f4aa7f88d..1949fe5f5424 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o -kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c new file mode 100644 index 000000000000..faf0e1fdba9e --- /dev/null +++ b/arch/arm64/kvm/debug.c @@ -0,0 +1,81 @@ +/* + * Debug and Guest Debug support + * + * Copyright (C) 2015 - Linaro Ltd + * Author: Alex Bennée + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include + +static DEFINE_PER_CPU(u32, mdcr_el2); + +/** + * kvm_arm_init_debug - grab what we need for debug + * + * Currently the sole task of this function is to retrieve the initial + * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has + * presumably been set-up by some knowledgeable bootcode. + * + * It is called once per-cpu during CPU hyp initialisation. + */ + +void kvm_arm_init_debug(void) +{ + __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); +} + + +/** + * kvm_arm_setup_debug - set up debug related stuff + * + * @vcpu: the vcpu pointer + * + * This is called before each entry into the hypervisor to setup any + * debug related registers. Currently this just ensures we will trap + * access to: + * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) + * - Debug ROM Address (MDCR_EL2_TDRA) + * - OS related registers (MDCR_EL2_TDOSA) + * + * Additionally, KVM only traps guest accesses to the debug registers if + * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY + * flag on vcpu->arch.debug_flags). Since the guest must not interfere + * with the hardware state when debugging the guest, we must ensure that + * trapping is enabled whenever we are debugging the guest using the + * debug registers. + */ + +void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) +{ + bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY); + + vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | + MDCR_EL2_TPMCR | + MDCR_EL2_TDRA | + MDCR_EL2_TDOSA); + + /* Trap on access to debug registers? */ + if (trap_debug) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + +} + +void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) +{ + /* Nothing to do yet */ +} diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 17a8fb14f428..b3176e6e51d1 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -770,17 +770,8 @@ mov x2, #(1 << 15) // Trap CP15 Cr=15 msr hstr_el2, x2 - mrs x2, mdcr_el2 - and x2, x2, #MDCR_EL2_HPMN_MASK - orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR) - orr x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA) - - // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap - // if not dirty. - ldr x3, [x0, #VCPU_DEBUG_FLAGS] - tbnz x3, #KVM_ARM64_DEBUG_DIRTY_SHIFT, 1f - orr x2, x2, #MDCR_EL2_TDA -1: + // Monitor Debug Config - see kvm_arm_setup_debug() + ldr x2, [x0, #VCPU_MDCR_EL2] msr mdcr_el2, x2 .endm @@ -1285,4 +1276,10 @@ ENTRY(__kvm_hyp_vector) ventry el1_error_invalid // Error 32-bit EL1 ENDPROC(__kvm_hyp_vector) + +ENTRY(__kvm_get_mdcr_el2) + mrs x0, mdcr_el2 + ret +ENDPROC(__kvm_get_mdcr_el2) + .popsection From 4bd611ca60afa155bca25b40312ed61c4d46237f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:57 +0100 Subject: [PATCH 0058/1466] KVM: arm64: guest debug, add SW break point support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for SW breakpoints inserted by userspace. We do this by trapping all guest software debug exceptions to the hypervisor (MDCR_EL2.TDE). The exit handler sets an exit reason of KVM_EXIT_DEBUG with the kvm_debug_exit_arch structure holding the exception syndrome information. It will be up to userspace to extract the PC (via GET_ONE_REG) and determine if the debug event was for a breakpoint it inserted. If not userspace will need to re-inject the correct exception restart the hypervisor to deliver the debug exception to the guest. Any other guest software debug exception (e.g. single step or HW assisted breakpoints) will cause an error and the VM to be killed. This is addressed by later patches which add support for the other debug types. Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 2 +- arch/arm64/kvm/debug.c | 3 +++ arch/arm64/kvm/guest.c | 2 +- arch/arm64/kvm/handle_exit.c | 36 +++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 19adfd385882..0f498da354f2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2693,7 +2693,7 @@ when running. Common control bits are: The top 16 bits of the control field are architecture specific control flags which can include the following: - - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86] + - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390] - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index faf0e1fdba9e..8d1bfa438310 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -73,6 +73,9 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) if (trap_debug) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + /* Trap breakpoints? */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 0ba86775235d..22d22c54fd8d 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -332,7 +332,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return -EINVAL; } -#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE) +#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP) /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 524fa25671fc..27f38a9f9ea2 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -82,6 +82,40 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +/** + * kvm_handle_guest_debug - handle a debug exception instruction + * + * @vcpu: the vcpu pointer + * @run: access to the kvm_run structure for results + * + * We route all debug exceptions through the same handler. If both the + * guest and host are using the same debug facilities it will be up to + * userspace to re-inject the correct exception for guest delivery. + * + * @return: 0 (while setting run->exit_reason), -1 for error + */ +static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + u32 hsr = kvm_vcpu_get_hsr(vcpu); + int ret = 0; + + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.hsr = hsr; + + switch (hsr >> ESR_ELx_EC_SHIFT) { + case ESR_ELx_EC_BKPT32: + case ESR_ELx_EC_BRK64: + break; + default: + kvm_err("%s: un-handled case hsr: %#08x\n", + __func__, (unsigned int) hsr); + ret = -1; + break; + } + + return ret; +} + static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_WFx] = kvm_handle_wfx, [ESR_ELx_EC_CP15_32] = kvm_handle_cp15_32, @@ -96,6 +130,8 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, + [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) From 337b99bf7edfb2044781447e7ca386edb1fdba9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:58 +0100 Subject: [PATCH 0059/1466] KVM: arm64: guest debug, add support for single-step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for single-stepping the guest. To do this we need to manipulate the guests PSTATE.SS and MDSCR_EL1.SS bits to trigger stepping. We take care to preserve MDSCR_EL1 and trap access to it to ensure we don't affect the apparent state of the guest. As we have to enable trapping of all software debug exceptions we suppress the ability of the guest to single-step itself. If we didn't we would have to deal with the exception arriving while the guest was in kernelspace when the guest is expecting to single-step userspace. This is something we don't want to unwind in the kernel. Once the host is no longer debugging the guest its ability to single-step userspace is restored. Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 +++++ arch/arm64/kvm/debug.c | 68 +++++++++++++++++++++++++++++-- arch/arm64/kvm/guest.c | 4 +- arch/arm64/kvm/handle_exit.c | 2 + 4 files changed, 80 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c90c6a41c448..cfb675466e86 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -123,6 +123,17 @@ struct kvm_vcpu_arch { * here. */ + /* + * Guest registers we preserve during guest debugging. + * + * These shadow registers are updated by the kvm_handle_sys_reg + * trap handler if the guest accesses or updates them while we + * are using guest debug. + */ + struct { + u32 mdscr_el1; + } guest_debug_preserved; + /* Don't run the guest */ bool pause; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 8d1bfa438310..d439eb8f3239 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -19,10 +19,38 @@ #include +#include +#include #include +#include + +/* These are the bits of MDSCR_EL1 we may manipulate */ +#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ + DBG_MDSCR_KDE | \ + DBG_MDSCR_MDE) static DEFINE_PER_CPU(u32, mdcr_el2); +/** + * save/restore_guest_debug_regs + * + * For some debug operations we need to tweak some guest registers. As + * a result we need to save the state of those registers before we + * make those modifications. + * + * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled + * after we have restored the preserved value to the main context. + */ +static void save_guest_debug_regs(struct kvm_vcpu *vcpu) +{ + vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1); +} + +static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) +{ + vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1; +} + /** * kvm_arm_init_debug - grab what we need for debug * @@ -38,7 +66,6 @@ void kvm_arm_init_debug(void) __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); } - /** * kvm_arm_setup_debug - set up debug related stuff * @@ -73,12 +100,45 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) if (trap_debug) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - /* Trap breakpoints? */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + /* Is Guest debugging in effect? */ + if (vcpu->guest_debug) { + /* Route all software debug exceptions to EL2 */ vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; + + /* Save guest debug state */ + save_guest_debug_regs(vcpu); + + /* + * Single Step (ARM ARM D2.12.3 The software step state + * machine) + * + * If we are doing Single Step we need to manipulate + * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the + * step has occurred the hypervisor will trap the + * debug exception and we return to userspace. + * + * If the guest attempts to single step its userspace + * we would have to deal with a trapped exception + * while in the guest kernel. Because this would be + * hard to unwind we suppress the guest's ability to + * do so by masking MDSCR_EL.SS. + * + * This confuses guest debuggers which use + * single-step behind the scenes but everything + * returns to normal once the host is no longer + * debugging the system. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS; + } else { + vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS; + } + } } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) { - /* Nothing to do yet */ + if (vcpu->guest_debug) + restore_guest_debug_regs(vcpu); } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 22d22c54fd8d..48de4f4aaa1a 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -332,7 +332,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return -EINVAL; } -#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP) +#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ + KVM_GUESTDBG_USE_SW_BP | \ + KVM_GUESTDBG_SINGLESTEP) /** * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 27f38a9f9ea2..e9de13ed477e 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -103,6 +103,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) run->debug.arch.hsr = hsr; switch (hsr >> ESR_ELx_EC_SHIFT) { + case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_BKPT32: case ESR_ELx_EC_BRK64: break; @@ -130,6 +131,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_SYS64] = kvm_handle_sys_reg, [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, + [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, }; From e0a1b9a937dc6df9eb402af8966b5d617c7c6b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:29:59 +0100 Subject: [PATCH 0060/1466] KVM: arm64: re-factor hyp.S debug register code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a pre-cursor to sharing the code with the guest debug support. This replaces the big macro that fishes data out of a fixed location with a more general helper macro to restore a set of debug registers. It uses macro substitution so it can be re-used for debug control and value registers. It does however rely on the debug registers being 64 bit aligned (as they happen to be in the hyp ABI). Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp.S | 511 ++++++++++++------------------------------- 1 file changed, 135 insertions(+), 376 deletions(-) diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index b3176e6e51d1..8264f5b4817c 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -230,199 +230,52 @@ stp x24, x25, [x3, #160] .endm -.macro save_debug - // x2: base address for cpu context - // x3: tmp register +.macro save_debug type + // x4: pointer to register set + // x5: number of registers to skip + // x6..x22 trashed - mrs x26, id_aa64dfr0_el1 - ubfx x24, x26, #12, #4 // Extract BRPs - ubfx x25, x26, #20, #4 // Extract WRPs - mov w26, #15 - sub w24, w26, w24 // How many BPs to skip - sub w25, w26, w25 // How many WPs to skip - - add x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 + adr x22, 1f + add x22, x22, x5, lsl #2 + br x22 1: - mrs x20, dbgbcr15_el1 - mrs x19, dbgbcr14_el1 - mrs x18, dbgbcr13_el1 - mrs x17, dbgbcr12_el1 - mrs x16, dbgbcr11_el1 - mrs x15, dbgbcr10_el1 - mrs x14, dbgbcr9_el1 - mrs x13, dbgbcr8_el1 - mrs x12, dbgbcr7_el1 - mrs x11, dbgbcr6_el1 - mrs x10, dbgbcr5_el1 - mrs x9, dbgbcr4_el1 - mrs x8, dbgbcr3_el1 - mrs x7, dbgbcr2_el1 - mrs x6, dbgbcr1_el1 - mrs x5, dbgbcr0_el1 - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 + mrs x21, \type\()15_el1 + mrs x20, \type\()14_el1 + mrs x19, \type\()13_el1 + mrs x18, \type\()12_el1 + mrs x17, \type\()11_el1 + mrs x16, \type\()10_el1 + mrs x15, \type\()9_el1 + mrs x14, \type\()8_el1 + mrs x13, \type\()7_el1 + mrs x12, \type\()6_el1 + mrs x11, \type\()5_el1 + mrs x10, \type\()4_el1 + mrs x9, \type\()3_el1 + mrs x8, \type\()2_el1 + mrs x7, \type\()1_el1 + mrs x6, \type\()0_el1 + adr x22, 1f + add x22, x22, x5, lsl #2 + br x22 1: - str x20, [x3, #(15 * 8)] - str x19, [x3, #(14 * 8)] - str x18, [x3, #(13 * 8)] - str x17, [x3, #(12 * 8)] - str x16, [x3, #(11 * 8)] - str x15, [x3, #(10 * 8)] - str x14, [x3, #(9 * 8)] - str x13, [x3, #(8 * 8)] - str x12, [x3, #(7 * 8)] - str x11, [x3, #(6 * 8)] - str x10, [x3, #(5 * 8)] - str x9, [x3, #(4 * 8)] - str x8, [x3, #(3 * 8)] - str x7, [x3, #(2 * 8)] - str x6, [x3, #(1 * 8)] - str x5, [x3, #(0 * 8)] - - add x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 -1: - mrs x20, dbgbvr15_el1 - mrs x19, dbgbvr14_el1 - mrs x18, dbgbvr13_el1 - mrs x17, dbgbvr12_el1 - mrs x16, dbgbvr11_el1 - mrs x15, dbgbvr10_el1 - mrs x14, dbgbvr9_el1 - mrs x13, dbgbvr8_el1 - mrs x12, dbgbvr7_el1 - mrs x11, dbgbvr6_el1 - mrs x10, dbgbvr5_el1 - mrs x9, dbgbvr4_el1 - mrs x8, dbgbvr3_el1 - mrs x7, dbgbvr2_el1 - mrs x6, dbgbvr1_el1 - mrs x5, dbgbvr0_el1 - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 - -1: - str x20, [x3, #(15 * 8)] - str x19, [x3, #(14 * 8)] - str x18, [x3, #(13 * 8)] - str x17, [x3, #(12 * 8)] - str x16, [x3, #(11 * 8)] - str x15, [x3, #(10 * 8)] - str x14, [x3, #(9 * 8)] - str x13, [x3, #(8 * 8)] - str x12, [x3, #(7 * 8)] - str x11, [x3, #(6 * 8)] - str x10, [x3, #(5 * 8)] - str x9, [x3, #(4 * 8)] - str x8, [x3, #(3 * 8)] - str x7, [x3, #(2 * 8)] - str x6, [x3, #(1 * 8)] - str x5, [x3, #(0 * 8)] - - add x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - mrs x20, dbgwcr15_el1 - mrs x19, dbgwcr14_el1 - mrs x18, dbgwcr13_el1 - mrs x17, dbgwcr12_el1 - mrs x16, dbgwcr11_el1 - mrs x15, dbgwcr10_el1 - mrs x14, dbgwcr9_el1 - mrs x13, dbgwcr8_el1 - mrs x12, dbgwcr7_el1 - mrs x11, dbgwcr6_el1 - mrs x10, dbgwcr5_el1 - mrs x9, dbgwcr4_el1 - mrs x8, dbgwcr3_el1 - mrs x7, dbgwcr2_el1 - mrs x6, dbgwcr1_el1 - mrs x5, dbgwcr0_el1 - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 - -1: - str x20, [x3, #(15 * 8)] - str x19, [x3, #(14 * 8)] - str x18, [x3, #(13 * 8)] - str x17, [x3, #(12 * 8)] - str x16, [x3, #(11 * 8)] - str x15, [x3, #(10 * 8)] - str x14, [x3, #(9 * 8)] - str x13, [x3, #(8 * 8)] - str x12, [x3, #(7 * 8)] - str x11, [x3, #(6 * 8)] - str x10, [x3, #(5 * 8)] - str x9, [x3, #(4 * 8)] - str x8, [x3, #(3 * 8)] - str x7, [x3, #(2 * 8)] - str x6, [x3, #(1 * 8)] - str x5, [x3, #(0 * 8)] - - add x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - mrs x20, dbgwvr15_el1 - mrs x19, dbgwvr14_el1 - mrs x18, dbgwvr13_el1 - mrs x17, dbgwvr12_el1 - mrs x16, dbgwvr11_el1 - mrs x15, dbgwvr10_el1 - mrs x14, dbgwvr9_el1 - mrs x13, dbgwvr8_el1 - mrs x12, dbgwvr7_el1 - mrs x11, dbgwvr6_el1 - mrs x10, dbgwvr5_el1 - mrs x9, dbgwvr4_el1 - mrs x8, dbgwvr3_el1 - mrs x7, dbgwvr2_el1 - mrs x6, dbgwvr1_el1 - mrs x5, dbgwvr0_el1 - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 - -1: - str x20, [x3, #(15 * 8)] - str x19, [x3, #(14 * 8)] - str x18, [x3, #(13 * 8)] - str x17, [x3, #(12 * 8)] - str x16, [x3, #(11 * 8)] - str x15, [x3, #(10 * 8)] - str x14, [x3, #(9 * 8)] - str x13, [x3, #(8 * 8)] - str x12, [x3, #(7 * 8)] - str x11, [x3, #(6 * 8)] - str x10, [x3, #(5 * 8)] - str x9, [x3, #(4 * 8)] - str x8, [x3, #(3 * 8)] - str x7, [x3, #(2 * 8)] - str x6, [x3, #(1 * 8)] - str x5, [x3, #(0 * 8)] - - mrs x21, mdccint_el1 - str x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)] + str x21, [x4, #(15 * 8)] + str x20, [x4, #(14 * 8)] + str x19, [x4, #(13 * 8)] + str x18, [x4, #(12 * 8)] + str x17, [x4, #(11 * 8)] + str x16, [x4, #(10 * 8)] + str x15, [x4, #(9 * 8)] + str x14, [x4, #(8 * 8)] + str x13, [x4, #(7 * 8)] + str x12, [x4, #(6 * 8)] + str x11, [x4, #(5 * 8)] + str x10, [x4, #(4 * 8)] + str x9, [x4, #(3 * 8)] + str x8, [x4, #(2 * 8)] + str x7, [x4, #(1 * 8)] + str x6, [x4, #(0 * 8)] .endm .macro restore_sysregs @@ -467,195 +320,52 @@ msr mdscr_el1, x25 .endm -.macro restore_debug - // x2: base address for cpu context - // x3: tmp register +.macro restore_debug type + // x4: pointer to register set + // x5: number of registers to skip + // x6..x22 trashed - mrs x26, id_aa64dfr0_el1 - ubfx x24, x26, #12, #4 // Extract BRPs - ubfx x25, x26, #20, #4 // Extract WRPs - mov w26, #15 - sub w24, w26, w24 // How many BPs to skip - sub w25, w26, w25 // How many WPs to skip - - add x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 + adr x22, 1f + add x22, x22, x5, lsl #2 + br x22 1: - ldr x20, [x3, #(15 * 8)] - ldr x19, [x3, #(14 * 8)] - ldr x18, [x3, #(13 * 8)] - ldr x17, [x3, #(12 * 8)] - ldr x16, [x3, #(11 * 8)] - ldr x15, [x3, #(10 * 8)] - ldr x14, [x3, #(9 * 8)] - ldr x13, [x3, #(8 * 8)] - ldr x12, [x3, #(7 * 8)] - ldr x11, [x3, #(6 * 8)] - ldr x10, [x3, #(5 * 8)] - ldr x9, [x3, #(4 * 8)] - ldr x8, [x3, #(3 * 8)] - ldr x7, [x3, #(2 * 8)] - ldr x6, [x3, #(1 * 8)] - ldr x5, [x3, #(0 * 8)] + ldr x21, [x4, #(15 * 8)] + ldr x20, [x4, #(14 * 8)] + ldr x19, [x4, #(13 * 8)] + ldr x18, [x4, #(12 * 8)] + ldr x17, [x4, #(11 * 8)] + ldr x16, [x4, #(10 * 8)] + ldr x15, [x4, #(9 * 8)] + ldr x14, [x4, #(8 * 8)] + ldr x13, [x4, #(7 * 8)] + ldr x12, [x4, #(6 * 8)] + ldr x11, [x4, #(5 * 8)] + ldr x10, [x4, #(4 * 8)] + ldr x9, [x4, #(3 * 8)] + ldr x8, [x4, #(2 * 8)] + ldr x7, [x4, #(1 * 8)] + ldr x6, [x4, #(0 * 8)] - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 + adr x22, 1f + add x22, x22, x5, lsl #2 + br x22 1: - msr dbgbcr15_el1, x20 - msr dbgbcr14_el1, x19 - msr dbgbcr13_el1, x18 - msr dbgbcr12_el1, x17 - msr dbgbcr11_el1, x16 - msr dbgbcr10_el1, x15 - msr dbgbcr9_el1, x14 - msr dbgbcr8_el1, x13 - msr dbgbcr7_el1, x12 - msr dbgbcr6_el1, x11 - msr dbgbcr5_el1, x10 - msr dbgbcr4_el1, x9 - msr dbgbcr3_el1, x8 - msr dbgbcr2_el1, x7 - msr dbgbcr1_el1, x6 - msr dbgbcr0_el1, x5 - - add x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 -1: - ldr x20, [x3, #(15 * 8)] - ldr x19, [x3, #(14 * 8)] - ldr x18, [x3, #(13 * 8)] - ldr x17, [x3, #(12 * 8)] - ldr x16, [x3, #(11 * 8)] - ldr x15, [x3, #(10 * 8)] - ldr x14, [x3, #(9 * 8)] - ldr x13, [x3, #(8 * 8)] - ldr x12, [x3, #(7 * 8)] - ldr x11, [x3, #(6 * 8)] - ldr x10, [x3, #(5 * 8)] - ldr x9, [x3, #(4 * 8)] - ldr x8, [x3, #(3 * 8)] - ldr x7, [x3, #(2 * 8)] - ldr x6, [x3, #(1 * 8)] - ldr x5, [x3, #(0 * 8)] - - adr x26, 1f - add x26, x26, x24, lsl #2 - br x26 -1: - msr dbgbvr15_el1, x20 - msr dbgbvr14_el1, x19 - msr dbgbvr13_el1, x18 - msr dbgbvr12_el1, x17 - msr dbgbvr11_el1, x16 - msr dbgbvr10_el1, x15 - msr dbgbvr9_el1, x14 - msr dbgbvr8_el1, x13 - msr dbgbvr7_el1, x12 - msr dbgbvr6_el1, x11 - msr dbgbvr5_el1, x10 - msr dbgbvr4_el1, x9 - msr dbgbvr3_el1, x8 - msr dbgbvr2_el1, x7 - msr dbgbvr1_el1, x6 - msr dbgbvr0_el1, x5 - - add x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - ldr x20, [x3, #(15 * 8)] - ldr x19, [x3, #(14 * 8)] - ldr x18, [x3, #(13 * 8)] - ldr x17, [x3, #(12 * 8)] - ldr x16, [x3, #(11 * 8)] - ldr x15, [x3, #(10 * 8)] - ldr x14, [x3, #(9 * 8)] - ldr x13, [x3, #(8 * 8)] - ldr x12, [x3, #(7 * 8)] - ldr x11, [x3, #(6 * 8)] - ldr x10, [x3, #(5 * 8)] - ldr x9, [x3, #(4 * 8)] - ldr x8, [x3, #(3 * 8)] - ldr x7, [x3, #(2 * 8)] - ldr x6, [x3, #(1 * 8)] - ldr x5, [x3, #(0 * 8)] - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - msr dbgwcr15_el1, x20 - msr dbgwcr14_el1, x19 - msr dbgwcr13_el1, x18 - msr dbgwcr12_el1, x17 - msr dbgwcr11_el1, x16 - msr dbgwcr10_el1, x15 - msr dbgwcr9_el1, x14 - msr dbgwcr8_el1, x13 - msr dbgwcr7_el1, x12 - msr dbgwcr6_el1, x11 - msr dbgwcr5_el1, x10 - msr dbgwcr4_el1, x9 - msr dbgwcr3_el1, x8 - msr dbgwcr2_el1, x7 - msr dbgwcr1_el1, x6 - msr dbgwcr0_el1, x5 - - add x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - ldr x20, [x3, #(15 * 8)] - ldr x19, [x3, #(14 * 8)] - ldr x18, [x3, #(13 * 8)] - ldr x17, [x3, #(12 * 8)] - ldr x16, [x3, #(11 * 8)] - ldr x15, [x3, #(10 * 8)] - ldr x14, [x3, #(9 * 8)] - ldr x13, [x3, #(8 * 8)] - ldr x12, [x3, #(7 * 8)] - ldr x11, [x3, #(6 * 8)] - ldr x10, [x3, #(5 * 8)] - ldr x9, [x3, #(4 * 8)] - ldr x8, [x3, #(3 * 8)] - ldr x7, [x3, #(2 * 8)] - ldr x6, [x3, #(1 * 8)] - ldr x5, [x3, #(0 * 8)] - - adr x26, 1f - add x26, x26, x25, lsl #2 - br x26 -1: - msr dbgwvr15_el1, x20 - msr dbgwvr14_el1, x19 - msr dbgwvr13_el1, x18 - msr dbgwvr12_el1, x17 - msr dbgwvr11_el1, x16 - msr dbgwvr10_el1, x15 - msr dbgwvr9_el1, x14 - msr dbgwvr8_el1, x13 - msr dbgwvr7_el1, x12 - msr dbgwvr6_el1, x11 - msr dbgwvr5_el1, x10 - msr dbgwvr4_el1, x9 - msr dbgwvr3_el1, x8 - msr dbgwvr2_el1, x7 - msr dbgwvr1_el1, x6 - msr dbgwvr0_el1, x5 - - ldr x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)] - msr mdccint_el1, x21 + msr \type\()15_el1, x21 + msr \type\()14_el1, x20 + msr \type\()13_el1, x19 + msr \type\()12_el1, x18 + msr \type\()11_el1, x17 + msr \type\()10_el1, x16 + msr \type\()9_el1, x15 + msr \type\()8_el1, x14 + msr \type\()7_el1, x13 + msr \type\()6_el1, x12 + msr \type\()5_el1, x11 + msr \type\()4_el1, x10 + msr \type\()3_el1, x9 + msr \type\()2_el1, x8 + msr \type\()1_el1, x7 + msr \type\()0_el1, x6 .endm .macro skip_32bit_state tmp, target @@ -883,12 +593,61 @@ __restore_sysregs: restore_sysregs ret +/* Save debug state */ __save_debug: - save_debug + // x2: ptr to CPU context + // x4/x5/x6-22/x24-26: trashed + + mrs x26, id_aa64dfr0_el1 + ubfx x24, x26, #12, #4 // Extract BRPs + ubfx x25, x26, #20, #4 // Extract WRPs + mov w26, #15 + sub w24, w26, w24 // How many BPs to skip + sub w25, w26, w25 // How many WPs to skip + + mov x5, x24 + add x4, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) + save_debug dbgbcr + add x4, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) + save_debug dbgbvr + + mov x5, x25 + add x4, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) + save_debug dbgwcr + add x4, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) + save_debug dbgwvr + + mrs x21, mdccint_el1 + str x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)] ret +/* Restore debug state */ __restore_debug: - restore_debug + // x2: ptr to CPU context + // x4/x5/x6-22/x24-26: trashed + + mrs x26, id_aa64dfr0_el1 + ubfx x24, x26, #12, #4 // Extract BRPs + ubfx x25, x26, #20, #4 // Extract WRPs + mov w26, #15 + sub w24, w26, w24 // How many BPs to skip + sub w25, w26, w25 // How many WPs to skip + + mov x5, x24 + add x4, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) + restore_debug dbgbcr + add x4, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) + restore_debug dbgbvr + + mov x5, x25 + add x4, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) + restore_debug dbgwcr + add x4, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) + restore_debug dbgwvr + + ldr x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)] + msr mdccint_el1, x21 + ret __save_fpsimd: From 84e690bfbed1d1ecb45d8eccd4c7b6c8e878da1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:30:00 +0100 Subject: [PATCH 0061/1466] KVM: arm64: introduce vcpu->arch.debug_ptr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a level of indirection for the debug registers. Instead of using the sys_regs[] directly we store registers in a structure in the vcpu. The new kvm_arm_reset_debug_ptr() sets the debug ptr to the guest context. Because we no longer give the sys_regs offset for the sys_reg_desc->reg field, but instead the index into a debug-specific struct we need to add a number of additional trap functions for each register. Also as the generic generic user-space access code no longer works we have introduced a new pair of function pointers to the sys_reg_desc structure to override the generic code when needed. Reviewed-by: Christoffer Dall Signed-off-by: Alex Bennée Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 1 + arch/arm/kvm/arm.c | 2 + arch/arm64/include/asm/kvm_asm.h | 24 +-- arch/arm64/include/asm/kvm_host.h | 17 +- arch/arm64/kernel/asm-offsets.c | 6 + arch/arm64/kvm/debug.c | 9 + arch/arm64/kvm/hyp.S | 24 ++- arch/arm64/kvm/sys_regs.c | 272 +++++++++++++++++++++++++++--- arch/arm64/kvm/sys_regs.h | 6 + 9 files changed, 315 insertions(+), 46 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 2b0bc8c57552..dcba0fa5176e 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -234,5 +234,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} +static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 77151b111d32..9ce5cf02ed17 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -278,6 +278,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) /* Set up the timer */ kvm_timer_vcpu_init(vcpu); + kvm_arm_reset_debug_ptr(vcpu); + return 0; } diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index f5e40dae291a..67fa0de3d483 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -46,24 +46,16 @@ #define CNTKCTL_EL1 20 /* Timer Control Register (EL1) */ #define PAR_EL1 21 /* Physical Address Register */ #define MDSCR_EL1 22 /* Monitor Debug System Control Register */ -#define DBGBCR0_EL1 23 /* Debug Breakpoint Control Registers (0-15) */ -#define DBGBCR15_EL1 38 -#define DBGBVR0_EL1 39 /* Debug Breakpoint Value Registers (0-15) */ -#define DBGBVR15_EL1 54 -#define DBGWCR0_EL1 55 /* Debug Watchpoint Control Registers (0-15) */ -#define DBGWCR15_EL1 70 -#define DBGWVR0_EL1 71 /* Debug Watchpoint Value Registers (0-15) */ -#define DBGWVR15_EL1 86 -#define MDCCINT_EL1 87 /* Monitor Debug Comms Channel Interrupt Enable Reg */ +#define MDCCINT_EL1 23 /* Monitor Debug Comms Channel Interrupt Enable Reg */ /* 32bit specific registers. Keep them at the end of the range */ -#define DACR32_EL2 88 /* Domain Access Control Register */ -#define IFSR32_EL2 89 /* Instruction Fault Status Register */ -#define FPEXC32_EL2 90 /* Floating-Point Exception Control Register */ -#define DBGVCR32_EL2 91 /* Debug Vector Catch Register */ -#define TEECR32_EL1 92 /* ThumbEE Configuration Register */ -#define TEEHBR32_EL1 93 /* ThumbEE Handler Base Register */ -#define NR_SYS_REGS 94 +#define DACR32_EL2 24 /* Domain Access Control Register */ +#define IFSR32_EL2 25 /* Instruction Fault Status Register */ +#define FPEXC32_EL2 26 /* Floating-Point Exception Control Register */ +#define DBGVCR32_EL2 27 /* Debug Vector Catch Register */ +#define TEECR32_EL1 28 /* ThumbEE Configuration Register */ +#define TEEHBR32_EL1 29 /* ThumbEE Handler Base Register */ +#define NR_SYS_REGS 30 /* 32bit mapping */ #define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index cfb675466e86..9b99402b14df 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -108,11 +108,25 @@ struct kvm_vcpu_arch { /* Exception Information */ struct kvm_vcpu_fault_info fault; - /* Debug state */ + /* Guest debug state */ u64 debug_flags; + /* + * We maintain more than a single set of debug registers to support + * debugging the guest from the host and to maintain separate host and + * guest state during world switches. vcpu_debug_state are the debug + * registers of the vcpu as the guest sees them. host_debug_state are + * the host registers which are saved and restored during world switches. + * + * debug_ptr points to the set of debug registers that should be loaded + * onto the hardware when running the guest. + */ + struct kvm_guest_debug_arch *debug_ptr; + struct kvm_guest_debug_arch vcpu_debug_state; + /* Pointer to host CPU context */ kvm_cpu_context_t *host_cpu_context; + struct kvm_guest_debug_arch host_debug_state; /* VGIC state */ struct vgic_cpu vgic_cpu; @@ -242,5 +256,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); +void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5c900d49b906..d88630899a24 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -116,10 +116,16 @@ int main(void) DEFINE(VCPU_FAR_EL2, offsetof(struct kvm_vcpu, arch.fault.far_el2)); DEFINE(VCPU_HPFAR_EL2, offsetof(struct kvm_vcpu, arch.fault.hpfar_el2)); DEFINE(VCPU_DEBUG_FLAGS, offsetof(struct kvm_vcpu, arch.debug_flags)); + DEFINE(VCPU_DEBUG_PTR, offsetof(struct kvm_vcpu, arch.debug_ptr)); + DEFINE(DEBUG_BCR, offsetof(struct kvm_guest_debug_arch, dbg_bcr)); + DEFINE(DEBUG_BVR, offsetof(struct kvm_guest_debug_arch, dbg_bvr)); + DEFINE(DEBUG_WCR, offsetof(struct kvm_guest_debug_arch, dbg_wcr)); + DEFINE(DEBUG_WVR, offsetof(struct kvm_guest_debug_arch, dbg_wvr)); DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2)); DEFINE(VCPU_MDCR_EL2, offsetof(struct kvm_vcpu, arch.mdcr_el2)); DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines)); DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); + DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state)); DEFINE(VCPU_TIMER_CNTV_CTL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl)); DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval)); DEFINE(KVM_TIMER_CNTVOFF, offsetof(struct kvm, arch.timer.cntvoff)); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d439eb8f3239..e0947b77faaa 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -66,6 +66,15 @@ void kvm_arm_init_debug(void) __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); } +/** + * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state + */ + +void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state; +} + /** * kvm_arm_setup_debug - set up debug related stuff * diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 8264f5b4817c..d93c0a23630e 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -596,6 +596,7 @@ __restore_sysregs: /* Save debug state */ __save_debug: // x2: ptr to CPU context + // x3: ptr to debug reg struct // x4/x5/x6-22/x24-26: trashed mrs x26, id_aa64dfr0_el1 @@ -606,15 +607,15 @@ __save_debug: sub w25, w26, w25 // How many WPs to skip mov x5, x24 - add x4, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) + add x4, x3, #DEBUG_BCR save_debug dbgbcr - add x4, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) + add x4, x3, #DEBUG_BVR save_debug dbgbvr mov x5, x25 - add x4, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) + add x4, x3, #DEBUG_WCR save_debug dbgwcr - add x4, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) + add x4, x3, #DEBUG_WVR save_debug dbgwvr mrs x21, mdccint_el1 @@ -624,6 +625,7 @@ __save_debug: /* Restore debug state */ __restore_debug: // x2: ptr to CPU context + // x3: ptr to debug reg struct // x4/x5/x6-22/x24-26: trashed mrs x26, id_aa64dfr0_el1 @@ -634,15 +636,15 @@ __restore_debug: sub w25, w26, w25 // How many WPs to skip mov x5, x24 - add x4, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1) + add x4, x3, #DEBUG_BCR restore_debug dbgbcr - add x4, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1) + add x4, x3, #DEBUG_BVR restore_debug dbgbvr mov x5, x25 - add x4, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1) + add x4, x3, #DEBUG_WCR restore_debug dbgwcr - add x4, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1) + add x4, x3, #DEBUG_WVR restore_debug dbgwvr ldr x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)] @@ -682,6 +684,7 @@ ENTRY(__kvm_vcpu_run) bl __save_sysregs compute_debug_state 1f + add x3, x0, #VCPU_HOST_DEBUG_STATE bl __save_debug 1: activate_traps @@ -697,6 +700,8 @@ ENTRY(__kvm_vcpu_run) bl __restore_fpsimd skip_debug_state x3, 1f + ldr x3, [x0, #VCPU_DEBUG_PTR] + kern_hyp_va x3 bl __restore_debug 1: restore_guest_32bit_state @@ -717,6 +722,8 @@ __kvm_vcpu_return: bl __save_sysregs skip_debug_state x3, 1f + ldr x3, [x0, #VCPU_DEBUG_PTR] + kern_hyp_va x3 bl __save_debug 1: save_guest_32bit_state @@ -739,6 +746,7 @@ __kvm_vcpu_return: // already been saved. Note that we nuke the whole 64bit word. // If we ever add more flags, we'll have to be more careful... str xzr, [x0, #VCPU_DEBUG_FLAGS] + add x3, x0, #VCPU_HOST_DEBUG_STATE bl __restore_debug 1: restore_host_regs diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c370b4014799..158bae7c52cc 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -211,6 +211,203 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, return true; } +/* + * reg_to_dbg/dbg_to_reg + * + * A 32 bit write to a debug register leave top bits alone + * A 32 bit read from a debug register only returns the bottom bits + * + * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the + * hyp.S code switches between host and guest values in future. + */ +static inline void reg_to_dbg(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + u64 *dbg_reg) +{ + u64 val = *vcpu_reg(vcpu, p->Rt); + + if (p->is_32bit) { + val &= 0xffffffffUL; + val |= ((*dbg_reg >> 32) << 32); + } + + *dbg_reg = val; + vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; +} + +static inline void dbg_to_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + u64 *dbg_reg) +{ + u64 val = *dbg_reg; + + if (p->is_32bit) + val &= 0xffffffffUL; + + *vcpu_reg(vcpu, p->Rt) = val; +} + +static inline bool trap_bvr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + + if (p->is_write) + reg_to_dbg(vcpu, p, dbg_reg); + else + dbg_to_reg(vcpu, p, dbg_reg); + + return true; +} + +static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + + if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + + if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static inline void reset_bvr(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val; +} + +static inline bool trap_bcr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + + if (p->is_write) + reg_to_dbg(vcpu, p, dbg_reg); + else + dbg_to_reg(vcpu, p, dbg_reg); + + return true; +} + +static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + + if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + + return 0; +} + +static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + + if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static inline void reset_bcr(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val; +} + +static inline bool trap_wvr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + + if (p->is_write) + reg_to_dbg(vcpu, p, dbg_reg); + else + dbg_to_reg(vcpu, p, dbg_reg); + + return true; +} + +static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + + if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + + if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static inline void reset_wvr(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val; +} + +static inline bool trap_wcr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + + if (p->is_write) + reg_to_dbg(vcpu, p, dbg_reg); + else + dbg_to_reg(vcpu, p, dbg_reg); + + return true; +} + +static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + + if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + + if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) + return -EFAULT; + return 0; +} + +static inline void reset_wcr(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val; +} + static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair; @@ -240,16 +437,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ /* DBGBVRn_EL1 */ \ { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100), \ - trap_debug_regs, reset_val, (DBGBVR0_EL1 + (n)), 0 }, \ + trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr }, \ /* DBGBCRn_EL1 */ \ { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101), \ - trap_debug_regs, reset_val, (DBGBCR0_EL1 + (n)), 0 }, \ + trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr }, \ /* DBGWVRn_EL1 */ \ { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110), \ - trap_debug_regs, reset_val, (DBGWVR0_EL1 + (n)), 0 }, \ + trap_wvr, reset_wvr, n, 0, get_wvr, set_wvr }, \ /* DBGWCRn_EL1 */ \ { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111), \ - trap_debug_regs, reset_val, (DBGWCR0_EL1 + (n)), 0 } + trap_wcr, reset_wcr, n, 0, get_wcr, set_wcr } /* * Architected system registers. @@ -516,28 +713,55 @@ static bool trap_debug32(struct kvm_vcpu *vcpu, return true; } -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_debug32, \ - NULL, (cp14_DBGBVR0 + (n) * 2) }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_debug32, \ - NULL, (cp14_DBGBCR0 + (n) * 2) }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_debug32, \ - NULL, (cp14_DBGWVR0 + (n) * 2) }, \ - /* DBGWCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_debug32, \ - NULL, (cp14_DBGWCR0 + (n) * 2) } +/* AArch32 debug register mappings + * + * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0] + * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32] + * + * All control registers and watchpoint value registers are mapped to + * the lower 32 bits of their AArch64 equivalents. We share the trap + * handlers with the above AArch64 code which checks what mode the + * system is in. + */ -#define DBGBXVR(n) \ - { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_debug32, \ - NULL, cp14_DBGBXVR0 + n * 2 } +static inline bool trap_xvr(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + + if (p->is_write) { + u64 val = *dbg_reg; + + val &= 0xffffffffUL; + val |= *vcpu_reg(vcpu, p->Rt) << 32; + *dbg_reg = val; + + vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; + } else { + *vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32; + } + + return true; +} + +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } + +#define DBGBXVR(n) \ + { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external * debug, on the principle that they don't really make sense to a - * guest. Revisit this one day, whould this principle change. + * guest. Revisit this one day, would this principle change. */ static const struct sys_reg_desc cp14_regs[] = { /* DBGIDR */ @@ -1303,6 +1527,9 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return get_invariant_sys_reg(reg->id, uaddr); + if (r->get_user) + return (r->get_user)(vcpu, r, reg, uaddr); + return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id); } @@ -1321,6 +1548,9 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (!r) return set_invariant_sys_reg(reg->id, uaddr); + if (r->set_user) + return (r->set_user)(vcpu, r, reg, uaddr); + return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index d411e251412c..eaa324e4db4d 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -55,6 +55,12 @@ struct sys_reg_desc { /* Value (usually reset value) */ u64 val; + + /* Custom get/set_user functions, fallback to generic if NULL */ + int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr); + int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr); }; static inline void print_sys_reg_instr(const struct sys_reg_params *p) From 5540546bc93b49f98a0466fe3f96615286c76574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:30:01 +0100 Subject: [PATCH 0062/1466] KVM: arm64: guest debug, HW assisted debug support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for userspace to control the HW debug registers for guest debug. In the debug ioctl we copy an IMPDEF registers into a new register set called host_debug_state. We use the recently introduced vcpu parameter debug_ptr to select which register set is copied into the real registers when world switch occurs. I've made some helper functions from hw_breakpoint.c more widely available for re-use. As with single step we need to tweak the guest registers to enable the exceptions so we need to save and restore those bits. Two new capabilities have been added to the KVM_EXTENSION ioctl to allow userspace to query the number of hardware break and watch points available on the host hardware. Signed-off-by: Alex Bennée Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4ab3c6a8d563..a1e08e7bbf20 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -820,6 +820,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_DISABLE_QUIRKS 116 #define KVM_CAP_X86_SMM 117 #define KVM_CAP_MULTI_ADDRESS_SPACE 118 +#define KVM_CAP_GUEST_DEBUG_HW_BPS 119 +#define KVM_CAP_GUEST_DEBUG_HW_WPS 120 #ifdef KVM_CAP_IRQ_ROUTING From 834bf88726f0f11ddc7ff9679fc9458654c01a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:30:02 +0100 Subject: [PATCH 0063/1466] KVM: arm64: enable KVM_CAP_SET_GUEST_DEBUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finally advertise the KVM capability for SET_GUEST_DEBUG. Once arm support is added this check can be moved to the common kvm_vm_ioctl_check_extension() code. Signed-off-by: Alex Bennée Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 7 ++++- arch/arm64/include/asm/hw_breakpoint.h | 14 +++++++++ arch/arm64/include/asm/kvm_host.h | 6 +++- arch/arm64/kernel/hw_breakpoint.c | 12 -------- arch/arm64/kvm/debug.c | 40 ++++++++++++++++++++++---- arch/arm64/kvm/guest.c | 7 +++++ arch/arm64/kvm/handle_exit.c | 6 ++++ arch/arm64/kvm/reset.c | 16 +++++++++++ 8 files changed, 89 insertions(+), 19 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 0f498da354f2..35affb5d9456 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2694,7 +2694,7 @@ The top 16 bits of the control field are architecture specific control flags which can include the following: - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] - - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390] + - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] @@ -2709,6 +2709,11 @@ updated to the correct (supplied) values. The second part of the structure is architecture specific and typically contains a set of debug registers. +For arm64 the number of debug registers is implementation defined and +can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and +KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number +indicating the number of supported registers. + When debug events exit the main run loop with the reason KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run structure containing architecture specific debug information. diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 52b484b6aa1a..4c47cb2fbb52 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -16,6 +16,8 @@ #ifndef __ASM_HW_BREAKPOINT_H #define __ASM_HW_BREAKPOINT_H +#include + #ifdef __KERNEL__ struct arch_hw_breakpoint_ctrl { @@ -132,5 +134,17 @@ static inline void ptrace_hw_copy_thread(struct task_struct *task) extern struct pmu perf_ops_bp; +/* Determine number of BRP registers available. */ +static inline int get_num_brps(void) +{ + return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1; +} + +/* Determine number of WRP registers available. */ +static inline int get_num_wrps(void) +{ + return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1; +} + #endif /* __KERNEL__ */ #endif /* __ASM_BREAKPOINT_H */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 9b99402b14df..409217f48456 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -116,13 +116,17 @@ struct kvm_vcpu_arch { * debugging the guest from the host and to maintain separate host and * guest state during world switches. vcpu_debug_state are the debug * registers of the vcpu as the guest sees them. host_debug_state are - * the host registers which are saved and restored during world switches. + * the host registers which are saved and restored during + * world switches. external_debug_state contains the debug + * values we want to debug the guest. This is set via the + * KVM_SET_GUEST_DEBUG ioctl. * * debug_ptr points to the set of debug registers that should be loaded * onto the hardware when running the guest. */ struct kvm_guest_debug_arch *debug_ptr; struct kvm_guest_debug_arch vcpu_debug_state; + struct kvm_guest_debug_arch external_debug_state; /* Pointer to host CPU context */ kvm_cpu_context_t *host_cpu_context; diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 7a1a5da6c8c1..77bee00bd7ea 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -48,18 +48,6 @@ static DEFINE_PER_CPU(int, stepping_kernel_bp); static int core_num_brps; static int core_num_wrps; -/* Determine number of BRP registers available. */ -static int get_num_brps(void) -{ - return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1; -} - -/* Determine number of WRP registers available. */ -static int get_num_wrps(void) -{ - return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1; -} - int hw_breakpoint_slots(int type) { /* diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index e0947b77faaa..4a99e54d7f3d 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -105,10 +105,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) MDCR_EL2_TDRA | MDCR_EL2_TDOSA); - /* Trap on access to debug registers? */ - if (trap_debug) - vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; - /* Is Guest debugging in effect? */ if (vcpu->guest_debug) { /* Route all software debug exceptions to EL2 */ @@ -143,11 +139,45 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) } else { vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS; } + + /* + * HW Breakpoints and watchpoints + * + * We simply switch the debug_ptr to point to our new + * external_debug_state which has been populated by the + * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY + * mechanism ensures the registers are updated on the + * world switch. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { + /* Enable breakpoints/watchpoints */ + vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE; + + vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; + vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; + trap_debug = true; + } } + + BUG_ON(!vcpu->guest_debug && + vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); + + /* Trap debug register access */ + if (trap_debug) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) { - if (vcpu->guest_debug) + if (vcpu->guest_debug) { restore_guest_debug_regs(vcpu); + + /* + * If we were using HW debug we need to restore the + * debug_ptr to the guest debug state. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + kvm_arm_reset_debug_ptr(vcpu); + + } } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 48de4f4aaa1a..6f1b249e0587 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -334,6 +334,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ + KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) /** @@ -354,6 +355,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { + vcpu->arch.external_debug_state = dbg->arch; + } + } else { /* If not enabled clear all flags */ vcpu->guest_debug = 0; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e9de13ed477e..68a0759b1375 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -103,7 +103,11 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) run->debug.arch.hsr = hsr; switch (hsr >> ESR_ELx_EC_SHIFT) { + case ESR_ELx_EC_WATCHPT_LOW: + run->debug.arch.far = vcpu->arch.fault.far_el2; + /* fall through */ case ESR_ELx_EC_SOFTSTP_LOW: + case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_BKPT32: case ESR_ELx_EC_BRK64: break; @@ -132,6 +136,8 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort, [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug, + [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug, + [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, }; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 0b4326578985..b4af6185713f 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -56,6 +57,12 @@ static bool cpu_has_32bit_el1(void) return !!(pfr0 & 0x20); } +/** + * kvm_arch_dev_ioctl_check_extension + * + * We currently assume that the number of HW registers is uniform + * across all CPUs (see cpuinfo_sanity_check). + */ int kvm_arch_dev_ioctl_check_extension(long ext) { int r; @@ -64,6 +71,15 @@ int kvm_arch_dev_ioctl_check_extension(long ext) case KVM_CAP_ARM_EL1_32BIT: r = cpu_has_32bit_el1(); break; + case KVM_CAP_GUEST_DEBUG_HW_BPS: + r = get_num_brps(); + break; + case KVM_CAP_GUEST_DEBUG_HW_WPS: + r = get_num_wrps(); + break; + case KVM_CAP_SET_GUEST_DEBUG: + r = 1; + break; default: r = 0; } From eef8c85a3b81301f912e8802a1dd9f42dd884947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Tue, 7 Jul 2015 17:30:03 +0100 Subject: [PATCH 0064/1466] KVM: arm64: add trace points for guest_debug debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This includes trace points for: kvm_arch_setup_guest_debug kvm_arch_clear_guest_debug I've also added some generic register setting trace events and also a trace point to dump the array of hardware registers. Acked-by: Christoffer Dall Signed-off-by: Alex Bennée Signed-off-by: Marc Zyngier --- arch/arm64/kvm/debug.c | 36 ++++++++++- arch/arm64/kvm/guest.c | 4 ++ arch/arm64/kvm/sys_regs.c | 17 ++++++ arch/arm64/kvm/trace.h | 123 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 4a99e54d7f3d..47e5f0feaee8 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -18,12 +18,15 @@ */ #include +#include #include #include #include #include +#include "trace.h" + /* These are the bits of MDSCR_EL1 we may manipulate */ #define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ DBG_MDSCR_KDE | \ @@ -44,11 +47,17 @@ static DEFINE_PER_CPU(u32, mdcr_el2); static void save_guest_debug_regs(struct kvm_vcpu *vcpu) { vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1); + + trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", + vcpu->arch.guest_debug_preserved.mdscr_el1); } static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) { vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1; + + trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", + vcpu_sys_reg(vcpu, MDSCR_EL1)); } /** @@ -99,6 +108,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY); + trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); + vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMCR | @@ -140,6 +151,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS; } + trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); + /* * HW Breakpoints and watchpoints * @@ -156,6 +169,14 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; trap_debug = true; + + trace_kvm_arm_set_regset("BKPTS", get_num_brps(), + &vcpu->arch.debug_ptr->dbg_bcr[0], + &vcpu->arch.debug_ptr->dbg_bvr[0]); + + trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), + &vcpu->arch.debug_ptr->dbg_wcr[0], + &vcpu->arch.debug_ptr->dbg_wvr[0]); } } @@ -165,10 +186,15 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) /* Trap debug register access */ if (trap_debug) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + + trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); + trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1)); } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) { + trace_kvm_arm_clear_debug(vcpu->guest_debug); + if (vcpu->guest_debug) { restore_guest_debug_regs(vcpu); @@ -176,8 +202,16 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) * If we were using HW debug we need to restore the * debug_ptr to the guest debug state. */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { kvm_arm_reset_debug_ptr(vcpu); + trace_kvm_arm_set_regset("BKPTS", get_num_brps(), + &vcpu->arch.debug_ptr->dbg_bcr[0], + &vcpu->arch.debug_ptr->dbg_bvr[0]); + + trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), + &vcpu->arch.debug_ptr->dbg_wcr[0], + &vcpu->arch.debug_ptr->dbg_wvr[0]); + } } } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 6f1b249e0587..48868d893870 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -32,6 +32,8 @@ #include #include +#include "trace.h" + struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; @@ -350,6 +352,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + trace_kvm_set_guest_debug(vcpu, dbg->control); + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) return -EINVAL; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 158bae7c52cc..b41607d270ac 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -38,6 +38,8 @@ #include "sys_regs.h" +#include "trace.h" + /* * All of this file is extremly similar to the ARM coproc.c, but the * types are different. My gut feeling is that it should be pretty @@ -208,6 +210,8 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg); } + trace_trap_reg(__func__, r->reg, p->is_write, *vcpu_reg(vcpu, p->Rt)); + return true; } @@ -258,6 +262,8 @@ static inline bool trap_bvr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, dbg_reg); + trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + return true; } @@ -298,6 +304,8 @@ static inline bool trap_bcr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, dbg_reg); + trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + return true; } @@ -339,6 +347,9 @@ static inline bool trap_wvr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, dbg_reg); + trace_trap_reg(__func__, rd->reg, p->is_write, + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); + return true; } @@ -379,6 +390,8 @@ static inline bool trap_wcr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, dbg_reg); + trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + return true; } @@ -742,6 +755,8 @@ static inline bool trap_xvr(struct kvm_vcpu *vcpu, *vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32; } + trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + return true; } @@ -1223,6 +1238,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_hsr(vcpu); + trace_kvm_handle_sys_reg(esr); + params.is_aarch32 = false; params.is_32bit = false; params.Op0 = (esr >> 20) & 3; diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h index 157416e963f2..7fb0008c4fa3 100644 --- a/arch/arm64/kvm/trace.h +++ b/arch/arm64/kvm/trace.h @@ -44,6 +44,129 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->vcpu_pc, __entry->r0, __entry->imm) ); +TRACE_EVENT(kvm_arm_setup_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_clear_debug, + TP_PROTO(__u32 guest_debug), + TP_ARGS(guest_debug), + + TP_STRUCT__entry( + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->guest_debug = guest_debug; + ), + + TP_printk("flags: 0x%08x", __entry->guest_debug) +); + +TRACE_EVENT(kvm_arm_set_dreg32, + TP_PROTO(const char *name, __u32 value), + TP_ARGS(name, value), + + TP_STRUCT__entry( + __field(const char *, name) + __field(__u32, value) + ), + + TP_fast_assign( + __entry->name = name; + __entry->value = value; + ), + + TP_printk("%s: 0x%08x", __entry->name, __entry->value) +); + +TRACE_EVENT(kvm_arm_set_regset, + TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), + TP_ARGS(type, len, control, value), + TP_STRUCT__entry( + __field(const char *, name) + __field(int, len) + __array(u64, ctrls, 16) + __array(u64, values, 16) + ), + TP_fast_assign( + __entry->name = type; + __entry->len = len; + memcpy(__entry->ctrls, control, len << 3); + memcpy(__entry->values, value, len << 3); + ), + TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, + __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), + __print_array(__entry->values, __entry->len, sizeof(__u64))) +); + +TRACE_EVENT(trap_reg, + TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), + TP_ARGS(fn, reg, is_write, write_value), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(int, reg) + __field(bool, is_write) + __field(u64, write_value) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->reg = reg; + __entry->is_write = is_write; + __entry->write_value = write_value; + ), + + TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) +); + +TRACE_EVENT(kvm_handle_sys_reg, + TP_PROTO(unsigned long hsr), + TP_ARGS(hsr), + + TP_STRUCT__entry( + __field(unsigned long, hsr) + ), + + TP_fast_assign( + __entry->hsr = hsr; + ), + + TP_printk("HSR 0x%08lx", __entry->hsr) +); + +TRACE_EVENT(kvm_set_guest_debug, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), + TP_ARGS(vcpu, guest_debug), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(__u32, guest_debug) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->guest_debug = guest_debug; + ), + + TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) +); + + #endif /* _TRACE_ARM64_KVM_H */ #undef TRACE_INCLUDE_PATH From 3bf2789cad9e6573dc19a6c3d123c2c049f2d90f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi Date: Mon, 22 Jun 2015 15:36:06 +0530 Subject: [PATCH 0065/1466] smack: allow mount opts setting over filesystems with binary mount data Add support for setting smack mount labels(using smackfsdef, smackfsroot, smackfshat, smackfsfloor, smackfstransmute) for filesystems with binary mount data like NFS. To achieve this, implement sb_parse_opts_str and sb_set_mnt_opts security operations in smack LSM similar to SELinux. Signed-off-by: Vivek Trivedi Signed-off-by: Amit Sahrawat Acked-by: Casey Schaufler --- security/smack/smack.h | 18 +++ security/smack/smack_lsm.c | 249 ++++++++++++++++++++++++++++++------- 2 files changed, 223 insertions(+), 44 deletions(-) diff --git a/security/smack/smack.h b/security/smack/smack.h index 244e035e5a99..69ab9eb7d6d9 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -143,6 +143,24 @@ struct smack_onlycap { struct smack_known *smk_label; }; +/* Super block security struct flags for mount options */ +#define FSDEFAULT_MNT 0x01 +#define FSFLOOR_MNT 0x02 +#define FSHAT_MNT 0x04 +#define FSROOT_MNT 0x08 +#define FSTRANS_MNT 0x10 + +#define NUM_SMK_MNT_OPTS 5 + +enum { + Opt_error = -1, + Opt_fsdefault = 1, + Opt_fsfloor = 2, + Opt_fshat = 3, + Opt_fsroot = 4, + Opt_fstransmute = 5, +}; + /* * Mount options */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index a143328f75eb..d962f887d3f4 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "smack.h" #define TRANS_TRUE "TRUE" @@ -64,6 +65,15 @@ static char *smk_bu_mess[] = { "Unconfined Object", /* SMACK_UNCONFINED_OBJECT */ }; +static const match_table_t tokens = { + {Opt_fsdefault, SMK_FSDEFAULT "%s"}, + {Opt_fsfloor, SMK_FSFLOOR "%s"}, + {Opt_fshat, SMK_FSHAT "%s"}, + {Opt_fsroot, SMK_FSROOT "%s"}, + {Opt_fstransmute, SMK_FSTRANS "%s"}, + {Opt_error, NULL}, +}; + static void smk_bu_mode(int mode, char *s) { int i = 0; @@ -577,76 +587,193 @@ static int smack_sb_copy_data(char *orig, char *smackopts) } /** - * smack_sb_kern_mount - Smack specific mount processing + * smack_parse_opts_str - parse Smack specific mount options + * @options: mount options string + * @opts: where to store converted mount opts + * + * Returns 0 on success or -ENOMEM on error. + * + * converts Smack specific mount options to generic security option format + */ +static int smack_parse_opts_str(char *options, + struct security_mnt_opts *opts) +{ + char *p; + char *fsdefault = NULL, *fsfloor = NULL; + char *fshat = NULL, *fsroot = NULL, *fstransmute = NULL; + int rc = -ENOMEM, num_mnt_opts = 0; + + opts->num_mnt_opts = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + + switch (token) { + case Opt_fsdefault: + if (fsdefault) + goto out_opt_err; + fsdefault = match_strdup(&args[0]); + if (!fsdefault) + goto out_err; + break; + case Opt_fsfloor: + if (fsfloor) + goto out_opt_err; + fsfloor = match_strdup(&args[0]); + if (!fsfloor) + goto out_err; + break; + case Opt_fshat: + if (fshat) + goto out_opt_err; + fshat = match_strdup(&args[0]); + if (!fshat) + goto out_err; + break; + case Opt_fsroot: + if (fsroot) + goto out_opt_err; + fsroot = match_strdup(&args[0]); + if (!fsroot) + goto out_err; + break; + case Opt_fstransmute: + if (fstransmute) + goto out_opt_err; + fstransmute = match_strdup(&args[0]); + if (!fstransmute) + goto out_err; + break; + default: + rc = -EINVAL; + pr_warn("Smack: unknown mount option\n"); + goto out_err; + } + } + + opts->mnt_opts = kcalloc(NUM_SMK_MNT_OPTS, sizeof(char *), GFP_ATOMIC); + if (!opts->mnt_opts) + goto out_err; + + opts->mnt_opts_flags = kcalloc(NUM_SMK_MNT_OPTS, sizeof(int), + GFP_ATOMIC); + if (!opts->mnt_opts_flags) { + kfree(opts->mnt_opts); + goto out_err; + } + + if (fsdefault) { + opts->mnt_opts[num_mnt_opts] = fsdefault; + opts->mnt_opts_flags[num_mnt_opts++] = FSDEFAULT_MNT; + } + if (fsfloor) { + opts->mnt_opts[num_mnt_opts] = fsfloor; + opts->mnt_opts_flags[num_mnt_opts++] = FSFLOOR_MNT; + } + if (fshat) { + opts->mnt_opts[num_mnt_opts] = fshat; + opts->mnt_opts_flags[num_mnt_opts++] = FSHAT_MNT; + } + if (fsroot) { + opts->mnt_opts[num_mnt_opts] = fsroot; + opts->mnt_opts_flags[num_mnt_opts++] = FSROOT_MNT; + } + if (fstransmute) { + opts->mnt_opts[num_mnt_opts] = fstransmute; + opts->mnt_opts_flags[num_mnt_opts++] = FSTRANS_MNT; + } + + opts->num_mnt_opts = num_mnt_opts; + return 0; + +out_opt_err: + rc = -EINVAL; + pr_warn("Smack: duplicate mount options\n"); + +out_err: + kfree(fsdefault); + kfree(fsfloor); + kfree(fshat); + kfree(fsroot); + kfree(fstransmute); + return rc; +} + +/** + * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock - * @flags: the mount flags - * @data: the smack mount options + * @opts: Smack mount options + * @kern_flags: mount option from kernel space or user space + * @set_kern_flags: where to store converted mount opts * * Returns 0 on success, an error code on failure + * + * Allow filesystems with binary mount data to explicitly set Smack mount + * labels. */ -static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) +static int smack_set_mnt_opts(struct super_block *sb, + struct security_mnt_opts *opts, + unsigned long kern_flags, + unsigned long *set_kern_flags) { struct dentry *root = sb->s_root; struct inode *inode = d_backing_inode(root); struct superblock_smack *sp = sb->s_security; struct inode_smack *isp; struct smack_known *skp; - char *op; - char *commap; + int i; + int num_opts = opts->num_mnt_opts; int transmute = 0; - int specified = 0; if (sp->smk_initialized) return 0; sp->smk_initialized = 1; - for (op = data; op != NULL; op = commap) { - commap = strchr(op, ','); - if (commap != NULL) - *commap++ = '\0'; - - if (strncmp(op, SMK_FSHAT, strlen(SMK_FSHAT)) == 0) { - op += strlen(SMK_FSHAT); - skp = smk_import_entry(op, 0); - if (IS_ERR(skp)) - return PTR_ERR(skp); - sp->smk_hat = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSFLOOR, strlen(SMK_FSFLOOR)) == 0) { - op += strlen(SMK_FSFLOOR); - skp = smk_import_entry(op, 0); - if (IS_ERR(skp)) - return PTR_ERR(skp); - sp->smk_floor = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSDEFAULT, - strlen(SMK_FSDEFAULT)) == 0) { - op += strlen(SMK_FSDEFAULT); - skp = smk_import_entry(op, 0); + for (i = 0; i < num_opts; i++) { + switch (opts->mnt_opts_flags[i]) { + case FSDEFAULT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_default = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSROOT, strlen(SMK_FSROOT)) == 0) { - op += strlen(SMK_FSROOT); - skp = smk_import_entry(op, 0); + break; + case FSFLOOR_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); + if (IS_ERR(skp)) + return PTR_ERR(skp); + sp->smk_floor = skp; + break; + case FSHAT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); + if (IS_ERR(skp)) + return PTR_ERR(skp); + sp->smk_hat = skp; + break; + case FSROOT_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; - specified = 1; - - } else if (strncmp(op, SMK_FSTRANS, strlen(SMK_FSTRANS)) == 0) { - op += strlen(SMK_FSTRANS); - skp = smk_import_entry(op, 0); + break; + case FSTRANS_MNT: + skp = smk_import_entry(opts->mnt_opts[i], 0); if (IS_ERR(skp)) return PTR_ERR(skp); sp->smk_root = skp; transmute = 1; - specified = 1; + break; + default: + break; } } @@ -654,7 +781,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) /* * Unprivileged mounts don't get to specify Smack values. */ - if (specified) + if (num_opts) return -EPERM; /* * Unprivileged mounts get root and default from the caller. @@ -663,6 +790,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) sp->smk_root = skp; sp->smk_default = skp; } + /* * Initialize the root inode. */ @@ -681,6 +809,37 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) return 0; } +/** + * smack_sb_kern_mount - Smack specific mount processing + * @sb: the file system superblock + * @flags: the mount flags + * @data: the smack mount options + * + * Returns 0 on success, an error code on failure + */ +static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) +{ + int rc = 0; + char *options = data; + struct security_mnt_opts opts; + + security_init_mnt_opts(&opts); + + if (!options) + goto out; + + rc = smack_parse_opts_str(options, &opts); + if (rc) + goto out_err; + +out: + rc = smack_set_mnt_opts(sb, &opts, 0, NULL); + +out_err: + security_free_mnt_opts(&opts); + return rc; +} + /** * smack_sb_statfs - Smack check on statfs * @dentry: identifies the file system in question @@ -4264,6 +4423,8 @@ struct security_hook_list smack_hooks[] = { LSM_HOOK_INIT(sb_copy_data, smack_sb_copy_data), LSM_HOOK_INIT(sb_kern_mount, smack_sb_kern_mount), LSM_HOOK_INIT(sb_statfs, smack_sb_statfs), + LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts), + LSM_HOOK_INIT(sb_parse_opts_str, smack_parse_opts_str), LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds), LSM_HOOK_INIT(bprm_committing_creds, smack_bprm_committing_creds), From ca70d27e445fe721587598030b97357b35f61913 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 24 Jun 2015 07:41:07 +0800 Subject: [PATCH 0066/1466] sysfs: fix simple_return.cocci warnings security/smack/smackfs.c:2251:1-4: WARNING: end returns can be simpified and declaration on line 2250 can be dropped Simplify a trivial if-return sequence. Possibly combine with a preceding function call. Generated by: scripts/coccinelle/misc/simple_return.cocci Signed-off-by: Fengguang Wu Acked-by: Serge Hallyn Acked-by: Casey Schaufler --- security/smack/smackfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 2716d02119f3..81a2888a9908 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -2320,11 +2320,7 @@ static const struct file_operations smk_revoke_subj_ops = { */ static int smk_init_sysfs(void) { - int err; - err = sysfs_create_mount_point(fs_kobj, "smackfs"); - if (err) - return err; - return 0; + return sysfs_create_mount_point(fs_kobj, "smackfs"); } /** From ac4c90c82e4d38cee613f68d2fabd714338ecca7 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 1 Jul 2015 15:10:38 +0200 Subject: [PATCH 0067/1466] cpufreq: exynos: remove exynos5250 specific cpufreq driver support Exynos5250 based platforms have switched over to use generic cpufreq driver for cpufreq functionality. So the Exynos specific cpufreq support for these platforms can be removed. Cc: Thomas Abraham Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Javier Martinez Canillas Tested-by: Javier Martinez Canillas Acked-by: Viresh Kumar [k.kozlowski: Rebased the patch around exynos-cpufreq.c] Signed-off-by: Krzysztof Kozlowski Signed-off-by: Kukjin Kim --- drivers/cpufreq/Kconfig.arm | 11 -- drivers/cpufreq/Makefile | 1 - drivers/cpufreq/exynos-cpufreq.c | 3 - drivers/cpufreq/exynos-cpufreq.h | 17 --- drivers/cpufreq/exynos5250-cpufreq.c | 210 --------------------------- 5 files changed, 242 deletions(-) delete mode 100644 drivers/cpufreq/exynos5250-cpufreq.c diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index cc8a71c267b8..7a4c1820de8c 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -47,17 +47,6 @@ config ARM_EXYNOS4X12_CPUFREQ If in doubt, say N. -config ARM_EXYNOS5250_CPUFREQ - bool "SAMSUNG EXYNOS5250" - depends on SOC_EXYNOS5250 - depends on ARM_EXYNOS_CPUFREQ - default y - help - This adds the CPUFreq driver for Samsung EXYNOS5250 - SoC. - - If in doubt, say N. - config ARM_EXYNOS_CPU_FREQ_BOOST_SW bool "EXYNOS Frequency Overclocking - Software" depends on ARM_EXYNOS_CPUFREQ && THERMAL diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 2169bf792db7..06286bbace21 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -55,7 +55,6 @@ obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o obj-$(CONFIG_ARM_EXYNOS_CPUFREQ) += arm-exynos-cpufreq.o arm-exynos-cpufreq-y := exynos-cpufreq.o arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ) += exynos4x12-cpufreq.o -arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS5250_CPUFREQ) += exynos5250-cpufreq.o obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o obj-$(CONFIG_ARM_HISI_ACPU_CPUFREQ) += hisi-acpu-cpufreq.o diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c index ae5b2bd3a978..71d889161218 100644 --- a/drivers/cpufreq/exynos-cpufreq.c +++ b/drivers/cpufreq/exynos-cpufreq.c @@ -175,9 +175,6 @@ static int exynos_cpufreq_probe(struct platform_device *pdev) } else if (of_machine_is_compatible("samsung,exynos4412")) { exynos_info->type = EXYNOS_SOC_4412; ret = exynos4x12_cpufreq_init(exynos_info); - } else if (of_machine_is_compatible("samsung,exynos5250")) { - exynos_info->type = EXYNOS_SOC_5250; - ret = exynos5250_cpufreq_init(exynos_info); } else { pr_err("%s: Unknown SoC type\n", __func__); return -ENODEV; diff --git a/drivers/cpufreq/exynos-cpufreq.h b/drivers/cpufreq/exynos-cpufreq.h index a3855e4d913d..a359db792ac8 100644 --- a/drivers/cpufreq/exynos-cpufreq.h +++ b/drivers/cpufreq/exynos-cpufreq.h @@ -20,7 +20,6 @@ enum cpufreq_level_index { enum exynos_soc_type { EXYNOS_SOC_4212, EXYNOS_SOC_4412, - EXYNOS_SOC_5250, }; #define APLL_FREQ(f, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, m, p, s) \ @@ -60,14 +59,6 @@ static inline int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info) return -EOPNOTSUPP; } #endif -#ifdef CONFIG_ARM_EXYNOS5250_CPUFREQ -extern int exynos5250_cpufreq_init(struct exynos_dvfs_info *); -#else -static inline int exynos5250_cpufreq_init(struct exynos_dvfs_info *info) -{ - return -EOPNOTSUPP; -} -#endif #define EXYNOS4_CLKSRC_CPU 0x14200 #define EXYNOS4_CLKMUX_STATCPU 0x14400 @@ -79,11 +70,3 @@ static inline int exynos5250_cpufreq_init(struct exynos_dvfs_info *info) #define EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT (16) #define EXYNOS4_CLKMUX_STATCPU_MUXCORE_MASK (0x7 << EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT) - -#define EXYNOS5_APLL_LOCK 0x00000 -#define EXYNOS5_APLL_CON0 0x00100 -#define EXYNOS5_CLKMUX_STATCPU 0x00400 -#define EXYNOS5_CLKDIV_CPU0 0x00500 -#define EXYNOS5_CLKDIV_CPU1 0x00504 -#define EXYNOS5_CLKDIV_STATCPU0 0x00600 -#define EXYNOS5_CLKDIV_STATCPU1 0x00604 diff --git a/drivers/cpufreq/exynos5250-cpufreq.c b/drivers/cpufreq/exynos5250-cpufreq.c deleted file mode 100644 index 3eafdc7ba787..000000000000 --- a/drivers/cpufreq/exynos5250-cpufreq.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2010-20122Samsung Electronics Co., Ltd. - * http://www.samsung.com - * - * EXYNOS5250 - CPU frequency scaling support - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "exynos-cpufreq.h" - -static struct clk *cpu_clk; -static struct clk *moutcore; -static struct clk *mout_mpll; -static struct clk *mout_apll; -static struct exynos_dvfs_info *cpufreq; - -static unsigned int exynos5250_volt_table[] = { - 1300000, 1250000, 1225000, 1200000, 1150000, - 1125000, 1100000, 1075000, 1050000, 1025000, - 1012500, 1000000, 975000, 950000, 937500, - 925000 -}; - -static struct cpufreq_frequency_table exynos5250_freq_table[] = { - {0, L0, 1700 * 1000}, - {0, L1, 1600 * 1000}, - {0, L2, 1500 * 1000}, - {0, L3, 1400 * 1000}, - {0, L4, 1300 * 1000}, - {0, L5, 1200 * 1000}, - {0, L6, 1100 * 1000}, - {0, L7, 1000 * 1000}, - {0, L8, 900 * 1000}, - {0, L9, 800 * 1000}, - {0, L10, 700 * 1000}, - {0, L11, 600 * 1000}, - {0, L12, 500 * 1000}, - {0, L13, 400 * 1000}, - {0, L14, 300 * 1000}, - {0, L15, 200 * 1000}, - {0, 0, CPUFREQ_TABLE_END}, -}; - -static struct apll_freq apll_freq_5250[] = { - /* - * values: - * freq - * clock divider for ARM, CPUD, ACP, PERIPH, ATB, PCLK_DBG, APLL, ARM2 - * clock divider for COPY, HPM, RESERVED - * PLL M, P, S - */ - APLL_FREQ(1700, 0, 3, 7, 7, 7, 3, 5, 0, 0, 2, 0, 425, 6, 0), - APLL_FREQ(1600, 0, 3, 7, 7, 7, 1, 4, 0, 0, 2, 0, 200, 3, 0), - APLL_FREQ(1500, 0, 2, 7, 7, 7, 1, 4, 0, 0, 2, 0, 250, 4, 0), - APLL_FREQ(1400, 0, 2, 7, 7, 6, 1, 4, 0, 0, 2, 0, 175, 3, 0), - APLL_FREQ(1300, 0, 2, 7, 7, 6, 1, 3, 0, 0, 2, 0, 325, 6, 0), - APLL_FREQ(1200, 0, 2, 7, 7, 5, 1, 3, 0, 0, 2, 0, 200, 4, 0), - APLL_FREQ(1100, 0, 3, 7, 7, 5, 1, 3, 0, 0, 2, 0, 275, 6, 0), - APLL_FREQ(1000, 0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 125, 3, 0), - APLL_FREQ(900, 0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 150, 4, 0), - APLL_FREQ(800, 0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 100, 3, 0), - APLL_FREQ(700, 0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 175, 3, 1), - APLL_FREQ(600, 0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 200, 4, 1), - APLL_FREQ(500, 0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 125, 3, 1), - APLL_FREQ(400, 0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 100, 3, 1), - APLL_FREQ(300, 0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 200, 4, 2), - APLL_FREQ(200, 0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 100, 3, 2), -}; - -static void set_clkdiv(unsigned int div_index) -{ - unsigned int tmp; - - /* Change Divider - CPU0 */ - - tmp = apll_freq_5250[div_index].clk_div_cpu0; - - __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU0); - - while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU0) - & 0x11111111) - cpu_relax(); - - /* Change Divider - CPU1 */ - tmp = apll_freq_5250[div_index].clk_div_cpu1; - - __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU1); - - while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU1) & 0x11) - cpu_relax(); -} - -static void set_apll(unsigned int index) -{ - unsigned int tmp; - unsigned int freq = apll_freq_5250[index].freq; - - /* MUX_CORE_SEL = MPLL, ARMCLK uses MPLL for lock time */ - clk_set_parent(moutcore, mout_mpll); - - do { - cpu_relax(); - tmp = (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU) - >> 16); - tmp &= 0x7; - } while (tmp != 0x2); - - clk_set_rate(mout_apll, freq * 1000); - - /* MUX_CORE_SEL = APLL */ - clk_set_parent(moutcore, mout_apll); - - do { - cpu_relax(); - tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU); - tmp &= (0x7 << 16); - } while (tmp != (0x1 << 16)); -} - -static void exynos5250_set_frequency(unsigned int old_index, - unsigned int new_index) -{ - if (old_index > new_index) { - set_clkdiv(new_index); - set_apll(new_index); - } else if (old_index < new_index) { - set_apll(new_index); - set_clkdiv(new_index); - } -} - -int exynos5250_cpufreq_init(struct exynos_dvfs_info *info) -{ - struct device_node *np; - unsigned long rate; - - /* - * HACK: This is a temporary workaround to get access to clock - * controller registers directly and remove static mappings and - * dependencies on platform headers. It is necessary to enable - * Exynos multi-platform support and will be removed together with - * this whole driver as soon as Exynos gets migrated to use - * cpufreq-dt driver. - */ - np = of_find_compatible_node(NULL, NULL, "samsung,exynos5250-clock"); - if (!np) { - pr_err("%s: failed to find clock controller DT node\n", - __func__); - return -ENODEV; - } - - info->cmu_regs = of_iomap(np, 0); - if (!info->cmu_regs) { - pr_err("%s: failed to map CMU registers\n", __func__); - return -EFAULT; - } - - cpu_clk = clk_get(NULL, "armclk"); - if (IS_ERR(cpu_clk)) - return PTR_ERR(cpu_clk); - - moutcore = clk_get(NULL, "mout_cpu"); - if (IS_ERR(moutcore)) - goto err_moutcore; - - mout_mpll = clk_get(NULL, "mout_mpll"); - if (IS_ERR(mout_mpll)) - goto err_mout_mpll; - - rate = clk_get_rate(mout_mpll) / 1000; - - mout_apll = clk_get(NULL, "mout_apll"); - if (IS_ERR(mout_apll)) - goto err_mout_apll; - - info->mpll_freq_khz = rate; - /* 800Mhz */ - info->pll_safe_idx = L9; - info->cpu_clk = cpu_clk; - info->volt_table = exynos5250_volt_table; - info->freq_table = exynos5250_freq_table; - info->set_freq = exynos5250_set_frequency; - - cpufreq = info; - - return 0; - -err_mout_apll: - clk_put(mout_mpll); -err_mout_mpll: - clk_put(moutcore); -err_moutcore: - clk_put(cpu_clk); - - pr_err("%s: failed initialization\n", __func__); - return -EINVAL; -} From b7446cacfb433f5e89ff94afecbc349e404aee21 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 18 Jun 2015 11:43:37 +0200 Subject: [PATCH 0068/1466] tcm_loop: Remove SAS vestigies tcm_loop is able to emulate several protocols, so remove last vestigies of the SAS protocol. Signed-off-by: Hannes Reinecke Signed-off-by: Nicholas Bellinger --- drivers/target/loopback/tcm_loop.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index a556bdebd775..b179d934cee1 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -526,7 +526,7 @@ static inline struct tcm_loop_tpg *tl_tpg(struct se_portal_group *se_tpg) static char *tcm_loop_get_endpoint_wwn(struct se_portal_group *se_tpg) { /* - * Return the passed NAA identifier for the SAS Target Port + * Return the passed NAA identifier for the Target Port */ return &tl_tpg(se_tpg)->tl_hba->tl_wwn_address[0]; } @@ -845,7 +845,7 @@ static int tcm_loop_make_nexus( transport_free_session(tl_nexus->se_sess); goto out; } - /* Now, register the SAS I_T Nexus as active. */ + /* Now, register the I_T Nexus as active. */ transport_register_session(se_tpg, tl_nexus->se_sess->se_node_acl, tl_nexus->se_sess, tl_nexus); tl_tpg->tl_nexus = tl_nexus; @@ -884,7 +884,7 @@ static int tcm_loop_drop_nexus( " %s Initiator Port: %s\n", tcm_loop_dump_proto_id(tpg->tl_hba), tl_nexus->se_sess->se_node_acl->initiatorname); /* - * Release the SCSI I_T Nexus to the emulated SAS Target Port + * Release the SCSI I_T Nexus to the emulated Target Port */ transport_deregister_session(tl_nexus->se_sess); tpg->tl_nexus = NULL; @@ -1077,7 +1077,7 @@ static struct se_portal_group *tcm_loop_make_naa_tpg( tl_tpg->tl_hba = tl_hba; tl_tpg->tl_tpgt = tpgt; /* - * Register the tl_tpg as a emulated SAS TCM Target Endpoint + * Register the tl_tpg as a emulated TCM Target Endpoint */ ret = core_tpg_register(wwn, &tl_tpg->tl_se_tpg, tl_hba->tl_proto_id); if (ret < 0) @@ -1102,11 +1102,11 @@ static void tcm_loop_drop_naa_tpg( tl_hba = tl_tpg->tl_hba; tpgt = tl_tpg->tl_tpgt; /* - * Release the I_T Nexus for the Virtual SAS link if present + * Release the I_T Nexus for the Virtual target link if present */ tcm_loop_drop_nexus(tl_tpg); /* - * Deregister the tl_tpg as a emulated SAS TCM Target Endpoint + * Deregister the tl_tpg as a emulated TCM Target Endpoint */ core_tpg_deregister(se_tpg); @@ -1199,8 +1199,9 @@ static void tcm_loop_drop_scsi_hba( struct tcm_loop_hba, tl_hba_wwn); pr_debug("TCM_Loop_ConfigFS: Deallocating emulated Target" - " SAS Address: %s at Linux/SCSI Host ID: %d\n", - tl_hba->tl_wwn_address, tl_hba->sh->host_no); + " %s Address: %s at Linux/SCSI Host ID: %d\n", + tcm_loop_dump_proto_id(tl_hba), tl_hba->tl_wwn_address, + tl_hba->sh->host_no); /* * Call device_unregister() on the original tl_hba->dev. * tcm_loop_fabric_scsi.c:tcm_loop_release_adapter() will From e986a35aba67558381d5cec59a14c4d0b20f0d47 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 18 Jun 2015 11:43:38 +0200 Subject: [PATCH 0069/1466] tcm_loop: Send I_T_NEXUS_LOSS_OCCURRED UA If the virtual SAS link is set to 'offline' we should be queueing an I_T_NEXUS_LOSS_OCCURRED UA. Signed-off-by: Hannes Reinecke Signed-off-by: Nicholas Bellinger --- drivers/target/loopback/tcm_loop.c | 5 +++++ drivers/target/target_core_tpg.c | 17 +++++++++++++++++ include/target/target_core_fabric.h | 1 + 3 files changed, 23 insertions(+) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index b179d934cee1..5bc85ffed720 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -1034,6 +1034,11 @@ static ssize_t tcm_loop_tpg_store_transport_status( } if (!strncmp(page, "offline", 7)) { tl_tpg->tl_transport_status = TCM_TRANSPORT_OFFLINE; + if (tl_tpg->tl_nexus) { + struct se_session *tl_sess = tl_tpg->tl_nexus->se_sess; + + core_allocate_nexus_loss_ua(tl_sess->se_node_acl); + } return count; } return -EINVAL; diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index babde4ad841f..2d0381dd105c 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -41,6 +41,7 @@ #include "target_core_internal.h" #include "target_core_alua.h" #include "target_core_pr.h" +#include "target_core_ua.h" extern struct se_device *g_lun0_dev; @@ -83,6 +84,22 @@ struct se_node_acl *core_tpg_get_initiator_node_acl( } EXPORT_SYMBOL(core_tpg_get_initiator_node_acl); +void core_allocate_nexus_loss_ua( + struct se_node_acl *nacl) +{ + struct se_dev_entry *deve; + + if (!nacl) + return; + + rcu_read_lock(); + hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link) + core_scsi3_ua_allocate(deve, 0x29, + ASCQ_29H_NEXUS_LOSS_OCCURRED); + rcu_read_unlock(); +} +EXPORT_SYMBOL(core_allocate_nexus_loss_ua); + /* core_tpg_add_node_to_devs(): * * diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 18afef91b447..69355feabd1d 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -152,6 +152,7 @@ int transport_generic_handle_tmr(struct se_cmd *); void transport_generic_request_failure(struct se_cmd *, sense_reason_t); void __target_execute_cmd(struct se_cmd *); int transport_lookup_tmr_lun(struct se_cmd *, u64); +void core_allocate_nexus_loss_ua(struct se_node_acl *acl); struct se_node_acl *core_tpg_get_initiator_node_acl(struct se_portal_group *tpg, unsigned char *); From 46d5bd62ef9e3d6e2018963cbb725c91f864922d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 8 Jul 2015 17:58:50 +0300 Subject: [PATCH 0070/1466] target: Inline transport_get_sense_codes() Inline this function in its call site since it performs a trivial task and since it is only called once. Signed-off-by: Bart Van Assche Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ce8574b7220c..b6708d1b69b8 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2615,17 +2615,6 @@ bool transport_wait_for_tasks(struct se_cmd *cmd) } EXPORT_SYMBOL(transport_wait_for_tasks); -static int transport_get_sense_codes( - struct se_cmd *cmd, - u8 *asc, - u8 *ascq) -{ - *asc = cmd->scsi_asc; - *ascq = cmd->scsi_ascq; - - return 0; -} - static void transport_err_sector_info(unsigned char *buffer, sector_t bad_sector) { @@ -2819,9 +2808,8 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd, buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; /* Not Ready */ buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY; - transport_get_sense_codes(cmd, &asc, &ascq); - buffer[SPC_ASC_KEY_OFFSET] = asc; - buffer[SPC_ASCQ_KEY_OFFSET] = ascq; + buffer[SPC_ASC_KEY_OFFSET] = cmd->scsi_asc; + buffer[SPC_ASCQ_KEY_OFFSET] = cmd->scsi_ascq; break; case TCM_MISCOMPARE_VERIFY: /* CURRENT ERROR */ From ab78fef4d5f79134042ae0e1e2c259e1226aa5bd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 8 Jul 2015 17:58:51 +0300 Subject: [PATCH 0071/1466] target: Split transport_send_check_condition_and_sense() Move the code for translating a sense_reason_t code into a SCSI status ASC and ASCQ codes from transport_send_check_condition_and_sense() into the new function translate_sense_reason(). Convert the switch statement that performs the translation into table-driven code. Signed-off-by: Bart Van Assche Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 383 ++++++++++--------------- 1 file changed, 148 insertions(+), 235 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index b6708d1b69b8..6ef44c9db381 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2628,13 +2628,155 @@ void transport_err_sector_info(unsigned char *buffer, sector_t bad_sector) put_unaligned_be64(bad_sector, &buffer[12]); } +struct sense_info { + u8 key; + u8 asc; + u8 ascq; + bool add_sector_info; +}; + +static const struct sense_info sense_info_table[] = { + [TCM_NO_SENSE] = { + .key = NOT_READY + }, + [TCM_NON_EXISTENT_LUN] = { + .key = ILLEGAL_REQUEST, + .asc = 0x25 /* LOGICAL UNIT NOT SUPPORTED */ + }, + [TCM_UNSUPPORTED_SCSI_OPCODE] = { + .key = ILLEGAL_REQUEST, + .asc = 0x20, /* INVALID COMMAND OPERATION CODE */ + }, + [TCM_SECTOR_COUNT_TOO_MANY] = { + .key = ILLEGAL_REQUEST, + .asc = 0x20, /* INVALID COMMAND OPERATION CODE */ + }, + [TCM_UNKNOWN_MODE_PAGE] = { + .key = ILLEGAL_REQUEST, + .asc = 0x24, /* INVALID FIELD IN CDB */ + }, + [TCM_CHECK_CONDITION_ABORT_CMD] = { + .key = ABORTED_COMMAND, + .asc = 0x29, /* BUS DEVICE RESET FUNCTION OCCURRED */ + .ascq = 0x03, + }, + [TCM_INCORRECT_AMOUNT_OF_DATA] = { + .key = ABORTED_COMMAND, + .asc = 0x0c, /* WRITE ERROR */ + .ascq = 0x0d, /* NOT ENOUGH UNSOLICITED DATA */ + }, + [TCM_INVALID_CDB_FIELD] = { + .key = ILLEGAL_REQUEST, + .asc = 0x24, /* INVALID FIELD IN CDB */ + }, + [TCM_INVALID_PARAMETER_LIST] = { + .key = ILLEGAL_REQUEST, + .asc = 0x26, /* INVALID FIELD IN PARAMETER LIST */ + }, + [TCM_PARAMETER_LIST_LENGTH_ERROR] = { + .key = ILLEGAL_REQUEST, + .asc = 0x1a, /* PARAMETER LIST LENGTH ERROR */ + }, + [TCM_UNEXPECTED_UNSOLICITED_DATA] = { + .key = ILLEGAL_REQUEST, + .asc = 0x0c, /* WRITE ERROR */ + .ascq = 0x0c, /* UNEXPECTED_UNSOLICITED_DATA */ + }, + [TCM_SERVICE_CRC_ERROR] = { + .key = ABORTED_COMMAND, + .asc = 0x47, /* PROTOCOL SERVICE CRC ERROR */ + .ascq = 0x05, /* N/A */ + }, + [TCM_SNACK_REJECTED] = { + .key = ABORTED_COMMAND, + .asc = 0x11, /* READ ERROR */ + .ascq = 0x13, /* FAILED RETRANSMISSION REQUEST */ + }, + [TCM_WRITE_PROTECTED] = { + .key = DATA_PROTECT, + .asc = 0x27, /* WRITE PROTECTED */ + }, + [TCM_ADDRESS_OUT_OF_RANGE] = { + .key = ILLEGAL_REQUEST, + .asc = 0x21, /* LOGICAL BLOCK ADDRESS OUT OF RANGE */ + }, + [TCM_CHECK_CONDITION_UNIT_ATTENTION] = { + .key = UNIT_ATTENTION, + }, + [TCM_CHECK_CONDITION_NOT_READY] = { + .key = NOT_READY, + }, + [TCM_MISCOMPARE_VERIFY] = { + .key = MISCOMPARE, + .asc = 0x1d, /* MISCOMPARE DURING VERIFY OPERATION */ + .ascq = 0x00, + }, + [TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED] = { + .key = ILLEGAL_REQUEST, + .asc = 0x10, + .ascq = 0x01, /* LOGICAL BLOCK GUARD CHECK FAILED */ + .add_sector_info = true, + }, + [TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED] = { + .key = ILLEGAL_REQUEST, + .asc = 0x10, + .ascq = 0x02, /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */ + .add_sector_info = true, + }, + [TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED] = { + .key = ILLEGAL_REQUEST, + .asc = 0x10, + .ascq = 0x03, /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */ + .add_sector_info = true, + }, + [TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE] = { + /* + * Returning ILLEGAL REQUEST would cause immediate IO errors on + * Solaris initiators. Returning NOT READY instead means the + * operations will be retried a finite number of times and we + * can survive intermittent errors. + */ + .key = NOT_READY, + .asc = 0x08, /* LOGICAL UNIT COMMUNICATION FAILURE */ + }, +}; + +static void translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) +{ + const struct sense_info *si; + u8 *buffer = cmd->sense_buffer; + int r = (__force int)reason; + u8 asc, ascq; + + if (r < ARRAY_SIZE(sense_info_table) && sense_info_table[r].key) + si = &sense_info_table[r]; + else + si = &sense_info_table[(__force int) + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE]; + + buffer[SPC_SENSE_KEY_OFFSET] = si->key; + if (reason == TCM_CHECK_CONDITION_UNIT_ATTENTION) { + core_scsi3_ua_for_check_condition(cmd, &asc, &ascq); + WARN_ON_ONCE(asc == 0); + } else if (si->asc == 0) { + WARN_ON_ONCE(cmd->scsi_asc == 0); + asc = cmd->scsi_asc; + ascq = cmd->scsi_ascq; + } else { + asc = si->asc; + ascq = si->ascq; + } + buffer[SPC_ASC_KEY_OFFSET] = asc; + buffer[SPC_ASCQ_KEY_OFFSET] = ascq; + if (si->add_sector_info) + transport_err_sector_info(cmd->sense_buffer, cmd->bad_sector); +} + int transport_send_check_condition_and_sense(struct se_cmd *cmd, sense_reason_t reason, int from_transport) { - unsigned char *buffer = cmd->sense_buffer; unsigned long flags; - u8 asc = 0, ascq = 0; spin_lock_irqsave(&cmd->t_state_lock, flags); if (cmd->se_cmd_flags & SCF_SENT_CHECK_CONDITION) { @@ -2644,242 +2786,13 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd, cmd->se_cmd_flags |= SCF_SENT_CHECK_CONDITION; spin_unlock_irqrestore(&cmd->t_state_lock, flags); - if (!reason && from_transport) - goto after_reason; - - if (!from_transport) + if (!from_transport) { cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE; - - /* - * Actual SENSE DATA, see SPC-3 7.23.2 SPC_SENSE_KEY_OFFSET uses - * SENSE KEY values from include/scsi/scsi.h - */ - switch (reason) { - case TCM_NO_SENSE: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* Not Ready */ - buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY; - /* NO ADDITIONAL SENSE INFORMATION */ - buffer[SPC_ASC_KEY_OFFSET] = 0; - buffer[SPC_ASCQ_KEY_OFFSET] = 0; - break; - case TCM_NON_EXISTENT_LUN: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* LOGICAL UNIT NOT SUPPORTED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x25; - break; - case TCM_UNSUPPORTED_SCSI_OPCODE: - case TCM_SECTOR_COUNT_TOO_MANY: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* INVALID COMMAND OPERATION CODE */ - buffer[SPC_ASC_KEY_OFFSET] = 0x20; - break; - case TCM_UNKNOWN_MODE_PAGE: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* INVALID FIELD IN CDB */ - buffer[SPC_ASC_KEY_OFFSET] = 0x24; - break; - case TCM_CHECK_CONDITION_ABORT_CMD: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ABORTED COMMAND */ - buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND; - /* BUS DEVICE RESET FUNCTION OCCURRED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x29; - buffer[SPC_ASCQ_KEY_OFFSET] = 0x03; - break; - case TCM_INCORRECT_AMOUNT_OF_DATA: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ABORTED COMMAND */ - buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND; - /* WRITE ERROR */ - buffer[SPC_ASC_KEY_OFFSET] = 0x0c; - /* NOT ENOUGH UNSOLICITED DATA */ - buffer[SPC_ASCQ_KEY_OFFSET] = 0x0d; - break; - case TCM_INVALID_CDB_FIELD: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* INVALID FIELD IN CDB */ - buffer[SPC_ASC_KEY_OFFSET] = 0x24; - break; - case TCM_INVALID_PARAMETER_LIST: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* INVALID FIELD IN PARAMETER LIST */ - buffer[SPC_ASC_KEY_OFFSET] = 0x26; - break; - case TCM_PARAMETER_LIST_LENGTH_ERROR: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* PARAMETER LIST LENGTH ERROR */ - buffer[SPC_ASC_KEY_OFFSET] = 0x1a; - break; - case TCM_UNEXPECTED_UNSOLICITED_DATA: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ABORTED COMMAND */ - buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND; - /* WRITE ERROR */ - buffer[SPC_ASC_KEY_OFFSET] = 0x0c; - /* UNEXPECTED_UNSOLICITED_DATA */ - buffer[SPC_ASCQ_KEY_OFFSET] = 0x0c; - break; - case TCM_SERVICE_CRC_ERROR: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ABORTED COMMAND */ - buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND; - /* PROTOCOL SERVICE CRC ERROR */ - buffer[SPC_ASC_KEY_OFFSET] = 0x47; - /* N/A */ - buffer[SPC_ASCQ_KEY_OFFSET] = 0x05; - break; - case TCM_SNACK_REJECTED: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ABORTED COMMAND */ - buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND; - /* READ ERROR */ - buffer[SPC_ASC_KEY_OFFSET] = 0x11; - /* FAILED RETRANSMISSION REQUEST */ - buffer[SPC_ASCQ_KEY_OFFSET] = 0x13; - break; - case TCM_WRITE_PROTECTED: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* DATA PROTECT */ - buffer[SPC_SENSE_KEY_OFFSET] = DATA_PROTECT; - /* WRITE PROTECTED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x27; - break; - case TCM_ADDRESS_OUT_OF_RANGE: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* LOGICAL BLOCK ADDRESS OUT OF RANGE */ - buffer[SPC_ASC_KEY_OFFSET] = 0x21; - break; - case TCM_CHECK_CONDITION_UNIT_ATTENTION: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* UNIT ATTENTION */ - buffer[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION; - core_scsi3_ua_for_check_condition(cmd, &asc, &ascq); - buffer[SPC_ASC_KEY_OFFSET] = asc; - buffer[SPC_ASCQ_KEY_OFFSET] = ascq; - break; - case TCM_CHECK_CONDITION_NOT_READY: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* Not Ready */ - buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY; - buffer[SPC_ASC_KEY_OFFSET] = cmd->scsi_asc; - buffer[SPC_ASCQ_KEY_OFFSET] = cmd->scsi_ascq; - break; - case TCM_MISCOMPARE_VERIFY: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - buffer[SPC_SENSE_KEY_OFFSET] = MISCOMPARE; - /* MISCOMPARE DURING VERIFY OPERATION */ - buffer[SPC_ASC_KEY_OFFSET] = 0x1d; - buffer[SPC_ASCQ_KEY_OFFSET] = 0x00; - break; - case TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* LOGICAL BLOCK GUARD CHECK FAILED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x10; - buffer[SPC_ASCQ_KEY_OFFSET] = 0x01; - transport_err_sector_info(buffer, cmd->bad_sector); - break; - case TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x10; - buffer[SPC_ASCQ_KEY_OFFSET] = 0x02; - transport_err_sector_info(buffer, cmd->bad_sector); - break; - case TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* ILLEGAL REQUEST */ - buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST; - /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */ - buffer[SPC_ASC_KEY_OFFSET] = 0x10; - buffer[SPC_ASCQ_KEY_OFFSET] = 0x03; - transport_err_sector_info(buffer, cmd->bad_sector); - break; - case TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE: - default: - /* CURRENT ERROR */ - buffer[0] = 0x70; - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10; - /* - * Returning ILLEGAL REQUEST would cause immediate IO errors on - * Solaris initiators. Returning NOT READY instead means the - * operations will be retried a finite number of times and we - * can survive intermittent errors. - */ - buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY; - /* LOGICAL UNIT COMMUNICATION FAILURE */ - buffer[SPC_ASC_KEY_OFFSET] = 0x08; - break; + translate_sense_reason(cmd, reason); + cmd->scsi_status = SAM_STAT_CHECK_CONDITION; + cmd->scsi_sense_length = TRANSPORT_SENSE_BUFFER; } - /* - * This code uses linux/include/scsi/scsi.h SAM status codes! - */ - cmd->scsi_status = SAM_STAT_CHECK_CONDITION; - /* - * Automatically padded, this value is encoded in the fabric's - * data_length response PDU containing the SCSI defined sense data. - */ - cmd->scsi_sense_length = TRANSPORT_SENSE_BUFFER; -after_reason: trace_target_cmd_complete(cmd); return cmd->se_tfo->queue_status(cmd); } From 7708c1656552ddd60b9b9df3a9ee156acd1801ba Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Jul 2015 17:58:52 +0300 Subject: [PATCH 0072/1466] scsi: Move sense handling routines to scsi_common Sense data handling is also done in the target stack. Hence, move sense handling routines to scsi_common so the target will be able to use them as well. Signed-off-by: Sagi Grimberg Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/scsi/scsi_common.c | 98 +++++++++++++++++++++++++++++++++++++ drivers/scsi/scsi_error.c | 99 +------------------------------------- include/scsi/scsi_common.h | 5 ++ include/scsi/scsi_eh.h | 7 +-- 4 files changed, 105 insertions(+), 104 deletions(-) diff --git a/drivers/scsi/scsi_common.c b/drivers/scsi/scsi_common.c index 2ff092252b76..41432c10dda2 100644 --- a/drivers/scsi/scsi_common.c +++ b/drivers/scsi/scsi_common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI. @@ -176,3 +177,100 @@ bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len, return true; } EXPORT_SYMBOL(scsi_normalize_sense); + +/** + * scsi_sense_desc_find - search for a given descriptor type in descriptor sense data format. + * @sense_buffer: byte array of descriptor format sense data + * @sb_len: number of valid bytes in sense_buffer + * @desc_type: value of descriptor type to find + * (e.g. 0 -> information) + * + * Notes: + * only valid when sense data is in descriptor format + * + * Return value: + * pointer to start of (first) descriptor if found else NULL + */ +const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, + int desc_type) +{ + int add_sen_len, add_len, desc_len, k; + const u8 * descp; + + if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7]))) + return NULL; + if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73)) + return NULL; + add_sen_len = (add_sen_len < (sb_len - 8)) ? + add_sen_len : (sb_len - 8); + descp = &sense_buffer[8]; + for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) { + descp += desc_len; + add_len = (k < (add_sen_len - 1)) ? descp[1]: -1; + desc_len = add_len + 2; + if (descp[0] == desc_type) + return descp; + if (add_len < 0) // short descriptor ?? + break; + } + return NULL; +} +EXPORT_SYMBOL(scsi_sense_desc_find); + +/** + * scsi_build_sense_buffer - build sense data in a buffer + * @desc: Sense format (non zero == descriptor format, + * 0 == fixed format) + * @buf: Where to build sense data + * @key: Sense key + * @asc: Additional sense code + * @ascq: Additional sense code qualifier + * + **/ +void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq) +{ + if (desc) { + buf[0] = 0x72; /* descriptor, current */ + buf[1] = key; + buf[2] = asc; + buf[3] = ascq; + buf[7] = 0; + } else { + buf[0] = 0x70; /* fixed, current */ + buf[2] = key; + buf[7] = 0xa; + buf[12] = asc; + buf[13] = ascq; + } +} +EXPORT_SYMBOL(scsi_build_sense_buffer); + +/** + * scsi_set_sense_information - set the information field in a + * formatted sense data buffer + * @buf: Where to build sense data + * @info: 64-bit information value to be set + * + **/ +void scsi_set_sense_information(u8 *buf, u64 info) +{ + if ((buf[0] & 0x7f) == 0x72) { + u8 *ucp, len; + + len = buf[7]; + ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0); + if (!ucp) { + buf[7] = len + 0xa; + ucp = buf + 8 + len; + } + ucp[0] = 0; + ucp[1] = 0xa; + ucp[2] = 0x80; /* Valid bit */ + ucp[3] = 0; + put_unaligned_be64(info, &ucp[4]); + } else if ((buf[0] & 0x7f) == 0x70) { + buf[0] |= 0x80; + put_unaligned_be64(info, &buf[3]); + } +} +EXPORT_SYMBOL(scsi_set_sense_information); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 106884a5444e..6e6b2d26d3ce 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -34,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -2407,45 +2407,6 @@ bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd, } EXPORT_SYMBOL(scsi_command_normalize_sense); -/** - * scsi_sense_desc_find - search for a given descriptor type in descriptor sense data format. - * @sense_buffer: byte array of descriptor format sense data - * @sb_len: number of valid bytes in sense_buffer - * @desc_type: value of descriptor type to find - * (e.g. 0 -> information) - * - * Notes: - * only valid when sense data is in descriptor format - * - * Return value: - * pointer to start of (first) descriptor if found else NULL - */ -const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, - int desc_type) -{ - int add_sen_len, add_len, desc_len, k; - const u8 * descp; - - if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7]))) - return NULL; - if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73)) - return NULL; - add_sen_len = (add_sen_len < (sb_len - 8)) ? - add_sen_len : (sb_len - 8); - descp = &sense_buffer[8]; - for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) { - descp += desc_len; - add_len = (k < (add_sen_len - 1)) ? descp[1]: -1; - desc_len = add_len + 2; - if (descp[0] == desc_type) - return descp; - if (add_len < 0) // short descriptor ?? - break; - } - return NULL; -} -EXPORT_SYMBOL(scsi_sense_desc_find); - /** * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format) * @sense_buffer: byte array of sense data @@ -2495,61 +2456,3 @@ int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, } } EXPORT_SYMBOL(scsi_get_sense_info_fld); - -/** - * scsi_build_sense_buffer - build sense data in a buffer - * @desc: Sense format (non zero == descriptor format, - * 0 == fixed format) - * @buf: Where to build sense data - * @key: Sense key - * @asc: Additional sense code - * @ascq: Additional sense code qualifier - * - **/ -void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq) -{ - if (desc) { - buf[0] = 0x72; /* descriptor, current */ - buf[1] = key; - buf[2] = asc; - buf[3] = ascq; - buf[7] = 0; - } else { - buf[0] = 0x70; /* fixed, current */ - buf[2] = key; - buf[7] = 0xa; - buf[12] = asc; - buf[13] = ascq; - } -} -EXPORT_SYMBOL(scsi_build_sense_buffer); - -/** - * scsi_set_sense_information - set the information field in a - * formatted sense data buffer - * @buf: Where to build sense data - * @info: 64-bit information value to be set - * - **/ -void scsi_set_sense_information(u8 *buf, u64 info) -{ - if ((buf[0] & 0x7f) == 0x72) { - u8 *ucp, len; - - len = buf[7]; - ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0); - if (!ucp) { - buf[7] = len + 0xa; - ucp = buf + 8 + len; - } - ucp[0] = 0; - ucp[1] = 0xa; - ucp[2] = 0x80; /* Valid bit */ - ucp[3] = 0; - put_unaligned_be64(info, &ucp[4]); - } else if ((buf[0] & 0x7f) == 0x70) { - buf[0] |= 0x80; - put_unaligned_be64(info, &buf[3]); - } -} -EXPORT_SYMBOL(scsi_set_sense_information); diff --git a/include/scsi/scsi_common.h b/include/scsi/scsi_common.h index 676b03b78e57..156d673db900 100644 --- a/include/scsi/scsi_common.h +++ b/include/scsi/scsi_common.h @@ -61,4 +61,9 @@ static inline bool scsi_sense_valid(const struct scsi_sense_hdr *sshdr) extern bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len, struct scsi_sense_hdr *sshdr); +extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq); +extern void scsi_set_sense_information(u8 *buf, u64 info); +extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, + int desc_type); + #endif /* _SCSI_COMMON_H_ */ diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index 4942710ef720..dbb8c640e26f 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h @@ -4,6 +4,7 @@ #include #include +#include struct scsi_device; struct Scsi_Host; @@ -21,15 +22,9 @@ static inline bool scsi_sense_is_deferred(const struct scsi_sense_hdr *sshdr) return ((sshdr->response_code >= 0x70) && (sshdr->response_code & 1)); } -extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, - int desc_type); - extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, u64 * info_out); -extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq); -extern void scsi_set_sense_information(u8 *buf, u64 info); - extern int scsi_ioctl_reset(struct scsi_device *, int __user *); struct scsi_eh_save { From 9ec1e1ce3a0f854b9150e7888a373392fbbe7442 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Jul 2015 17:58:53 +0300 Subject: [PATCH 0073/1466] target: Use scsi helpers to build the sense data correctly Instead of open coding the sense buffer construction, use scsi scsi_build_sense_buffer() and scsi_set_sense_information() helpers which moved to scsi_common. Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_spc.c | 31 +++++--------------------- drivers/target/target_core_transport.c | 21 ++++------------- 2 files changed, 9 insertions(+), 43 deletions(-) diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index b0744433315a..c43dcbf2d48e 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -1157,32 +1157,11 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd) if (!rbuf) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq)) { - /* - * CURRENT ERROR, UNIT ATTENTION - */ - buf[0] = 0x70; - buf[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION; - - /* - * The Additional Sense Code (ASC) from the UNIT ATTENTION - */ - buf[SPC_ASC_KEY_OFFSET] = ua_asc; - buf[SPC_ASCQ_KEY_OFFSET] = ua_ascq; - buf[7] = 0x0A; - } else { - /* - * CURRENT ERROR, NO SENSE - */ - buf[0] = 0x70; - buf[SPC_SENSE_KEY_OFFSET] = NO_SENSE; - - /* - * NO ADDITIONAL SENSE INFORMATION - */ - buf[SPC_ASC_KEY_OFFSET] = 0x00; - buf[7] = 0x0A; - } + if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq)) + scsi_build_sense_buffer(0, buf, UNIT_ATTENTION, + ua_asc, ua_ascq); + else + scsi_build_sense_buffer(0, buf, NO_SENSE, 0x0, 0x0); memcpy(rbuf, buf, min_t(u32, sizeof(buf), cmd->data_length)); transport_kunmap_data_sg(cmd); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 6ef44c9db381..f528a9def65a 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2615,19 +2616,6 @@ bool transport_wait_for_tasks(struct se_cmd *cmd) } EXPORT_SYMBOL(transport_wait_for_tasks); -static -void transport_err_sector_info(unsigned char *buffer, sector_t bad_sector) -{ - /* Place failed LBA in sense data information descriptor 0. */ - buffer[SPC_ADD_SENSE_LEN_OFFSET] = 0xc; - buffer[SPC_DESC_TYPE_OFFSET] = 0; /* Information */ - buffer[SPC_ADDITIONAL_DESC_LEN_OFFSET] = 0xa; - buffer[SPC_VALIDITY_OFFSET] = 0x80; - - /* Descriptor Information: failing sector */ - put_unaligned_be64(bad_sector, &buffer[12]); -} - struct sense_info { u8 key; u8 asc; @@ -2754,7 +2742,6 @@ static void translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) si = &sense_info_table[(__force int) TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE]; - buffer[SPC_SENSE_KEY_OFFSET] = si->key; if (reason == TCM_CHECK_CONDITION_UNIT_ATTENTION) { core_scsi3_ua_for_check_condition(cmd, &asc, &ascq); WARN_ON_ONCE(asc == 0); @@ -2766,10 +2753,10 @@ static void translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) asc = si->asc; ascq = si->ascq; } - buffer[SPC_ASC_KEY_OFFSET] = asc; - buffer[SPC_ASCQ_KEY_OFFSET] = ascq; + + scsi_build_sense_buffer(0, buffer, si->key, asc, ascq); if (si->add_sector_info) - transport_err_sector_info(cmd->sense_buffer, cmd->bad_sector); + scsi_set_sense_information(buffer, cmd->bad_sector); } int From 734ca5c467842186bf836d0b33379a51cfe259da Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Jul 2015 17:58:54 +0300 Subject: [PATCH 0074/1466] target: Return ABORTED_COMMAND sense key for PI errors PI errors were reported with ILLEGAL_REQUEST sense key but there was actually no problem with the request. Target detected PI errors should be reported with aborted command sense key. Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index f528a9def65a..2bece607ca0f 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2700,19 +2700,19 @@ static const struct sense_info sense_info_table[] = { .ascq = 0x00, }, [TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED] = { - .key = ILLEGAL_REQUEST, + .key = ABORTED_COMMAND, .asc = 0x10, .ascq = 0x01, /* LOGICAL BLOCK GUARD CHECK FAILED */ .add_sector_info = true, }, [TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED] = { - .key = ILLEGAL_REQUEST, + .key = ABORTED_COMMAND, .asc = 0x10, .ascq = 0x02, /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */ .add_sector_info = true, }, [TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED] = { - .key = ILLEGAL_REQUEST, + .key = ABORTED_COMMAND, .asc = 0x10, .ascq = 0x03, /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */ .add_sector_info = true, From 3e963b2d3c93e0546e911d681f37d35f0f79b54f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Jul 2015 07:33:07 -0700 Subject: [PATCH 0075/1466] tcm_qla2xxx: Remove set-but-not-used variables Detected these by building with W=1. This patch does not change any functionality. Signed-off-by: Bart Van Assche Acked-by: Himanshu Madhani Cc: Quinn Tran Cc: Saurav Kashyap Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/tcm_qla2xxx.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index d9a8c6084346..474d0c0e6dba 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -1367,9 +1367,7 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess) struct qla_hw_data *ha = tgt->ha; scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev); struct se_session *se_sess; - struct se_node_acl *se_nacl; struct tcm_qla2xxx_lport *lport; - struct tcm_qla2xxx_nacl *nacl; BUG_ON(in_interrupt()); @@ -1379,8 +1377,6 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess) dump_stack(); return; } - se_nacl = se_sess->se_node_acl; - nacl = container_of(se_nacl, struct tcm_qla2xxx_nacl, se_node_acl); lport = vha->vha_tgt.target_lport_ptr; if (!lport) { @@ -1684,7 +1680,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha, (struct tcm_qla2xxx_lport *)target_lport_ptr; struct tcm_qla2xxx_lport *base_lport = (struct tcm_qla2xxx_lport *)base_vha->vha_tgt.target_lport_ptr; - struct tcm_qla2xxx_tpg *base_tpg; struct fc_vport_identifiers vport_id; if (!qla_tgt_mode_enabled(base_vha)) { @@ -1697,7 +1692,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha, pr_err("qla2xxx base_lport or tpg_1 not available\n"); return -EPERM; } - base_tpg = base_lport->tpg_1; memset(&vport_id, 0, sizeof(vport_id)); vport_id.port_name = npiv_wwpn; From 12306b425d0dbab7b60f54e02d67cf3dfae494d1 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 15 Jul 2015 10:55:36 +0300 Subject: [PATCH 0076/1466] scsi: Fix wrong additional sense length in descriptor format The sense header additional sense length should be the accumulated size of all the descriptors. Information descriptor size is 12 bytes. When setting the additional sense length we should add 0xc instead of 0xa. Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Nicholas Bellinger --- drivers/scsi/scsi_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_common.c b/drivers/scsi/scsi_common.c index 41432c10dda2..ee6bdf43a8ea 100644 --- a/drivers/scsi/scsi_common.c +++ b/drivers/scsi/scsi_common.c @@ -260,7 +260,7 @@ void scsi_set_sense_information(u8 *buf, u64 info) len = buf[7]; ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0); if (!ucp) { - buf[7] = len + 0xa; + buf[7] = len + 0xc; ucp = buf + 8 + len; } ucp[0] = 0; From f5a8b3a796db01b639435515b3adc003b9f27387 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 15 Jul 2015 10:55:37 +0300 Subject: [PATCH 0077/1466] scsi: Protect against buffer possible overflow in scsi_set_sense_information Make sure that the input sense buffer has sufficient length to fit the information descriptor (12 additional bytes). Modify scsi_set_sense_information to receive the sense buffer length and adjust its callers scsi target and libata. (Fix patch fuzz in scsi_set_sense_information - nab) Reported-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Cc: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Nicholas Bellinger --- drivers/ata/libata-scsi.c | 4 +++- drivers/scsi/scsi_common.c | 13 ++++++++++++- drivers/target/target_core_transport.c | 14 +++++++++++--- include/scsi/scsi_common.h | 2 +- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 3131adcc1f87..2fb7c79e727f 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -289,7 +289,9 @@ void ata_scsi_set_sense_information(struct scsi_cmnd *cmd, return; information = ata_tf_read_block(tf, NULL); - scsi_set_sense_information(cmd->sense_buffer, information); + scsi_set_sense_information(cmd->sense_buffer, + SCSI_SENSE_BUFFERSIZE, + information); } static ssize_t diff --git a/drivers/scsi/scsi_common.c b/drivers/scsi/scsi_common.c index ee6bdf43a8ea..c126966130ab 100644 --- a/drivers/scsi/scsi_common.c +++ b/drivers/scsi/scsi_common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -249,10 +250,13 @@ EXPORT_SYMBOL(scsi_build_sense_buffer); * scsi_set_sense_information - set the information field in a * formatted sense data buffer * @buf: Where to build sense data + * @buf_len: buffer length * @info: 64-bit information value to be set * + * Return value: + * 0 on success or EINVAL for invalid sense buffer length **/ -void scsi_set_sense_information(u8 *buf, u64 info) +int scsi_set_sense_information(u8 *buf, int buf_len, u64 info) { if ((buf[0] & 0x7f) == 0x72) { u8 *ucp, len; @@ -263,6 +267,11 @@ void scsi_set_sense_information(u8 *buf, u64 info) buf[7] = len + 0xc; ucp = buf + 8 + len; } + + if (buf_len < len + 0xc) + /* Not enough room for info */ + return -EINVAL; + ucp[0] = 0; ucp[1] = 0xa; ucp[2] = 0x80; /* Valid bit */ @@ -272,5 +281,7 @@ void scsi_set_sense_information(u8 *buf, u64 info) buf[0] |= 0x80; put_unaligned_be64(info, &buf[3]); } + + return 0; } EXPORT_SYMBOL(scsi_set_sense_information); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 2bece607ca0f..7fb031bbcc8d 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2729,7 +2729,7 @@ static const struct sense_info sense_info_table[] = { }, }; -static void translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) +static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) { const struct sense_info *si; u8 *buffer = cmd->sense_buffer; @@ -2756,7 +2756,11 @@ static void translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) scsi_build_sense_buffer(0, buffer, si->key, asc, ascq); if (si->add_sector_info) - scsi_set_sense_information(buffer, cmd->bad_sector); + return scsi_set_sense_information(buffer, + cmd->scsi_sense_length, + cmd->bad_sector); + + return 0; } int @@ -2774,10 +2778,14 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd, spin_unlock_irqrestore(&cmd->t_state_lock, flags); if (!from_transport) { + int rc; + cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE; - translate_sense_reason(cmd, reason); cmd->scsi_status = SAM_STAT_CHECK_CONDITION; cmd->scsi_sense_length = TRANSPORT_SENSE_BUFFER; + rc = translate_sense_reason(cmd, reason); + if (rc) + return rc; } trace_target_cmd_complete(cmd); diff --git a/include/scsi/scsi_common.h b/include/scsi/scsi_common.h index 156d673db900..11571b2a831e 100644 --- a/include/scsi/scsi_common.h +++ b/include/scsi/scsi_common.h @@ -62,7 +62,7 @@ extern bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len, struct scsi_sense_hdr *sshdr); extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq); -extern void scsi_set_sense_information(u8 *buf, u64 info); +int scsi_set_sense_information(u8 *buf, int buf_len, u64 info); extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len, int desc_type); From 4e4937e8aefde8d49340e803ebbedcdf4b43e5f0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 16 Jul 2015 10:28:05 +0300 Subject: [PATCH 0078/1466] target: Return descriptor format sense data in case the LU spans 64bit sectors In case a LU spans 64bit sectors, fixed size sense data information field is only 32 bits which means the sector information will be truncated. Thus, if the LU spans 64bit sectors, use descriptor format sense data to correctly report sector information. Reported-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Sagi Grimberg Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_hba.c | 5 +++++ drivers/target/target_core_spc.c | 12 +++++++++--- drivers/target/target_core_transport.c | 3 ++- include/target/target_core_backend.h | 2 ++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/target/target_core_hba.c b/drivers/target/target_core_hba.c index 62ea4e8e70a8..d746a3a4a623 100644 --- a/drivers/target/target_core_hba.c +++ b/drivers/target/target_core_hba.c @@ -176,3 +176,8 @@ core_delete_hba(struct se_hba *hba) kfree(hba); return 0; } + +bool target_sense_desc_format(struct se_device *dev) +{ + return dev->transport->get_blocks(dev) > U32_MAX; +} diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index c43dcbf2d48e..b949d335a6ba 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -761,7 +761,12 @@ static int spc_modesense_control(struct se_cmd *cmd, u8 pc, u8 *p) if (pc == 1) goto out; - p[2] = 2; + /* GLTSD: No implicit save of log parameters */ + p[2] = (1 << 1); + if (target_sense_desc_format(dev)) + /* D_SENSE: Descriptor format sense data for 64bit sectors */ + p[2] |= (1 << 2); + /* * From spc4r23, 7.4.7 Control mode page * @@ -1144,6 +1149,7 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd) unsigned char *rbuf; u8 ua_asc = 0, ua_ascq = 0; unsigned char buf[SE_SENSE_BUF]; + bool desc_format = target_sense_desc_format(cmd->se_dev); memset(buf, 0, SE_SENSE_BUF); @@ -1158,10 +1164,10 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq)) - scsi_build_sense_buffer(0, buf, UNIT_ATTENTION, + scsi_build_sense_buffer(desc_format, buf, UNIT_ATTENTION, ua_asc, ua_ascq); else - scsi_build_sense_buffer(0, buf, NO_SENSE, 0x0, 0x0); + scsi_build_sense_buffer(desc_format, buf, NO_SENSE, 0x0, 0x0); memcpy(rbuf, buf, min_t(u32, sizeof(buf), cmd->data_length)); transport_kunmap_data_sg(cmd); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7fb031bbcc8d..98155db28365 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2735,6 +2735,7 @@ static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) u8 *buffer = cmd->sense_buffer; int r = (__force int)reason; u8 asc, ascq; + bool desc_format = target_sense_desc_format(cmd->se_dev); if (r < ARRAY_SIZE(sense_info_table) && sense_info_table[r].key) si = &sense_info_table[r]; @@ -2754,7 +2755,7 @@ static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason) ascq = si->ascq; } - scsi_build_sense_buffer(0, buffer, si->key, asc, ascq); + scsi_build_sense_buffer(desc_format, buffer, si->key, asc, ascq); if (si->add_sector_info) return scsi_set_sense_information(buffer, cmd->scsi_sense_length, diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 1e5c8f949bae..56cf8e485ef2 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -93,4 +93,6 @@ bool target_lun_is_rdonly(struct se_cmd *); sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, sense_reason_t (*exec_cmd)(struct se_cmd *cmd)); +bool target_sense_desc_format(struct se_device *dev); + #endif /* TARGET_CORE_BACKEND_H */ From a73c2a2f9123605022bedbd2b59ca7e76036f0b3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 15 Jul 2015 10:55:39 +0300 Subject: [PATCH 0079/1466] libiscsi: Use scsi helper to set information descriptor In case encountered a PI error, use scsi_set_sense_information instead of open coding information descriptor format. Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Nicholas Bellinger --- drivers/scsi/libiscsi.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 8053f24f0349..bb5ca7f3d16d 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -853,12 +853,9 @@ static void iscsi_scsi_cmd_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr, SAM_STAT_CHECK_CONDITION; scsi_build_sense_buffer(1, sc->sense_buffer, ILLEGAL_REQUEST, 0x10, ascq); - sc->sense_buffer[7] = 0xc; /* Additional sense length */ - sc->sense_buffer[8] = 0; /* Information desc type */ - sc->sense_buffer[9] = 0xa; /* Additional desc length */ - sc->sense_buffer[10] = 0x80; /* Validity bit */ - - put_unaligned_be64(sector, &sc->sense_buffer[12]); + scsi_set_sense_information(sc->sense_buffer, + SCSI_SENSE_BUFFERSIZE, + sector); goto out; } } From 7deef550f3a7d44c1d52a6d54f824e7e180c08ae Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:10 -0600 Subject: [PATCH 0080/1466] toshiba_acpi: Adapt /proc/acpi/toshiba/keys to TOS1900 devices Since the introduction of TOS1900 devices support to the driver, the "keys" entry under the proc directory was broken, given that it only handled TOS620X devices accordingly. This patch adapts the code to show the hotkey values of TOS1900 devices too, and in case some programs are still using that interface, hotkeys reporting should now work on these devices. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 56 +++++++++++------------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 3ad7b1fa24ce..c3a0c4d0c1dc 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1499,32 +1499,10 @@ static const struct file_operations fan_proc_fops = { static int keys_proc_show(struct seq_file *m, void *v) { struct toshiba_acpi_dev *dev = m->private; - u32 hci_result; - u32 value; - - if (!dev->key_event_valid && dev->system_event_supported) { - hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value); - if (hci_result == TOS_SUCCESS) { - dev->key_event_valid = 1; - dev->last_key_event = value; - } else if (hci_result == TOS_FIFO_EMPTY) { - /* Better luck next time */ - } else if (hci_result == TOS_NOT_SUPPORTED) { - /* - * This is a workaround for an unresolved issue on - * some machines where system events sporadically - * become disabled. - */ - hci_result = hci_write(dev, HCI_SYSTEM_EVENT, 1); - pr_notice("Re-enabled hotkeys\n"); - } else { - pr_err("Error reading hotkey status\n"); - return -EIO; - } - } seq_printf(m, "hotkey_ready: %d\n", dev->key_event_valid); seq_printf(m, "hotkey: 0x%04x\n", dev->last_key_event); + return 0; } @@ -2361,22 +2339,28 @@ static void toshiba_acpi_report_hotkey(struct toshiba_acpi_dev *dev, static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev) { - u32 hci_result, value; - int retries = 3; - int scancode; - if (dev->info_supported) { - scancode = toshiba_acpi_query_hotkey(dev); - if (scancode < 0) + int scancode = toshiba_acpi_query_hotkey(dev); + + if (scancode < 0) { pr_err("Failed to query hotkey event\n"); - else if (scancode != 0) + } else if (scancode != 0) { toshiba_acpi_report_hotkey(dev, scancode); + dev->key_event_valid = 1; + dev->last_key_event = scancode; + } } else if (dev->system_event_supported) { + u32 result; + u32 value; + int retries = 3; + do { - hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value); - switch (hci_result) { + result = hci_read(dev, HCI_SYSTEM_EVENT, &value); + switch (result) { case TOS_SUCCESS: toshiba_acpi_report_hotkey(dev, (int)value); + dev->key_event_valid = 1; + dev->last_key_event = value; break; case TOS_NOT_SUPPORTED: /* @@ -2384,15 +2368,15 @@ static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev) * issue on some machines where system events * sporadically become disabled. */ - hci_result = - hci_write(dev, HCI_SYSTEM_EVENT, 1); - pr_notice("Re-enabled hotkeys\n"); + result = hci_write(dev, HCI_SYSTEM_EVENT, 1); + if (result == TOS_SUCCESS) + pr_notice("Re-enabled hotkeys\n"); /* Fall through */ default: retries--; break; } - } while (retries && hci_result != TOS_FIFO_EMPTY); + } while (retries && result != TOS_FIFO_EMPTY); } } From fc5462f8525b47fa219452289ecb22c921c16823 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:11 -0600 Subject: [PATCH 0081/1466] toshiba_acpi: Add /dev/toshiba_acpi device There were previous attempts to "merge" the toshiba SMM module to the toshiba_acpi one, they were trying to imitate what the old toshiba module does, however, some models (TOS1900 devices) come with a "crippled" implementation and do not provide all the "features" a "genuine" Toshiba BIOS does. This patch adds a new device called toshiba_acpi, which aim is to enable userspace to access the SMM on Toshiba laptops via ACPI calls. Creating a new convenience _IOWR command to access the SCI functions by opening/closing the SCI internally to avoid buggy BIOS, while at the same time providing backwards compatibility. Older programs (and new) who wish to access the SMM on newer models can do it by pointing their path to /dev/toshiba_acpi (instead of /dev/toshiba) as the toshiba.h header was modified to reflect these changes as well as adds all the toshiba_acpi paths and command, however, it is strongly recommended to use the new IOCTL for any SCI command to avoid any buggy BIOS. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- Documentation/ioctl/ioctl-number.txt | 2 +- drivers/platform/x86/toshiba_acpi.c | 91 ++++++++++++++++++++++++++++ include/uapi/linux/toshiba.h | 32 +++++++++- 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 611c52267d24..21d2f27c886b 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -263,7 +263,7 @@ Code Seq#(hex) Include File Comments 's' all linux/cdk.h 't' 00-7F linux/ppp-ioctl.h 't' 80-8F linux/isdn_ppp.h -'t' 90 linux/toshiba.h +'t' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM 'u' 00-1F linux/smb_fs.h gone 'u' 20-3F linux/uvcvideo.h USB video class host driver 'v' 00-1F linux/ext2_fs.h conflict! diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index c3a0c4d0c1dc..802577f43a23 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include MODULE_AUTHOR("John Belmonte"); @@ -170,6 +172,7 @@ struct toshiba_acpi_dev { struct led_classdev led_dev; struct led_classdev kbd_led; struct led_classdev eco_led; + struct miscdevice miscdev; int force_fan; int last_key_event; @@ -2239,6 +2242,81 @@ static struct attribute_group toshiba_attr_group = { .attrs = toshiba_attributes, }; +/* + * Misc device + */ +static int toshiba_acpi_smm_bridge(SMMRegisters *regs) +{ + u32 in[TCI_WORDS] = { regs->eax, regs->ebx, regs->ecx, + regs->edx, regs->esi, regs->edi }; + u32 out[TCI_WORDS]; + acpi_status status; + + status = tci_raw(toshiba_acpi, in, out); + if (ACPI_FAILURE(status)) { + pr_err("ACPI call to query SMM registers failed\n"); + return -EIO; + } + + /* Fillout the SMM struct with the TCI call results */ + regs->eax = out[0]; + regs->ebx = out[1]; + regs->ecx = out[2]; + regs->edx = out[3]; + regs->esi = out[4]; + regs->edi = out[5]; + + return 0; +} + +static long toshiba_acpi_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + SMMRegisters __user *argp = (SMMRegisters __user *)arg; + SMMRegisters regs; + int ret; + + if (!argp) + return -EINVAL; + + switch (cmd) { + case TOSH_SMM: + if (copy_from_user(®s, argp, sizeof(SMMRegisters))) + return -EFAULT; + ret = toshiba_acpi_smm_bridge(®s); + if (ret) + return ret; + if (copy_to_user(argp, ®s, sizeof(SMMRegisters))) + return -EFAULT; + break; + case TOSHIBA_ACPI_SCI: + if (copy_from_user(®s, argp, sizeof(SMMRegisters))) + return -EFAULT; + /* Ensure we are being called with a SCI_{GET, SET} register */ + if (regs.eax != SCI_GET && regs.eax != SCI_SET) + return -EINVAL; + if (!sci_open(toshiba_acpi)) + return -EIO; + ret = toshiba_acpi_smm_bridge(®s); + sci_close(toshiba_acpi); + if (ret) + return ret; + if (copy_to_user(argp, ®s, sizeof(SMMRegisters))) + return -EFAULT; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct file_operations toshiba_acpi_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = toshiba_acpi_ioctl, + .llseek = noop_llseek, +}; + /* * Hotkeys */ @@ -2540,6 +2618,8 @@ static int toshiba_acpi_remove(struct acpi_device *acpi_dev) { struct toshiba_acpi_dev *dev = acpi_driver_data(acpi_dev); + misc_deregister(&dev->miscdev); + remove_toshiba_proc_entries(dev); if (dev->sysfs_created) @@ -2611,6 +2691,17 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) return -ENOMEM; dev->acpi_dev = acpi_dev; dev->method_hci = hci_method; + dev->miscdev.minor = MISC_DYNAMIC_MINOR; + dev->miscdev.name = "toshiba_acpi"; + dev->miscdev.fops = &toshiba_acpi_fops; + + ret = misc_register(&dev->miscdev); + if (ret) { + pr_err("Failed to register miscdevice\n"); + kfree(dev); + return ret; + } + acpi_dev->driver_data = dev; dev_set_drvdata(&acpi_dev->dev, dev); diff --git a/include/uapi/linux/toshiba.h b/include/uapi/linux/toshiba.h index e9bef5b2f91e..c58bf4b5bb26 100644 --- a/include/uapi/linux/toshiba.h +++ b/include/uapi/linux/toshiba.h @@ -1,6 +1,7 @@ /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops * * Copyright (c) 1996-2000 Jonathan A. Buzzard (jonathan@buzzard.org.uk) + * Copyright (c) 2015 Azael Avalos * * Thanks to Juergen Heinzl for the pointers * on making sure the structure is aligned and packed. @@ -20,9 +21,18 @@ #ifndef _UAPI_LINUX_TOSHIBA_H #define _UAPI_LINUX_TOSHIBA_H -#define TOSH_PROC "/proc/toshiba" -#define TOSH_DEVICE "/dev/toshiba" -#define TOSH_SMM _IOWR('t', 0x90, int) /* broken: meant 24 bytes */ +/* + * Toshiba modules paths + */ + +#define TOSH_PROC "/proc/toshiba" +#define TOSH_DEVICE "/dev/toshiba" +#define TOSHIBA_ACPI_PROC "/proc/acpi/toshiba" +#define TOSHIBA_ACPI_DEVICE "/dev/toshiba_acpi" + +/* + * Toshiba SMM structure + */ typedef struct { unsigned int eax; @@ -33,5 +43,21 @@ typedef struct { unsigned int edi __attribute__ ((packed)); } SMMRegisters; +/* + * IOCTLs (0x90 - 0x91) + */ + +#define TOSH_SMM _IOWR('t', 0x90, SMMRegisters) +/* + * Convenience toshiba_acpi command. + * + * The System Configuration Interface (SCI) is opened/closed internally + * to avoid userspace of buggy BIOSes. + * + * The toshiba_acpi module checks whether the eax register is set with + * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not. + */ +#define TOSHIBA_ACPI_SCI _IOWR('t', 0x91, SMMRegisters) + #endif /* _UAPI_LINUX_TOSHIBA_H */ From a88bc06e5aec4984f5bf01c6d410a0939134f737 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:12 -0600 Subject: [PATCH 0082/1466] toshiba_acpi: Avoid registering input device on WMI event laptops Commit f11f999e9890 ("toshiba_acpi: Refuse to load on machines with buggy INFO implementations") denied loading on laptops with a WMI Event GUID given that such laptops manage the hotkeys via that interface, however, such laptops have a working Toshiba Configuration Interface (TCI), and thus, such commit denied several supported features. This patch avoids registering the input device and ignores all hotkey events on laptops with such WMI Event GUID, making the supported features found in those laptops to work. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 802577f43a23..48b16b323c89 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -2466,6 +2466,11 @@ static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev) u32 hci_result; int error; + if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) { + pr_info("WMI event detected, hotkeys will not be monitored\n"); + return 0; + } + error = toshiba_acpi_enable_hotkeys(dev); if (error) return error; @@ -2813,6 +2818,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) switch (event) { case 0x80: /* Hotkeys and some system events */ + /* + * Machines with this WMI GUID aren't supported due to bugs in + * their AML. + * + * Return silently to avoid triggering a netlink event. + */ + if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) + return; toshiba_acpi_process_hotkeys(dev); break; case 0x81: /* Dock events */ @@ -2899,14 +2912,6 @@ static int __init toshiba_acpi_init(void) { int ret; - /* - * Machines with this WMI guid aren't supported due to bugs in - * their AML. This check relies on wmi initializing before - * toshiba_acpi to guarantee guids have been identified. - */ - if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) - return -ENODEV; - toshiba_proc_dir = proc_mkdir(PROC_TOSHIBA, acpi_root_dir); if (!toshiba_proc_dir) { pr_err("Unable to create proc dir " PROC_TOSHIBA "\n"); From 695f6060903cefa08ffb78433136f51ac0f94488 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 18:09:13 -0600 Subject: [PATCH 0083/1466] toshiba_acpi: Transflective backlight updates This patch changes the tr function second parameter from bool to u32, to be on par with the rest of the TCI functions of the driver, and the code was updated accordingly. Also, the check for translective support was moved to the *add function, as the {__get, set}_lcd_brightness functions make use of it. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 30 +++++++++++------------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 48b16b323c89..649786de4a79 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1187,22 +1187,17 @@ static int toshiba_hotkey_event_type_get(struct toshiba_acpi_dev *dev, } /* Transflective Backlight */ -static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, bool *enabled) +static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 *status) { - u32 hci_result; - u32 status; + u32 hci_result = hci_read(dev, HCI_TR_BACKLIGHT, status); - hci_result = hci_read(dev, HCI_TR_BACKLIGHT, &status); - *enabled = !status; return hci_result == TOS_SUCCESS ? 0 : -EIO; } -static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, bool enable) +static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 status) { - u32 hci_result; - u32 value = !enable; + u32 hci_result = hci_write(dev, HCI_TR_BACKLIGHT, !status); - hci_result = hci_write(dev, HCI_TR_BACKLIGHT, value); return hci_result == TOS_SUCCESS ? 0 : -EIO; } @@ -1216,12 +1211,11 @@ static int __get_lcd_brightness(struct toshiba_acpi_dev *dev) int brightness = 0; if (dev->tr_backlight_supported) { - bool enabled; - int ret = get_tr_backlight_status(dev, &enabled); + int ret = get_tr_backlight_status(dev, &value); if (ret) return ret; - if (enabled) + if (value) return 0; brightness++; } @@ -1271,8 +1265,7 @@ static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value) u32 hci_result; if (dev->tr_backlight_supported) { - bool enable = !value; - int ret = set_tr_backlight_status(dev, enable); + int ret = set_tr_backlight_status(dev, !value); if (ret) return ret; @@ -2563,7 +2556,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev) struct backlight_properties props; int brightness; int ret; - bool enabled; /* * Some machines don't support the backlight methods at all, and @@ -2580,10 +2572,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev) return 0; } - /* Determine whether or not BIOS supports transflective backlight */ - ret = get_tr_backlight_status(dev, &enabled); - dev->tr_backlight_supported = !ret; - /* * Tell acpi-video-detect code to prefer vendor backlight on all * systems with transflective backlight and on dmi matched systems. @@ -2723,6 +2711,10 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) if (toshiba_acpi_setup_keyboard(dev)) pr_info("Unable to activate hotkeys\n"); + /* Determine whether or not BIOS supports transflective backlight */ + ret = get_tr_backlight_status(dev, &dummy); + dev->tr_backlight_supported = !ret; + ret = toshiba_acpi_setup_backlight(dev); if (ret) goto error; From d7e4f2e2ca392bce468718bcbba808108d81d501 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:47 -0600 Subject: [PATCH 0084/1466] toshiba_acpi: Remove unused wireless defines Commit 2b74103547b4 ("toshiba_acpi: Remove bluetooth rfkill code") removed bluetooth related code, however, the wireless defines were not removed and are unused. This patch simply removes those defines as there is no code using them. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 649786de4a79..90d8cb1c2e27 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -113,7 +113,6 @@ MODULE_LICENSE("GPL"); #define HCI_VIDEO_OUT 0x001c #define HCI_HOTKEY_EVENT 0x001e #define HCI_LCD_BRIGHTNESS 0x002a -#define HCI_WIRELESS 0x0056 #define HCI_ACCELEROMETER 0x006d #define HCI_KBD_ILLUMINATION 0x0095 #define HCI_ECO_MODE 0x0097 @@ -142,10 +141,6 @@ MODULE_LICENSE("GPL"); #define HCI_VIDEO_OUT_LCD 0x1 #define HCI_VIDEO_OUT_CRT 0x2 #define HCI_VIDEO_OUT_TV 0x4 -#define HCI_WIRELESS_KILL_SWITCH 0x01 -#define HCI_WIRELESS_BT_PRESENT 0x0f -#define HCI_WIRELESS_BT_ATTACH 0x40 -#define HCI_WIRELESS_BT_POWER 0x80 #define SCI_KBD_MODE_MASK 0x1f #define SCI_KBD_MODE_FNZ 0x1 #define SCI_KBD_MODE_AUTO 0x2 From d50c9005d32b4eda6e11f7ec4f1b00a93088e0ca Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:46 -0600 Subject: [PATCH 0085/1466] toshiba_acpi: Reorder toshiba_acpi_alt_keymap entries This patch simply reorders the entries found in the new keymap by ascending order, this is simply a cosmetic change, no functionality was modified. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 90d8cb1c2e27..6013a11caeea 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -246,16 +246,16 @@ static const struct key_entry toshiba_acpi_keymap[] = { }; static const struct key_entry toshiba_acpi_alt_keymap[] = { - { KE_KEY, 0x157, { KEY_MUTE } }, { KE_KEY, 0x102, { KEY_ZOOMOUT } }, { KE_KEY, 0x103, { KEY_ZOOMIN } }, { KE_KEY, 0x12c, { KEY_KBDILLUMTOGGLE } }, { KE_KEY, 0x139, { KEY_ZOOMRESET } }, - { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } }, { KE_KEY, 0x13c, { KEY_BRIGHTNESSDOWN } }, { KE_KEY, 0x13d, { KEY_BRIGHTNESSUP } }, - { KE_KEY, 0x158, { KEY_WLAN } }, + { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } }, { KE_KEY, 0x13f, { KEY_TOUCHPAD_TOGGLE } }, + { KE_KEY, 0x157, { KEY_MUTE } }, + { KE_KEY, 0x158, { KEY_WLAN } }, { KE_END, 0 }, }; From 1e574dbfadafd9fd1f2a414efb731d7538277e71 Mon Sep 17 00:00:00 2001 From: Azael Avalos Date: Wed, 22 Jul 2015 19:37:49 -0600 Subject: [PATCH 0086/1466] toshiba_acpi: Change some variables to avoid warnings from ninja-check This patch changes some variables to avoid warnings from ninja-check. We are basically moving some variables inside the conditionals where such variables are being used, and we are checking the returned values of some others. Signed-off-by: Azael Avalos Signed-off-by: Darren Hart --- drivers/platform/x86/toshiba_acpi.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 6013a11caeea..3bfdfddc38ac 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1651,7 +1651,6 @@ static ssize_t kbd_backlight_mode_store(struct device *dev, { struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev); int mode; - int time; int ret; @@ -1682,7 +1681,7 @@ static ssize_t kbd_backlight_mode_store(struct device *dev, /* Only make a change if the actual mode has changed */ if (toshiba->kbd_mode != mode) { /* Shift the time to "base time" (0x3c0000 == 60 seconds) */ - time = toshiba->kbd_time << HCI_MISC_SHIFT; + int time = toshiba->kbd_time << HCI_MISC_SHIFT; /* OR the "base time" to the actual method format */ if (toshiba->kbd_type == 1) { @@ -2856,10 +2855,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) static int toshiba_acpi_suspend(struct device *device) { struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device)); - u32 result; - if (dev->hotkey_dev) + if (dev->hotkey_dev) { + u32 result; + result = hci_write(dev, HCI_HOTKEY_EVENT, HCI_HOTKEY_DISABLE); + if (result != TOS_SUCCESS) + pr_info("Unable to disable hotkeys\n"); + } return 0; } @@ -2867,10 +2870,10 @@ static int toshiba_acpi_suspend(struct device *device) static int toshiba_acpi_resume(struct device *device) { struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device)); - int error; if (dev->hotkey_dev) { - error = toshiba_acpi_enable_hotkeys(dev); + int error = toshiba_acpi_enable_hotkeys(dev); + if (error) pr_info("Unable to re-enable hotkeys\n"); } From 5a063d87e97df28ca0b00807bc4d6fa11c5a5107 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 20 Jul 2015 09:56:05 +0200 Subject: [PATCH 0087/1466] pwm: sysfs: Properly convert from enum to string The current code will check for polarity in a boolean way. While it is correct that polarity is either normal or inversed, make it more obvious that it's an enumeration by using a switch statement and explicit matches on the enumeration values. Signed-off-by: Thierry Reding --- drivers/pwm/sysfs.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index ac0abecfbaa0..fbfc9e903230 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -133,9 +133,19 @@ static ssize_t pwm_polarity_show(struct device *child, char *buf) { const struct pwm_device *pwm = child_to_pwm_device(child); + const char *polarity = "unknown"; - return sprintf(buf, "%s\n", - pwm_get_polarity(pwm) ? "inversed" : "normal"); + switch (pwm_get_polarity(pwm)) { + case PWM_POLARITY_NORMAL: + polarity = "normal"; + break; + + case PWM_POLARITY_INVERSED: + polarity = "inversed"; + break; + } + + return sprintf(buf, "%s\n", polarity); } static ssize_t pwm_polarity_store(struct device *child, From 412820dd536fe2d01a35f0d68982ea225ec255b3 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 20 Jul 2015 09:58:09 +0200 Subject: [PATCH 0088/1466] pwm: sysfs: Remove unnecessary padding Padding initializers so that assignment operators align is bound to lead to inconsistencies or churn. Single spaces around the assignment is just fine. Signed-off-by: Thierry Reding --- drivers/pwm/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c index fbfc9e903230..c472772f00a7 100644 --- a/drivers/pwm/sysfs.c +++ b/drivers/pwm/sysfs.c @@ -312,9 +312,9 @@ static struct attribute *pwm_chip_attrs[] = { ATTRIBUTE_GROUPS(pwm_chip); static struct class pwm_class = { - .name = "pwm", - .owner = THIS_MODULE, - .dev_groups = pwm_chip_groups, + .name = "pwm", + .owner = THIS_MODULE, + .dev_groups = pwm_chip_groups, }; static int pwmchip_sysfs_match(struct device *parent, const void *data) From 5e32940621eb62064d98f42c9889db71b0368bde Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 11 Jul 2015 10:02:46 -0400 Subject: [PATCH 0089/1466] libnvdimm, btt: sparse fix Fix: drivers/nvdimm/btt.c:635:29: warning: restricted __le64 degrades to integer Signed-off-by: Dan Williams --- drivers/nvdimm/btt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 411c7b2bb37a..552f1c4f4dc6 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -632,8 +632,9 @@ static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, arena->logoff = arena_off + le64_to_cpu(super->logoff); arena->info2off = arena_off + le64_to_cpu(super->info2off); - arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) : - (arena->info2off - arena->infooff + BTT_PG_SIZE); + arena->size = (le64_to_cpu(super->nextoff) > 0) + ? (le64_to_cpu(super->nextoff)) + : (arena->info2off - arena->infooff + BTT_PG_SIZE); arena->flags = le32_to_cpu(super->flags); } From ec92777f2ba93c00387b8fe53780c25adc57c744 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:35 -0600 Subject: [PATCH 0090/1466] libnvdimm: Update name of the ars_status_record mask field The spec suggests that this is a simple 'length' field, not a mask. Update the name accordingly. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 2b94ea2287bb..e94bc20016b2 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -87,7 +87,7 @@ struct nd_cmd_ars_status { __u32 handle; __u32 flags; __u64 err_address; - __u64 mask; + __u64 length; } __packed records[0]; } __packed; From 39c686b862cdb2049b90e095b6c6c727b2a7ab60 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 9 Jul 2015 13:25:36 -0600 Subject: [PATCH 0091/1466] libnvdimm: Add DSM support for Address Range Scrub commands Add support for the three ARS DSM commands: - Query ARS Capabilities - Queries the firmware to check if a given range supports scrub, and if so, which type (persistent vs. volatile) - Start ARS - Starts a scrub for a given range/type - Query ARS Status - Checks status of a previously started scrub, and provides the error logs if any. The commands are described by the example DSM spec at: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Also add these commands to the nfit_test test framework, and return canned data. Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 1 + drivers/acpi/nfit.h | 1 + include/uapi/linux/ndctl.h | 10 ++ tools/testing/nvdimm/test/nfit.c | 199 ++++++++++++++++++++++--------- 4 files changed, 152 insertions(+), 59 deletions(-) diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 628a42c41ab1..ef8a664db254 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -868,6 +868,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) struct acpi_device *adev; int i; + nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en; adev = to_acpi_dev(acpi_desc); if (!adev) return; diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index 79b6d83875c1..f2c2bb751882 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -107,6 +107,7 @@ struct acpi_nfit_desc { struct nvdimm_bus *nvdimm_bus; struct device *dev; unsigned long dimm_dsm_force_en; + unsigned long bus_dsm_force_en; int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, void *iobuf, u64 len, int rw); }; diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index e94bc20016b2..5b4a4be06e2b 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -111,6 +111,11 @@ enum { ND_CMD_VENDOR = 9, }; +enum { + ND_ARS_VOLATILE = 1, + ND_ARS_PERSISTENT = 2, +}; + static inline const char *nvdimm_bus_cmd_name(unsigned cmd) { static const char * const names[] = { @@ -194,4 +199,9 @@ enum nd_driver_flags { enum { ND_MIN_NAMESPACE_SIZE = 0x00400000, }; + +enum ars_masks { + ARS_STATUS_MASK = 0x0000FFFF, + ARS_EXT_STATUS_SHIFT = 16, +}; #endif /* __NDCTL_H__ */ diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index d0bdae40ccc9..28dba918524e 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -147,75 +147,153 @@ static struct nfit_test *to_nfit_test(struct device *dev) return container_of(pdev, struct nfit_test, pdev); } +static int nfit_test_cmd_get_config_size(struct nd_cmd_get_config_size *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->status = 0; + nd_cmd->config_size = LABEL_SIZE; + nd_cmd->max_xfer = SZ_4K; + + return 0; +} + +static int nfit_test_cmd_get_config_data(struct nd_cmd_get_config_data_hdr + *nd_cmd, unsigned int buf_len, void *label) +{ + unsigned int len, offset = nd_cmd->in_offset; + int rc; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) + return -EINVAL; + + nd_cmd->status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(nd_cmd->out_buf, label + offset, len); + rc = buf_len - sizeof(*nd_cmd) - len; + + return rc; +} + +static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd, + unsigned int buf_len, void *label) +{ + unsigned int len, offset = nd_cmd->in_offset; + u32 *status; + int rc; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) + return -EINVAL; + + status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd); + *status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(label + offset, nd_cmd->in_buf, len); + rc = buf_len - sizeof(*nd_cmd) - (len + 4); + + return rc; +} + +static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->max_ars_out = 256; + nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16; + + return 0; +} + +static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->status = 0; + + return 0; +} + +static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd, + unsigned int buf_len) +{ + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + + nd_cmd->out_length = 256; + nd_cmd->num_records = 0; + nd_cmd->status = 0; + + return 0; +} + static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc); - struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); - int i, rc; + int i, rc = 0; - if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) - return -ENOTTY; + if (nvdimm) { + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); - /* lookup label space for the given dimm */ - for (i = 0; i < ARRAY_SIZE(handle); i++) - if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i]) + if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) + return -ENOTTY; + + /* lookup label space for the given dimm */ + for (i = 0; i < ARRAY_SIZE(handle); i++) + if (__to_nfit_memdev(nfit_mem)->device_handle == + handle[i]) + break; + if (i >= ARRAY_SIZE(handle)) + return -ENXIO; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + rc = nfit_test_cmd_get_config_size(buf, buf_len); break; - if (i >= ARRAY_SIZE(handle)) - return -ENXIO; + case ND_CMD_GET_CONFIG_DATA: + rc = nfit_test_cmd_get_config_data(buf, buf_len, + t->label[i]); + break; + case ND_CMD_SET_CONFIG_DATA: + rc = nfit_test_cmd_set_config_data(buf, buf_len, + t->label[i]); + break; + default: + return -ENOTTY; + } + } else { + if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask)) + return -ENOTTY; - switch (cmd) { - case ND_CMD_GET_CONFIG_SIZE: { - struct nd_cmd_get_config_size *nd_cmd = buf; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - nd_cmd->status = 0; - nd_cmd->config_size = LABEL_SIZE; - nd_cmd->max_xfer = SZ_4K; - rc = 0; - break; - } - case ND_CMD_GET_CONFIG_DATA: { - struct nd_cmd_get_config_data_hdr *nd_cmd = buf; - unsigned int len, offset = nd_cmd->in_offset; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - if (offset >= LABEL_SIZE) - return -EINVAL; - if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) - return -EINVAL; - - nd_cmd->status = 0; - len = min(nd_cmd->in_length, LABEL_SIZE - offset); - memcpy(nd_cmd->out_buf, t->label[i] + offset, len); - rc = buf_len - sizeof(*nd_cmd) - len; - break; - } - case ND_CMD_SET_CONFIG_DATA: { - struct nd_cmd_set_config_hdr *nd_cmd = buf; - unsigned int len, offset = nd_cmd->in_offset; - u32 *status; - - if (buf_len < sizeof(*nd_cmd)) - return -EINVAL; - if (offset >= LABEL_SIZE) - return -EINVAL; - if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) - return -EINVAL; - - status = buf + nd_cmd->in_length + sizeof(*nd_cmd); - *status = 0; - len = min(nd_cmd->in_length, LABEL_SIZE - offset); - memcpy(t->label[i] + offset, nd_cmd->in_buf, len); - rc = buf_len - sizeof(*nd_cmd) - (len + 4); - break; - } - default: - return -ENOTTY; + switch (cmd) { + case ND_CMD_ARS_CAP: + rc = nfit_test_cmd_ars_cap(buf, buf_len); + break; + case ND_CMD_ARS_START: + rc = nfit_test_cmd_ars_start(buf, buf_len); + break; + case ND_CMD_ARS_STATUS: + rc = nfit_test_cmd_ars_status(buf, buf_len); + break; + default: + return -ENOTTY; + } } return rc; @@ -876,6 +954,9 @@ static void nfit_test0_setup(struct nfit_test *t) set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en); set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en); + set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en); nd_desc = &acpi_desc->nd_desc; nd_desc->ndctl = nfit_test_ctl; } From 6b47496a6fc81816e7edaf8224dfb88e402a05f5 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 23 Jul 2015 11:58:48 -0600 Subject: [PATCH 0092/1466] libnvdimm, pmem: Change pmem physical sector size to PAGE_SIZE Based on a patch: c8fa317 brd: Request from fdisk 4k alignment by Boaz Harrosh, allow fdisk to create properly aligned partitions for DAX. This will also cause mkfs.ext4 to emit a warning if using a file system block size of less than PAGE_SIZE. Cc: Dan Williams Cc: Ross Zwisler Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Elliott, Robert Signed-off-by: Vishal Verma Acked-by: Boaz Harrosh Acked-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ade9eb917a4d..bcf48f133443 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -162,6 +162,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns, return -ENOMEM; blk_queue_make_request(pmem->pmem_queue, pmem_make_request); + blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); From 60e95f43fc8573e81f54b0c1e0bc542c2260d956 Mon Sep 17 00:00:00 2001 From: Linda Knippers Date: Wed, 22 Jul 2015 16:17:22 -0400 Subject: [PATCH 0093/1466] nfit: Don't check _STA on NVDIMM devices The _STA only applies to the root device, not the individual NVDIMMS, so don't check here. NVDIMM device state flags are checked elsewhere. Signed-off-by: Linda Knippers Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index ef8a664db254..7c2638f914a9 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -764,9 +764,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct acpi_device *adev, *adev_dimm; struct device *dev = acpi_desc->dev; const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM); - unsigned long long sta; - int i, rc = -ENODEV; - acpi_status status; + int i; nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en; adev = to_acpi_dev(acpi_desc); @@ -781,25 +779,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return force_enable_dimms ? 0 : -ENODEV; } - status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta); - if (status == AE_NOT_FOUND) { - dev_dbg(dev, "%s missing _STA, assuming enabled...\n", - dev_name(&adev_dimm->dev)); - rc = 0; - } else if (ACPI_FAILURE(status)) - dev_err(dev, "%s failed to retrieve_STA, disabling...\n", - dev_name(&adev_dimm->dev)); - else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0) - dev_info(dev, "%s disabled by firmware\n", - dev_name(&adev_dimm->dev)); - else - rc = 0; - for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++) if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); - return force_enable_dimms ? 0 : rc; + return 0; } static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) From 730daa164e7c7e31c08fab940549f4acc3329432 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Jul 2015 18:02:48 -0700 Subject: [PATCH 0094/1466] Yama: remove needless CONFIG_SECURITY_YAMA_STACKED Now that minor LSMs can cleanly stack with major LSMs, remove the unneeded config for Yama to be made to explicitly stack. Just selecting the main Yama CONFIG will allow it to work, regardless of the major LSM. Since distros using Yama are already forcing it to stack, this is effectively a no-op change. Additionally add MAINTAINERS entry. Signed-off-by: Kees Cook Signed-off-by: James Morris --- Documentation/security/Yama.txt | 10 ++++----- MAINTAINERS | 6 +++++ arch/mips/configs/pistachio_defconfig | 1 - include/linux/lsm_hooks.h | 6 +++-- security/Kconfig | 5 ----- security/security.c | 11 +++------ security/yama/Kconfig | 9 +------- security/yama/yama_lsm.c | 32 +++++++++------------------ 8 files changed, 28 insertions(+), 52 deletions(-) diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt index 227a63f018a2..d9ee7d7a6c7f 100644 --- a/Documentation/security/Yama.txt +++ b/Documentation/security/Yama.txt @@ -1,9 +1,7 @@ -Yama is a Linux Security Module that collects a number of system-wide DAC -security protections that are not handled by the core kernel itself. To -select it at boot time, specify "security=yama" (though this will disable -any other LSM). - -Yama is controlled through sysctl in /proc/sys/kernel/yama: +Yama is a Linux Security Module that collects system-wide DAC security +protections that are not handled by the core kernel itself. This is +selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled +at run-time through sysctls in /proc/sys/kernel/yama: - ptrace_scope diff --git a/MAINTAINERS b/MAINTAINERS index a2264167791a..f8be2f797197 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9102,6 +9102,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git S: Supported F: security/apparmor/ +YAMA SECURITY MODULE +M: Kees Cook +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip +S: Supported +F: security/yama/ + SENSABLE PHANTOM M: Jiri Slaby S: Maintained diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig index 1646cce032c3..642b50946943 100644 --- a/arch/mips/configs/pistachio_defconfig +++ b/arch/mips/configs/pistachio_defconfig @@ -320,7 +320,6 @@ CONFIG_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_YAMA_STACKED=y CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_CRYPTO_AUTHENC=y CONFIG_CRYPTO_HMAC=y diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 9429f054c323..ec3a6bab29de 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1881,8 +1881,10 @@ static inline void security_delete_hooks(struct security_hook_list *hooks, extern int __init security_module_enable(const char *module); extern void __init capability_add_hooks(void); -#ifdef CONFIG_SECURITY_YAMA_STACKED -void __init yama_add_hooks(void); +#ifdef CONFIG_SECURITY_YAMA +extern void __init yama_add_hooks(void); +#else +static inline void __init yama_add_hooks(void) { } #endif #endif /* ! __LINUX_LSM_HOOKS_H */ diff --git a/security/Kconfig b/security/Kconfig index bf4ec46474b6..e45237897b43 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -132,7 +132,6 @@ choice default DEFAULT_SECURITY_SMACK if SECURITY_SMACK default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR - default DEFAULT_SECURITY_YAMA if SECURITY_YAMA default DEFAULT_SECURITY_DAC help @@ -151,9 +150,6 @@ choice config DEFAULT_SECURITY_APPARMOR bool "AppArmor" if SECURITY_APPARMOR=y - config DEFAULT_SECURITY_YAMA - bool "Yama" if SECURITY_YAMA=y - config DEFAULT_SECURITY_DAC bool "Unix Discretionary Access Controls" @@ -165,7 +161,6 @@ config DEFAULT_SECURITY default "smack" if DEFAULT_SECURITY_SMACK default "tomoyo" if DEFAULT_SECURITY_TOMOYO default "apparmor" if DEFAULT_SECURITY_APPARMOR - default "yama" if DEFAULT_SECURITY_YAMA default "" if DEFAULT_SECURITY_DAC endmenu diff --git a/security/security.c b/security/security.c index 595fffab48b0..e693ffcf9266 100644 --- a/security/security.c +++ b/security/security.c @@ -56,18 +56,13 @@ int __init security_init(void) pr_info("Security Framework initialized\n"); /* - * Always load the capability module. + * Load minor LSMs, with the capability module always first. */ capability_add_hooks(); -#ifdef CONFIG_SECURITY_YAMA_STACKED - /* - * If Yama is configured for stacking load it next. - */ yama_add_hooks(); -#endif + /* - * Load the chosen module if there is one. - * This will also find yama if it is stacking + * Load all the remaining security modules. */ do_security_initcalls(); diff --git a/security/yama/Kconfig b/security/yama/Kconfig index 3123e1da2fed..90c605eea892 100644 --- a/security/yama/Kconfig +++ b/security/yama/Kconfig @@ -6,14 +6,7 @@ config SECURITY_YAMA This selects Yama, which extends DAC support with additional system-wide security settings beyond regular Linux discretionary access controls. Currently available is ptrace scope restriction. + Like capabilities, this security module stacks with other LSMs. Further information can be found in Documentation/security/Yama.txt. If you are unsure how to answer this question, answer N. - -config SECURITY_YAMA_STACKED - bool "Yama stacked with other LSMs" - depends on SECURITY_YAMA - default n - help - When Yama is built into the kernel, force it to stack with the - selected primary LSM. diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 9ed32502470e..d3c19c970a06 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -353,11 +353,6 @@ static struct security_hook_list yama_hooks[] = { LSM_HOOK_INIT(task_free, yama_task_free), }; -void __init yama_add_hooks(void) -{ - security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks)); -} - #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -396,25 +391,18 @@ static struct ctl_table yama_sysctl_table[] = { }, { } }; -#endif /* CONFIG_SYSCTL */ - -static __init int yama_init(void) +static void __init yama_init_sysctl(void) { -#ifndef CONFIG_SECURITY_YAMA_STACKED - /* - * If yama is being stacked this is already taken care of. - */ - if (!security_module_enable("yama")) - return 0; -#endif - pr_info("Yama: becoming mindful.\n"); - -#ifdef CONFIG_SYSCTL if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table)) panic("Yama: sysctl registration failed.\n"); -#endif - - return 0; } +#else +static inline void yama_init_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ -security_initcall(yama_init); +void __init yama_add_hooks(void) +{ + pr_info("Yama: becoming mindful.\n"); + security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks)); + yama_init_sysctl(); +} From 21abb1ec414c75abe32c3854848ff30e2b4a6113 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 22 Jul 2015 14:25:31 -0700 Subject: [PATCH 0095/1466] Smack: IPv6 host labeling IPv6 appears to be (finally) coming of age with the influx of autonomous devices. In support of this, add the ability to associate a Smack label with IPv6 addresses. This patch also cleans up some of the conditional compilation associated with the introduction of secmark processing. It's now more obvious which bit of code goes with which feature. Signed-off-by: Casey Schaufler --- Documentation/security/Smack.txt | 27 +- security/smack/smack.h | 48 +++- security/smack/smack_lsm.c | 262 +++++++++++++------ security/smack/smackfs.c | 428 +++++++++++++++++++++++++------ 4 files changed, 604 insertions(+), 161 deletions(-) diff --git a/Documentation/security/Smack.txt b/Documentation/security/Smack.txt index de5e1aeca7fb..5e6d07fbed07 100644 --- a/Documentation/security/Smack.txt +++ b/Documentation/security/Smack.txt @@ -28,6 +28,10 @@ Smack kernels use the CIPSO IP option. Some network configurations are intolerant of IP options and can impede access to systems that use them as Smack does. +Smack is used in the Tizen operating system. Please +go to http://wiki.tizen.org for information about how +Smack is used in Tizen. + The current git repository for Smack user space is: git://github.com/smack-team/smack.git @@ -108,6 +112,8 @@ in the smackfs filesystem. This pseudo-filesystem is mounted on /sys/fs/smackfs. access + Provided for backward compatibility. The access2 interface + is preferred and should be used instead. This interface reports whether a subject with the specified Smack label has a particular access to an object with a specified Smack label. Write a fixed format access rule to @@ -136,6 +142,8 @@ change-rule those in the fourth string. If there is no such rule it will be created using the access specified in the third and the fourth strings. cipso + Provided for backward compatibility. The cipso2 interface + is preferred and should be used instead. This interface allows a specific CIPSO header to be assigned to a Smack label. The format accepted on write is: "%24s%4d%4d"["%4d"]... @@ -157,7 +165,19 @@ direct doi This contains the CIPSO domain of interpretation used in network packets. +ipv6host + This interface allows specific IPv6 internet addresses to be + treated as single label hosts. Packets are sent to single + label hosts only from processes that have Smack write access + to the host label. All packets received from single label hosts + are given the specified label. The format accepted on write is: + "%h:%h:%h:%h:%h:%h:%h:%h label" or + "%h:%h:%h:%h:%h:%h:%h:%h/%d label". + The "::" address shortcut is not supported. + If label is "-DELETE" a matched entry will be deleted. load + Provided for backward compatibility. The load2 interface + is preferred and should be used instead. This interface allows access control rules in addition to the system defined rules to be specified. The format accepted on write is: @@ -181,6 +201,8 @@ load2 permissions that are not allowed. The string "r-x--" would specify read and execute access. load-self + Provided for backward compatibility. The load-self2 interface + is preferred and should be used instead. This interface allows process specific access rules to be defined. These rules are only consulted if access would otherwise be permitted, and are intended to provide additional @@ -205,6 +227,8 @@ netlabel received from single label hosts are given the specified label. The format accepted on write is: "%d.%d.%d.%d label" or "%d.%d.%d.%d/%d label". + If the label specified is "-CIPSO" the address is treated + as a host that supports CIPSO headers. onlycap This contains labels processes must have for CAP_MAC_ADMIN and CAP_MAC_OVERRIDE to be effective. If this file is empty @@ -232,7 +256,8 @@ unconfined is dangerous and can ruin the proper labeling of your system. It should never be used in production. -You can add access rules in /etc/smack/accesses. They take the form: +If you are using the smackload utility +you can add access rules in /etc/smack/accesses. They take the form: subjectlabel objectlabel access diff --git a/security/smack/smack.h b/security/smack/smack.h index 69ab9eb7d6d9..fff0c612bbb7 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -17,11 +17,26 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif /* CONFIG_IPV6 */ #include #include #include #include +/* + * Use IPv6 port labeling if IPv6 is enabled and secmarks + * are not being used. + */ +#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#define SMACK_IPV6_PORT_LABELING 1 +#endif + +#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_SECURITY_SMACK_NETFILTER) +#define SMACK_IPV6_SECMARK_LABELING 1 +#endif + /* * Smack labels were limited to 23 characters for a long time. */ @@ -118,15 +133,30 @@ struct smack_rule { }; /* - * An entry in the table identifying hosts. + * An entry in the table identifying IPv4 hosts. */ -struct smk_netlbladdr { +struct smk_net4addr { struct list_head list; - struct sockaddr_in smk_host; /* network address */ + struct in_addr smk_host; /* network address */ struct in_addr smk_mask; /* network mask */ + int smk_masks; /* mask size */ struct smack_known *smk_label; /* label */ }; +#if IS_ENABLED(CONFIG_IPV6) +/* + * An entry in the table identifying IPv6 hosts. + */ +struct smk_net6addr { + struct list_head list; + struct in6_addr smk_host; /* network address */ + struct in6_addr smk_mask; /* network mask */ + int smk_masks; /* mask size */ + struct smack_known *smk_label; /* label */ +}; +#endif /* CONFIG_IPV6 */ + +#ifdef SMACK_IPV6_PORT_LABELING /* * An entry in the table identifying ports. */ @@ -137,6 +167,7 @@ struct smk_port_label { struct smack_known *smk_in; /* inbound label */ struct smack_known *smk_out; /* outgoing label */ }; +#endif /* SMACK_IPV6_PORT_LABELING */ struct smack_onlycap { struct list_head list; @@ -170,6 +201,7 @@ enum { #define SMK_FSROOT "smackfsroot=" #define SMK_FSTRANS "smackfstransmute=" +#define SMACK_DELETE_OPTION "-DELETE" #define SMACK_CIPSO_OPTION "-CIPSO" /* @@ -252,10 +284,6 @@ struct smk_audit_info { struct smack_audit_data sad; #endif }; -/* - * These functions are in smack_lsm.c - */ -struct inode_smack *new_inode_smack(struct smack_known *); /* * These functions are in smack_access.c @@ -285,7 +313,6 @@ extern struct smack_known *smack_syslog_label; #ifdef CONFIG_SECURITY_SMACK_BRINGUP extern struct smack_known *smack_unconfined; #endif -extern struct smack_known smack_cipso_option; extern int smack_ptrace_rule; extern struct smack_known smack_known_floor; @@ -297,7 +324,10 @@ extern struct smack_known smack_known_web; extern struct mutex smack_known_lock; extern struct list_head smack_known_list; -extern struct list_head smk_netlbladdr_list; +extern struct list_head smk_net4addr_list; +#if IS_ENABLED(CONFIG_IPV6) +extern struct list_head smk_net6addr_list; +#endif /* CONFIG_IPV6 */ extern struct mutex smack_onlycap_lock; extern struct list_head smack_onlycap_list; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d962f887d3f4..cc390bccecd7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -51,9 +51,9 @@ #define SMK_RECEIVING 1 #define SMK_SENDING 2 -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_PORT_LABELING LIST_HEAD(smk_ipv6_port_list); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif static struct kmem_cache *smack_inode_cache; int smack_enabled; @@ -2272,7 +2272,7 @@ static void smack_sk_free_security(struct sock *sk) } /** -* smack_host_label - check host based restrictions +* smack_ipv4host_label - check host based restrictions * @sip: the object end * * looks for host based access restrictions @@ -2283,30 +2283,96 @@ static void smack_sk_free_security(struct sock *sk) * * Returns the label of the far end or NULL if it's not special. */ -static struct smack_known *smack_host_label(struct sockaddr_in *sip) +static struct smack_known *smack_ipv4host_label(struct sockaddr_in *sip) { - struct smk_netlbladdr *snp; + struct smk_net4addr *snp; struct in_addr *siap = &sip->sin_addr; if (siap->s_addr == 0) return NULL; - list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list) + list_for_each_entry_rcu(snp, &smk_net4addr_list, list) + /* + * we break after finding the first match because + * the list is sorted from longest to shortest mask + * so we have found the most specific match + */ + if (snp->smk_host.s_addr == + (siap->s_addr & snp->smk_mask.s_addr)) + return snp->smk_label; + + return NULL; +} + +#if IS_ENABLED(CONFIG_IPV6) +/* + * smk_ipv6_localhost - Check for local ipv6 host address + * @sip: the address + * + * Returns boolean true if this is the localhost address + */ +static bool smk_ipv6_localhost(struct sockaddr_in6 *sip) +{ + __be16 *be16p = (__be16 *)&sip->sin6_addr; + __be32 *be32p = (__be32 *)&sip->sin6_addr; + + if (be32p[0] == 0 && be32p[1] == 0 && be32p[2] == 0 && be16p[6] == 0 && + ntohs(be16p[7]) == 1) + return true; + return false; +} + +/** +* smack_ipv6host_label - check host based restrictions +* @sip: the object end +* +* looks for host based access restrictions +* +* This version will only be appropriate for really small sets of single label +* hosts. The caller is responsible for ensuring that the RCU read lock is +* taken before calling this function. +* +* Returns the label of the far end or NULL if it's not special. +*/ +static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip) +{ + struct smk_net6addr *snp; + struct in6_addr *sap = &sip->sin6_addr; + int i; + int found = 0; + + /* + * It's local. Don't look for a host label. + */ + if (smk_ipv6_localhost(sip)) + return NULL; + + list_for_each_entry_rcu(snp, &smk_net6addr_list, list) { /* * we break after finding the first match because * the list is sorted from longest to shortest mask * so we have found the most specific match */ - if ((&snp->smk_host.sin_addr)->s_addr == - (siap->s_addr & (&snp->smk_mask)->s_addr)) { - /* we have found the special CIPSO option */ - if (snp->smk_label == &smack_cipso_option) - return NULL; - return snp->smk_label; + for (found = 1, i = 0; i < 8; i++) { + /* + * If the label is NULL the entry has + * been renounced. Ignore it. + */ + if (snp->smk_label == NULL) + continue; + if ((sap->s6_addr16[i] & snp->smk_mask.s6_addr16[i]) != + snp->smk_host.s6_addr16[i]) { + found = 0; + break; + } } + if (found) + return snp->smk_label; + } return NULL; } +#endif /* CONFIG_IPV6 */ /** * smack_netlabel - Set the secattr on a socket @@ -2370,7 +2436,7 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) struct smk_audit_info ad; rcu_read_lock(); - hkp = smack_host_label(sap); + hkp = smack_ipv4host_label(sap); if (hkp != NULL) { #ifdef CONFIG_AUDIT struct lsm_network_audit net; @@ -2395,7 +2461,42 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) return smack_netlabel(sk, sk_lbl); } -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#if IS_ENABLED(CONFIG_IPV6) +/** + * smk_ipv6_check - check Smack access + * @subject: subject Smack label + * @object: object Smack label + * @address: address + * @act: the action being taken + * + * Check an IPv6 access + */ +static int smk_ipv6_check(struct smack_known *subject, + struct smack_known *object, + struct sockaddr_in6 *address, int act) +{ +#ifdef CONFIG_AUDIT + struct lsm_network_audit net; +#endif + struct smk_audit_info ad; + int rc; + +#ifdef CONFIG_AUDIT + smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); + ad.a.u.net->family = PF_INET6; + ad.a.u.net->dport = ntohs(address->sin6_port); + if (act == SMK_RECEIVING) + ad.a.u.net->v6info.saddr = address->sin6_addr; + else + ad.a.u.net->v6info.daddr = address->sin6_addr; +#endif + rc = smk_access(subject, object, MAY_WRITE, &ad); + rc = smk_bu_note("IPv6 check", subject, object, MAY_WRITE, rc); + return rc; +} +#endif /* CONFIG_IPV6 */ + +#ifdef SMACK_IPV6_PORT_LABELING /** * smk_ipv6_port_label - Smack port access table management * @sock: socket @@ -2479,48 +2580,43 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, int act) { - __be16 *bep; - __be32 *be32p; struct smk_port_label *spp; struct socket_smack *ssp = sk->sk_security; - struct smack_known *skp; - unsigned short port = 0; + struct smack_known *skp = NULL; + unsigned short port; struct smack_known *object; - struct smk_audit_info ad; - int rc; -#ifdef CONFIG_AUDIT - struct lsm_network_audit net; -#endif if (act == SMK_RECEIVING) { - skp = smack_net_ambient; + skp = smack_ipv6host_label(address); object = ssp->smk_in; } else { skp = ssp->smk_out; - object = smack_net_ambient; + object = smack_ipv6host_label(address); } /* - * Get the IP address and port from the address. + * The other end is a single label host. */ - port = ntohs(address->sin6_port); - bep = (__be16 *)(&address->sin6_addr); - be32p = (__be32 *)(&address->sin6_addr); + if (skp != NULL && object != NULL) + return smk_ipv6_check(skp, object, address, act); + if (skp == NULL) + skp = smack_net_ambient; + if (object == NULL) + object = smack_net_ambient; /* * It's remote, so port lookup does no good. */ - if (be32p[0] || be32p[1] || be32p[2] || bep[6] || ntohs(bep[7]) != 1) - goto auditout; + if (!smk_ipv6_localhost(address)) + return smk_ipv6_check(skp, object, address, act); /* * It's local so the send check has to have passed. */ - if (act == SMK_RECEIVING) { - skp = &smack_known_web; - goto auditout; - } + if (act == SMK_RECEIVING) + return 0; + port = ntohs(address->sin6_port); list_for_each_entry(spp, &smk_ipv6_port_list, list) { if (spp->smk_port != port) continue; @@ -2530,22 +2626,9 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, break; } -auditout: - -#ifdef CONFIG_AUDIT - smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); - ad.a.u.net->family = sk->sk_family; - ad.a.u.net->dport = port; - if (act == SMK_RECEIVING) - ad.a.u.net->v6info.saddr = address->sin6_addr; - else - ad.a.u.net->v6info.daddr = address->sin6_addr; -#endif - rc = smk_access(skp, object, MAY_WRITE, &ad); - rc = smk_bu_note("IPv6 port check", skp, object, MAY_WRITE, rc); - return rc; + return smk_ipv6_check(skp, object, address, act); } -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ /** * smack_inode_setsecurity - set smack xattrs @@ -2606,10 +2689,10 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name, } else return -EOPNOTSUPP; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_PORT_LABELING if (sock->sk->sk_family == PF_INET6) smk_ipv6_port_label(sock, NULL); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif return 0; } @@ -2651,7 +2734,7 @@ static int smack_socket_post_create(struct socket *sock, int family, return smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET); } -#ifndef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_PORT_LABELING /** * smack_socket_bind - record port binding information. * @sock: the socket @@ -2665,14 +2748,11 @@ static int smack_socket_post_create(struct socket *sock, int family, static int smack_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { -#if IS_ENABLED(CONFIG_IPV6) if (sock->sk != NULL && sock->sk->sk_family == PF_INET6) smk_ipv6_port_label(sock, address); -#endif - return 0; } -#endif /* !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ /** * smack_socket_connect - connect access check @@ -2688,6 +2768,13 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, int addrlen) { int rc = 0; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *sip = (struct sockaddr_in6 *)sap; +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + struct smack_known *rsp; + struct socket_smack *ssp = sock->sk->sk_security; +#endif if (sock->sk == NULL) return 0; @@ -2701,10 +2788,15 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, case PF_INET6: if (addrlen < sizeof(struct sockaddr_in6)) return -EINVAL; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) - rc = smk_ipv6_port_check(sock->sk, (struct sockaddr_in6 *)sap, +#ifdef SMACK_IPV6_SECMARK_LABELING + rsp = smack_ipv6host_label(sip); + if (rsp != NULL) + rc = smk_ipv6_check(ssp->smk_out, rsp, sip, SMK_CONNECTING); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif +#ifdef SMACK_IPV6_PORT_LABELING + rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); +#endif break; } return rc; @@ -3590,9 +3682,13 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { struct sockaddr_in *sip = (struct sockaddr_in *) msg->msg_name; -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sap = (struct sockaddr_in6 *) msg->msg_name; -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + struct socket_smack *ssp = sock->sk->sk_security; + struct smack_known *rsp; +#endif int rc = 0; /* @@ -3606,9 +3702,15 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg, rc = smack_netlabel_send(sock->sk, sip); break; case AF_INET6: -#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER) +#ifdef SMACK_IPV6_SECMARK_LABELING + rsp = smack_ipv6host_label(sap); + if (rsp != NULL) + rc = smk_ipv6_check(ssp->smk_out, rsp, sap, + SMK_CONNECTING); +#endif +#ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sock->sk, sap, SMK_SENDING); -#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */ +#endif break; } return rc; @@ -3822,10 +3924,12 @@ access_check: proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_TCP) break; -#ifdef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_SECMARK_LABELING if (skb && skb->secmark != 0) skp = smack_from_secid(skb->secmark); else + skp = smack_ipv6host_label(&sadd); + if (skp == NULL) skp = smack_net_ambient; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); @@ -3836,9 +3940,10 @@ access_check: rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad); rc = smk_bu_note("IPv6 delivery", skp, ssp->smk_in, MAY_WRITE, rc); -#else /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_SECMARK_LABELING */ +#ifdef SMACK_IPV6_PORT_LABELING rc = smk_ipv6_port_check(sk, &sadd, SMK_RECEIVING); -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif /* SMACK_IPV6_PORT_LABELING */ break; #endif /* CONFIG_IPV6 */ } @@ -3936,13 +4041,11 @@ static int smack_socket_getpeersec_dgram(struct socket *sock, } netlbl_secattr_destroy(&secattr); break; -#if IS_ENABLED(CONFIG_IPV6) case PF_INET6: -#ifdef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_SECMARK_LABELING s = skb->secmark; -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif break; -#endif /* CONFIG_IPV6 */ } *secid = s; if (s == 0) @@ -4065,7 +4168,7 @@ access_check: hdr = ip_hdr(skb); addr.sin_addr.s_addr = hdr->saddr; rcu_read_lock(); - hskp = smack_host_label(&addr); + hskp = smack_ipv4host_label(&addr); rcu_read_unlock(); if (hskp == NULL) @@ -4517,9 +4620,9 @@ struct security_hook_list smack_hooks[] = { LSM_HOOK_INIT(unix_may_send, smack_unix_may_send), LSM_HOOK_INIT(socket_post_create, smack_socket_post_create), -#ifndef CONFIG_SECURITY_SMACK_NETFILTER +#ifdef SMACK_IPV6_PORT_LABELING LSM_HOOK_INIT(socket_bind, smack_socket_bind), -#endif /* CONFIG_SECURITY_SMACK_NETFILTER */ +#endif LSM_HOOK_INIT(socket_connect, smack_socket_connect), LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg), LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb), @@ -4614,7 +4717,16 @@ static __init int smack_init(void) return -ENOMEM; } - printk(KERN_INFO "Smack: Initializing.\n"); + pr_info("Smack: Initializing.\n"); +#ifdef CONFIG_SECURITY_SMACK_NETFILTER + pr_info("Smack: Netfilter enabled.\n"); +#endif +#ifdef SMACK_IPV6_PORT_LABELING + pr_info("Smack: IPv6 port labeling enabled.\n"); +#endif +#ifdef SMACK_IPV6_SECMARK_LABELING + pr_info("Smack: IPv6 Netfilter enabled.\n"); +#endif /* * Set the security state for the initial task. diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 81a2888a9908..11b752b366ea 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -29,6 +29,7 @@ #include #include "smack.h" +#define BEBITS (sizeof(__be32) * 8) /* * smackfs pseudo filesystem. */ @@ -40,7 +41,7 @@ enum smk_inos { SMK_DOI = 5, /* CIPSO DOI */ SMK_DIRECT = 6, /* CIPSO level indicating direct label */ SMK_AMBIENT = 7, /* internet ambient label */ - SMK_NETLBLADDR = 8, /* single label hosts */ + SMK_NET4ADDR = 8, /* single label hosts */ SMK_ONLYCAP = 9, /* the only "capable" label */ SMK_LOGGING = 10, /* logging */ SMK_LOAD_SELF = 11, /* task specific rules */ @@ -57,6 +58,9 @@ enum smk_inos { #ifdef CONFIG_SECURITY_SMACK_BRINGUP SMK_UNCONFINED = 22, /* define an unconfined label */ #endif +#if IS_ENABLED(CONFIG_IPV6) + SMK_NET6ADDR = 23, /* single label IPv6 hosts */ +#endif /* CONFIG_IPV6 */ }; /* @@ -64,7 +68,10 @@ enum smk_inos { */ static DEFINE_MUTEX(smack_cipso_lock); static DEFINE_MUTEX(smack_ambient_lock); -static DEFINE_MUTEX(smk_netlbladdr_lock); +static DEFINE_MUTEX(smk_net4addr_lock); +#if IS_ENABLED(CONFIG_IPV6) +static DEFINE_MUTEX(smk_net6addr_lock); +#endif /* CONFIG_IPV6 */ /* * This is the "ambient" label for network traffic. @@ -118,7 +125,10 @@ int smack_ptrace_rule = SMACK_PTRACE_DEFAULT; * can write to the specified label. */ -LIST_HEAD(smk_netlbladdr_list); +LIST_HEAD(smk_net4addr_list); +#if IS_ENABLED(CONFIG_IPV6) +LIST_HEAD(smk_net6addr_list); +#endif /* CONFIG_IPV6 */ /* * Rule lists are maintained for each label. @@ -140,11 +150,6 @@ struct smack_parsed_rule { static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT; -struct smack_known smack_cipso_option = { - .smk_known = SMACK_CIPSO_OPTION, - .smk_secid = 0, -}; - /* * Values for parsing cipso rules * SMK_DIGITLEN: Length of a digit field in a rule. @@ -1047,92 +1052,90 @@ static const struct file_operations smk_cipso2_ops = { * Seq_file read operations for /smack/netlabel */ -static void *netlbladdr_seq_start(struct seq_file *s, loff_t *pos) +static void *net4addr_seq_start(struct seq_file *s, loff_t *pos) { - return smk_seq_start(s, pos, &smk_netlbladdr_list); + return smk_seq_start(s, pos, &smk_net4addr_list); } -static void *netlbladdr_seq_next(struct seq_file *s, void *v, loff_t *pos) +static void *net4addr_seq_next(struct seq_file *s, void *v, loff_t *pos) { - return smk_seq_next(s, v, pos, &smk_netlbladdr_list); + return smk_seq_next(s, v, pos, &smk_net4addr_list); } -#define BEBITS (sizeof(__be32) * 8) /* * Print host/label pairs */ -static int netlbladdr_seq_show(struct seq_file *s, void *v) +static int net4addr_seq_show(struct seq_file *s, void *v) { struct list_head *list = v; - struct smk_netlbladdr *skp = - list_entry_rcu(list, struct smk_netlbladdr, list); - unsigned char *hp = (char *) &skp->smk_host.sin_addr.s_addr; - int maskn; - u32 temp_mask = be32_to_cpu(skp->smk_mask.s_addr); + struct smk_net4addr *skp = + list_entry_rcu(list, struct smk_net4addr, list); + char *kp = SMACK_CIPSO_OPTION; - for (maskn = 0; temp_mask; temp_mask <<= 1, maskn++); - - seq_printf(s, "%u.%u.%u.%u/%d %s\n", - hp[0], hp[1], hp[2], hp[3], maskn, skp->smk_label->smk_known); + if (skp->smk_label != NULL) + kp = skp->smk_label->smk_known; + seq_printf(s, "%pI4/%d %s\n", &skp->smk_host.s_addr, + skp->smk_masks, kp); return 0; } -static const struct seq_operations netlbladdr_seq_ops = { - .start = netlbladdr_seq_start, - .next = netlbladdr_seq_next, - .show = netlbladdr_seq_show, +static const struct seq_operations net4addr_seq_ops = { + .start = net4addr_seq_start, + .next = net4addr_seq_next, + .show = net4addr_seq_show, .stop = smk_seq_stop, }; /** - * smk_open_netlbladdr - open() for /smack/netlabel + * smk_open_net4addr - open() for /smack/netlabel * @inode: inode structure representing file * @file: "netlabel" file pointer * - * Connect our netlbladdr_seq_* operations with /smack/netlabel + * Connect our net4addr_seq_* operations with /smack/netlabel * file_operations */ -static int smk_open_netlbladdr(struct inode *inode, struct file *file) +static int smk_open_net4addr(struct inode *inode, struct file *file) { - return seq_open(file, &netlbladdr_seq_ops); + return seq_open(file, &net4addr_seq_ops); } /** - * smk_netlbladdr_insert + * smk_net4addr_insert * @new : netlabel to insert * - * This helper insert netlabel in the smack_netlbladdrs list + * This helper insert netlabel in the smack_net4addrs list * sorted by netmask length (longest to smallest) - * locked by &smk_netlbladdr_lock in smk_write_netlbladdr + * locked by &smk_net4addr_lock in smk_write_net4addr * */ -static void smk_netlbladdr_insert(struct smk_netlbladdr *new) +static void smk_net4addr_insert(struct smk_net4addr *new) { - struct smk_netlbladdr *m, *m_next; + struct smk_net4addr *m; + struct smk_net4addr *m_next; - if (list_empty(&smk_netlbladdr_list)) { - list_add_rcu(&new->list, &smk_netlbladdr_list); + if (list_empty(&smk_net4addr_list)) { + list_add_rcu(&new->list, &smk_net4addr_list); return; } - m = list_entry_rcu(smk_netlbladdr_list.next, - struct smk_netlbladdr, list); + m = list_entry_rcu(smk_net4addr_list.next, + struct smk_net4addr, list); /* the comparison '>' is a bit hacky, but works */ - if (new->smk_mask.s_addr > m->smk_mask.s_addr) { - list_add_rcu(&new->list, &smk_netlbladdr_list); + if (new->smk_masks > m->smk_masks) { + list_add_rcu(&new->list, &smk_net4addr_list); return; } - list_for_each_entry_rcu(m, &smk_netlbladdr_list, list) { - if (list_is_last(&m->list, &smk_netlbladdr_list)) { + list_for_each_entry_rcu(m, &smk_net4addr_list, list) { + if (list_is_last(&m->list, &smk_net4addr_list)) { list_add_rcu(&new->list, &m->list); return; } m_next = list_entry_rcu(m->list.next, - struct smk_netlbladdr, list); - if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) { + struct smk_net4addr, list); + if (new->smk_masks > m_next->smk_masks) { list_add_rcu(&new->list, &m->list); return; } @@ -1141,28 +1144,29 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) /** - * smk_write_netlbladdr - write() for /smack/netlabel + * smk_write_net4addr - write() for /smack/netlabel * @file: file pointer, not actually used * @buf: where to get the data from * @count: bytes sent * @ppos: where to start * - * Accepts only one netlbladdr per write call. + * Accepts only one net4addr per write call. * Returns number of bytes written or error code, as appropriate */ -static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, +static ssize_t smk_write_net4addr(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct smk_netlbladdr *snp; + struct smk_net4addr *snp; struct sockaddr_in newname; char *smack; - struct smack_known *skp; + struct smack_known *skp = NULL; char *data; char *host = (char *)&newname.sin_addr.s_addr; int rc; struct netlbl_audit audit_info; struct in_addr mask; unsigned int m; + unsigned int masks; int found; u32 mask_bits = (1<<31); __be32 nsa; @@ -1200,7 +1204,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, data[count] = '\0'; rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd/%u %s", - &host[0], &host[1], &host[2], &host[3], &m, smack); + &host[0], &host[1], &host[2], &host[3], &masks, smack); if (rc != 6) { rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd %s", &host[0], &host[1], &host[2], &host[3], smack); @@ -1209,8 +1213,9 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, goto free_out; } m = BEBITS; + masks = 32; } - if (m > BEBITS) { + if (masks > BEBITS) { rc = -EINVAL; goto free_out; } @@ -1225,16 +1230,16 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, goto free_out; } } else { - /* check known options */ - if (strcmp(smack, smack_cipso_option.smk_known) == 0) - skp = &smack_cipso_option; - else { + /* + * Only the -CIPSO option is supported for IPv4 + */ + if (strcmp(smack, SMACK_CIPSO_OPTION) != 0) { rc = -EINVAL; goto free_out; } } - for (temp_mask = 0; m > 0; m--) { + for (m = masks, temp_mask = 0; m > 0; m--) { temp_mask |= mask_bits; mask_bits >>= 1; } @@ -1245,14 +1250,13 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, * Only allow one writer at a time. Writes should be * quite rare and small in any case. */ - mutex_lock(&smk_netlbladdr_lock); + mutex_lock(&smk_net4addr_lock); nsa = newname.sin_addr.s_addr; /* try to find if the prefix is already in the list */ found = 0; - list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list) { - if (snp->smk_host.sin_addr.s_addr == nsa && - snp->smk_mask.s_addr == mask.s_addr) { + list_for_each_entry_rcu(snp, &smk_net4addr_list, list) { + if (snp->smk_host.s_addr == nsa && snp->smk_masks == masks) { found = 1; break; } @@ -1265,17 +1269,20 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, rc = -ENOMEM; else { rc = 0; - snp->smk_host.sin_addr.s_addr = newname.sin_addr.s_addr; + snp->smk_host.s_addr = newname.sin_addr.s_addr; snp->smk_mask.s_addr = mask.s_addr; snp->smk_label = skp; - smk_netlbladdr_insert(snp); + snp->smk_masks = masks; + smk_net4addr_insert(snp); } } else { - /* we delete the unlabeled entry, only if the previous label - * wasn't the special CIPSO option */ - if (snp->smk_label != &smack_cipso_option) + /* + * Delete the unlabeled entry, only if the previous label + * wasn't the special CIPSO option + */ + if (snp->smk_label != NULL) rc = netlbl_cfg_unlbl_static_del(&init_net, NULL, - &snp->smk_host.sin_addr, &snp->smk_mask, + &snp->smk_host, &snp->smk_mask, PF_INET, &audit_info); else rc = 0; @@ -1287,15 +1294,15 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, * this host so that incoming packets get labeled. * but only if we didn't get the special CIPSO option */ - if (rc == 0 && skp != &smack_cipso_option) + if (rc == 0 && skp != NULL) rc = netlbl_cfg_unlbl_static_add(&init_net, NULL, - &snp->smk_host.sin_addr, &snp->smk_mask, PF_INET, + &snp->smk_host, &snp->smk_mask, PF_INET, snp->smk_label->smk_secid, &audit_info); if (rc == 0) rc = count; - mutex_unlock(&smk_netlbladdr_lock); + mutex_unlock(&smk_net4addr_lock); free_out: kfree(smack); @@ -1305,14 +1312,279 @@ free_data_out: return rc; } -static const struct file_operations smk_netlbladdr_ops = { - .open = smk_open_netlbladdr, +static const struct file_operations smk_net4addr_ops = { + .open = smk_open_net4addr, .read = seq_read, .llseek = seq_lseek, - .write = smk_write_netlbladdr, + .write = smk_write_net4addr, .release = seq_release, }; +#if IS_ENABLED(CONFIG_IPV6) +/* + * Seq_file read operations for /smack/netlabel6 + */ + +static void *net6addr_seq_start(struct seq_file *s, loff_t *pos) +{ + return smk_seq_start(s, pos, &smk_net6addr_list); +} + +static void *net6addr_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + return smk_seq_next(s, v, pos, &smk_net6addr_list); +} + +/* + * Print host/label pairs + */ +static int net6addr_seq_show(struct seq_file *s, void *v) +{ + struct list_head *list = v; + struct smk_net6addr *skp = + list_entry(list, struct smk_net6addr, list); + + if (skp->smk_label != NULL) + seq_printf(s, "%pI6/%d %s\n", &skp->smk_host, skp->smk_masks, + skp->smk_label->smk_known); + + return 0; +} + +static const struct seq_operations net6addr_seq_ops = { + .start = net6addr_seq_start, + .next = net6addr_seq_next, + .show = net6addr_seq_show, + .stop = smk_seq_stop, +}; + +/** + * smk_open_net6addr - open() for /smack/netlabel + * @inode: inode structure representing file + * @file: "netlabel" file pointer + * + * Connect our net6addr_seq_* operations with /smack/netlabel + * file_operations + */ +static int smk_open_net6addr(struct inode *inode, struct file *file) +{ + return seq_open(file, &net6addr_seq_ops); +} + +/** + * smk_net6addr_insert + * @new : entry to insert + * + * This inserts an entry in the smack_net6addrs list + * sorted by netmask length (longest to smallest) + * locked by &smk_net6addr_lock in smk_write_net6addr + * + */ +static void smk_net6addr_insert(struct smk_net6addr *new) +{ + struct smk_net6addr *m_next; + struct smk_net6addr *m; + + if (list_empty(&smk_net6addr_list)) { + list_add_rcu(&new->list, &smk_net6addr_list); + return; + } + + m = list_entry_rcu(smk_net6addr_list.next, + struct smk_net6addr, list); + + if (new->smk_masks > m->smk_masks) { + list_add_rcu(&new->list, &smk_net6addr_list); + return; + } + + list_for_each_entry_rcu(m, &smk_net6addr_list, list) { + if (list_is_last(&m->list, &smk_net6addr_list)) { + list_add_rcu(&new->list, &m->list); + return; + } + m_next = list_entry_rcu(m->list.next, + struct smk_net6addr, list); + if (new->smk_masks > m_next->smk_masks) { + list_add_rcu(&new->list, &m->list); + return; + } + } +} + + +/** + * smk_write_net6addr - write() for /smack/netlabel + * @file: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Accepts only one net6addr per write call. + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_net6addr(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct smk_net6addr *snp; + struct in6_addr newname; + struct in6_addr fullmask; + struct smack_known *skp = NULL; + char *smack; + char *data; + int rc = 0; + int found = 0; + int i; + unsigned int scanned[8]; + unsigned int m; + unsigned int mask = 128; + + /* + * Must have privilege. + * No partial writes. + * Enough data must be present. + * "