diff --git a/Documentation/crypto/api-digest.rst b/Documentation/crypto/api-digest.rst
index 07356fa99200..7a1e670d6ce1 100644
--- a/Documentation/crypto/api-digest.rst
+++ b/Documentation/crypto/api-digest.rst
@@ -14,7 +14,7 @@ Asynchronous Message Digest API
    :doc: Asynchronous Message Digest API
 
 .. kernel-doc:: include/crypto/hash.h
-   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import
+   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_statesize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import
 
 Asynchronous Hash Request Handle
 --------------------------------
diff --git a/Documentation/crypto/api-skcipher.rst b/Documentation/crypto/api-skcipher.rst
index b20028a361a9..4eec4a93f7e3 100644
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -59,4 +59,4 @@ Synchronous Block Cipher API - Deprecated
    :doc: Synchronous Block Cipher API
 
 .. kernel-doc:: include/linux/crypto.h
-   :functions: crypto_alloc_blkcipher rypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
+   :functions: crypto_alloc_blkcipher crypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
diff --git a/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt b/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
new file mode 100644
index 000000000000..29b6007568eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
@@ -0,0 +1,22 @@
+The Broadcom Secure Processing Unit (SPU) hardware supports symmetric
+cryptographic offload for Broadcom SoCs. A SoC may have multiple SPU hardware
+blocks.
+
+Required properties:
+- compatible: Should be one of the following:
+  brcm,spum-crypto - for devices with SPU-M hardware
+  brcm,spu2-crypto - for devices with SPU2 hardware
+  brcm,spu2-v2-crypto - for devices with enhanced SPU2 hardware features like SHA3
+  and Rabin Fingerprint support
+  brcm,spum-nsp-crypto - for the Northstar Plus variant of the SPU-M hardware
+
+- reg: Should contain SPU registers location and length.
+- mboxes: The mailbox channel to be used to communicate with the SPU.
+  Mailbox channels correspond to DMA rings on the device.
+
+Example:
+	crypto@612d0000 {
+		compatible = "brcm,spum-crypto";
+		reg = <0 0x612d0000 0 0x900>;
+		mboxes = <&pdc0 0>;
+	};
diff --git a/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
new file mode 100644
index 000000000000..c204725e5873
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
@@ -0,0 +1,27 @@
+MediaTek cryptographic accelerators
+
+Required properties:
+- compatible: Should be "mediatek,eip97-crypto"
+- reg: Address and length of the register set for the device
+- interrupts: Should contain the five crypto engines interrupts in numeric
+	order. These are global system and four descriptor rings.
+- clocks: the clock used by the core
+- clock-names: the names of the clock listed in the clocks property. These are
+	"ethif", "cryp"
+- power-domains: Must contain a reference to the PM domain.
+
+
+Example:
+	crypto: crypto@1b240000 {
+		compatible = "mediatek,eip97-crypto";
+		reg = <0 0x1b240000 0 0x20000>;
+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 83 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 84 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 91 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 97 IRQ_TYPE_LEVEL_LOW>;
+		clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
+			 <&ethsys CLK_ETHSYS_CRYPTO>;
+		clock-names = "ethif","cryp";
+		power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
+	};
diff --git a/MAINTAINERS b/MAINTAINERS
index 5a5fa41ac961..24fef3773d67 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3031,6 +3031,13 @@ W:     http://www.cavium.com
 S:     Supported
 F:     drivers/net/ethernet/cavium/liquidio/
 
+CAVIUM OCTEON-TX CRYPTO DRIVER
+M:	George Cherian <george.cherian@cavium.com>
+L:	linux-crypto@vger.kernel.org
+W:	http://www.cavium.com
+S:	Supported
+F:	drivers/crypto/cavium/cpt/
+
 CC2520 IEEE-802.15.4 RADIO DRIVER
 M:	Varka Bhadram <varkabhadram@gmail.com>
 L:	linux-wpan@vger.kernel.org
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 13f1b4c289d4..a8fce93137fb 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,35 +62,18 @@ config CRYPTO_SHA512_ARM
 	  using optimized ARM assembler and NEON, when available.
 
 config CRYPTO_AES_ARM
-	tristate "AES cipher algorithms (ARM-asm)"
-	depends on ARM
+	tristate "Scalar AES cipher for ARM"
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES
 	help
 	  Use optimized AES assembler routines for ARM platforms.
 
-	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
-	  algorithm.
-
-	  Rijndael appears to be consistently a very good performer in
-	  both hardware and software across a wide range of computing
-	  environments regardless of its use in feedback or non-feedback
-	  modes. Its key setup time is excellent, and its key agility is
-	  good. Rijndael's very low memory requirements make it very well
-	  suited for restricted-space environments, in which it also
-	  demonstrates excellent performance. Rijndael's operations are
-	  among the easiest to defend against power and timing attacks.
-
-	  The AES specifies three key sizes: 128, 192 and 256 bits
-
-	  See <http://csrc.nist.gov/encryption/aes/> for more information.
-
 config CRYPTO_AES_ARM_BS
 	tristate "Bit sliced AES using NEON instructions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_AES_ARM
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
+	select CRYPTO_AES_ARM
 	help
 	  Use a faster and more secure NEON based implementation of AES in CBC,
 	  CTR and XTS modes
@@ -130,4 +113,10 @@ config CRYPTO_CRC32_ARM_CE
 	depends on KERNEL_MODE_NEON && CRC32
 	select CRYPTO_HASH
 
+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b578a1820ab1..1822c4697278 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -26,8 +27,8 @@ $(warning $(ce-obj-y) $(ce-obj-m))
 endif
 endif
 
-aes-arm-y	:= aes-armv4.o aes_glue.o
-aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
+aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
+aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
 sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
@@ -40,17 +41,15 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
 
-$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
-	$(call cmd,perl)
-
 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 	$(call cmd,perl)
 
 $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S $(obj)/sha512-core.S
+.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
diff --git a/arch/arm/crypto/aes-armv4.S b/arch/arm/crypto/aes-armv4.S
deleted file mode 100644
index ebb9761fb572..000000000000
--- a/arch/arm/crypto/aes-armv4.S
+++ /dev/null
@@ -1,1089 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ AES for ARMv4
-
-@ January 2007.
-@
-@ Code uses single 1K S-box and is >2 times faster than code generated
-@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
-@ allows to merge logical or arithmetic operation with shift or rotate
-@ in one instruction and emit combined result every cycle. The module
-@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit
-@ key [on single-issue Xscale PXA250 core].
-
-@ May 2007.
-@
-@ AES_set_[en|de]crypt_key is added.
-
-@ July 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 12% improvement on
-@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 16%
-@ improvement on Cortex A8 core and ~21.5 cycles per byte.
-
-@ A little glue here to select the correct code below for the ARM CPU
-@ that is being targetted.
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-.text
-
-.type	AES_Te,%object
-.align	5
-AES_Te:
-.word	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
-.word	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
-.word	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
-.word	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
-.word	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
-.word	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
-.word	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
-.word	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
-.word	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
-.word	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
-.word	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
-.word	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
-.word	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
-.word	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
-.word	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
-.word	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
-.word	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
-.word	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
-.word	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
-.word	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
-.word	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
-.word	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
-.word	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
-.word	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
-.word	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
-.word	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
-.word	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
-.word	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
-.word	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
-.word	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
-.word	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
-.word	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
-.word	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
-.word	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
-.word	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
-.word	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
-.word	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
-.word	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
-.word	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
-.word	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
-.word	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
-.word	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
-.word	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
-.word	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
-.word	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
-.word	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
-.word	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
-.word	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
-.word	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
-.word	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
-.word	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
-.word	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
-.word	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
-.word	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
-.word	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
-.word	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
-.word	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
-.word	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
-.word	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
-.word	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
-.word	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
-.word	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
-.word	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
-.word	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
-@ Te4[256]
-.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
-.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
-.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
-.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
-.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
-.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
-.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
-.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
-.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
-.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
-.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
-.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
-.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
-.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
-.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
-.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
-.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
-.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
-.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
-.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
-.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
-.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
-.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
-.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
-.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
-.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
-.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
-.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
-.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
-.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
-.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
-.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-@ rcon[]
-.word	0x01000000, 0x02000000, 0x04000000, 0x08000000
-.word	0x10000000, 0x20000000, 0x40000000, 0x80000000
-.word	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
-.size	AES_Te,.-AES_Te
-
-@ void AES_encrypt(const unsigned char *in, unsigned char *out,
-@ 		 const AES_KEY *key) {
-.align	5
-ENTRY(AES_encrypt)
-	adr	r3,AES_encrypt
-	stmdb   sp!,{r1,r4-r12,lr}
-	mov	r12,r0		@ inp
-	mov	r11,r2
-	sub	r10,r3,#AES_encrypt-AES_Te	@ Te
-#if __ARM_ARCH__<7
-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
-	ldrb	r4,[r12,#2]	@ manner...
-	ldrb	r5,[r12,#1]
-	ldrb	r6,[r12,#0]
-	orr	r0,r0,r4,lsl#8
-	ldrb	r1,[r12,#7]
-	orr	r0,r0,r5,lsl#16
-	ldrb	r4,[r12,#6]
-	orr	r0,r0,r6,lsl#24
-	ldrb	r5,[r12,#5]
-	ldrb	r6,[r12,#4]
-	orr	r1,r1,r4,lsl#8
-	ldrb	r2,[r12,#11]
-	orr	r1,r1,r5,lsl#16
-	ldrb	r4,[r12,#10]
-	orr	r1,r1,r6,lsl#24
-	ldrb	r5,[r12,#9]
-	ldrb	r6,[r12,#8]
-	orr	r2,r2,r4,lsl#8
-	ldrb	r3,[r12,#15]
-	orr	r2,r2,r5,lsl#16
-	ldrb	r4,[r12,#14]
-	orr	r2,r2,r6,lsl#24
-	ldrb	r5,[r12,#13]
-	ldrb	r6,[r12,#12]
-	orr	r3,r3,r4,lsl#8
-	orr	r3,r3,r5,lsl#16
-	orr	r3,r3,r6,lsl#24
-#else
-	ldr	r0,[r12,#0]
-	ldr	r1,[r12,#4]
-	ldr	r2,[r12,#8]
-	ldr	r3,[r12,#12]
-#ifdef __ARMEL__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-#endif
-#endif
-	bl	_armv4_AES_encrypt
-
-	ldr	r12,[sp],#4		@ pop out
-#if __ARM_ARCH__>=7
-#ifdef __ARMEL__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-#endif
-	str	r0,[r12,#0]
-	str	r1,[r12,#4]
-	str	r2,[r12,#8]
-	str	r3,[r12,#12]
-#else
-	mov	r4,r0,lsr#24		@ write output in endian-neutral
-	mov	r5,r0,lsr#16		@ manner...
-	mov	r6,r0,lsr#8
-	strb	r4,[r12,#0]
-	strb	r5,[r12,#1]
-	mov	r4,r1,lsr#24
-	strb	r6,[r12,#2]
-	mov	r5,r1,lsr#16
-	strb	r0,[r12,#3]
-	mov	r6,r1,lsr#8
-	strb	r4,[r12,#4]
-	strb	r5,[r12,#5]
-	mov	r4,r2,lsr#24
-	strb	r6,[r12,#6]
-	mov	r5,r2,lsr#16
-	strb	r1,[r12,#7]
-	mov	r6,r2,lsr#8
-	strb	r4,[r12,#8]
-	strb	r5,[r12,#9]
-	mov	r4,r3,lsr#24
-	strb	r6,[r12,#10]
-	mov	r5,r3,lsr#16
-	strb	r2,[r12,#11]
-	mov	r6,r3,lsr#8
-	strb	r4,[r12,#12]
-	strb	r5,[r12,#13]
-	strb	r6,[r12,#14]
-	strb	r3,[r12,#15]
-#endif
-	ldmia	sp!,{r4-r12,pc}
-ENDPROC(AES_encrypt)
-
-.type   _armv4_AES_encrypt,%function
-.align	2
-_armv4_AES_encrypt:
-	str	lr,[sp,#-4]!		@ push lr
-	ldmia	r11!,{r4-r7}
-	eor	r0,r0,r4
-	ldr	r12,[r11,#240-16]
-	eor	r1,r1,r5
-	eor	r2,r2,r6
-	eor	r3,r3,r7
-	sub	r12,r12,#1
-	mov	lr,#255
-
-	and	r7,lr,r0
-	and	r8,lr,r0,lsr#8
-	and	r9,lr,r0,lsr#16
-	mov	r0,r0,lsr#24
-.Lenc_loop:
-	ldr	r4,[r10,r7,lsl#2]	@ Te3[s0>>0]
-	and	r7,lr,r1,lsr#16	@ i0
-	ldr	r5,[r10,r8,lsl#2]	@ Te2[s0>>8]
-	and	r8,lr,r1
-	ldr	r6,[r10,r9,lsl#2]	@ Te1[s0>>16]
-	and	r9,lr,r1,lsr#8
-	ldr	r0,[r10,r0,lsl#2]	@ Te0[s0>>24]
-	mov	r1,r1,lsr#24
-
-	ldr	r7,[r10,r7,lsl#2]	@ Te1[s1>>16]
-	ldr	r8,[r10,r8,lsl#2]	@ Te3[s1>>0]
-	ldr	r9,[r10,r9,lsl#2]	@ Te2[s1>>8]
-	eor	r0,r0,r7,ror#8
-	ldr	r1,[r10,r1,lsl#2]	@ Te0[s1>>24]
-	and	r7,lr,r2,lsr#8	@ i0
-	eor	r5,r5,r8,ror#8
-	and	r8,lr,r2,lsr#16	@ i1
-	eor	r6,r6,r9,ror#8
-	and	r9,lr,r2
-	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
-	eor	r1,r1,r4,ror#24
-	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
-	mov	r2,r2,lsr#24
-
-	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
-	eor	r0,r0,r7,ror#16
-	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
-	and	r7,lr,r3		@ i0
-	eor	r1,r1,r8,ror#8
-	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r6,r9,ror#16
-	and	r9,lr,r3,lsr#16	@ i2
-	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
-	eor	r2,r2,r5,ror#16
-	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
-	mov	r3,r3,lsr#24
-
-	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
-	eor	r0,r0,r7,ror#24
-	ldr	r7,[r11],#16
-	eor	r1,r1,r8,ror#16
-	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
-	eor	r2,r2,r9,ror#8
-	ldr	r4,[r11,#-12]
-	eor	r3,r3,r6,ror#8
-
-	ldr	r5,[r11,#-8]
-	eor	r0,r0,r7
-	ldr	r6,[r11,#-4]
-	and	r7,lr,r0
-	eor	r1,r1,r4
-	and	r8,lr,r0,lsr#8
-	eor	r2,r2,r5
-	and	r9,lr,r0,lsr#16
-	eor	r3,r3,r6
-	mov	r0,r0,lsr#24
-
-	subs	r12,r12,#1
-	bne	.Lenc_loop
-
-	add	r10,r10,#2
-
-	ldrb	r4,[r10,r7,lsl#2]	@ Te4[s0>>0]
-	and	r7,lr,r1,lsr#16	@ i0
-	ldrb	r5,[r10,r8,lsl#2]	@ Te4[s0>>8]
-	and	r8,lr,r1
-	ldrb	r6,[r10,r9,lsl#2]	@ Te4[s0>>16]
-	and	r9,lr,r1,lsr#8
-	ldrb	r0,[r10,r0,lsl#2]	@ Te4[s0>>24]
-	mov	r1,r1,lsr#24
-
-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s1>>16]
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s1>>0]
-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s1>>8]
-	eor	r0,r7,r0,lsl#8
-	ldrb	r1,[r10,r1,lsl#2]	@ Te4[s1>>24]
-	and	r7,lr,r2,lsr#8	@ i0
-	eor	r5,r8,r5,lsl#8
-	and	r8,lr,r2,lsr#16	@ i1
-	eor	r6,r9,r6,lsl#8
-	and	r9,lr,r2
-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
-	eor	r1,r4,r1,lsl#24
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
-	mov	r2,r2,lsr#24
-
-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
-	eor	r0,r7,r0,lsl#8
-	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
-	and	r7,lr,r3		@ i0
-	eor	r1,r1,r8,lsl#16
-	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r9,r6,lsl#8
-	and	r9,lr,r3,lsr#16	@ i2
-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
-	eor	r2,r5,r2,lsl#24
-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
-	mov	r3,r3,lsr#24
-
-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
-	eor	r0,r7,r0,lsl#8
-	ldr	r7,[r11,#0]
-	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
-	eor	r1,r1,r8,lsl#8
-	ldr	r4,[r11,#4]
-	eor	r2,r2,r9,lsl#16
-	ldr	r5,[r11,#8]
-	eor	r3,r6,r3,lsl#24
-	ldr	r6,[r11,#12]
-
-	eor	r0,r0,r7
-	eor	r1,r1,r4
-	eor	r2,r2,r5
-	eor	r3,r3,r6
-
-	sub	r10,r10,#2
-	ldr	pc,[sp],#4		@ pop and return
-.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
-
-.align	5
-ENTRY(private_AES_set_encrypt_key)
-_armv4_AES_set_encrypt_key:
-	adr	r3,_armv4_AES_set_encrypt_key
-	teq	r0,#0
-	moveq	r0,#-1
-	beq	.Labrt
-	teq	r2,#0
-	moveq	r0,#-1
-	beq	.Labrt
-
-	teq	r1,#128
-	beq	.Lok
-	teq	r1,#192
-	beq	.Lok
-	teq	r1,#256
-	movne	r0,#-1
-	bne	.Labrt
-
-.Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
-
-	mov	r12,r0		@ inp
-	mov	lr,r1			@ bits
-	mov	r11,r2			@ key
-
-#if __ARM_ARCH__<7
-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
-	ldrb	r4,[r12,#2]	@ manner...
-	ldrb	r5,[r12,#1]
-	ldrb	r6,[r12,#0]
-	orr	r0,r0,r4,lsl#8
-	ldrb	r1,[r12,#7]
-	orr	r0,r0,r5,lsl#16
-	ldrb	r4,[r12,#6]
-	orr	r0,r0,r6,lsl#24
-	ldrb	r5,[r12,#5]
-	ldrb	r6,[r12,#4]
-	orr	r1,r1,r4,lsl#8
-	ldrb	r2,[r12,#11]
-	orr	r1,r1,r5,lsl#16
-	ldrb	r4,[r12,#10]
-	orr	r1,r1,r6,lsl#24
-	ldrb	r5,[r12,#9]
-	ldrb	r6,[r12,#8]
-	orr	r2,r2,r4,lsl#8
-	ldrb	r3,[r12,#15]
-	orr	r2,r2,r5,lsl#16
-	ldrb	r4,[r12,#14]
-	orr	r2,r2,r6,lsl#24
-	ldrb	r5,[r12,#13]
-	ldrb	r6,[r12,#12]
-	orr	r3,r3,r4,lsl#8
-	str	r0,[r11],#16
-	orr	r3,r3,r5,lsl#16
-	str	r1,[r11,#-12]
-	orr	r3,r3,r6,lsl#24
-	str	r2,[r11,#-8]
-	str	r3,[r11,#-4]
-#else
-	ldr	r0,[r12,#0]
-	ldr	r1,[r12,#4]
-	ldr	r2,[r12,#8]
-	ldr	r3,[r12,#12]
-#ifdef __ARMEL__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-#endif
-	str	r0,[r11],#16
-	str	r1,[r11,#-12]
-	str	r2,[r11,#-8]
-	str	r3,[r11,#-4]
-#endif
-
-	teq	lr,#128
-	bne	.Lnot128
-	mov	r12,#10
-	str	r12,[r11,#240-16]
-	add	r6,r10,#256			@ rcon
-	mov	lr,#255
-
-.L128_loop:
-	and	r5,lr,r3,lsr#24
-	and	r7,lr,r3,lsr#16
-	ldrb	r5,[r10,r5]
-	and	r8,lr,r3,lsr#8
-	ldrb	r7,[r10,r7]
-	and	r9,lr,r3
-	ldrb	r8,[r10,r8]
-	orr	r5,r5,r7,lsl#24
-	ldrb	r9,[r10,r9]
-	orr	r5,r5,r8,lsl#16
-	ldr	r4,[r6],#4			@ rcon[i++]
-	orr	r5,r5,r9,lsl#8
-	eor	r5,r5,r4
-	eor	r0,r0,r5			@ rk[4]=rk[0]^...
-	eor	r1,r1,r0			@ rk[5]=rk[1]^rk[4]
-	str	r0,[r11],#16
-	eor	r2,r2,r1			@ rk[6]=rk[2]^rk[5]
-	str	r1,[r11,#-12]
-	eor	r3,r3,r2			@ rk[7]=rk[3]^rk[6]
-	str	r2,[r11,#-8]
-	subs	r12,r12,#1
-	str	r3,[r11,#-4]
-	bne	.L128_loop
-	sub	r2,r11,#176
-	b	.Ldone
-
-.Lnot128:
-#if __ARM_ARCH__<7
-	ldrb	r8,[r12,#19]
-	ldrb	r4,[r12,#18]
-	ldrb	r5,[r12,#17]
-	ldrb	r6,[r12,#16]
-	orr	r8,r8,r4,lsl#8
-	ldrb	r9,[r12,#23]
-	orr	r8,r8,r5,lsl#16
-	ldrb	r4,[r12,#22]
-	orr	r8,r8,r6,lsl#24
-	ldrb	r5,[r12,#21]
-	ldrb	r6,[r12,#20]
-	orr	r9,r9,r4,lsl#8
-	orr	r9,r9,r5,lsl#16
-	str	r8,[r11],#8
-	orr	r9,r9,r6,lsl#24
-	str	r9,[r11,#-4]
-#else
-	ldr	r8,[r12,#16]
-	ldr	r9,[r12,#20]
-#ifdef __ARMEL__
-	rev	r8,r8
-	rev	r9,r9
-#endif
-	str	r8,[r11],#8
-	str	r9,[r11,#-4]
-#endif
-
-	teq	lr,#192
-	bne	.Lnot192
-	mov	r12,#12
-	str	r12,[r11,#240-24]
-	add	r6,r10,#256			@ rcon
-	mov	lr,#255
-	mov	r12,#8
-
-.L192_loop:
-	and	r5,lr,r9,lsr#24
-	and	r7,lr,r9,lsr#16
-	ldrb	r5,[r10,r5]
-	and	r8,lr,r9,lsr#8
-	ldrb	r7,[r10,r7]
-	and	r9,lr,r9
-	ldrb	r8,[r10,r8]
-	orr	r5,r5,r7,lsl#24
-	ldrb	r9,[r10,r9]
-	orr	r5,r5,r8,lsl#16
-	ldr	r4,[r6],#4			@ rcon[i++]
-	orr	r5,r5,r9,lsl#8
-	eor	r9,r5,r4
-	eor	r0,r0,r9			@ rk[6]=rk[0]^...
-	eor	r1,r1,r0			@ rk[7]=rk[1]^rk[6]
-	str	r0,[r11],#24
-	eor	r2,r2,r1			@ rk[8]=rk[2]^rk[7]
-	str	r1,[r11,#-20]
-	eor	r3,r3,r2			@ rk[9]=rk[3]^rk[8]
-	str	r2,[r11,#-16]
-	subs	r12,r12,#1
-	str	r3,[r11,#-12]
-	subeq	r2,r11,#216
-	beq	.Ldone
-
-	ldr	r7,[r11,#-32]
-	ldr	r8,[r11,#-28]
-	eor	r7,r7,r3			@ rk[10]=rk[4]^rk[9]
-	eor	r9,r8,r7			@ rk[11]=rk[5]^rk[10]
-	str	r7,[r11,#-8]
-	str	r9,[r11,#-4]
-	b	.L192_loop
-
-.Lnot192:
-#if __ARM_ARCH__<7
-	ldrb	r8,[r12,#27]
-	ldrb	r4,[r12,#26]
-	ldrb	r5,[r12,#25]
-	ldrb	r6,[r12,#24]
-	orr	r8,r8,r4,lsl#8
-	ldrb	r9,[r12,#31]
-	orr	r8,r8,r5,lsl#16
-	ldrb	r4,[r12,#30]
-	orr	r8,r8,r6,lsl#24
-	ldrb	r5,[r12,#29]
-	ldrb	r6,[r12,#28]
-	orr	r9,r9,r4,lsl#8
-	orr	r9,r9,r5,lsl#16
-	str	r8,[r11],#8
-	orr	r9,r9,r6,lsl#24
-	str	r9,[r11,#-4]
-#else
-	ldr	r8,[r12,#24]
-	ldr	r9,[r12,#28]
-#ifdef __ARMEL__
-	rev	r8,r8
-	rev	r9,r9
-#endif
-	str	r8,[r11],#8
-	str	r9,[r11,#-4]
-#endif
-
-	mov	r12,#14
-	str	r12,[r11,#240-32]
-	add	r6,r10,#256			@ rcon
-	mov	lr,#255
-	mov	r12,#7
-
-.L256_loop:
-	and	r5,lr,r9,lsr#24
-	and	r7,lr,r9,lsr#16
-	ldrb	r5,[r10,r5]
-	and	r8,lr,r9,lsr#8
-	ldrb	r7,[r10,r7]
-	and	r9,lr,r9
-	ldrb	r8,[r10,r8]
-	orr	r5,r5,r7,lsl#24
-	ldrb	r9,[r10,r9]
-	orr	r5,r5,r8,lsl#16
-	ldr	r4,[r6],#4			@ rcon[i++]
-	orr	r5,r5,r9,lsl#8
-	eor	r9,r5,r4
-	eor	r0,r0,r9			@ rk[8]=rk[0]^...
-	eor	r1,r1,r0			@ rk[9]=rk[1]^rk[8]
-	str	r0,[r11],#32
-	eor	r2,r2,r1			@ rk[10]=rk[2]^rk[9]
-	str	r1,[r11,#-28]
-	eor	r3,r3,r2			@ rk[11]=rk[3]^rk[10]
-	str	r2,[r11,#-24]
-	subs	r12,r12,#1
-	str	r3,[r11,#-20]
-	subeq	r2,r11,#256
-	beq	.Ldone
-
-	and	r5,lr,r3
-	and	r7,lr,r3,lsr#8
-	ldrb	r5,[r10,r5]
-	and	r8,lr,r3,lsr#16
-	ldrb	r7,[r10,r7]
-	and	r9,lr,r3,lsr#24
-	ldrb	r8,[r10,r8]
-	orr	r5,r5,r7,lsl#8
-	ldrb	r9,[r10,r9]
-	orr	r5,r5,r8,lsl#16
-	ldr	r4,[r11,#-48]
-	orr	r5,r5,r9,lsl#24
-
-	ldr	r7,[r11,#-44]
-	ldr	r8,[r11,#-40]
-	eor	r4,r4,r5			@ rk[12]=rk[4]^...
-	ldr	r9,[r11,#-36]
-	eor	r7,r7,r4			@ rk[13]=rk[5]^rk[12]
-	str	r4,[r11,#-16]
-	eor	r8,r8,r7			@ rk[14]=rk[6]^rk[13]
-	str	r7,[r11,#-12]
-	eor	r9,r9,r8			@ rk[15]=rk[7]^rk[14]
-	str	r8,[r11,#-8]
-	str	r9,[r11,#-4]
-	b	.L256_loop
-
-.Ldone:	mov	r0,#0
-	ldmia   sp!,{r4-r12,lr}
-.Labrt:	ret	lr
-ENDPROC(private_AES_set_encrypt_key)
-
-.align	5
-ENTRY(private_AES_set_decrypt_key)
-	str	lr,[sp,#-4]!            @ push lr
-#if 0
-	@ kernel does both of these in setkey so optimise this bit out by
-	@ expecting the key to already have the enc_key work done (see aes_glue.c)
-	bl	_armv4_AES_set_encrypt_key
-#else
-	mov	r0,#0
-#endif
-	teq	r0,#0
-	ldrne	lr,[sp],#4              @ pop lr
-	bne	.Labrt
-
-	stmdb   sp!,{r4-r12}
-
-	ldr	r12,[r2,#240]	@ AES_set_encrypt_key preserves r2,
-	mov	r11,r2			@ which is AES_KEY *key
-	mov	r7,r2
-	add	r8,r2,r12,lsl#4
-
-.Linv:	ldr	r0,[r7]
-	ldr	r1,[r7,#4]
-	ldr	r2,[r7,#8]
-	ldr	r3,[r7,#12]
-	ldr	r4,[r8]
-	ldr	r5,[r8,#4]
-	ldr	r6,[r8,#8]
-	ldr	r9,[r8,#12]
-	str	r0,[r8],#-16
-	str	r1,[r8,#16+4]
-	str	r2,[r8,#16+8]
-	str	r3,[r8,#16+12]
-	str	r4,[r7],#16
-	str	r5,[r7,#-12]
-	str	r6,[r7,#-8]
-	str	r9,[r7,#-4]
-	teq	r7,r8
-	bne	.Linv
-	ldr	r0,[r11,#16]!		@ prefetch tp1
-	mov	r7,#0x80
-	mov	r8,#0x1b
-	orr	r7,r7,#0x8000
-	orr	r8,r8,#0x1b00
-	orr	r7,r7,r7,lsl#16
-	orr	r8,r8,r8,lsl#16
-	sub	r12,r12,#1
-	mvn	r9,r7
-	mov	r12,r12,lsl#2	@ (rounds-1)*4
-
-.Lmix:	and	r4,r0,r7
-	and	r1,r0,r9
-	sub	r4,r4,r4,lsr#7
-	and	r4,r4,r8
-	eor	r1,r4,r1,lsl#1	@ tp2
-
-	and	r4,r1,r7
-	and	r2,r1,r9
-	sub	r4,r4,r4,lsr#7
-	and	r4,r4,r8
-	eor	r2,r4,r2,lsl#1	@ tp4
-
-	and	r4,r2,r7
-	and	r3,r2,r9
-	sub	r4,r4,r4,lsr#7
-	and	r4,r4,r8
-	eor	r3,r4,r3,lsl#1	@ tp8
-
-	eor	r4,r1,r2
-	eor	r5,r0,r3		@ tp9
-	eor	r4,r4,r3		@ tpe
-	eor	r4,r4,r1,ror#24
-	eor	r4,r4,r5,ror#24	@ ^= ROTATE(tpb=tp9^tp2,8)
-	eor	r4,r4,r2,ror#16
-	eor	r4,r4,r5,ror#16	@ ^= ROTATE(tpd=tp9^tp4,16)
-	eor	r4,r4,r5,ror#8	@ ^= ROTATE(tp9,24)
-
-	ldr	r0,[r11,#4]		@ prefetch tp1
-	str	r4,[r11],#4
-	subs	r12,r12,#1
-	bne	.Lmix
-
-	mov	r0,#0
-	ldmia	sp!,{r4-r12,pc}
-ENDPROC(private_AES_set_decrypt_key)
-
-.type	AES_Td,%object
-.align	5
-AES_Td:
-.word	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
-.word	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
-.word	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
-.word	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
-.word	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
-.word	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
-.word	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
-.word	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
-.word	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
-.word	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
-.word	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
-.word	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
-.word	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
-.word	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
-.word	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
-.word	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
-.word	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
-.word	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
-.word	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
-.word	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
-.word	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
-.word	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
-.word	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
-.word	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
-.word	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
-.word	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
-.word	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
-.word	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
-.word	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
-.word	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
-.word	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
-.word	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
-.word	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
-.word	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
-.word	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
-.word	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
-.word	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
-.word	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
-.word	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
-.word	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
-.word	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
-.word	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
-.word	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
-.word	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
-.word	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
-.word	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
-.word	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
-.word	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
-.word	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
-.word	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
-.word	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
-.word	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
-.word	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
-.word	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
-.word	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
-.word	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
-.word	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
-.word	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
-.word	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
-.word	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
-.word	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
-.word	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
-.word	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
-.word	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
-@ Td4[256]
-.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
-.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
-.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
-.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
-.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
-.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
-.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
-.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
-.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
-.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
-.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
-.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
-.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
-.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
-.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
-.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
-.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
-.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
-.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
-.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
-.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
-.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
-.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
-.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
-.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
-.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
-.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
-.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
-.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
-.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
-.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
-.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-.size	AES_Td,.-AES_Td
-
-@ void AES_decrypt(const unsigned char *in, unsigned char *out,
-@ 		 const AES_KEY *key) {
-.align	5
-ENTRY(AES_decrypt)
-	adr	r3,AES_decrypt
-	stmdb   sp!,{r1,r4-r12,lr}
-	mov	r12,r0		@ inp
-	mov	r11,r2
-	sub	r10,r3,#AES_decrypt-AES_Td		@ Td
-#if __ARM_ARCH__<7
-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
-	ldrb	r4,[r12,#2]	@ manner...
-	ldrb	r5,[r12,#1]
-	ldrb	r6,[r12,#0]
-	orr	r0,r0,r4,lsl#8
-	ldrb	r1,[r12,#7]
-	orr	r0,r0,r5,lsl#16
-	ldrb	r4,[r12,#6]
-	orr	r0,r0,r6,lsl#24
-	ldrb	r5,[r12,#5]
-	ldrb	r6,[r12,#4]
-	orr	r1,r1,r4,lsl#8
-	ldrb	r2,[r12,#11]
-	orr	r1,r1,r5,lsl#16
-	ldrb	r4,[r12,#10]
-	orr	r1,r1,r6,lsl#24
-	ldrb	r5,[r12,#9]
-	ldrb	r6,[r12,#8]
-	orr	r2,r2,r4,lsl#8
-	ldrb	r3,[r12,#15]
-	orr	r2,r2,r5,lsl#16
-	ldrb	r4,[r12,#14]
-	orr	r2,r2,r6,lsl#24
-	ldrb	r5,[r12,#13]
-	ldrb	r6,[r12,#12]
-	orr	r3,r3,r4,lsl#8
-	orr	r3,r3,r5,lsl#16
-	orr	r3,r3,r6,lsl#24
-#else
-	ldr	r0,[r12,#0]
-	ldr	r1,[r12,#4]
-	ldr	r2,[r12,#8]
-	ldr	r3,[r12,#12]
-#ifdef __ARMEL__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-#endif
-#endif
-	bl	_armv4_AES_decrypt
-
-	ldr	r12,[sp],#4		@ pop out
-#if __ARM_ARCH__>=7
-#ifdef __ARMEL__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-#endif
-	str	r0,[r12,#0]
-	str	r1,[r12,#4]
-	str	r2,[r12,#8]
-	str	r3,[r12,#12]
-#else
-	mov	r4,r0,lsr#24		@ write output in endian-neutral
-	mov	r5,r0,lsr#16		@ manner...
-	mov	r6,r0,lsr#8
-	strb	r4,[r12,#0]
-	strb	r5,[r12,#1]
-	mov	r4,r1,lsr#24
-	strb	r6,[r12,#2]
-	mov	r5,r1,lsr#16
-	strb	r0,[r12,#3]
-	mov	r6,r1,lsr#8
-	strb	r4,[r12,#4]
-	strb	r5,[r12,#5]
-	mov	r4,r2,lsr#24
-	strb	r6,[r12,#6]
-	mov	r5,r2,lsr#16
-	strb	r1,[r12,#7]
-	mov	r6,r2,lsr#8
-	strb	r4,[r12,#8]
-	strb	r5,[r12,#9]
-	mov	r4,r3,lsr#24
-	strb	r6,[r12,#10]
-	mov	r5,r3,lsr#16
-	strb	r2,[r12,#11]
-	mov	r6,r3,lsr#8
-	strb	r4,[r12,#12]
-	strb	r5,[r12,#13]
-	strb	r6,[r12,#14]
-	strb	r3,[r12,#15]
-#endif
-	ldmia	sp!,{r4-r12,pc}
-ENDPROC(AES_decrypt)
-
-.type   _armv4_AES_decrypt,%function
-.align	2
-_armv4_AES_decrypt:
-	str	lr,[sp,#-4]!		@ push lr
-	ldmia	r11!,{r4-r7}
-	eor	r0,r0,r4
-	ldr	r12,[r11,#240-16]
-	eor	r1,r1,r5
-	eor	r2,r2,r6
-	eor	r3,r3,r7
-	sub	r12,r12,#1
-	mov	lr,#255
-
-	and	r7,lr,r0,lsr#16
-	and	r8,lr,r0,lsr#8
-	and	r9,lr,r0
-	mov	r0,r0,lsr#24
-.Ldec_loop:
-	ldr	r4,[r10,r7,lsl#2]	@ Td1[s0>>16]
-	and	r7,lr,r1		@ i0
-	ldr	r5,[r10,r8,lsl#2]	@ Td2[s0>>8]
-	and	r8,lr,r1,lsr#16
-	ldr	r6,[r10,r9,lsl#2]	@ Td3[s0>>0]
-	and	r9,lr,r1,lsr#8
-	ldr	r0,[r10,r0,lsl#2]	@ Td0[s0>>24]
-	mov	r1,r1,lsr#24
-
-	ldr	r7,[r10,r7,lsl#2]	@ Td3[s1>>0]
-	ldr	r8,[r10,r8,lsl#2]	@ Td1[s1>>16]
-	ldr	r9,[r10,r9,lsl#2]	@ Td2[s1>>8]
-	eor	r0,r0,r7,ror#24
-	ldr	r1,[r10,r1,lsl#2]	@ Td0[s1>>24]
-	and	r7,lr,r2,lsr#8	@ i0
-	eor	r5,r8,r5,ror#8
-	and	r8,lr,r2		@ i1
-	eor	r6,r9,r6,ror#8
-	and	r9,lr,r2,lsr#16
-	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
-	eor	r1,r1,r4,ror#8
-	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
-	mov	r2,r2,lsr#24
-
-	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
-	eor	r0,r0,r7,ror#16
-	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
-	and	r7,lr,r3,lsr#16	@ i0
-	eor	r1,r1,r8,ror#24
-	and	r8,lr,r3,lsr#8	@ i1
-	eor	r6,r9,r6,ror#8
-	and	r9,lr,r3		@ i2
-	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
-	eor	r2,r2,r5,ror#8
-	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
-	mov	r3,r3,lsr#24
-
-	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
-	eor	r0,r0,r7,ror#8
-	ldr	r7,[r11],#16
-	eor	r1,r1,r8,ror#16
-	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
-	eor	r2,r2,r9,ror#24
-
-	ldr	r4,[r11,#-12]
-	eor	r0,r0,r7
-	ldr	r5,[r11,#-8]
-	eor	r3,r3,r6,ror#8
-	ldr	r6,[r11,#-4]
-	and	r7,lr,r0,lsr#16
-	eor	r1,r1,r4
-	and	r8,lr,r0,lsr#8
-	eor	r2,r2,r5
-	and	r9,lr,r0
-	eor	r3,r3,r6
-	mov	r0,r0,lsr#24
-
-	subs	r12,r12,#1
-	bne	.Ldec_loop
-
-	add	r10,r10,#1024
-
-	ldr	r5,[r10,#0]		@ prefetch Td4
-	ldr	r6,[r10,#32]
-	ldr	r4,[r10,#64]
-	ldr	r5,[r10,#96]
-	ldr	r6,[r10,#128]
-	ldr	r4,[r10,#160]
-	ldr	r5,[r10,#192]
-	ldr	r6,[r10,#224]
-
-	ldrb	r0,[r10,r0]		@ Td4[s0>>24]
-	ldrb	r4,[r10,r7]		@ Td4[s0>>16]
-	and	r7,lr,r1		@ i0
-	ldrb	r5,[r10,r8]		@ Td4[s0>>8]
-	and	r8,lr,r1,lsr#16
-	ldrb	r6,[r10,r9]		@ Td4[s0>>0]
-	and	r9,lr,r1,lsr#8
-
-	ldrb	r7,[r10,r7]		@ Td4[s1>>0]
- ARM(	ldrb	r1,[r10,r1,lsr#24]  )	@ Td4[s1>>24]
- THUMB(	add	r1,r10,r1,lsr#24    ) 	@ Td4[s1>>24]
- THUMB(	ldrb	r1,[r1]		    )
-	ldrb	r8,[r10,r8]		@ Td4[s1>>16]
-	eor	r0,r7,r0,lsl#24
-	ldrb	r9,[r10,r9]		@ Td4[s1>>8]
-	eor	r1,r4,r1,lsl#8
-	and	r7,lr,r2,lsr#8	@ i0
-	eor	r5,r5,r8,lsl#8
-	and	r8,lr,r2		@ i1
-	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
-	eor	r6,r6,r9,lsl#8
-	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
-	and	r9,lr,r2,lsr#16
-
- ARM(	ldrb	r2,[r10,r2,lsr#24]  )	@ Td4[s2>>24]
- THUMB(	add	r2,r10,r2,lsr#24    )	@ Td4[s2>>24]
- THUMB(	ldrb	r2,[r2]		    )
-	eor	r0,r0,r7,lsl#8
-	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
-	eor	r1,r8,r1,lsl#16
-	and	r7,lr,r3,lsr#16	@ i0
-	eor	r2,r5,r2,lsl#16
-	and	r8,lr,r3,lsr#8	@ i1
-	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
-	eor	r6,r6,r9,lsl#16
-	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
-	and	r9,lr,r3		@ i2
-
-	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
- ARM(	ldrb	r3,[r10,r3,lsr#24]  )	@ Td4[s3>>24]
- THUMB(	add	r3,r10,r3,lsr#24    )	@ Td4[s3>>24]
- THUMB(	ldrb	r3,[r3]		    )
-	eor	r0,r0,r7,lsl#16
-	ldr	r7,[r11,#0]
-	eor	r1,r1,r8,lsl#8
-	ldr	r4,[r11,#4]
-	eor	r2,r9,r2,lsl#8
-	ldr	r5,[r11,#8]
-	eor	r3,r6,r3,lsl#24
-	ldr	r6,[r11,#12]
-
-	eor	r0,r0,r7
-	eor	r1,r1,r4
-	eor	r2,r2,r5
-	eor	r3,r3,r6
-
-	sub	r10,r10,#1024
-	ldr	pc,[sp],#4		@ pop and return
-.size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
-.asciz	"AES for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align	2
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index 987aa632c9f0..ba8e6a32fdc9 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -169,19 +169,19 @@ ENTRY(ce_aes_ecb_encrypt)
 .Lecbencloop3x:
 	subs		r4, r4, #3
 	bmi		.Lecbenc1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	bl		aes_encrypt_3x
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lecbencloop3x
 .Lecbenc1x:
 	adds		r4, r4, #3
 	beq		.Lecbencout
 .Lecbencloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	bl		aes_encrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lecbencloop
 .Lecbencout:
@@ -195,19 +195,19 @@ ENTRY(ce_aes_ecb_decrypt)
 .Lecbdecloop3x:
 	subs		r4, r4, #3
 	bmi		.Lecbdec1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	bl		aes_decrypt_3x
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lecbdecloop3x
 .Lecbdec1x:
 	adds		r4, r4, #3
 	beq		.Lecbdecout
 .Lecbdecloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	bl		aes_decrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lecbdecloop
 .Lecbdecout:
@@ -226,10 +226,10 @@ ENTRY(ce_aes_cbc_encrypt)
 	vld1.8		{q0}, [r5]
 	prepare_key	r2, r3
 .Lcbcencloop:
-	vld1.8		{q1}, [r1, :64]!	@ get next pt block
+	vld1.8		{q1}, [r1]!		@ get next pt block
 	veor		q0, q0, q1		@ ..and xor with iv
 	bl		aes_encrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lcbcencloop
 	vst1.8		{q0}, [r5]
@@ -244,8 +244,8 @@ ENTRY(ce_aes_cbc_decrypt)
 .Lcbcdecloop3x:
 	subs		r4, r4, #3
 	bmi		.Lcbcdec1x
-	vld1.8		{q0-q1}, [r1, :64]!
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!
+	vld1.8		{q2}, [r1]!
 	vmov		q3, q0
 	vmov		q4, q1
 	vmov		q5, q2
@@ -254,19 +254,19 @@ ENTRY(ce_aes_cbc_decrypt)
 	veor		q1, q1, q3
 	veor		q2, q2, q4
 	vmov		q6, q5
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	b		.Lcbcdecloop3x
 .Lcbcdec1x:
 	adds		r4, r4, #3
 	beq		.Lcbcdecout
 	vmov		q15, q14		@ preserve last round key
 .Lcbcdecloop:
-	vld1.8		{q0}, [r1, :64]!	@ get next ct block
+	vld1.8		{q0}, [r1]!		@ get next ct block
 	veor		q14, q15, q6		@ combine prev ct with last key
 	vmov		q6, q0
 	bl		aes_decrypt
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
@@ -300,15 +300,15 @@ ENTRY(ce_aes_ctr_encrypt)
 	rev		ip, r6
 	add		r6, r6, #1
 	vmov		s11, ip
-	vld1.8		{q3-q4}, [r1, :64]!
-	vld1.8		{q5}, [r1, :64]!
+	vld1.8		{q3-q4}, [r1]!
+	vld1.8		{q5}, [r1]!
 	bl		aes_encrypt_3x
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
 	rev		ip, r6
-	vst1.8		{q0-q1}, [r0, :64]!
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!
+	vst1.8		{q2}, [r0]!
 	vmov		s27, ip
 	b		.Lctrloop3x
 .Lctr1x:
@@ -318,10 +318,10 @@ ENTRY(ce_aes_ctr_encrypt)
 	vmov		q0, q6
 	bl		aes_encrypt
 	subs		r4, r4, #1
-	bmi		.Lctrhalfblock		@ blocks < 0 means 1/2 block
-	vld1.8		{q3}, [r1, :64]!
+	bmi		.Lctrtailblock		@ blocks < 0 means tail block
+	vld1.8		{q3}, [r1]!
 	veor		q3, q0, q3
-	vst1.8		{q3}, [r0, :64]!
+	vst1.8		{q3}, [r0]!
 
 	adds		r6, r6, #1		@ increment BE ctr
 	rev		ip, r6
@@ -333,10 +333,8 @@ ENTRY(ce_aes_ctr_encrypt)
 	vst1.8		{q6}, [r5]
 	pop		{r4-r6, pc}
 
-.Lctrhalfblock:
-	vld1.8		{d1}, [r1, :64]
-	veor		d0, d0, d1
-	vst1.8		{d0}, [r0, :64]
+.Lctrtailblock:
+	vst1.8		{q0}, [r0, :64]		@ return just the key stream
 	pop		{r4-r6, pc}
 
 .Lctrcarry:
@@ -405,8 +403,8 @@ ENTRY(ce_aes_xts_encrypt)
 .Lxtsenc3x:
 	subs		r4, r4, #3
 	bmi		.Lxtsenc1x
-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 pt blocks
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
+	vld1.8		{q2}, [r1]!
 	next_tweak	q4, q3, q7, q6
 	veor		q0, q0, q3
 	next_tweak	q5, q4, q7, q6
@@ -416,8 +414,8 @@ ENTRY(ce_aes_xts_encrypt)
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 ct blocks
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
+	vst1.8		{q2}, [r0]!
 	vmov		q3, q5
 	teq		r4, #0
 	beq		.Lxtsencout
@@ -426,11 +424,11 @@ ENTRY(ce_aes_xts_encrypt)
 	adds		r4, r4, #3
 	beq		.Lxtsencout
 .Lxtsencloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	veor		q0, q0, q3
 	bl		aes_encrypt
 	veor		q0, q0, q3
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	beq		.Lxtsencout
 	next_tweak	q3, q3, q7, q6
@@ -456,8 +454,8 @@ ENTRY(ce_aes_xts_decrypt)
 .Lxtsdec3x:
 	subs		r4, r4, #3
 	bmi		.Lxtsdec1x
-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 ct blocks
-	vld1.8		{q2}, [r1, :64]!
+	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
+	vld1.8		{q2}, [r1]!
 	next_tweak	q4, q3, q7, q6
 	veor		q0, q0, q3
 	next_tweak	q5, q4, q7, q6
@@ -467,8 +465,8 @@ ENTRY(ce_aes_xts_decrypt)
 	veor		q0, q0, q3
 	veor		q1, q1, q4
 	veor		q2, q2, q5
-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 pt blocks
-	vst1.8		{q2}, [r0, :64]!
+	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
+	vst1.8		{q2}, [r0]!
 	vmov		q3, q5
 	teq		r4, #0
 	beq		.Lxtsdecout
@@ -477,12 +475,12 @@ ENTRY(ce_aes_xts_decrypt)
 	adds		r4, r4, #3
 	beq		.Lxtsdecout
 .Lxtsdecloop:
-	vld1.8		{q0}, [r1, :64]!
+	vld1.8		{q0}, [r1]!
 	veor		q0, q0, q3
 	add		ip, r2, #32		@ 3rd round key
 	bl		aes_decrypt
 	veor		q0, q0, q3
-	vst1.8		{q0}, [r0, :64]!
+	vst1.8		{q0}, [r0]!
 	subs		r4, r4, #1
 	beq		.Lxtsdecout
 	next_tweak	q3, q3, q7, q6
diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index 8857531915bf..883b84d828c5 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -278,14 +278,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 		u8 *tsrc = walk.src.virt.addr;
 
 		/*
-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
-		 * to tell aes_ctr_encrypt() to only read half a block.
+		 * Tell aes_ctr_encrypt() to process a tail block.
 		 */
-		blocks = (nbytes <= 8) ? -1 : 1;
+		blocks = -1;
 
-		ce_aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc,
+		ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
 				   num_rounds(ctx), blocks, walk.iv);
-		memcpy(tdst, tail, nbytes);
+		if (tdst != tsrc)
+			memcpy(tdst, tsrc, nbytes);
+		crypto_xor(tdst, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
@@ -345,7 +346,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -361,7 +361,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -378,7 +377,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -396,7 +394,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
new file mode 100644
index 000000000000..c817a86c4ca8
--- /dev/null
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -0,0 +1,179 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		5
+
+	rk		.req	r0
+	rounds		.req	r1
+	in		.req	r2
+	out		.req	r3
+	ttab		.req	ip
+
+	t0		.req	lr
+	t1		.req	r2
+	t2		.req	r3
+
+	.macro		__select, out, in, idx
+	.if		__LINUX_ARM_ARCH__ < 7
+	and		\out, \in, #0xff << (8 * \idx)
+	.else
+	ubfx		\out, \in, #(8 * \idx), #8
+	.endif
+	.endm
+
+	.macro		__load, out, in, idx
+	.if		__LINUX_ARM_ARCH__ < 7 && \idx > 0
+	ldr		\out, [ttab, \in, lsr #(8 * \idx) - 2]
+	.else
+	ldr		\out, [ttab, \in, lsl #2]
+	.endif
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc
+	__select	\out0, \in0, 0
+	__select	t0, \in1, 1
+	__load		\out0, \out0, 0
+	__load		t0, t0, 1
+
+	.if		\enc
+	__select	\out1, \in1, 0
+	__select	t1, \in2, 1
+	.else
+	__select	\out1, \in3, 0
+	__select	t1, \in0, 1
+	.endif
+	__load		\out1, \out1, 0
+	__select	t2, \in2, 2
+	__load		t1, t1, 1
+	__load		t2, t2, 2
+
+	eor		\out0, \out0, t0, ror #24
+
+	__select	t0, \in3, 3
+	.if		\enc
+	__select	\t3, \in3, 2
+	__select	\t4, \in0, 3
+	.else
+	__select	\t3, \in1, 2
+	__select	\t4, \in2, 3
+	.endif
+	__load		\t3, \t3, 2
+	__load		t0, t0, 3
+	__load		\t4, \t4, 3
+
+	eor		\out1, \out1, t1, ror #24
+	eor		\out0, \out0, t2, ror #16
+	ldm		rk!, {t1, t2}
+	eor		\out1, \out1, \t3, ror #16
+	eor		\out0, \out0, t0, ror #8
+	eor		\out1, \out1, \t4, ror #8
+	eor		\out0, \out0, t1
+	eor		\out1, \out1, t2
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+	.endm
+
+	.macro		__rev, out, in
+	.if		__LINUX_ARM_ARCH__ < 6
+	lsl		t0, \in, #24
+	and		t1, \in, #0xff00
+	and		t2, \in, #0xff0000
+	orr		\out, t0, \in, lsr #24
+	orr		\out, \out, t1, lsl #8
+	orr		\out, \out, t2, lsr #8
+	.else
+	rev		\out, \in
+	.endif
+	.endm
+
+	.macro		__adrl, out, sym, c
+	.if		__LINUX_ARM_ARCH__ < 7
+	ldr\c		\out, =\sym
+	.else
+	movw\c		\out, #:lower16:\sym
+	movt\c		\out, #:upper16:\sym
+	.endif
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab
+	push		{r3-r11, lr}
+
+	ldr		r4, [in]
+	ldr		r5, [in, #4]
+	ldr		r6, [in, #8]
+	ldr		r7, [in, #12]
+
+	ldm		rk!, {r8-r11}
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	__rev		r4, r4
+	__rev		r5, r5
+	__rev		r6, r6
+	__rev		r7, r7
+#endif
+
+	eor		r4, r4, r8
+	eor		r5, r5, r9
+	eor		r6, r6, r10
+	eor		r7, r7, r11
+
+	__adrl		ttab, \ttab
+
+	tst		rounds, #2
+	bne		1f
+
+0:	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+
+1:	subs		rounds, rounds, #4
+	\round		r8, r9, r10, r11, r4, r5, r6, r7
+	__adrl		ttab, \ltab, ls
+	\round		r4, r5, r6, r7, r8, r9, r10, r11
+	bhi		0b
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+	__rev		r4, r4
+	__rev		r5, r5
+	__rev		r6, r6
+	__rev		r7, r7
+#endif
+
+	ldr		out, [sp]
+
+	str		r4, [out]
+	str		r5, [out, #4]
+	str		r6, [out, #8]
+	str		r7, [out, #12]
+
+	pop		{r3-r11, pc}
+
+	.align		3
+	.ltorg
+	.endm
+
+ENTRY(__aes_arm_encrypt)
+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
+ENDPROC(__aes_arm_encrypt)
+
+ENTRY(__aes_arm_decrypt)
+	do_crypt	iround, crypto_it_tab, crypto_il_tab
+ENDPROC(__aes_arm_decrypt)
diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
new file mode 100644
index 000000000000..c222f6e072ad
--- /dev/null
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -0,0 +1,74 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_encrypt);
+
+asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
+EXPORT_SYMBOL(__aes_arm_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm_decrypt(ctx->key_dec, rounds, in, out);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-arm",
+	.cra_priority			= 200,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt,
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	.cra_alignmask			= 3,
+#endif
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for ARM");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/arm/crypto/aes-neonbs-core.S b/arch/arm/crypto/aes-neonbs-core.S
new file mode 100644
index 000000000000..2b625c6d4712
--- /dev/null
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -0,0 +1,1023 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2017 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.fpu		neon
+
+	rounds		.req	ip
+	bskey		.req	r4
+
+	q0l		.req	d0
+	q0h		.req	d1
+	q1l		.req	d2
+	q1h		.req	d3
+	q2l		.req	d4
+	q2h		.req	d5
+	q3l		.req	d6
+	q3h		.req	d7
+	q4l		.req	d8
+	q4h		.req	d9
+	q5l		.req	d10
+	q5h		.req	d11
+	q6l		.req	d12
+	q6h		.req	d13
+	q7l		.req	d14
+	q7h		.req	d15
+	q8l		.req	d16
+	q8h		.req	d17
+	q9l		.req	d18
+	q9h		.req	d19
+	q10l		.req	d20
+	q10h		.req	d21
+	q11l		.req	d22
+	q11h		.req	d23
+	q12l		.req	d24
+	q12h		.req	d25
+	q13l		.req	d26
+	q13h		.req	d27
+	q14l		.req	d28
+	q14h		.req	d29
+	q15l		.req	d30
+	q15h		.req	d31
+
+	.macro		__tbl, out, tbl, in, tmp
+	.ifc		\out, \tbl
+	.ifb		\tmp
+	.error		__tbl needs temp register if out == tbl
+	.endif
+	vmov		\tmp, \out
+	.endif
+	vtbl.8		\out\()l, {\tbl}, \in\()l
+	.ifc		\out, \tbl
+	vtbl.8		\out\()h, {\tmp}, \in\()h
+	.else
+	vtbl.8		\out\()h, {\tbl}, \in\()h
+	.endif
+	.endm
+
+	.macro		__ldr, out, sym
+	vldr		\out\()l, \sym
+	vldr		\out\()h, \sym + 8
+	.endm
+
+	.macro		__adr, reg, lbl
+	adr		\reg, \lbl
+THUMB(	orr		\reg, \reg, #1		)
+	.endm
+
+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	veor		\b2, \b2, \b1
+	veor		\b5, \b5, \b6
+	veor		\b3, \b3, \b0
+	veor		\b6, \b6, \b2
+	veor		\b5, \b5, \b0
+	veor		\b6, \b6, \b3
+	veor		\b3, \b3, \b7
+	veor		\b7, \b7, \b5
+	veor		\b3, \b3, \b4
+	veor		\b4, \b4, \b5
+	veor		\b2, \b2, \b7
+	veor		\b3, \b3, \b1
+	veor		\b1, \b1, \b5
+	.endm
+
+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	veor		\b0, \b0, \b6
+	veor		\b1, \b1, \b4
+	veor		\b4, \b4, \b6
+	veor		\b2, \b2, \b0
+	veor		\b6, \b6, \b1
+	veor		\b1, \b1, \b5
+	veor		\b5, \b5, \b3
+	veor		\b3, \b3, \b7
+	veor		\b7, \b7, \b5
+	veor		\b2, \b2, \b5
+	veor		\b4, \b4, \b7
+	.endm
+
+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+	veor		\b1, \b1, \b7
+	veor		\b4, \b4, \b7
+	veor		\b7, \b7, \b5
+	veor		\b1, \b1, \b3
+	veor		\b2, \b2, \b5
+	veor		\b3, \b3, \b7
+	veor		\b6, \b6, \b1
+	veor		\b2, \b2, \b0
+	veor		\b5, \b5, \b3
+	veor		\b4, \b4, \b6
+	veor		\b0, \b0, \b6
+	veor		\b1, \b1, \b4
+	.endm
+
+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+	veor		\b1, \b1, \b5
+	veor		\b2, \b2, \b7
+	veor		\b3, \b3, \b1
+	veor		\b4, \b4, \b5
+	veor		\b7, \b7, \b5
+	veor		\b3, \b3, \b4
+	veor 		\b5, \b5, \b0
+	veor		\b3, \b3, \b7
+	veor		\b6, \b6, \b2
+	veor		\b2, \b2, \b1
+	veor		\b6, \b6, \b3
+	veor		\b3, \b3, \b0
+	veor		\b5, \b5, \b6
+	.endm
+
+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
+	veor 		\t0, \y0, \y1
+	vand		\t0, \t0, \x0
+	veor		\x0, \x0, \x1
+	vand		\t1, \x1, \y0
+	vand		\x0, \x0, \y1
+	veor		\x1, \t1, \t0
+	veor		\x0, \x0, \t1
+	.endm
+
+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+	veor		\t0, \y0, \y1
+	veor 		\t1, \y2, \y3
+	vand		\t0, \t0, \x0
+	vand		\t1, \t1, \x2
+	veor		\x0, \x0, \x1
+	veor		\x2, \x2, \x3
+	vand		\x1, \x1, \y0
+	vand		\x3, \x3, \y2
+	vand		\x0, \x0, \y1
+	vand		\x2, \x2, \y3
+	veor		\x1, \x1, \x0
+	veor		\x2, \x2, \x3
+	veor		\x0, \x0, \t0
+	veor		\x3, \x3, \t1
+	.endm
+
+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+				    y0, y1, y2, y3, t0, t1, t2, t3
+	veor		\t0, \x0, \x2
+	veor		\t1, \x1, \x3
+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
+	veor		\y0, \y0, \y2
+	veor		\y1, \y1, \y3
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+	veor		\x0, \x0, \t0
+	veor		\x2, \x2, \t0
+	veor		\x1, \x1, \t1
+	veor		\x3, \x3, \t1
+	veor		\t0, \x4, \x6
+	veor		\t1, \x5, \x7
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+	veor		\y0, \y0, \y2
+	veor		\y1, \y1, \y3
+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
+	veor		\x4, \x4, \t0
+	veor		\x6, \x6, \t0
+	veor		\x5, \x5, \t1
+	veor		\x7, \x7, \t1
+	.endm
+
+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+				   t0, t1, t2, t3, s0, s1, s2, s3
+	veor		\t3, \x4, \x6
+	veor		\t0, \x5, \x7
+	veor		\t1, \x1, \x3
+	veor		\s1, \x7, \x6
+	veor		\s0, \x0, \x2
+	veor		\s3, \t3, \t0
+	vorr		\t2, \t0, \t1
+	vand		\s2, \t3, \s0
+	vorr		\t3, \t3, \s0
+	veor		\s0, \s0, \t1
+	vand		\t0, \t0, \t1
+	veor		\t1, \x3, \x2
+	vand		\s3, \s3, \s0
+	vand		\s1, \s1, \t1
+	veor		\t1, \x4, \x5
+	veor		\s0, \x1, \x0
+	veor		\t3, \t3, \s1
+	veor		\t2, \t2, \s1
+	vand		\s1, \t1, \s0
+	vorr		\t1, \t1, \s0
+	veor		\t3, \t3, \s3
+	veor		\t0, \t0, \s1
+	veor		\t2, \t2, \s2
+	veor		\t1, \t1, \s3
+	veor		\t0, \t0, \s2
+	vand		\s0, \x7, \x3
+	veor		\t1, \t1, \s2
+	vand		\s1, \x6, \x2
+	vand		\s2, \x5, \x1
+	vorr		\s3, \x4, \x0
+	veor		\t3, \t3, \s0
+	veor		\t1, \t1, \s2
+	veor		\s0, \t0, \s3
+	veor		\t2, \t2, \s1
+	vand		\s2, \t3, \t1
+	veor		\s1, \t2, \s2
+	veor		\s3, \s0, \s2
+	vbsl		\s1, \t1, \s0
+	vmvn		\t0, \s0
+	vbsl		\s0, \s1, \s3
+	vbsl		\t0, \s1, \s3
+	vbsl		\s3, \t3, \t2
+	veor		\t3, \t3, \t2
+	vand		\s2, \s0, \s3
+	veor		\t1, \t1, \t0
+	veor		\s2, \s2, \t3
+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	.endm
+
+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+			      t0, t1, t2, t3, s0, s1, s2, s3
+	in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
+	inv_gf256	\b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
+			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+	out_bs_ch	\b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
+	.endm
+
+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+				  t0, t1, t2, t3, s0, s1, s2, s3
+	inv_in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
+	inv_gf256	\b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
+			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+	inv_out_bs_ch	\b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
+	.endm
+
+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
+				    t0, t1, t2, t3, mask
+	vld1.8		{\t0-\t1}, [bskey, :256]!
+	veor		\t0, \t0, \x0
+	vld1.8		{\t2-\t3}, [bskey, :256]!
+	veor		\t1, \t1, \x1
+	__tbl		\x0, \t0, \mask
+	veor		\t2, \t2, \x2
+	__tbl		\x1, \t1, \mask
+	vld1.8		{\t0-\t1}, [bskey, :256]!
+	veor		\t3, \t3, \x3
+	__tbl		\x2, \t2, \mask
+	__tbl		\x3, \t3, \mask
+	vld1.8		{\t2-\t3}, [bskey, :256]!
+	veor		\t0, \t0, \x4
+	veor		\t1, \t1, \x5
+	__tbl		\x4, \t0, \mask
+	veor		\t2, \t2, \x6
+	__tbl		\x5, \t1, \mask
+	veor		\t3, \t3, \x7
+	__tbl		\x6, \t2, \mask
+	__tbl		\x7, \t3, \mask
+	.endm
+
+	.macro		inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
+					t0, t1, t2, t3, mask
+	__tbl		\x0, \x0, \mask, \t0
+	__tbl		\x1, \x1, \mask, \t1
+	__tbl		\x2, \x2, \mask, \t2
+	__tbl		\x3, \x3, \mask, \t3
+	__tbl		\x4, \x4, \mask, \t0
+	__tbl		\x5, \x5, \mask, \t1
+	__tbl		\x6, \x6, \mask, \t2
+	__tbl		\x7, \x7, \mask, \t3
+	.endm
+
+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
+	vext.8		\t0, \x0, \x0, #12
+	vext.8		\t1, \x1, \x1, #12
+	veor		\x0, \x0, \t0
+	vext.8		\t2, \x2, \x2, #12
+	veor		\x1, \x1, \t1
+	vext.8		\t3, \x3, \x3, #12
+	veor		\x2, \x2, \t2
+	vext.8		\t4, \x4, \x4, #12
+	veor		\x3, \x3, \t3
+	vext.8		\t5, \x5, \x5, #12
+	veor		\x4, \x4, \t4
+	vext.8		\t6, \x6, \x6, #12
+	veor		\x5, \x5, \t5
+	vext.8		\t7, \x7, \x7, #12
+	veor		\x6, \x6, \t6
+	veor		\t1, \t1, \x0
+	veor.8		\x7, \x7, \t7
+	vext.8		\x0, \x0, \x0, #8
+	veor		\t2, \t2, \x1
+	veor		\t0, \t0, \x7
+	veor		\t1, \t1, \x7
+	vext.8		\x1, \x1, \x1, #8
+	veor		\t5, \t5, \x4
+	veor		\x0, \x0, \t0
+	veor		\t6, \t6, \x5
+	veor		\x1, \x1, \t1
+	vext.8		\t0, \x4, \x4, #8
+	veor		\t4, \t4, \x3
+	vext.8		\t1, \x5, \x5, #8
+	veor		\t7, \t7, \x6
+	vext.8		\x4, \x3, \x3, #8
+	veor		\t3, \t3, \x2
+	vext.8		\x5, \x7, \x7, #8
+	veor		\t4, \t4, \x7
+	vext.8		\x3, \x6, \x6, #8
+	veor		\t3, \t3, \x7
+	vext.8		\x6, \x2, \x2, #8
+	veor		\x7, \t1, \t5
+	.ifb		\inv
+	veor		\x2, \t0, \t4
+	veor		\x4, \x4, \t3
+	veor		\x5, \x5, \t7
+	veor		\x3, \x3, \t6
+	veor		\x6, \x6, \t2
+	.else
+	veor		\t3, \t3, \x4
+	veor		\x5, \x5, \t7
+	veor		\x2, \x3, \t6
+	veor		\x3, \t0, \t4
+	veor		\x4, \x6, \t2
+	vmov		\x6, \t3
+	.endif
+	.endm
+
+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				      t0, t1, t2, t3, t4, t5, t6, t7
+	vld1.8		{\t0-\t1}, [bskey, :256]!
+	veor		\x0, \x0, \t0
+	vld1.8		{\t2-\t3}, [bskey, :256]!
+	veor		\x1, \x1, \t1
+	vld1.8		{\t4-\t5}, [bskey, :256]!
+	veor		\x2, \x2, \t2
+	vld1.8		{\t6-\t7}, [bskey, :256]
+	sub		bskey, bskey, #224
+	veor		\x3, \x3, \t3
+	veor		\x4, \x4, \t4
+	veor		\x5, \x5, \t5
+	veor		\x6, \x6, \t6
+	veor		\x7, \x7, \t7
+	vext.8		\t0, \x0, \x0, #8
+	vext.8		\t6, \x6, \x6, #8
+	vext.8		\t7, \x7, \x7, #8
+	veor		\t0, \t0, \x0
+	vext.8		\t1, \x1, \x1, #8
+	veor		\t6, \t6, \x6
+	vext.8		\t2, \x2, \x2, #8
+	veor		\t7, \t7, \x7
+	vext.8		\t3, \x3, \x3, #8
+	veor		\t1, \t1, \x1
+	vext.8		\t4, \x4, \x4, #8
+	veor		\t2, \t2, \x2
+	vext.8		\t5, \x5, \x5, #8
+	veor		\t3, \t3, \x3
+	veor		\t4, \t4, \x4
+	veor		\t5, \t5, \x5
+	veor		\x0, \x0, \t6
+	veor		\x1, \x1, \t6
+	veor		\x2, \x2, \t0
+	veor		\x4, \x4, \t2
+	veor		\x3, \x3, \t1
+	veor		\x1, \x1, \t7
+	veor		\x2, \x2, \t7
+	veor		\x4, \x4, \t6
+	veor		\x5, \x5, \t3
+	veor		\x3, \x3, \t6
+	veor		\x6, \x6, \t4
+	veor		\x4, \x4, \t7
+	veor		\x5, \x5, \t7
+	veor		\x7, \x7, \t5
+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+	.endm
+
+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+	vshr.u64	\t0, \b0, #\n
+	vshr.u64	\t1, \b1, #\n
+	veor		\t0, \t0, \a0
+	veor		\t1, \t1, \a1
+	vand		\t0, \t0, \mask
+	vand		\t1, \t1, \mask
+	veor		\a0, \a0, \t0
+	vshl.s64	\t0, \t0, #\n
+	veor		\a1, \a1, \t1
+	vshl.s64	\t1, \t1, #\n
+	veor		\b0, \b0, \t0
+	veor		\b1, \b1, \t1
+	.endm
+
+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+	vmov.i8		\t0, #0x55
+	vmov.i8		\t1, #0x33
+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+	vmov.i8		\t0, #0x0f
+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+	.endm
+
+	.align		4
+M0:	.quad		0x02060a0e03070b0f, 0x0004080c0105090d
+
+	/*
+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+	 */
+ENTRY(aesbs_convert_key)
+	vld1.32		{q7}, [r1]!		// load round 0 key
+	vld1.32		{q15}, [r1]!		// load round 1 key
+
+	vmov.i8		q8,  #0x01		// bit masks
+	vmov.i8		q9,  #0x02
+	vmov.i8		q10, #0x04
+	vmov.i8		q11, #0x08
+	vmov.i8		q12, #0x10
+	vmov.i8		q13, #0x20
+	__ldr		q14, M0
+
+	sub		r2, r2, #1
+	vst1.8		{q7}, [r0, :128]!	// save round 0 key
+
+.Lkey_loop:
+	__tbl		q7, q15, q14
+	vmov.i8		q6, #0x40
+	vmov.i8		q15, #0x80
+
+	vtst.8		q0, q7, q8
+	vtst.8		q1, q7, q9
+	vtst.8		q2, q7, q10
+	vtst.8		q3, q7, q11
+	vtst.8		q4, q7, q12
+	vtst.8		q5, q7, q13
+	vtst.8		q6, q7, q6
+	vtst.8		q7, q7, q15
+	vld1.32		{q15}, [r1]!		// load next round key
+	vmvn		q0, q0
+	vmvn		q1, q1
+	vmvn		q5, q5
+	vmvn		q6, q6
+
+	subs		r2, r2, #1
+	vst1.8		{q0-q1}, [r0, :256]!
+	vst1.8		{q2-q3}, [r0, :256]!
+	vst1.8		{q4-q5}, [r0, :256]!
+	vst1.8		{q6-q7}, [r0, :256]!
+	bne		.Lkey_loop
+
+	vmov.i8		q7, #0x63		// compose .L63
+	veor		q15, q15, q7
+	vst1.8		{q15}, [r0, :128]
+	bx		lr
+ENDPROC(aesbs_convert_key)
+
+	.align		4
+M0SR:	.quad		0x0a0e02060f03070b, 0x0004080c05090d01
+
+aesbs_encrypt8:
+	vld1.8		{q9}, [bskey, :128]!	// round 0 key
+	__ldr		q8, M0SR
+
+	veor		q10, q0, q9		// xor with round0 key
+	veor		q11, q1, q9
+	__tbl		q0, q10, q8
+	veor		q12, q2, q9
+	__tbl		q1, q11, q8
+	veor		q13, q3, q9
+	__tbl		q2, q12, q8
+	veor		q14, q4, q9
+	__tbl		q3, q13, q8
+	veor		q15, q5, q9
+	__tbl		q4, q14, q8
+	veor		q10, q6, q9
+	__tbl		q5, q15, q8
+	veor		q11, q7, q9
+	__tbl		q6, q10, q8
+	__tbl		q7, q11, q8
+
+	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
+
+	sub		rounds, rounds, #1
+	b		.Lenc_sbox
+
+	.align		5
+SR:	.quad		0x0504070600030201, 0x0f0e0d0c0a09080b
+SRM0:	.quad		0x0304090e00050a0f, 0x01060b0c0207080d
+
+.Lenc_last:
+	__ldr		q12, SRM0
+.Lenc_loop:
+	shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
+.Lenc_sbox:
+	sbox		q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
+								q13, q14, q15
+	subs		rounds, rounds, #1
+	bcc		.Lenc_done
+
+	mix_cols	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
+								q13, q14, q15
+
+	beq		.Lenc_last
+	__ldr		q12, SR
+	b		.Lenc_loop
+
+.Lenc_done:
+	vld1.8		{q12}, [bskey, :128]	// last round key
+
+	bitslice	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
+
+	veor		q0, q0, q12
+	veor		q1, q1, q12
+	veor		q4, q4, q12
+	veor		q6, q6, q12
+	veor		q3, q3, q12
+	veor		q7, q7, q12
+	veor		q2, q2, q12
+	veor		q5, q5, q12
+	bx		lr
+ENDPROC(aesbs_encrypt8)
+
+	.align		4
+M0ISR:	.quad		0x0a0e0206070b0f03, 0x0004080c0d010509
+
+aesbs_decrypt8:
+	add		bskey, bskey, rounds, lsl #7
+	sub		bskey, bskey, #112
+	vld1.8		{q9}, [bskey, :128]	// round 0 key
+	sub		bskey, bskey, #128
+	__ldr		q8, M0ISR
+
+	veor		q10, q0, q9		// xor with round0 key
+	veor		q11, q1, q9
+	__tbl		q0, q10, q8
+	veor		q12, q2, q9
+	__tbl		q1, q11, q8
+	veor		q13, q3, q9
+	__tbl		q2, q12, q8
+	veor		q14, q4, q9
+	__tbl		q3, q13, q8
+	veor		q15, q5, q9
+	__tbl		q4, q14, q8
+	veor		q10, q6, q9
+	__tbl		q5, q15, q8
+	veor		q11, q7, q9
+	__tbl		q6, q10, q8
+	__tbl		q7, q11, q8
+
+	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
+
+	sub		rounds, rounds, #1
+	b		.Ldec_sbox
+
+	.align		5
+ISR:	.quad		0x0504070602010003, 0x0f0e0d0c080b0a09
+ISRM0:	.quad		0x01040b0e0205080f, 0x0306090c00070a0d
+
+.Ldec_last:
+	__ldr		q12, ISRM0
+.Ldec_loop:
+	inv_shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
+.Ldec_sbox:
+	inv_sbox	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
+								q13, q14, q15
+	subs		rounds, rounds, #1
+	bcc		.Ldec_done
+
+	inv_mix_cols	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
+								q13, q14, q15
+
+	beq		.Ldec_last
+	__ldr		q12, ISR
+	b		.Ldec_loop
+
+.Ldec_done:
+	add		bskey, bskey, #112
+	vld1.8		{q12}, [bskey, :128]	// last round key
+
+	bitslice	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
+
+	veor		q0, q0, q12
+	veor		q1, q1, q12
+	veor		q6, q6, q12
+	veor		q4, q4, q12
+	veor		q2, q2, q12
+	veor		q7, q7, q12
+	veor		q3, q3, q12
+	veor		q5, q5, q12
+	bx		lr
+ENDPROC(aesbs_decrypt8)
+
+	/*
+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 */
+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	push		{r4-r6, lr}
+	ldr		r5, [sp, #16]		// number of blocks
+
+99:	__adr		ip, 0f
+	and		lr, r5, #7
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #2
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q0}, [r1]!
+	vld1.8		{q1}, [r1]!
+	vld1.8		{q2}, [r1]!
+	vld1.8		{q3}, [r1]!
+	vld1.8		{q4}, [r1]!
+	vld1.8		{q5}, [r1]!
+	vld1.8		{q6}, [r1]!
+	vld1.8		{q7}, [r1]!
+
+0:	mov		bskey, r2
+	mov		rounds, r3
+	bl		\do8
+
+	__adr		ip, 1f
+	and		lr, r5, #7
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #2
+	bxlt		ip			// computed goto if blocks < 8
+
+	vst1.8		{\o0}, [r0]!
+	vst1.8		{\o1}, [r0]!
+	vst1.8		{\o2}, [r0]!
+	vst1.8		{\o3}, [r0]!
+	vst1.8		{\o4}, [r0]!
+	vst1.8		{\o5}, [r0]!
+	vst1.8		{\o6}, [r0]!
+	vst1.8		{\o7}, [r0]!
+
+1:	subs		r5, r5, #8
+	bgt		99b
+
+	pop		{r4-r6, pc}
+	.endm
+
+	.align		4
+ENTRY(aesbs_ecb_encrypt)
+	__ecb_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
+ENDPROC(aesbs_ecb_encrypt)
+
+	.align		4
+ENTRY(aesbs_ecb_decrypt)
+	__ecb_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
+ENDPROC(aesbs_ecb_decrypt)
+
+	/*
+	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+	 *		     int rounds, int blocks, u8 iv[])
+	 */
+	.align		4
+ENTRY(aesbs_cbc_decrypt)
+	mov		ip, sp
+	push		{r4-r6, lr}
+	ldm		ip, {r5-r6}		// load args 4-5
+
+99:	__adr		ip, 0f
+	and		lr, r5, #7
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #2
+	mov		lr, r1
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q0}, [lr]!
+	vld1.8		{q1}, [lr]!
+	vld1.8		{q2}, [lr]!
+	vld1.8		{q3}, [lr]!
+	vld1.8		{q4}, [lr]!
+	vld1.8		{q5}, [lr]!
+	vld1.8		{q6}, [lr]!
+	vld1.8		{q7}, [lr]
+
+0:	mov		bskey, r2
+	mov		rounds, r3
+	bl		aesbs_decrypt8
+
+	vld1.8		{q8}, [r6]
+	vmov		q9, q8
+	vmov		q10, q8
+	vmov		q11, q8
+	vmov		q12, q8
+	vmov		q13, q8
+	vmov		q14, q8
+	vmov		q15, q8
+
+	__adr		ip, 1f
+	and		lr, r5, #7
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #2
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q9}, [r1]!
+	vld1.8		{q10}, [r1]!
+	vld1.8		{q11}, [r1]!
+	vld1.8		{q12}, [r1]!
+	vld1.8		{q13}, [r1]!
+	vld1.8		{q14}, [r1]!
+	vld1.8		{q15}, [r1]!
+	W(nop)
+
+1:	__adr		ip, 2f
+	sub		ip, ip, lr, lsl #3
+	bxlt		ip			// computed goto if blocks < 8
+
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r0]!
+	veor		q1, q1, q9
+	vst1.8		{q1}, [r0]!
+	veor		q6, q6, q10
+	vst1.8		{q6}, [r0]!
+	veor		q4, q4, q11
+	vst1.8		{q4}, [r0]!
+	veor		q2, q2, q12
+	vst1.8		{q2}, [r0]!
+	veor		q7, q7, q13
+	vst1.8		{q7}, [r0]!
+	veor		q3, q3, q14
+	vst1.8		{q3}, [r0]!
+	veor		q5, q5, q15
+	vld1.8		{q8}, [r1]!		// load next round's iv
+2:	vst1.8		{q5}, [r0]!
+
+	subs		r5, r5, #8
+	vst1.8		{q8}, [r6]		// store next round's iv
+	bgt		99b
+
+	pop		{r4-r6, pc}
+ENDPROC(aesbs_cbc_decrypt)
+
+	.macro		next_ctr, q
+	vmov.32		\q\()h[1], r10
+	adds		r10, r10, #1
+	vmov.32		\q\()h[0], r9
+	adcs		r9, r9, #0
+	vmov.32		\q\()l[1], r8
+	adcs		r8, r8, #0
+	vmov.32		\q\()l[0], r7
+	adc		r7, r7, #0
+	vrev32.8	\q, \q
+	.endm
+
+	/*
+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+	 *		     int rounds, int blocks, u8 ctr[], u8 final[])
+	 */
+ENTRY(aesbs_ctr_encrypt)
+	mov		ip, sp
+	push		{r4-r10, lr}
+
+	ldm		ip, {r5-r7}		// load args 4-6
+	teq		r7, #0
+	addne		r5, r5, #1		// one extra block if final != 0
+
+	vld1.8		{q0}, [r6]		// load counter
+	vrev32.8	q1, q0
+	vmov		r9, r10, d3
+	vmov		r7, r8, d2
+
+	adds		r10, r10, #1
+	adcs		r9, r9, #0
+	adcs		r8, r8, #0
+	adc		r7, r7, #0
+
+99:	vmov		q1, q0
+	vmov		q2, q0
+	vmov		q3, q0
+	vmov		q4, q0
+	vmov		q5, q0
+	vmov		q6, q0
+	vmov		q7, q0
+
+	__adr		ip, 0f
+	sub		lr, r5, #1
+	and		lr, lr, #7
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #5
+	sub		ip, ip, lr, lsl #2
+	bxlt		ip			// computed goto if blocks < 8
+
+	next_ctr	q1
+	next_ctr	q2
+	next_ctr	q3
+	next_ctr	q4
+	next_ctr	q5
+	next_ctr	q6
+	next_ctr	q7
+
+0:	mov		bskey, r2
+	mov		rounds, r3
+	bl		aesbs_encrypt8
+
+	__adr		ip, 1f
+	and		lr, r5, #7
+	cmp		r5, #8
+	movgt		r4, #0
+	ldrle		r4, [sp, #40]		// load final in the last round
+	sub		ip, ip, lr, lsl #2
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q8}, [r1]!
+	vld1.8		{q9}, [r1]!
+	vld1.8		{q10}, [r1]!
+	vld1.8		{q11}, [r1]!
+	vld1.8		{q12}, [r1]!
+	vld1.8		{q13}, [r1]!
+	vld1.8		{q14}, [r1]!
+	teq		r4, #0			// skip last block if 'final'
+1:	bne		2f
+	vld1.8		{q15}, [r1]!
+
+2:	__adr		ip, 3f
+	cmp		r5, #8
+	sub		ip, ip, lr, lsl #3
+	bxlt		ip			// computed goto if blocks < 8
+
+	veor		q0, q0, q8
+	vst1.8		{q0}, [r0]!
+	veor		q1, q1, q9
+	vst1.8		{q1}, [r0]!
+	veor		q4, q4, q10
+	vst1.8		{q4}, [r0]!
+	veor		q6, q6, q11
+	vst1.8		{q6}, [r0]!
+	veor		q3, q3, q12
+	vst1.8		{q3}, [r0]!
+	veor		q7, q7, q13
+	vst1.8		{q7}, [r0]!
+	veor		q2, q2, q14
+	vst1.8		{q2}, [r0]!
+	teq		r4, #0			// skip last block if 'final'
+	W(bne)		5f
+3:	veor		q5, q5, q15
+	vst1.8		{q5}, [r0]!
+
+4:	next_ctr	q0
+
+	subs		r5, r5, #8
+	bgt		99b
+
+	vst1.8		{q0}, [r6]
+	pop		{r4-r10, pc}
+
+5:	vst1.8		{q5}, [r4]
+	b		4b
+ENDPROC(aesbs_ctr_encrypt)
+
+	.macro		next_tweak, out, in, const, tmp
+	vshr.s64	\tmp, \in, #63
+	vand		\tmp, \tmp, \const
+	vadd.u64	\out, \in, \in
+	vext.8		\tmp, \tmp, \tmp, #8
+	veor		\out, \out, \tmp
+	.endm
+
+	.align		4
+.Lxts_mul_x:
+	.quad		1, 0x87
+
+	/*
+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+__xts_prepare8:
+	vld1.8		{q14}, [r7]		// load iv
+	__ldr		q15, .Lxts_mul_x	// load tweak mask
+	vmov		q12, q14
+
+	__adr		ip, 0f
+	and		r4, r6, #7
+	cmp		r6, #8
+	sub		ip, ip, r4, lsl #5
+	mov		r4, sp
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q0}, [r1]!
+	next_tweak	q12, q14, q15, q13
+	veor		q0, q0, q14
+	vst1.8		{q14}, [r4, :128]!
+
+	vld1.8		{q1}, [r1]!
+	next_tweak	q14, q12, q15, q13
+	veor		q1, q1, q12
+	vst1.8		{q12}, [r4, :128]!
+
+	vld1.8		{q2}, [r1]!
+	next_tweak	q12, q14, q15, q13
+	veor		q2, q2, q14
+	vst1.8		{q14}, [r4, :128]!
+
+	vld1.8		{q3}, [r1]!
+	next_tweak	q14, q12, q15, q13
+	veor		q3, q3, q12
+	vst1.8		{q12}, [r4, :128]!
+
+	vld1.8		{q4}, [r1]!
+	next_tweak	q12, q14, q15, q13
+	veor		q4, q4, q14
+	vst1.8		{q14}, [r4, :128]!
+
+	vld1.8		{q5}, [r1]!
+	next_tweak	q14, q12, q15, q13
+	veor		q5, q5, q12
+	vst1.8		{q12}, [r4, :128]!
+
+	vld1.8		{q6}, [r1]!
+	next_tweak	q12, q14, q15, q13
+	veor		q6, q6, q14
+	vst1.8		{q14}, [r4, :128]!
+
+	vld1.8		{q7}, [r1]!
+	next_tweak	q14, q12, q15, q13
+	veor		q7, q7, q12
+	vst1.8		{q12}, [r4, :128]
+
+0:	vst1.8		{q14}, [r7]		// store next iv
+	bx		lr
+ENDPROC(__xts_prepare8)
+
+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	push		{r4-r8, lr}
+	mov		r5, sp			// preserve sp
+	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
+	sub		ip, sp, #128		// make room for 8x tweak
+	bic		ip, ip, #0xf		// align sp to 16 bytes
+	mov		sp, ip
+
+99:	bl		__xts_prepare8
+
+	mov		bskey, r2
+	mov		rounds, r3
+	bl		\do8
+
+	__adr		ip, 0f
+	and		lr, r6, #7
+	cmp		r6, #8
+	sub		ip, ip, lr, lsl #2
+	mov		r4, sp
+	bxlt		ip			// computed goto if blocks < 8
+
+	vld1.8		{q8}, [r4, :128]!
+	vld1.8		{q9}, [r4, :128]!
+	vld1.8		{q10}, [r4, :128]!
+	vld1.8		{q11}, [r4, :128]!
+	vld1.8		{q12}, [r4, :128]!
+	vld1.8		{q13}, [r4, :128]!
+	vld1.8		{q14}, [r4, :128]!
+	vld1.8		{q15}, [r4, :128]
+
+0:	__adr		ip, 1f
+	sub		ip, ip, lr, lsl #3
+	bxlt		ip			// computed goto if blocks < 8
+
+	veor		\o0, \o0, q8
+	vst1.8		{\o0}, [r0]!
+	veor		\o1, \o1, q9
+	vst1.8		{\o1}, [r0]!
+	veor		\o2, \o2, q10
+	vst1.8		{\o2}, [r0]!
+	veor		\o3, \o3, q11
+	vst1.8		{\o3}, [r0]!
+	veor		\o4, \o4, q12
+	vst1.8		{\o4}, [r0]!
+	veor		\o5, \o5, q13
+	vst1.8		{\o5}, [r0]!
+	veor		\o6, \o6, q14
+	vst1.8		{\o6}, [r0]!
+	veor		\o7, \o7, q15
+	vst1.8		{\o7}, [r0]!
+
+1:	subs		r6, r6, #8
+	bgt		99b
+
+	mov		sp, r5
+	pop		{r4-r8, pc}
+	.endm
+
+ENTRY(aesbs_xts_encrypt)
+	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
+ENDPROC(aesbs_xts_decrypt)
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
new file mode 100644
index 000000000000..2920b96dbd36
--- /dev/null
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -0,0 +1,406 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/cbc.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 ctr[], u8 final[]);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
+				  u8 out[]);
+
+struct aesbs_ctx {
+	int	rounds;
+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
+};
+
+struct aesbs_cbc_ctx {
+	struct aesbs_ctx	key;
+	u32			enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+	struct aesbs_ctx	key;
+	u32			twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 4;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+		   ctx->rounds, blocks);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->key.rounds = 6 + key_len / 4;
+
+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+	return crypto_cbc_encrypt_walk(req, cbc_encrypt_one);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->key.rk, ctx->key.rounds, blocks,
+				  walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u8 buf[AES_BLOCK_SIZE];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+			final = NULL;
+		}
+
+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+		if (final) {
+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+			if (dst != src)
+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+
+			err = skcipher_walk_done(&walk, 0);
+			break;
+		}
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = xts_verify_key(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	key_len /= 2;
+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	if (err)
+		return err;
+
+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+	return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+		   ctx->key.rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+	.base.cra_name		= "__ecb(aes)",
+	.base.cra_driver_name	= "__ecb-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ecb_encrypt,
+	.decrypt		= ecb_decrypt,
+}, {
+	.base.cra_name		= "__cbc(aes)",
+	.base.cra_driver_name	= "__cbc-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_cbc_setkey,
+	.encrypt		= cbc_encrypt,
+	.decrypt		= cbc_decrypt,
+}, {
+	.base.cra_name		= "__ctr(aes)",
+	.base.cra_driver_name	= "__ctr-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "__xts(aes)",
+	.base.cra_driver_name	= "__xts-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_xts_setkey,
+	.encrypt		= xts_encrypt,
+	.decrypt		= xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);
+
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+	struct simd_skcipher_alg *simd;
+	const char *basename;
+	const char *algname;
+	const char *drvname;
+	int err;
+	int i;
+
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
+		algname = aes_algs[i].base.cra_name + 2;
+		drvname = aes_algs[i].base.cra_driver_name + 2;
+		basename = aes_algs[i].base.cra_driver_name;
+		simd = simd_skcipher_create_compat(algname, drvname, basename);
+		err = PTR_ERR(simd);
+		if (IS_ERR(simd))
+			goto unregister_simds;
+
+		aes_simd_algs[i] = simd;
+	}
+	return 0;
+
+unregister_simds:
+	aes_exit();
+	return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c
deleted file mode 100644
index 0409b8f89782..000000000000
--- a/arch/arm/crypto/aes_glue.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Glue Code for the asm optimized version of the AES Cipher Algorithm
- */
-
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <crypto/aes.h>
-
-#include "aes_glue.h"
-
-EXPORT_SYMBOL(AES_encrypt);
-EXPORT_SYMBOL(AES_decrypt);
-EXPORT_SYMBOL(private_AES_set_encrypt_key);
-EXPORT_SYMBOL(private_AES_set_decrypt_key);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-	AES_encrypt(src, dst, &ctx->enc_key);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-	AES_decrypt(src, dst, &ctx->dec_key);
-}
-
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-		unsigned int key_len)
-{
-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
-
-	switch (key_len) {
-	case AES_KEYSIZE_128:
-		key_len = 128;
-		break;
-	case AES_KEYSIZE_192:
-		key_len = 192;
-		break;
-	case AES_KEYSIZE_256:
-		key_len = 256;
-		break;
-	default:
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-
-	if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	/* private_AES_set_decrypt_key expects an encryption key as input */
-	ctx->dec_key = ctx->enc_key;
-	if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static struct crypto_alg aes_alg = {
-	.cra_name		= "aes",
-	.cra_driver_name	= "aes-asm",
-	.cra_priority		= 200,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct AES_CTX),
-	.cra_module		= THIS_MODULE,
-	.cra_list		= LIST_HEAD_INIT(aes_alg.cra_list),
-	.cra_u	= {
-		.cipher	= {
-			.cia_min_keysize	= AES_MIN_KEY_SIZE,
-			.cia_max_keysize	= AES_MAX_KEY_SIZE,
-			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= aes_encrypt,
-			.cia_decrypt		= aes_decrypt
-		}
-	}
-};
-
-static int __init aes_init(void)
-{
-	return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
-	crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("aes");
-MODULE_ALIAS_CRYPTO("aes-asm");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h
deleted file mode 100644
index cca3e51eb606..000000000000
--- a/arch/arm/crypto/aes_glue.h
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#define AES_MAXNR 14
-
-struct AES_KEY {
-	unsigned int rd_key[4 * (AES_MAXNR + 1)];
-	int rounds;
-};
-
-struct AES_CTX {
-	struct AES_KEY enc_key;
-	struct AES_KEY dec_key;
-};
-
-asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
-asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
-					   const int bits, struct AES_KEY *key);
-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
-					   const int bits, struct AES_KEY *key);
diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped
deleted file mode 100644
index 1d1800f71c5b..000000000000
--- a/arch/arm/crypto/aesbs-core.S_shipped
+++ /dev/null
@@ -1,2548 +0,0 @@
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see http://www.openssl.org/~appro/cryptogams/.
-@
-@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
-@ granted.
-@ ====================================================================
-
-@ Bit-sliced AES for ARM NEON
-@
-@ February 2012.
-@
-@ This implementation is direct adaptation of bsaes-x86_64 module for
-@ ARM NEON. Except that this module is endian-neutral [in sense that
-@ it can be compiled for either endianness] by courtesy of vld1.8's
-@ neutrality. Initial version doesn't implement interface to OpenSSL,
-@ only low-level primitives and unsupported entry points, just enough
-@ to collect performance results, which for Cortex-A8 core are:
-@
-@ encrypt	19.5 cycles per byte processed with 128-bit key
-@ decrypt	22.1 cycles per byte processed with 128-bit key
-@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
-@
-@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
-@ which is [much] worse than anticipated (for further details see
-@ http://www.openssl.org/~appro/Snapdragon-S4.html).
-@
-@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
-@ manages in 20.0 cycles].
-@
-@ When comparing to x86_64 results keep in mind that NEON unit is
-@ [mostly] single-issue and thus can't [fully] benefit from
-@ instruction-level parallelism. And when comparing to aes-armv4
-@ results keep in mind key schedule conversion overhead (see
-@ bsaes-x86_64.pl for further details)...
-@
-@						<appro@openssl.org>
-
-@ April-August 2013
-@
-@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
-@
-@					<ard.biesheuvel@linaro.org>
-
-#ifndef __KERNEL__
-# include "arm_arch.h"
-
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-# define VFP_ABI_FRAME	0x40
-#else
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-# define VFP_ABI_FRAME	0
-# define BSAES_ASM_EXTENDED_KEY
-# define XTS_CHAIN_TWEAK
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-#ifdef __thumb__
-# define adrl adr
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-.arch	armv7-a
-.fpu	neon
-
-.text
-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
-#ifdef __thumb2__
-.thumb
-#else
-.code   32
-#endif
-
-.type	_bsaes_decrypt8,%function
-.align	4
-_bsaes_decrypt8:
-	adr	r6,_bsaes_decrypt8
-	vldmia	r4!, {q9}		@ round 0 key
-	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
-
-	vldmia	r6!, {q8}		@ .LM0ISR
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	 vtbl.8	d0, {q10}, d16
-	 vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	 vtbl.8	d2, {q11}, d16
-	 vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	 vtbl.8	d4, {q12}, d16
-	 vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	 vtbl.8	d6, {q13}, d16
-	 vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	 vtbl.8	d8, {q14}, d16
-	 vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	 vtbl.8	d10, {q15}, d16
-	 vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	 vtbl.8	d12, {q10}, d16
-	 vtbl.8	d13, {q10}, d17
-	 vtbl.8	d14, {q11}, d16
-	 vtbl.8	d15, {q11}, d17
-	vmov.i8	q8,#0x55			@ compose .LBS0
-	vmov.i8	q9,#0x33			@ compose .LBS1
-	vshr.u64	q10, q6, #1
-	 vshr.u64	q11, q4, #1
-	veor		q10, q10, q7
-	 veor		q11, q11, q5
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #1
-	 veor		q5, q5, q11
-	 vshl.u64	q11, q11, #1
-	veor		q6, q6, q10
-	 veor		q4, q4, q11
-	vshr.u64	q10, q2, #1
-	 vshr.u64	q11, q0, #1
-	veor		q10, q10, q3
-	 veor		q11, q11, q1
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q3, q3, q10
-	vshl.u64	q10, q10, #1
-	 veor		q1, q1, q11
-	 vshl.u64	q11, q11, #1
-	veor		q2, q2, q10
-	 veor		q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose .LBS2
-	vshr.u64	q10, q5, #2
-	 vshr.u64	q11, q4, #2
-	veor		q10, q10, q7
-	 veor		q11, q11, q6
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #2
-	 veor		q6, q6, q11
-	 vshl.u64	q11, q11, #2
-	veor		q5, q5, q10
-	 veor		q4, q4, q11
-	vshr.u64	q10, q1, #2
-	 vshr.u64	q11, q0, #2
-	veor		q10, q10, q3
-	 veor		q11, q11, q2
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q3, q3, q10
-	vshl.u64	q10, q10, #2
-	 veor		q2, q2, q11
-	 vshl.u64	q11, q11, #2
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vshr.u64	q10, q3, #4
-	 vshr.u64	q11, q2, #4
-	veor		q10, q10, q7
-	 veor		q11, q11, q6
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #4
-	 veor		q6, q6, q11
-	 vshl.u64	q11, q11, #4
-	veor		q3, q3, q10
-	 veor		q2, q2, q11
-	vshr.u64	q10, q1, #4
-	 vshr.u64	q11, q0, #4
-	veor		q10, q10, q5
-	 veor		q11, q11, q4
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #4
-	 veor		q4, q4, q11
-	 vshl.u64	q11, q11, #4
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	sub	r5,r5,#1
-	b	.Ldec_sbox
-.align	4
-.Ldec_loop:
-	vldmia	r4!, {q8-q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-.Ldec_sbox:
-	 veor	q1, q1, q4
-	veor	q3, q3, q4
-
-	veor	q4, q4, q7
-	 veor	q1, q1, q6
-	veor	q2, q2, q7
-	veor	q6, q6, q4
-
-	veor	q0, q0, q1
-	veor	q2, q2, q5
-	 veor	q7, q7, q6
-	veor	q3, q3, q0
-	veor	q5, q5, q0
-	veor	q1, q1, q3
-	veor	q11, q3, q0
-	veor	q10, q7, q4
-	veor	q9, q1, q6
-	veor	q13, q4, q0
-	 vmov	q8, q10
-	veor	q12, q5, q2
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q6, q2
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q3, q7
-	veor	q12, q1, q5
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q4, q6
-	veor	q9, q9, q14
-	vand	q13, q0, q2
-	vand	q14, q7, q1
-	vorr	q15, q3, q5
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q5, q2
-	veor	q8, q1, q6
-	veor 	q10, q15, q14
-	vand	q10, q10, q5
-	veor	q5, q5, q1
-	vand	q11, q1, q15
-	vand	q5, q5, q14
-	veor	q1, q11, q10
-	veor	q5, q5, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	 veor 	q10, q13, q9
-	vand	q11, q11, q12
-	 vand	q10, q10, q2
-	veor	q12, q12, q8
-	 veor	q2, q2, q6
-	vand	q8, q8, q15
-	 vand	q6, q6, q13
-	vand	q12, q12, q14
-	 vand	q2, q2, q9
-	veor	q8, q8, q12
-	 veor	q2, q2, q6
-	veor	q12, q12, q11
-	 veor	q6, q6, q10
-	veor	q5, q5, q12
-	veor	q2, q2, q12
-	veor	q1, q1, q8
-	veor	q6, q6, q8
-
-	veor	q12, q3, q0
-	veor	q8, q7, q4
-	veor	q11, q15, q14
-	 veor 	q10, q13, q9
-	vand	q11, q11, q12
-	 vand	q10, q10, q0
-	veor	q12, q12, q8
-	 veor	q0, q0, q4
-	vand	q8, q8, q15
-	 vand	q4, q4, q13
-	vand	q12, q12, q14
-	 vand	q0, q0, q9
-	veor	q8, q8, q12
-	 veor	q0, q0, q4
-	veor	q12, q12, q11
-	 veor	q4, q4, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor 	q10, q15, q14
-	vand	q10, q10, q3
-	veor	q3, q3, q7
-	vand	q11, q7, q15
-	vand	q3, q3, q14
-	veor	q7, q11, q10
-	veor	q3, q3, q11
-	veor	q3, q3, q12
-	veor	q0, q0, q12
-	veor	q7, q7, q8
-	veor	q4, q4, q8
-	veor	q1, q1, q7
-	veor	q6, q6, q5
-
-	veor	q4, q4, q1
-	veor	q2, q2, q7
-	veor	q5, q5, q7
-	veor	q4, q4, q2
-	 veor 	q7, q7, q0
-	veor	q4, q4, q5
-	 veor	q3, q3, q6
-	 veor	q6, q6, q1
-	veor	q3, q3, q4
-
-	veor	q4, q4, q0
-	veor	q7, q7, q3
-	subs	r5,r5,#1
-	bcc	.Ldec_done
-	@ multiplication by 0x05-0x00-0x04-0x00
-	vext.8	q8, q0, q0, #8
-	vext.8	q14, q3, q3, #8
-	vext.8	q15, q5, q5, #8
-	veor	q8, q8, q0
-	vext.8	q9, q1, q1, #8
-	veor	q14, q14, q3
-	vext.8	q10, q6, q6, #8
-	veor	q15, q15, q5
-	vext.8	q11, q4, q4, #8
-	veor	q9, q9, q1
-	vext.8	q12, q2, q2, #8
-	veor	q10, q10, q6
-	vext.8	q13, q7, q7, #8
-	veor	q11, q11, q4
-	veor	q12, q12, q2
-	veor	q13, q13, q7
-
-	 veor	q0, q0, q14
-	 veor	q1, q1, q14
-	 veor	q6, q6, q8
-	 veor	q2, q2, q10
-	 veor	q4, q4, q9
-	 veor	q1, q1, q15
-	 veor	q6, q6, q15
-	 veor	q2, q2, q14
-	 veor	q7, q7, q11
-	 veor	q4, q4, q14
-	 veor	q3, q3, q12
-	 veor	q2, q2, q15
-	 veor	q7, q7, q15
-	 veor	q5, q5, q13
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q6, q6, #12
-	 veor	q1, q1, q9
-	vext.8	q11, q4, q4, #12
-	 veor	q6, q6, q10
-	vext.8	q12, q2, q2, #12
-	 veor	q4, q4, q11
-	vext.8	q13, q7, q7, #12
-	 veor	q2, q2, q12
-	vext.8	q14, q3, q3, #12
-	 veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	 veor	q3, q3, q14
-
-	veor	q9, q9, q0
-	 veor	q5, q5, q15
-	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	 vext.8	q1, q1, q1, #8
-	veor	q13, q13, q2
-	 veor	q0, q0, q8
-	veor	q14, q14, q7
-	 veor	q1, q1, q9
-	 vext.8	q8, q2, q2, #8
-	veor	q12, q12, q4
-	 vext.8	q9, q7, q7, #8
-	veor	q15, q15, q3
-	 vext.8	q2, q4, q4, #8
-	veor	q11, q11, q6
-	 vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	 vext.8	q4, q3, q3, #8
-	veor	q11, q11, q5
-	 vext.8	q3, q6, q6, #8
-	veor	q5, q9, q13
-	veor	q11, q11, q2
-	veor	q7, q7, q15
-	veor	q6, q4, q14
-	veor	q4, q8, q12
-	veor	q2, q3, q10
-	vmov	q3, q11
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ .LISR
-	ite	eq				@ Thumb2 thing, sanity check in ARM
-	addeq	r6,r6,#0x10
-	bne	.Ldec_loop
-	vldmia	r6, {q12}		@ .LISRM0
-	b	.Ldec_loop
-.align	4
-.Ldec_done:
-	vmov.i8	q8,#0x55			@ compose .LBS0
-	vmov.i8	q9,#0x33			@ compose .LBS1
-	vshr.u64	q10, q3, #1
-	 vshr.u64	q11, q2, #1
-	veor		q10, q10, q5
-	 veor		q11, q11, q7
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #1
-	 veor		q7, q7, q11
-	 vshl.u64	q11, q11, #1
-	veor		q3, q3, q10
-	 veor		q2, q2, q11
-	vshr.u64	q10, q6, #1
-	 vshr.u64	q11, q0, #1
-	veor		q10, q10, q4
-	 veor		q11, q11, q1
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q4, q4, q10
-	vshl.u64	q10, q10, #1
-	 veor		q1, q1, q11
-	 vshl.u64	q11, q11, #1
-	veor		q6, q6, q10
-	 veor		q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose .LBS2
-	vshr.u64	q10, q7, #2
-	 vshr.u64	q11, q2, #2
-	veor		q10, q10, q5
-	 veor		q11, q11, q3
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #2
-	 veor		q3, q3, q11
-	 vshl.u64	q11, q11, #2
-	veor		q7, q7, q10
-	 veor		q2, q2, q11
-	vshr.u64	q10, q1, #2
-	 vshr.u64	q11, q0, #2
-	veor		q10, q10, q4
-	 veor		q11, q11, q6
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q4, q4, q10
-	vshl.u64	q10, q10, #2
-	 veor		q6, q6, q11
-	 vshl.u64	q11, q11, #2
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vshr.u64	q10, q4, #4
-	 vshr.u64	q11, q6, #4
-	veor		q10, q10, q5
-	 veor		q11, q11, q3
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #4
-	 veor		q3, q3, q11
-	 vshl.u64	q11, q11, #4
-	veor		q4, q4, q10
-	 veor		q6, q6, q11
-	vshr.u64	q10, q1, #4
-	 vshr.u64	q11, q0, #4
-	veor		q10, q10, q7
-	 veor		q11, q11, q2
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #4
-	 veor		q2, q2, q11
-	 vshl.u64	q11, q11, #4
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q6, q6, q8
-	veor	q4, q4, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q8
-	veor	q3, q3, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-.size	_bsaes_decrypt8,.-_bsaes_decrypt8
-
-.type	_bsaes_const,%object
-.align	6
-_bsaes_const:
-.LM0ISR:	@ InvShiftRows constants
-	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISR:
-	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-.LISRM0:
-	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-.LM0SR:		@ ShiftRows constants
-	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-.LSR:
-	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-.LM0:
-	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-.LREVM0SR:
-	.quad	0x090d01050c000408, 0x03070b0f060a0e02
-.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
-.align	6
-.size	_bsaes_const,.-_bsaes_const
-
-.type	_bsaes_encrypt8,%function
-.align	4
-_bsaes_encrypt8:
-	adr	r6,_bsaes_encrypt8
-	vldmia	r4!, {q9}		@ round 0 key
-	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
-
-	vldmia	r6!, {q8}		@ .LM0SR
-_bsaes_encrypt8_alt:
-	veor	q10, q0, q9	@ xor with round0 key
-	veor	q11, q1, q9
-	 vtbl.8	d0, {q10}, d16
-	 vtbl.8	d1, {q10}, d17
-	veor	q12, q2, q9
-	 vtbl.8	d2, {q11}, d16
-	 vtbl.8	d3, {q11}, d17
-	veor	q13, q3, q9
-	 vtbl.8	d4, {q12}, d16
-	 vtbl.8	d5, {q12}, d17
-	veor	q14, q4, q9
-	 vtbl.8	d6, {q13}, d16
-	 vtbl.8	d7, {q13}, d17
-	veor	q15, q5, q9
-	 vtbl.8	d8, {q14}, d16
-	 vtbl.8	d9, {q14}, d17
-	veor	q10, q6, q9
-	 vtbl.8	d10, {q15}, d16
-	 vtbl.8	d11, {q15}, d17
-	veor	q11, q7, q9
-	 vtbl.8	d12, {q10}, d16
-	 vtbl.8	d13, {q10}, d17
-	 vtbl.8	d14, {q11}, d16
-	 vtbl.8	d15, {q11}, d17
-_bsaes_encrypt8_bitslice:
-	vmov.i8	q8,#0x55			@ compose .LBS0
-	vmov.i8	q9,#0x33			@ compose .LBS1
-	vshr.u64	q10, q6, #1
-	 vshr.u64	q11, q4, #1
-	veor		q10, q10, q7
-	 veor		q11, q11, q5
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #1
-	 veor		q5, q5, q11
-	 vshl.u64	q11, q11, #1
-	veor		q6, q6, q10
-	 veor		q4, q4, q11
-	vshr.u64	q10, q2, #1
-	 vshr.u64	q11, q0, #1
-	veor		q10, q10, q3
-	 veor		q11, q11, q1
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q3, q3, q10
-	vshl.u64	q10, q10, #1
-	 veor		q1, q1, q11
-	 vshl.u64	q11, q11, #1
-	veor		q2, q2, q10
-	 veor		q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose .LBS2
-	vshr.u64	q10, q5, #2
-	 vshr.u64	q11, q4, #2
-	veor		q10, q10, q7
-	 veor		q11, q11, q6
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #2
-	 veor		q6, q6, q11
-	 vshl.u64	q11, q11, #2
-	veor		q5, q5, q10
-	 veor		q4, q4, q11
-	vshr.u64	q10, q1, #2
-	 vshr.u64	q11, q0, #2
-	veor		q10, q10, q3
-	 veor		q11, q11, q2
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q3, q3, q10
-	vshl.u64	q10, q10, #2
-	 veor		q2, q2, q11
-	 vshl.u64	q11, q11, #2
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vshr.u64	q10, q3, #4
-	 vshr.u64	q11, q2, #4
-	veor		q10, q10, q7
-	 veor		q11, q11, q6
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #4
-	 veor		q6, q6, q11
-	 vshl.u64	q11, q11, #4
-	veor		q3, q3, q10
-	 veor		q2, q2, q11
-	vshr.u64	q10, q1, #4
-	 vshr.u64	q11, q0, #4
-	veor		q10, q10, q5
-	 veor		q11, q11, q4
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #4
-	 veor		q4, q4, q11
-	 vshl.u64	q11, q11, #4
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	sub	r5,r5,#1
-	b	.Lenc_sbox
-.align	4
-.Lenc_loop:
-	vldmia	r4!, {q8-q11}
-	veor	q8, q8, q0
-	veor	q9, q9, q1
-	vtbl.8	d0, {q8}, d24
-	vtbl.8	d1, {q8}, d25
-	vldmia	r4!, {q8}
-	veor	q10, q10, q2
-	vtbl.8	d2, {q9}, d24
-	vtbl.8	d3, {q9}, d25
-	vldmia	r4!, {q9}
-	veor	q11, q11, q3
-	vtbl.8	d4, {q10}, d24
-	vtbl.8	d5, {q10}, d25
-	vldmia	r4!, {q10}
-	vtbl.8	d6, {q11}, d24
-	vtbl.8	d7, {q11}, d25
-	vldmia	r4!, {q11}
-	veor	q8, q8, q4
-	veor	q9, q9, q5
-	vtbl.8	d8, {q8}, d24
-	vtbl.8	d9, {q8}, d25
-	veor	q10, q10, q6
-	vtbl.8	d10, {q9}, d24
-	vtbl.8	d11, {q9}, d25
-	veor	q11, q11, q7
-	vtbl.8	d12, {q10}, d24
-	vtbl.8	d13, {q10}, d25
-	vtbl.8	d14, {q11}, d24
-	vtbl.8	d15, {q11}, d25
-.Lenc_sbox:
-	veor	q2, q2, q1
-	veor	q5, q5, q6
-	veor	q3, q3, q0
-	veor	q6, q6, q2
-	veor	q5, q5, q0
-
-	veor	q6, q6, q3
-	veor	q3, q3, q7
-	veor	q7, q7, q5
-	veor	q3, q3, q4
-	veor	q4, q4, q5
-
-	veor	q2, q2, q7
-	veor	q3, q3, q1
-	veor	q1, q1, q5
-	veor	q11, q7, q4
-	veor	q10, q1, q2
-	veor	q9, q5, q3
-	veor	q13, q2, q4
-	 vmov	q8, q10
-	veor	q12, q6, q0
-
-	vorr	q10, q10, q9
-	veor	q15, q11, q8
-	vand	q14, q11, q12
-	vorr	q11, q11, q12
-	veor	q12, q12, q9
-	vand	q8, q8, q9
-	veor	q9, q3, q0
-	vand	q15, q15, q12
-	vand	q13, q13, q9
-	veor	q9, q7, q1
-	veor	q12, q5, q6
-	veor	q11, q11, q13
-	veor	q10, q10, q13
-	vand	q13, q9, q12
-	vorr	q9, q9, q12
-	veor	q11, q11, q15
-	veor	q8, q8, q13
-	veor	q10, q10, q14
-	veor	q9, q9, q15
-	veor	q8, q8, q14
-	vand	q12, q2, q3
-	veor	q9, q9, q14
-	vand	q13, q4, q0
-	vand	q14, q1, q5
-	vorr	q15, q7, q6
-	veor	q11, q11, q12
-	veor	q9, q9, q14
-	veor	q8, q8, q15
-	veor	q10, q10, q13
-
-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
-
-	@ new smaller inversion
-
-	vand	q14, q11, q9
-	vmov	q12, q8
-
-	veor	q13, q10, q14
-	veor	q15, q8, q14
-	veor	q14, q8, q14	@ q14=q15
-
-	vbsl	q13, q9, q8
-	vbsl	q15, q11, q10
-	veor	q11, q11, q10
-
-	vbsl	q12, q13, q14
-	vbsl	q8, q14, q13
-
-	vand	q14, q12, q15
-	veor	q9, q9, q8
-
-	veor	q14, q14, q11
-	veor	q12, q6, q0
-	veor	q8, q5, q3
-	veor 	q10, q15, q14
-	vand	q10, q10, q6
-	veor	q6, q6, q5
-	vand	q11, q5, q15
-	vand	q6, q6, q14
-	veor	q5, q11, q10
-	veor	q6, q6, q11
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor	q11, q15, q14
-	 veor 	q10, q13, q9
-	vand	q11, q11, q12
-	 vand	q10, q10, q0
-	veor	q12, q12, q8
-	 veor	q0, q0, q3
-	vand	q8, q8, q15
-	 vand	q3, q3, q13
-	vand	q12, q12, q14
-	 vand	q0, q0, q9
-	veor	q8, q8, q12
-	 veor	q0, q0, q3
-	veor	q12, q12, q11
-	 veor	q3, q3, q10
-	veor	q6, q6, q12
-	veor	q0, q0, q12
-	veor	q5, q5, q8
-	veor	q3, q3, q8
-
-	veor	q12, q7, q4
-	veor	q8, q1, q2
-	veor	q11, q15, q14
-	 veor 	q10, q13, q9
-	vand	q11, q11, q12
-	 vand	q10, q10, q4
-	veor	q12, q12, q8
-	 veor	q4, q4, q2
-	vand	q8, q8, q15
-	 vand	q2, q2, q13
-	vand	q12, q12, q14
-	 vand	q4, q4, q9
-	veor	q8, q8, q12
-	 veor	q4, q4, q2
-	veor	q12, q12, q11
-	 veor	q2, q2, q10
-	veor	q15, q15, q13
-	veor	q14, q14, q9
-	veor 	q10, q15, q14
-	vand	q10, q10, q7
-	veor	q7, q7, q1
-	vand	q11, q1, q15
-	vand	q7, q7, q14
-	veor	q1, q11, q10
-	veor	q7, q7, q11
-	veor	q7, q7, q12
-	veor	q4, q4, q12
-	veor	q1, q1, q8
-	veor	q2, q2, q8
-	veor	q7, q7, q0
-	veor	q1, q1, q6
-	veor	q6, q6, q0
-	veor	q4, q4, q7
-	veor	q0, q0, q1
-
-	veor	q1, q1, q5
-	veor	q5, q5, q2
-	veor	q2, q2, q3
-	veor	q3, q3, q5
-	veor	q4, q4, q5
-
-	veor	q6, q6, q3
-	subs	r5,r5,#1
-	bcc	.Lenc_done
-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
-	vext.8	q9, q1, q1, #12
-	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
-	vext.8	q10, q4, q4, #12
-	 veor	q1, q1, q9
-	vext.8	q11, q6, q6, #12
-	 veor	q4, q4, q10
-	vext.8	q12, q3, q3, #12
-	 veor	q6, q6, q11
-	vext.8	q13, q7, q7, #12
-	 veor	q3, q3, q12
-	vext.8	q14, q2, q2, #12
-	 veor	q7, q7, q13
-	vext.8	q15, q5, q5, #12
-	 veor	q2, q2, q14
-
-	veor	q9, q9, q0
-	 veor	q5, q5, q15
-	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	q10, q10, q1
-	veor	q8, q8, q5
-	veor	q9, q9, q5
-	 vext.8	q1, q1, q1, #8
-	veor	q13, q13, q3
-	 veor	q0, q0, q8
-	veor	q14, q14, q7
-	 veor	q1, q1, q9
-	 vext.8	q8, q3, q3, #8
-	veor	q12, q12, q6
-	 vext.8	q9, q7, q7, #8
-	veor	q15, q15, q2
-	 vext.8	q3, q6, q6, #8
-	veor	q11, q11, q4
-	 vext.8	q7, q5, q5, #8
-	veor	q12, q12, q5
-	 vext.8	q6, q2, q2, #8
-	veor	q11, q11, q5
-	 vext.8	q2, q4, q4, #8
-	veor	q5, q9, q13
-	veor	q4, q8, q12
-	veor	q3, q3, q11
-	veor	q7, q7, q15
-	veor	q6, q6, q14
-	 @ vmov	q4, q8
-	veor	q2, q2, q10
-	 @ vmov	q5, q9
-	vldmia	r6, {q12}		@ .LSR
-	ite	eq				@ Thumb2 thing, samity check in ARM
-	addeq	r6,r6,#0x10
-	bne	.Lenc_loop
-	vldmia	r6, {q12}		@ .LSRM0
-	b	.Lenc_loop
-.align	4
-.Lenc_done:
-	vmov.i8	q8,#0x55			@ compose .LBS0
-	vmov.i8	q9,#0x33			@ compose .LBS1
-	vshr.u64	q10, q2, #1
-	 vshr.u64	q11, q3, #1
-	veor		q10, q10, q5
-	 veor		q11, q11, q7
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #1
-	 veor		q7, q7, q11
-	 vshl.u64	q11, q11, #1
-	veor		q2, q2, q10
-	 veor		q3, q3, q11
-	vshr.u64	q10, q4, #1
-	 vshr.u64	q11, q0, #1
-	veor		q10, q10, q6
-	 veor		q11, q11, q1
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q6, q6, q10
-	vshl.u64	q10, q10, #1
-	 veor		q1, q1, q11
-	 vshl.u64	q11, q11, #1
-	veor		q4, q4, q10
-	 veor		q0, q0, q11
-	vmov.i8	q8,#0x0f			@ compose .LBS2
-	vshr.u64	q10, q7, #2
-	 vshr.u64	q11, q3, #2
-	veor		q10, q10, q5
-	 veor		q11, q11, q2
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #2
-	 veor		q2, q2, q11
-	 vshl.u64	q11, q11, #2
-	veor		q7, q7, q10
-	 veor		q3, q3, q11
-	vshr.u64	q10, q1, #2
-	 vshr.u64	q11, q0, #2
-	veor		q10, q10, q6
-	 veor		q11, q11, q4
-	vand		q10, q10, q9
-	 vand		q11, q11, q9
-	veor		q6, q6, q10
-	vshl.u64	q10, q10, #2
-	 veor		q4, q4, q11
-	 vshl.u64	q11, q11, #2
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vshr.u64	q10, q6, #4
-	 vshr.u64	q11, q4, #4
-	veor		q10, q10, q5
-	 veor		q11, q11, q2
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q5, q5, q10
-	vshl.u64	q10, q10, #4
-	 veor		q2, q2, q11
-	 vshl.u64	q11, q11, #4
-	veor		q6, q6, q10
-	 veor		q4, q4, q11
-	vshr.u64	q10, q1, #4
-	 vshr.u64	q11, q0, #4
-	veor		q10, q10, q7
-	 veor		q11, q11, q3
-	vand		q10, q10, q8
-	 vand		q11, q11, q8
-	veor		q7, q7, q10
-	vshl.u64	q10, q10, #4
-	 veor		q3, q3, q11
-	 vshl.u64	q11, q11, #4
-	veor		q1, q1, q10
-	 veor		q0, q0, q11
-	vldmia	r4, {q8}			@ last round key
-	veor	q4, q4, q8
-	veor	q6, q6, q8
-	veor	q3, q3, q8
-	veor	q7, q7, q8
-	veor	q2, q2, q8
-	veor	q5, q5, q8
-	veor	q0, q0, q8
-	veor	q1, q1, q8
-	bx	lr
-.size	_bsaes_encrypt8,.-_bsaes_encrypt8
-.type	_bsaes_key_convert,%function
-.align	4
-_bsaes_key_convert:
-	adr	r6,_bsaes_key_convert
-	vld1.8	{q7},  [r4]!		@ load round 0 key
-	sub	r6,r6,#_bsaes_key_convert-.LM0
-	vld1.8	{q15}, [r4]!		@ load round 1 key
-
-	vmov.i8	q8,  #0x01			@ bit masks
-	vmov.i8	q9,  #0x02
-	vmov.i8	q10, #0x04
-	vmov.i8	q11, #0x08
-	vmov.i8	q12, #0x10
-	vmov.i8	q13, #0x20
-	vldmia	r6, {q14}		@ .LM0
-
-#ifdef __ARMEL__
-	vrev32.8	q7,  q7
-	vrev32.8	q15, q15
-#endif
-	sub	r5,r5,#1
-	vstmia	r12!, {q7}		@ save round 0 key
-	b	.Lkey_loop
-
-.align	4
-.Lkey_loop:
-	vtbl.8	d14,{q15},d28
-	vtbl.8	d15,{q15},d29
-	vmov.i8	q6,  #0x40
-	vmov.i8	q15, #0x80
-
-	vtst.8	q0, q7, q8
-	vtst.8	q1, q7, q9
-	vtst.8	q2, q7, q10
-	vtst.8	q3, q7, q11
-	vtst.8	q4, q7, q12
-	vtst.8	q5, q7, q13
-	vtst.8	q6, q7, q6
-	vtst.8	q7, q7, q15
-	vld1.8	{q15}, [r4]!		@ load next round key
-	vmvn	q0, q0		@ "pnot"
-	vmvn	q1, q1
-	vmvn	q5, q5
-	vmvn	q6, q6
-#ifdef __ARMEL__
-	vrev32.8	q15, q15
-#endif
-	subs	r5,r5,#1
-	vstmia	r12!,{q0-q7}		@ write bit-sliced round key
-	bne	.Lkey_loop
-
-	vmov.i8	q7,#0x63			@ compose .L63
-	@ don't save last round key
-	bx	lr
-.size	_bsaes_key_convert,.-_bsaes_key_convert
-.extern AES_cbc_encrypt
-.extern AES_decrypt
-
-.global	bsaes_cbc_encrypt
-.type	bsaes_cbc_encrypt,%function
-.align	5
-bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	r2, #128
-#ifndef	__thumb__
-	blo	AES_cbc_encrypt
-#else
-	bhs	1f
-	b	AES_cbc_encrypt
-1:
-#endif
-#endif
-
-	@ it is up to the caller to make sure we are called with enc == 0
-
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ IV is 1st arg on the stack
-	mov	r2, r2, lsr#4		@ len in 16 byte blocks
-	sub	sp, #0x10			@ scratch space to carry over the IV
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ sifze of bit-slices key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	vldmia	sp, {q6}
-	vstmia	r12,  {q15}		@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	sp, {q7}
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, r3, #248
-	vldmia	r4, {q6}
-	vstmia	r12, {q15}			@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	r4, {q7}
-
-.align	2
-0:
-#endif
-
-	vld1.8	{q15}, [r8]		@ load IV
-	b	.Lcbc_dec_loop
-
-.align	4
-.Lcbc_dec_loop:
-	subs	r2, r2, #0x8
-	bmi	.Lcbc_dec_loop_finish
-
-	vld1.8	{q0-q1}, [r0]!	@ load input
-	vld1.8	{q2-q3}, [r0]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	vld1.8	{q4-q5}, [r0]!
-	mov	r5, r10
-	vld1.8	{q6-q7}, [r0]
-	sub	r0, r0, #0x60
-	vstmia	r9, {q15}			@ put aside IV
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10-q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12-q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q14-q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	veor	q5, q5, q14
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	vst1.8	{q5}, [r1]!
-
-	b	.Lcbc_dec_loop
-
-.Lcbc_dec_loop_finish:
-	adds	r2, r2, #8
-	beq	.Lcbc_dec_done
-
-	vld1.8	{q0}, [r0]!		@ load input
-	cmp	r2, #2
-	blo	.Lcbc_dec_one
-	vld1.8	{q1}, [r0]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, sp			@ pass the key
-#else
-	add	r4, r3, #248
-#endif
-	mov	r5, r10
-	vstmia	r9, {q15}			@ put aside IV
-	beq	.Lcbc_dec_two
-	vld1.8	{q2}, [r0]!
-	cmp	r2, #4
-	blo	.Lcbc_dec_three
-	vld1.8	{q3}, [r0]!
-	beq	.Lcbc_dec_four
-	vld1.8	{q4}, [r0]!
-	cmp	r2, #6
-	blo	.Lcbc_dec_five
-	vld1.8	{q5}, [r0]!
-	beq	.Lcbc_dec_six
-	vld1.8	{q6}, [r0]!
-	sub	r0, r0, #0x70
-
-	bl	_bsaes_decrypt8
-
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10-q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12-q13}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	veor	q3, q3, q13
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	vst1.8	{q3}, [r1]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_six:
-	sub	r0, r0, #0x60
-	bl	_bsaes_decrypt8
-	vldmia	r9,{q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10-q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q12}, [r0]!
-	veor	q4, q4, q10
-	veor	q2, q2, q11
-	vld1.8	{q15}, [r0]!
-	veor	q7, q7, q12
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	vst1.8	{q7}, [r1]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_five:
-	sub	r0, r0, #0x50
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10-q11}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	veor	q2, q2, q11
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	vst1.8	{q2}, [r1]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_four:
-	sub	r0, r0, #0x40
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q10}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vld1.8	{q15}, [r0]!
-	veor	q4, q4, q10
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	vst1.8	{q4}, [r1]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_three:
-	sub	r0, r0, #0x30
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8-q9}, [r0]!	@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!
-	veor	q1, q1, q8
-	veor	q6, q6, q9
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	vst1.8	{q6}, [r1]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_two:
-	sub	r0, r0, #0x20
-	bl	_bsaes_decrypt8
-	vldmia	r9, {q14}			@ reload IV
-	vld1.8	{q8}, [r0]!		@ reload input
-	veor	q0, q0, q14	@ ^= IV
-	vld1.8	{q15}, [r0]!		@ reload input
-	veor	q1, q1, q8
-	vst1.8	{q0-q1}, [r1]!	@ write output
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_one:
-	sub	r0, r0, #0x10
-	mov	r10, r1			@ save original out pointer
-	mov	r1, r9			@ use the iv scratch space as out buffer
-	mov	r2, r3
-	vmov	q4,q15		@ just in case ensure that IV
-	vmov	q5,q0			@ and input are preserved
-	bl	AES_decrypt
-	vld1.8	{q0}, [r9,:64]		@ load result
-	veor	q0, q0, q4	@ ^= IV
-	vmov	q15, q5		@ q5 holds input
-	vst1.8	{q0}, [r10]		@ write output
-
-.Lcbc_dec_done:
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-.Lcbc_dec_bzero:				@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r9
-	bne		.Lcbc_dec_bzero
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
-	vst1.8	{q15}, [r8]		@ return IV
-	VFP_ABI_POP
-	ldmia	sp!, {r4-r10, pc}
-.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-.extern	AES_encrypt
-.global	bsaes_ctr32_encrypt_blocks
-.type	bsaes_ctr32_encrypt_blocks,%function
-.align	5
-bsaes_ctr32_encrypt_blocks:
-	cmp	r2, #8			@ use plain AES for
-	blo	.Lctr_enc_short			@ small sizes
-
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}
-	VFP_ABI_PUSH
-	ldr	r8, [ip]			@ ctr is 1st arg on the stack
-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
-	mov	r9, sp				@ save sp
-
-	ldr	r10, [r3, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
-	add	r12, #96			@ size of bit-sliced key schedule
-
-	@ populate the key schedule
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	mov	sp, r12				@ sp is sp
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-	vld1.8	{q0}, [r8]		@ load counter
-	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
-	vldmia	sp, {q4}		@ load round0 key
-#else
-	ldr	r12, [r3, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [r3, #244]
-	mov	r4, r3			@ pass key
-	mov	r5, r10			@ pass # of rounds
-	add	r12, r3, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	q7,q7,q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-
-.align	2
-0:	add	r12, r3, #248
-	vld1.8	{q0}, [r8]		@ load counter
-	adrl	r8, .LREVM0SR			@ borrow r8
-	vldmia	r12, {q4}			@ load round0 key
-	sub	sp, #0x10			@ place for adjusted round0 key
-#endif
-
-	vmov.i32	q8,#1		@ compose 1<<96
-	veor		q9,q9,q9
-	vrev32.8	q0,q0
-	vext.8		q8,q9,q8,#4
-	vrev32.8	q4,q4
-	vadd.u32	q9,q8,q8	@ compose 2<<96
-	vstmia	sp, {q4}		@ save adjusted round0 key
-	b	.Lctr_enc_loop
-
-.align	4
-.Lctr_enc_loop:
-	vadd.u32	q10, q8, q9	@ compose 3<<96
-	vadd.u32	q1, q0, q8	@ +1
-	vadd.u32	q2, q0, q9	@ +2
-	vadd.u32	q3, q0, q10	@ +3
-	vadd.u32	q4, q1, q10
-	vadd.u32	q5, q2, q10
-	vadd.u32	q6, q3, q10
-	vadd.u32	q7, q4, q10
-	vadd.u32	q10, q5, q10	@ next counter
-
-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-	@ to flip byte order in 32-bit counter
-
-	vldmia		sp, {q9}		@ load round0 key
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x10		@ pass next round key
-#else
-	add		r4, r3, #264
-#endif
-	vldmia		r8, {q8}			@ .LREVM0SR
-	mov		r5, r10			@ pass rounds
-	vstmia		r9, {q10}			@ save next counter
-	sub		r6, r8, #.LREVM0SR-.LSR	@ pass constants
-
-	bl		_bsaes_encrypt8_alt
-
-	subs		r2, r2, #8
-	blo		.Lctr_enc_loop_done
-
-	vld1.8		{q8-q9}, [r0]!	@ load input
-	vld1.8		{q10-q11}, [r0]!
-	veor		q0, q8
-	veor		q1, q9
-	vld1.8		{q12-q13}, [r0]!
-	veor		q4, q10
-	veor		q6, q11
-	vld1.8		{q14-q15}, [r0]!
-	veor		q3, q12
-	vst1.8		{q0-q1}, [r1]!	@ write output
-	veor		q7, q13
-	veor		q2, q14
-	vst1.8		{q4}, [r1]!
-	veor		q5, q15
-	vst1.8		{q6}, [r1]!
-	vmov.i32	q8, #1			@ compose 1<<96
-	vst1.8		{q3}, [r1]!
-	veor		q9, q9, q9
-	vst1.8		{q7}, [r1]!
-	vext.8		q8, q9, q8, #4
-	vst1.8		{q2}, [r1]!
-	vadd.u32	q9,q8,q8		@ compose 2<<96
-	vst1.8		{q5}, [r1]!
-	vldmia		r9, {q0}			@ load counter
-
-	bne		.Lctr_enc_loop
-	b		.Lctr_enc_done
-
-.align	4
-.Lctr_enc_loop_done:
-	add		r2, r2, #8
-	vld1.8		{q8}, [r0]!	@ load input
-	veor		q0, q8
-	vst1.8		{q0}, [r1]!	@ write output
-	cmp		r2, #2
-	blo		.Lctr_enc_done
-	vld1.8		{q9}, [r0]!
-	veor		q1, q9
-	vst1.8		{q1}, [r1]!
-	beq		.Lctr_enc_done
-	vld1.8		{q10}, [r0]!
-	veor		q4, q10
-	vst1.8		{q4}, [r1]!
-	cmp		r2, #4
-	blo		.Lctr_enc_done
-	vld1.8		{q11}, [r0]!
-	veor		q6, q11
-	vst1.8		{q6}, [r1]!
-	beq		.Lctr_enc_done
-	vld1.8		{q12}, [r0]!
-	veor		q3, q12
-	vst1.8		{q3}, [r1]!
-	cmp		r2, #6
-	blo		.Lctr_enc_done
-	vld1.8		{q13}, [r0]!
-	veor		q7, q13
-	vst1.8		{q7}, [r1]!
-	beq		.Lctr_enc_done
-	vld1.8		{q14}, [r0]
-	veor		q2, q14
-	vst1.8		{q2}, [r1]!
-
-.Lctr_enc_done:
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifndef	BSAES_ASM_EXTENDED_KEY
-.Lctr_enc_bzero:			@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r9
-	bne		.Lctr_enc_bzero
-#else
-	vstmia		sp, {q0-q1}
-#endif
-
-	mov	sp, r9
-	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
-	VFP_ABI_POP
-	ldmia	sp!, {r4-r10, pc}	@ return
-
-.align	4
-.Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4-r8, lr}
-
-	mov	r4, r0		@ copy arguments
-	mov	r5, r1
-	mov	r6, r2
-	mov	r7, r3
-	ldr	r8, [ip, #12]		@ load counter LSW
-	vld1.8	{q1}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{q1}, [sp,:64]	@ copy counter value
-	sub	sp, sp, #0x10
-
-.Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	AES_encrypt
-
-	vld1.8	{q0}, [r4]!	@ load input
-	vld1.8	{q1}, [sp,:64]	@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	q0,q0,q1
-	vst1.8	{q0}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	.Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia		sp!, {q0-q1}
-
-	ldmia	sp!, {r4-r8, pc}
-.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-.globl	bsaes_xts_encrypt
-.type	bsaes_xts_encrypt,%function
-.align	4
-bsaes_xts_encrypt:
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}		@ 0x20
-	VFP_ABI_PUSH
-	mov	r6, sp				@ future r3
-
-	mov	r7, r0
-	mov	r8, r1
-	mov	r9, r2
-	mov	r10, r3
-
-	sub	r0, sp, #0x10			@ 0x10
-	bic	r0, #0xf			@ align at 16 bytes
-	mov	sp, r0
-
-#ifdef	XTS_CHAIN_TWEAK
-	ldr	r0, [ip]			@ pointer to input tweak
-#else
-	@ generate initial tweak
-	ldr	r0, [ip, #4]			@ iv[]
-	mov	r1, sp
-	ldr	r2, [ip, #0]			@ key2
-	bl	AES_encrypt
-	mov	r0,sp				@ pointer to initial tweak
-#endif
-
-	ldr	r1, [r10, #240]		@ get # of rounds
-	mov	r3, r6
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
-	@ add	r12, #96			@ size of bit-sliced key schedule
-	sub	r12, #48			@ place for tweak[9]
-
-	@ populate the key schedule
-	mov	r4, r10			@ pass key
-	mov	r5, r1			@ pass # of rounds
-	mov	sp, r12
-	add	r12, #0x90			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	q7, q7, q15	@ fix up last round key
-	vstmia	r12, {q7}			@ save last round key
-#else
-	ldr	r12, [r10, #244]
-	eors	r12, #1
-	beq	0f
-
-	str	r12, [r10, #244]
-	mov	r4, r10			@ pass key
-	mov	r5, r1			@ pass # of rounds
-	add	r12, r10, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	q7, q7, q15	@ fix up last round key
-	vstmia	r12, {q7}
-
-.align	2
-0:	sub	sp, #0x90			@ place for tweak[9]
-#endif
-
-	vld1.8	{q8}, [r0]			@ initial tweak
-	adr	r2, .Lxts_magic
-
-	subs	r9, #0x80
-	blo	.Lxts_enc_short
-	b	.Lxts_enc_loop
-
-.align	4
-.Lxts_enc_loop:
-	vldmia		r2, {q5}	@ load XTS magic
-	vshr.s64	q6, q8, #63
-	mov		r0, sp
-	vand		q6, q6, q5
-	vadd.u64	q9, q8, q8
-	vst1.64		{q8}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q9, #63
-	veor		q9, q9, q6
-	vand		q7, q7, q5
-	vadd.u64	q10, q9, q9
-	vst1.64		{q9}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q10, #63
-	veor		q10, q10, q7
-	vand		q6, q6, q5
-	vld1.8		{q0}, [r7]!
-	vadd.u64	q11, q10, q10
-	vst1.64		{q10}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q11, #63
-	veor		q11, q11, q6
-	vand		q7, q7, q5
-	vld1.8		{q1}, [r7]!
-	veor		q0, q0, q8
-	vadd.u64	q12, q11, q11
-	vst1.64		{q11}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q12, #63
-	veor		q12, q12, q7
-	vand		q6, q6, q5
-	vld1.8		{q2}, [r7]!
-	veor		q1, q1, q9
-	vadd.u64	q13, q12, q12
-	vst1.64		{q12}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q13, #63
-	veor		q13, q13, q6
-	vand		q7, q7, q5
-	vld1.8		{q3}, [r7]!
-	veor		q2, q2, q10
-	vadd.u64	q14, q13, q13
-	vst1.64		{q13}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q14, #63
-	veor		q14, q14, q7
-	vand		q6, q6, q5
-	vld1.8		{q4}, [r7]!
-	veor		q3, q3, q11
-	vadd.u64	q15, q14, q14
-	vst1.64		{q14}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q15, #63
-	veor		q15, q15, q6
-	vand		q7, q7, q5
-	vld1.8		{q5}, [r7]!
-	veor		q4, q4, q12
-	vadd.u64	q8, q15, q15
-	vst1.64		{q15}, [r0,:128]!
-	vswp		d15,d14
-	veor		q8, q8, q7
-	vst1.64		{q8}, [r0,:128]		@ next round tweak
-
-	vld1.8		{q6-q7}, [r7]!
-	veor		q5, q5, q13
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q6, q6, q14
-	mov		r5, r1			@ pass rounds
-	veor		q7, q7, q15
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q6, q11
-	vld1.64		{q14-q15}, [r0,:128]!
-	veor		q10, q3, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	veor		q12, q2, q14
-	vst1.8		{q10-q11}, [r8]!
-	veor		q13, q5, q15
-	vst1.8		{q12-q13}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-
-	subs		r9, #0x80
-	bpl		.Lxts_enc_loop
-
-.Lxts_enc_short:
-	adds		r9, #0x70
-	bmi		.Lxts_enc_done
-
-	vldmia		r2, {q5}	@ load XTS magic
-	vshr.s64	q7, q8, #63
-	mov		r0, sp
-	vand		q7, q7, q5
-	vadd.u64	q9, q8, q8
-	vst1.64		{q8}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q9, #63
-	veor		q9, q9, q7
-	vand		q6, q6, q5
-	vadd.u64	q10, q9, q9
-	vst1.64		{q9}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q10, #63
-	veor		q10, q10, q6
-	vand		q7, q7, q5
-	vld1.8		{q0}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_1
-	vadd.u64	q11, q10, q10
-	vst1.64		{q10}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q11, #63
-	veor		q11, q11, q7
-	vand		q6, q6, q5
-	vld1.8		{q1}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_2
-	veor		q0, q0, q8
-	vadd.u64	q12, q11, q11
-	vst1.64		{q11}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q12, #63
-	veor		q12, q12, q6
-	vand		q7, q7, q5
-	vld1.8		{q2}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_3
-	veor		q1, q1, q9
-	vadd.u64	q13, q12, q12
-	vst1.64		{q12}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q13, #63
-	veor		q13, q13, q7
-	vand		q6, q6, q5
-	vld1.8		{q3}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_4
-	veor		q2, q2, q10
-	vadd.u64	q14, q13, q13
-	vst1.64		{q13}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q14, #63
-	veor		q14, q14, q6
-	vand		q7, q7, q5
-	vld1.8		{q4}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_5
-	veor		q3, q3, q11
-	vadd.u64	q15, q14, q14
-	vst1.64		{q14}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q15, #63
-	veor		q15, q15, q7
-	vand		q6, q6, q5
-	vld1.8		{q5}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_enc_6
-	veor		q4, q4, q12
-	sub		r9, #0x10
-	vst1.64		{q15}, [r0,:128]		@ next round tweak
-
-	vld1.8		{q6}, [r7]!
-	veor		q5, q5, q13
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q6, q6, q14
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q6, q11
-	vld1.64		{q14}, [r0,:128]!
-	veor		q10, q3, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	veor		q12, q2, q14
-	vst1.8		{q10-q11}, [r8]!
-	vst1.8		{q12}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_6:
-	vst1.64		{q14}, [r0,:128]		@ next round tweak
-
-	veor		q4, q4, q12
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q5, q5, q13
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q6, q11
-	veor		q10, q3, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	vst1.8		{q10-q11}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-
-@ put this in range for both ARM and Thumb mode adr instructions
-.align	5
-.Lxts_magic:
-	.quad	1, 0x87
-
-.align	5
-.Lxts_enc_5:
-	vst1.64		{q13}, [r0,:128]		@ next round tweak
-
-	veor		q3, q3, q11
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q4, q4, q12
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q6, q11
-	veor		q10, q3, q12
-	vst1.8		{q8-q9}, [r8]!
-	vst1.8		{q10}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_4:
-	vst1.64		{q12}, [r0,:128]		@ next round tweak
-
-	veor		q2, q2, q10
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q3, q3, q11
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q6, q11
-	vst1.8		{q8-q9}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_3:
-	vst1.64		{q11}, [r0,:128]		@ next round tweak
-
-	veor		q1, q1, q9
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q2, q2, q10
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	veor		q8, q4, q10
-	vst1.8		{q0-q1}, [r8]!
-	vst1.8		{q8}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_2:
-	vst1.64		{q10}, [r0,:128]		@ next round tweak
-
-	veor		q0, q0, q8
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q1, q1, q9
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	vst1.8		{q0-q1}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_1:
-	mov		r0, sp
-	veor		q0, q8
-	mov		r1, sp
-	vst1.8		{q0}, [sp,:128]
-	mov		r2, r10
-	mov		r4, r3				@ preserve fp
-
-	bl		AES_encrypt
-
-	vld1.8		{q0}, [sp,:128]
-	veor		q0, q0, q8
-	vst1.8		{q0}, [r8]!
-	mov		r3, r4
-
-	vmov		q8, q9		@ next round tweak
-
-.Lxts_enc_done:
-#ifndef	XTS_CHAIN_TWEAK
-	adds		r9, #0x10
-	beq		.Lxts_enc_ret
-	sub		r6, r8, #0x10
-
-.Lxts_enc_steal:
-	ldrb		r0, [r7], #1
-	ldrb		r1, [r8, #-0x10]
-	strb		r0, [r8, #-0x10]
-	strb		r1, [r8], #1
-
-	subs		r9, #1
-	bhi		.Lxts_enc_steal
-
-	vld1.8		{q0}, [r6]
-	mov		r0, sp
-	veor		q0, q0, q8
-	mov		r1, sp
-	vst1.8		{q0}, [sp,:128]
-	mov		r2, r10
-	mov		r4, r3			@ preserve fp
-
-	bl		AES_encrypt
-
-	vld1.8		{q0}, [sp,:128]
-	veor		q0, q0, q8
-	vst1.8		{q0}, [r6]
-	mov		r3, r4
-#endif
-
-.Lxts_enc_ret:
-	bic		r0, r3, #0xf
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifdef	XTS_CHAIN_TWEAK
-	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
-#endif
-.Lxts_enc_bzero:				@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r0
-	bne		.Lxts_enc_bzero
-
-	mov		sp, r3
-#ifdef	XTS_CHAIN_TWEAK
-	vst1.8		{q8}, [r1]
-#endif
-	VFP_ABI_POP
-	ldmia		sp!, {r4-r10, pc}	@ return
-
-.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-.globl	bsaes_xts_decrypt
-.type	bsaes_xts_decrypt,%function
-.align	4
-bsaes_xts_decrypt:
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}		@ 0x20
-	VFP_ABI_PUSH
-	mov	r6, sp				@ future r3
-
-	mov	r7, r0
-	mov	r8, r1
-	mov	r9, r2
-	mov	r10, r3
-
-	sub	r0, sp, #0x10			@ 0x10
-	bic	r0, #0xf			@ align at 16 bytes
-	mov	sp, r0
-
-#ifdef	XTS_CHAIN_TWEAK
-	ldr	r0, [ip]			@ pointer to input tweak
-#else
-	@ generate initial tweak
-	ldr	r0, [ip, #4]			@ iv[]
-	mov	r1, sp
-	ldr	r2, [ip, #0]			@ key2
-	bl	AES_encrypt
-	mov	r0, sp				@ pointer to initial tweak
-#endif
-
-	ldr	r1, [r10, #240]		@ get # of rounds
-	mov	r3, r6
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
-	@ add	r12, #96			@ size of bit-sliced key schedule
-	sub	r12, #48			@ place for tweak[9]
-
-	@ populate the key schedule
-	mov	r4, r10			@ pass key
-	mov	r5, r1			@ pass # of rounds
-	mov	sp, r12
-	add	r12, #0x90			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, sp, #0x90
-	vldmia	r4, {q6}
-	vstmia	r12,  {q15}		@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	r4, {q7}
-#else
-	ldr	r12, [r10, #244]
-	eors	r12, #1
-	beq	0f
-
-	str	r12, [r10, #244]
-	mov	r4, r10			@ pass key
-	mov	r5, r1			@ pass # of rounds
-	add	r12, r10, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, r10, #248
-	vldmia	r4, {q6}
-	vstmia	r12,  {q15}		@ save last round key
-	veor	q7, q7, q6	@ fix up round 0 key
-	vstmia	r4, {q7}
-
-.align	2
-0:	sub	sp, #0x90			@ place for tweak[9]
-#endif
-	vld1.8	{q8}, [r0]			@ initial tweak
-	adr	r2, .Lxts_magic
-
-#ifndef	XTS_CHAIN_TWEAK
-	tst	r9, #0xf			@ if not multiple of 16
-	it	ne				@ Thumb2 thing, sanity check in ARM
-	subne	r9, #0x10			@ subtract another 16 bytes
-#endif
-	subs	r9, #0x80
-
-	blo	.Lxts_dec_short
-	b	.Lxts_dec_loop
-
-.align	4
-.Lxts_dec_loop:
-	vldmia		r2, {q5}	@ load XTS magic
-	vshr.s64	q6, q8, #63
-	mov		r0, sp
-	vand		q6, q6, q5
-	vadd.u64	q9, q8, q8
-	vst1.64		{q8}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q9, #63
-	veor		q9, q9, q6
-	vand		q7, q7, q5
-	vadd.u64	q10, q9, q9
-	vst1.64		{q9}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q10, #63
-	veor		q10, q10, q7
-	vand		q6, q6, q5
-	vld1.8		{q0}, [r7]!
-	vadd.u64	q11, q10, q10
-	vst1.64		{q10}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q11, #63
-	veor		q11, q11, q6
-	vand		q7, q7, q5
-	vld1.8		{q1}, [r7]!
-	veor		q0, q0, q8
-	vadd.u64	q12, q11, q11
-	vst1.64		{q11}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q12, #63
-	veor		q12, q12, q7
-	vand		q6, q6, q5
-	vld1.8		{q2}, [r7]!
-	veor		q1, q1, q9
-	vadd.u64	q13, q12, q12
-	vst1.64		{q12}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q13, #63
-	veor		q13, q13, q6
-	vand		q7, q7, q5
-	vld1.8		{q3}, [r7]!
-	veor		q2, q2, q10
-	vadd.u64	q14, q13, q13
-	vst1.64		{q13}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q14, #63
-	veor		q14, q14, q7
-	vand		q6, q6, q5
-	vld1.8		{q4}, [r7]!
-	veor		q3, q3, q11
-	vadd.u64	q15, q14, q14
-	vst1.64		{q14}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q15, #63
-	veor		q15, q15, q6
-	vand		q7, q7, q5
-	vld1.8		{q5}, [r7]!
-	veor		q4, q4, q12
-	vadd.u64	q8, q15, q15
-	vst1.64		{q15}, [r0,:128]!
-	vswp		d15,d14
-	veor		q8, q8, q7
-	vst1.64		{q8}, [r0,:128]		@ next round tweak
-
-	vld1.8		{q6-q7}, [r7]!
-	veor		q5, q5, q13
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q6, q6, q14
-	mov		r5, r1			@ pass rounds
-	veor		q7, q7, q15
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q4, q11
-	vld1.64		{q14-q15}, [r0,:128]!
-	veor		q10, q2, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	veor		q12, q3, q14
-	vst1.8		{q10-q11}, [r8]!
-	veor		q13, q5, q15
-	vst1.8		{q12-q13}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-
-	subs		r9, #0x80
-	bpl		.Lxts_dec_loop
-
-.Lxts_dec_short:
-	adds		r9, #0x70
-	bmi		.Lxts_dec_done
-
-	vldmia		r2, {q5}	@ load XTS magic
-	vshr.s64	q7, q8, #63
-	mov		r0, sp
-	vand		q7, q7, q5
-	vadd.u64	q9, q8, q8
-	vst1.64		{q8}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q9, #63
-	veor		q9, q9, q7
-	vand		q6, q6, q5
-	vadd.u64	q10, q9, q9
-	vst1.64		{q9}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q10, #63
-	veor		q10, q10, q6
-	vand		q7, q7, q5
-	vld1.8		{q0}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_1
-	vadd.u64	q11, q10, q10
-	vst1.64		{q10}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q11, #63
-	veor		q11, q11, q7
-	vand		q6, q6, q5
-	vld1.8		{q1}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_2
-	veor		q0, q0, q8
-	vadd.u64	q12, q11, q11
-	vst1.64		{q11}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q12, #63
-	veor		q12, q12, q6
-	vand		q7, q7, q5
-	vld1.8		{q2}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_3
-	veor		q1, q1, q9
-	vadd.u64	q13, q12, q12
-	vst1.64		{q12}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q13, #63
-	veor		q13, q13, q7
-	vand		q6, q6, q5
-	vld1.8		{q3}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_4
-	veor		q2, q2, q10
-	vadd.u64	q14, q13, q13
-	vst1.64		{q13}, [r0,:128]!
-	vswp		d13,d12
-	vshr.s64	q7, q14, #63
-	veor		q14, q14, q6
-	vand		q7, q7, q5
-	vld1.8		{q4}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_5
-	veor		q3, q3, q11
-	vadd.u64	q15, q14, q14
-	vst1.64		{q14}, [r0,:128]!
-	vswp		d15,d14
-	vshr.s64	q6, q15, #63
-	veor		q15, q15, q7
-	vand		q6, q6, q5
-	vld1.8		{q5}, [r7]!
-	subs		r9, #0x10
-	bmi		.Lxts_dec_6
-	veor		q4, q4, q12
-	sub		r9, #0x10
-	vst1.64		{q15}, [r0,:128]		@ next round tweak
-
-	vld1.8		{q6}, [r7]!
-	veor		q5, q5, q13
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q6, q6, q14
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q4, q11
-	vld1.64		{q14}, [r0,:128]!
-	veor		q10, q2, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	veor		q12, q3, q14
-	vst1.8		{q10-q11}, [r8]!
-	vst1.8		{q12}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_6:
-	vst1.64		{q14}, [r0,:128]		@ next round tweak
-
-	veor		q4, q4, q12
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q5, q5, q13
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12-q13}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q4, q11
-	veor		q10, q2, q12
-	vst1.8		{q8-q9}, [r8]!
-	veor		q11, q7, q13
-	vst1.8		{q10-q11}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_5:
-	vst1.64		{q13}, [r0,:128]		@ next round tweak
-
-	veor		q3, q3, q11
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q4, q4, q12
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	vld1.64		{q12}, [r0,:128]!
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q4, q11
-	veor		q10, q2, q12
-	vst1.8		{q8-q9}, [r8]!
-	vst1.8		{q10}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_4:
-	vst1.64		{q12}, [r0,:128]		@ next round tweak
-
-	veor		q2, q2, q10
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q3, q3, q11
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10-q11}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	veor		q9, q4, q11
-	vst1.8		{q8-q9}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_3:
-	vst1.64		{q11}, [r0,:128]		@ next round tweak
-
-	veor		q1, q1, q9
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q2, q2, q10
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	vld1.64		{q10}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	veor		q8, q6, q10
-	vst1.8		{q0-q1}, [r8]!
-	vst1.8		{q8}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_2:
-	vst1.64		{q10}, [r0,:128]		@ next round tweak
-
-	veor		q0, q0, q8
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, r10, #248			@ pass key schedule
-#endif
-	veor		q1, q1, q9
-	mov		r5, r1			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{q8-q9}, [r0,:128]!
-	veor		q0, q0, q8
-	veor		q1, q1, q9
-	vst1.8		{q0-q1}, [r8]!
-
-	vld1.64		{q8}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_1:
-	mov		r0, sp
-	veor		q0, q8
-	mov		r1, sp
-	vst1.8		{q0}, [sp,:128]
-	mov		r2, r10
-	mov		r4, r3				@ preserve fp
-	mov		r5, r2			@ preserve magic
-
-	bl		AES_decrypt
-
-	vld1.8		{q0}, [sp,:128]
-	veor		q0, q0, q8
-	vst1.8		{q0}, [r8]!
-	mov		r3, r4
-	mov		r2, r5
-
-	vmov		q8, q9		@ next round tweak
-
-.Lxts_dec_done:
-#ifndef	XTS_CHAIN_TWEAK
-	adds		r9, #0x10
-	beq		.Lxts_dec_ret
-
-	@ calculate one round of extra tweak for the stolen ciphertext
-	vldmia		r2, {q5}
-	vshr.s64	q6, q8, #63
-	vand		q6, q6, q5
-	vadd.u64	q9, q8, q8
-	vswp		d13,d12
-	veor		q9, q9, q6
-
-	@ perform the final decryption with the last tweak value
-	vld1.8		{q0}, [r7]!
-	mov		r0, sp
-	veor		q0, q0, q9
-	mov		r1, sp
-	vst1.8		{q0}, [sp,:128]
-	mov		r2, r10
-	mov		r4, r3			@ preserve fp
-
-	bl		AES_decrypt
-
-	vld1.8		{q0}, [sp,:128]
-	veor		q0, q0, q9
-	vst1.8		{q0}, [r8]
-
-	mov		r6, r8
-.Lxts_dec_steal:
-	ldrb		r1, [r8]
-	ldrb		r0, [r7], #1
-	strb		r1, [r8, #0x10]
-	strb		r0, [r8], #1
-
-	subs		r9, #1
-	bhi		.Lxts_dec_steal
-
-	vld1.8		{q0}, [r6]
-	mov		r0, sp
-	veor		q0, q8
-	mov		r1, sp
-	vst1.8		{q0}, [sp,:128]
-	mov		r2, r10
-
-	bl		AES_decrypt
-
-	vld1.8		{q0}, [sp,:128]
-	veor		q0, q0, q8
-	vst1.8		{q0}, [r6]
-	mov		r3, r4
-#endif
-
-.Lxts_dec_ret:
-	bic		r0, r3, #0xf
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifdef	XTS_CHAIN_TWEAK
-	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
-#endif
-.Lxts_dec_bzero:				@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r0
-	bne		.Lxts_dec_bzero
-
-	mov		sp, r3
-#ifdef	XTS_CHAIN_TWEAK
-	vst1.8		{q8}, [r1]
-#endif
-	VFP_ABI_POP
-	ldmia		sp!, {r4-r10, pc}	@ return
-
-.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
-#endif
diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
deleted file mode 100644
index d8e06de72ef3..000000000000
--- a/arch/arm/crypto/aesbs-glue.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
- *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <crypto/aes.h>
-#include <crypto/cbc.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/module.h>
-#include <crypto/xts.h>
-
-#include "aes_glue.h"
-
-#define BIT_SLICED_KEY_MAXSIZE	(128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
-
-struct BS_KEY {
-	struct AES_KEY	rk;
-	int		converted;
-	u8 __aligned(8)	bs[BIT_SLICED_KEY_MAXSIZE];
-} __aligned(8);
-
-asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
-asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
-
-asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 iv[]);
-
-asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
-					   struct BS_KEY *key, u8 const iv[]);
-
-asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 tweak[]);
-
-asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
-				  struct BS_KEY *key, u8 tweak[]);
-
-struct aesbs_cbc_ctx {
-	struct AES_KEY	enc;
-	struct BS_KEY	dec;
-};
-
-struct aesbs_ctr_ctx {
-	struct BS_KEY	enc;
-};
-
-struct aesbs_xts_ctx {
-	struct BS_KEY	enc;
-	struct BS_KEY	dec;
-	struct AES_KEY	twkey;
-};
-
-static int aesbs_cbc_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 8;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->dec.rk = ctx->enc;
-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
-	ctx->dec.converted = 0;
-	return 0;
-}
-
-static int aesbs_ctr_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 8;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->enc.converted = 0;
-	return 0;
-}
-
-static int aesbs_xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
-			     unsigned int key_len)
-{
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int bits = key_len * 4;
-	int err;
-
-	err = xts_verify_key(tfm, in_key, key_len);
-	if (err)
-		return err;
-
-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	ctx->dec.rk = ctx->enc.rk;
-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
-	private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
-	ctx->enc.converted = ctx->dec.converted = 0;
-	return 0;
-}
-
-static inline void aesbs_encrypt_one(struct crypto_skcipher *tfm,
-				     const u8 *src, u8 *dst)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	AES_encrypt(src, dst, &ctx->enc);
-}
-
-static int aesbs_cbc_encrypt(struct skcipher_request *req)
-{
-	return crypto_cbc_encrypt_walk(req, aesbs_encrypt_one);
-}
-
-static inline void aesbs_decrypt_one(struct crypto_skcipher *tfm,
-				     const u8 *src, u8 *dst)
-{
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	AES_decrypt(src, dst, &ctx->dec.rk);
-}
-
-static int aesbs_cbc_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	unsigned int nbytes;
-	int err;
-
-	for (err = skcipher_walk_virt(&walk, req, false);
-	     (nbytes = walk.nbytes); err = skcipher_walk_done(&walk, nbytes)) {
-		u32 blocks = nbytes / AES_BLOCK_SIZE;
-		u8 *dst = walk.dst.virt.addr;
-		u8 *src = walk.src.virt.addr;
-		u8 *iv = walk.iv;
-
-		if (blocks >= 8) {
-			kernel_neon_begin();
-			bsaes_cbc_encrypt(src, dst, nbytes, &ctx->dec, iv);
-			kernel_neon_end();
-			nbytes %= AES_BLOCK_SIZE;
-			continue;
-		}
-
-		nbytes = crypto_cbc_decrypt_blocks(&walk, tfm,
-						   aesbs_decrypt_one);
-	}
-	return err;
-}
-
-static void inc_be128_ctr(__be32 ctr[], u32 addend)
-{
-	int i;
-
-	for (i = 3; i >= 0; i--, addend = 1) {
-		u32 n = be32_to_cpu(ctr[i]) + addend;
-
-		ctr[i] = cpu_to_be32(n);
-		if (n >= addend)
-			break;
-	}
-}
-
-static int aesbs_ctr_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	u32 blocks;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
-		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
-		__be32 *ctr = (__be32 *)walk.iv;
-		u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
-
-		/* avoid 32 bit counter overflow in the NEON code */
-		if (unlikely(headroom < blocks)) {
-			blocks = headroom + 1;
-			tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
-		}
-		kernel_neon_begin();
-		bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
-					   walk.dst.virt.addr, blocks,
-					   &ctx->enc, walk.iv);
-		kernel_neon_end();
-		inc_be128_ctr(ctr, blocks);
-
-		err = skcipher_walk_done(&walk, tail);
-	}
-	if (walk.nbytes) {
-		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
-		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
-		u8 ks[AES_BLOCK_SIZE];
-
-		AES_encrypt(walk.iv, ks, &ctx->enc.rk);
-		if (tdst != tsrc)
-			memcpy(tdst, tsrc, walk.nbytes);
-		crypto_xor(tdst, ks, walk.nbytes);
-		err = skcipher_walk_done(&walk, 0);
-	}
-	return err;
-}
-
-static int aesbs_xts_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	/* generate the initial tweak */
-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
-
-	while (walk.nbytes) {
-		kernel_neon_begin();
-		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->enc, walk.iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static int aesbs_xts_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	/* generate the initial tweak */
-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
-
-	while (walk.nbytes) {
-		kernel_neon_begin();
-		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->dec, walk.iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static struct skcipher_alg aesbs_algs[] = { {
-	.base = {
-		.cra_name		= "__cbc(aes)",
-		.cra_driver_name	= "__cbc-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.setkey		= aesbs_cbc_set_key,
-	.encrypt	= aesbs_cbc_encrypt,
-	.decrypt	= aesbs_cbc_decrypt,
-}, {
-	.base = {
-		.cra_name		= "__ctr(aes)",
-		.cra_driver_name	= "__ctr-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.chunksize	= AES_BLOCK_SIZE,
-	.setkey		= aesbs_ctr_set_key,
-	.encrypt	= aesbs_ctr_encrypt,
-	.decrypt	= aesbs_ctr_encrypt,
-}, {
-	.base = {
-		.cra_name		= "__xts(aes)",
-		.cra_driver_name	= "__xts-aes-neonbs",
-		.cra_priority		= 300,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
-		.cra_alignmask		= 7,
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
-	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.setkey		= aesbs_xts_set_key,
-	.encrypt	= aesbs_xts_encrypt,
-	.decrypt	= aesbs_xts_decrypt,
-} };
-
-struct simd_skcipher_alg *aesbs_simd_algs[ARRAY_SIZE(aesbs_algs)];
-
-static void aesbs_mod_exit(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(aesbs_simd_algs) && aesbs_simd_algs[i]; i++)
-		simd_skcipher_free(aesbs_simd_algs[i]);
-
-	crypto_unregister_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
-}
-
-static int __init aesbs_mod_init(void)
-{
-	struct simd_skcipher_alg *simd;
-	const char *basename;
-	const char *algname;
-	const char *drvname;
-	int err;
-	int i;
-
-	if (!cpu_has_neon())
-		return -ENODEV;
-
-	err = crypto_register_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
-	if (err)
-		return err;
-
-	for (i = 0; i < ARRAY_SIZE(aesbs_algs); i++) {
-		algname = aesbs_algs[i].base.cra_name + 2;
-		drvname = aesbs_algs[i].base.cra_driver_name + 2;
-		basename = aesbs_algs[i].base.cra_driver_name;
-		simd = simd_skcipher_create_compat(algname, drvname, basename);
-		err = PTR_ERR(simd);
-		if (IS_ERR(simd))
-			goto unregister_simds;
-
-		aesbs_simd_algs[i] = simd;
-	}
-
-	return 0;
-
-unregister_simds:
-	aesbs_mod_exit();
-	return err;
-}
-
-module_init(aesbs_mod_init);
-module_exit(aesbs_mod_exit);
-
-MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL");
diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl
deleted file mode 100644
index a4d3856e7d24..000000000000
--- a/arch/arm/crypto/bsaes-armv7.pl
+++ /dev/null
@@ -1,2471 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-#
-# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
-# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
-# granted.
-# ====================================================================
-
-# Bit-sliced AES for ARM NEON
-#
-# February 2012.
-#
-# This implementation is direct adaptation of bsaes-x86_64 module for
-# ARM NEON. Except that this module is endian-neutral [in sense that
-# it can be compiled for either endianness] by courtesy of vld1.8's
-# neutrality. Initial version doesn't implement interface to OpenSSL,
-# only low-level primitives and unsupported entry points, just enough
-# to collect performance results, which for Cortex-A8 core are:
-#
-# encrypt	19.5 cycles per byte processed with 128-bit key
-# decrypt	22.1 cycles per byte processed with 128-bit key
-# key conv.	440  cycles per 128-bit key/0.18 of 8x block
-#
-# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
-# which is [much] worse than anticipated (for further details see
-# http://www.openssl.org/~appro/Snapdragon-S4.html).
-#
-# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
-# manages in 20.0 cycles].
-#
-# When comparing to x86_64 results keep in mind that NEON unit is
-# [mostly] single-issue and thus can't [fully] benefit from
-# instruction-level parallelism. And when comparing to aes-armv4
-# results keep in mind key schedule conversion overhead (see
-# bsaes-x86_64.pl for further details)...
-#
-#						<appro@openssl.org>
-
-# April-August 2013
-#
-# Add CBC, CTR and XTS subroutines, adapt for kernel use.
-#
-#					<ard.biesheuvel@linaro.org>
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
-my @XMM=map("q$_",(0..15));
-
-{
-my ($key,$rounds,$const)=("r4","r5","r6");
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
-sub Sbox {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-	&InBasisChange	(@b);
-	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
-	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
-}
-
-sub InBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
-my @b=@_[0..7];
-$code.=<<___;
-	veor	@b[2], @b[2], @b[1]
-	veor	@b[5], @b[5], @b[6]
-	veor	@b[3], @b[3], @b[0]
-	veor	@b[6], @b[6], @b[2]
-	veor	@b[5], @b[5], @b[0]
-
-	veor	@b[6], @b[6], @b[3]
-	veor	@b[3], @b[3], @b[7]
-	veor	@b[7], @b[7], @b[5]
-	veor	@b[3], @b[3], @b[4]
-	veor	@b[4], @b[4], @b[5]
-
-	veor	@b[2], @b[2], @b[7]
-	veor	@b[3], @b[3], @b[1]
-	veor	@b[1], @b[1], @b[5]
-___
-}
-
-sub OutBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
-my @b=@_[0..7];
-$code.=<<___;
-	veor	@b[0], @b[0], @b[6]
-	veor	@b[1], @b[1], @b[4]
-	veor	@b[4], @b[4], @b[6]
-	veor	@b[2], @b[2], @b[0]
-	veor	@b[6], @b[6], @b[1]
-
-	veor	@b[1], @b[1], @b[5]
-	veor	@b[5], @b[5], @b[3]
-	veor	@b[3], @b[3], @b[7]
-	veor	@b[7], @b[7], @b[5]
-	veor	@b[2], @b[2], @b[5]
-
-	veor	@b[4], @b[4], @b[7]
-___
-}
-
-sub InvSbox {
-# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-	&InvInBasisChange	(@b);
-	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
-	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
-}
-
-sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
-my @b=@_[5,1,2,6,3,7,0,4];
-$code.=<<___
-	 veor	@b[1], @b[1], @b[7]
-	veor	@b[4], @b[4], @b[7]
-
-	veor	@b[7], @b[7], @b[5]
-	 veor	@b[1], @b[1], @b[3]
-	veor	@b[2], @b[2], @b[5]
-	veor	@b[3], @b[3], @b[7]
-
-	veor	@b[6], @b[6], @b[1]
-	veor	@b[2], @b[2], @b[0]
-	 veor	@b[5], @b[5], @b[3]
-	veor	@b[4], @b[4], @b[6]
-	veor	@b[0], @b[0], @b[6]
-	veor	@b[1], @b[1], @b[4]
-___
-}
-
-sub InvOutBasisChange {		# InBasisChange in reverse
-my @b=@_[2,5,7,3,6,1,0,4];
-$code.=<<___;
-	veor	@b[1], @b[1], @b[5]
-	veor	@b[2], @b[2], @b[7]
-
-	veor	@b[3], @b[3], @b[1]
-	veor	@b[4], @b[4], @b[5]
-	veor	@b[7], @b[7], @b[5]
-	veor	@b[3], @b[3], @b[4]
-	 veor 	@b[5], @b[5], @b[0]
-	veor	@b[3], @b[3], @b[7]
-	 veor	@b[6], @b[6], @b[2]
-	 veor	@b[2], @b[2], @b[1]
-	veor	@b[6], @b[6], @b[3]
-
-	veor	@b[3], @b[3], @b[0]
-	veor	@b[5], @b[5], @b[6]
-___
-}
-
-sub Mul_GF4 {
-#;*************************************************************
-#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
-#;*************************************************************
-my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
-$code.=<<___;
-	veor 	$t0, $y0, $y1
-	vand	$t0, $t0, $x0
-	veor	$x0, $x0, $x1
-	vand	$t1, $x1, $y0
-	vand	$x0, $x0, $y1
-	veor	$x1, $t1, $t0
-	veor	$x0, $x0, $t1
-___
-}
-
-sub Mul_GF4_N {				# not used, see next subroutine
-# multiply and scale by N
-my ($x0,$x1,$y0,$y1,$t0)=@_;
-$code.=<<___;
-	veor	$t0, $y0, $y1
-	vand	$t0, $t0, $x0
-	veor	$x0, $x0, $x1
-	vand	$x1, $x1, $y0
-	vand	$x0, $x0, $y1
-	veor	$x1, $x1, $x0
-	veor	$x0, $x0, $t0
-___
-}
-
-sub Mul_GF4_N_GF4 {
-# interleaved Mul_GF4_N and Mul_GF4
-my ($x0,$x1,$y0,$y1,$t0,
-    $x2,$x3,$y2,$y3,$t1)=@_;
-$code.=<<___;
-	veor	$t0, $y0, $y1
-	 veor 	$t1, $y2, $y3
-	vand	$t0, $t0, $x0
-	 vand	$t1, $t1, $x2
-	veor	$x0, $x0, $x1
-	 veor	$x2, $x2, $x3
-	vand	$x1, $x1, $y0
-	 vand	$x3, $x3, $y2
-	vand	$x0, $x0, $y1
-	 vand	$x2, $x2, $y3
-	veor	$x1, $x1, $x0
-	 veor	$x2, $x2, $x3
-	veor	$x0, $x0, $t0
-	 veor	$x3, $x3, $t1
-___
-}
-sub Mul_GF16_2 {
-my @x=@_[0..7];
-my @y=@_[8..11];
-my @t=@_[12..15];
-$code.=<<___;
-	veor	@t[0], @x[0], @x[2]
-	veor	@t[1], @x[1], @x[3]
-___
-	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
-$code.=<<___;
-	veor	@y[0], @y[0], @y[2]
-	veor	@y[1], @y[1], @y[3]
-___
-	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-			 @x[2], @x[3], @y[2], @y[3], @t[2]);
-$code.=<<___;
-	veor	@x[0], @x[0], @t[0]
-	veor	@x[2], @x[2], @t[0]
-	veor	@x[1], @x[1], @t[1]
-	veor	@x[3], @x[3], @t[1]
-
-	veor	@t[0], @x[4], @x[6]
-	veor	@t[1], @x[5], @x[7]
-___
-	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-			 @x[6], @x[7], @y[2], @y[3], @t[2]);
-$code.=<<___;
-	veor	@y[0], @y[0], @y[2]
-	veor	@y[1], @y[1], @y[3]
-___
-	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
-$code.=<<___;
-	veor	@x[4], @x[4], @t[0]
-	veor	@x[6], @x[6], @t[0]
-	veor	@x[5], @x[5], @t[1]
-	veor	@x[7], @x[7], @t[1]
-___
-}
-sub Inv_GF256 {
-#;********************************************************************
-#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
-#;********************************************************************
-my @x=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-# direct optimizations from hardware
-$code.=<<___;
-	veor	@t[3], @x[4], @x[6]
-	veor	@t[2], @x[5], @x[7]
-	veor	@t[1], @x[1], @x[3]
-	veor	@s[1], @x[7], @x[6]
-	 vmov	@t[0], @t[2]
-	veor	@s[0], @x[0], @x[2]
-
-	vorr	@t[2], @t[2], @t[1]
-	veor	@s[3], @t[3], @t[0]
-	vand	@s[2], @t[3], @s[0]
-	vorr	@t[3], @t[3], @s[0]
-	veor	@s[0], @s[0], @t[1]
-	vand	@t[0], @t[0], @t[1]
-	veor	@t[1], @x[3], @x[2]
-	vand	@s[3], @s[3], @s[0]
-	vand	@s[1], @s[1], @t[1]
-	veor	@t[1], @x[4], @x[5]
-	veor	@s[0], @x[1], @x[0]
-	veor	@t[3], @t[3], @s[1]
-	veor	@t[2], @t[2], @s[1]
-	vand	@s[1], @t[1], @s[0]
-	vorr	@t[1], @t[1], @s[0]
-	veor	@t[3], @t[3], @s[3]
-	veor	@t[0], @t[0], @s[1]
-	veor	@t[2], @t[2], @s[2]
-	veor	@t[1], @t[1], @s[3]
-	veor	@t[0], @t[0], @s[2]
-	vand	@s[0], @x[7], @x[3]
-	veor	@t[1], @t[1], @s[2]
-	vand	@s[1], @x[6], @x[2]
-	vand	@s[2], @x[5], @x[1]
-	vorr	@s[3], @x[4], @x[0]
-	veor	@t[3], @t[3], @s[0]
-	veor	@t[1], @t[1], @s[2]
-	veor	@t[0], @t[0], @s[3]
-	veor	@t[2], @t[2], @s[1]
-
-	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
-
-	@ new smaller inversion
-
-	vand	@s[2], @t[3], @t[1]
-	vmov	@s[0], @t[0]
-
-	veor	@s[1], @t[2], @s[2]
-	veor	@s[3], @t[0], @s[2]
-	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
-
-	vbsl	@s[1], @t[1], @t[0]
-	vbsl	@s[3], @t[3], @t[2]
-	veor	@t[3], @t[3], @t[2]
-
-	vbsl	@s[0], @s[1], @s[2]
-	vbsl	@t[0], @s[2], @s[1]
-
-	vand	@s[2], @s[0], @s[3]
-	veor	@t[1], @t[1], @t[0]
-
-	veor	@s[2], @s[2], @t[3]
-___
-# output in s3, s2, s1, t1
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
-	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
-
-### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
-}
-
-# AES linear components
-
-sub ShiftRows {
-my @x=@_[0..7];
-my @t=@_[8..11];
-my $mask=pop;
-$code.=<<___;
-	vldmia	$key!, {@t[0]-@t[3]}
-	veor	@t[0], @t[0], @x[0]
-	veor	@t[1], @t[1], @x[1]
-	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
-	vldmia	$key!, {@t[0]}
-	veor	@t[2], @t[2], @x[2]
-	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
-	vldmia	$key!, {@t[1]}
-	veor	@t[3], @t[3], @x[3]
-	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
-	vldmia	$key!, {@t[2]}
-	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
-	vldmia	$key!, {@t[3]}
-	veor	@t[0], @t[0], @x[4]
-	veor	@t[1], @t[1], @x[5]
-	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
-	veor	@t[2], @t[2], @x[6]
-	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
-	veor	@t[3], @t[3], @x[7]
-	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
-	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
-	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
-___
-}
-
-sub MixColumns {
-# modified to emit output in order suitable for feeding back to aesenc[last]
-my @x=@_[0..7];
-my @t=@_[8..15];
-my $inv=@_[16];	# optional
-$code.=<<___;
-	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
-	vext.8	@t[1], @x[1], @x[1], #12
-	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
-	vext.8	@t[2], @x[2], @x[2], #12
-	 veor	@x[1], @x[1], @t[1]
-	vext.8	@t[3], @x[3], @x[3], #12
-	 veor	@x[2], @x[2], @t[2]
-	vext.8	@t[4], @x[4], @x[4], #12
-	 veor	@x[3], @x[3], @t[3]
-	vext.8	@t[5], @x[5], @x[5], #12
-	 veor	@x[4], @x[4], @t[4]
-	vext.8	@t[6], @x[6], @x[6], #12
-	 veor	@x[5], @x[5], @t[5]
-	vext.8	@t[7], @x[7], @x[7], #12
-	 veor	@x[6], @x[6], @t[6]
-
-	veor	@t[1], @t[1], @x[0]
-	 veor	@x[7], @x[7], @t[7]
-	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
-	veor	@t[2], @t[2], @x[1]
-	veor	@t[0], @t[0], @x[7]
-	veor	@t[1], @t[1], @x[7]
-	 vext.8	@x[1], @x[1], @x[1], #8
-	veor	@t[5], @t[5], @x[4]
-	 veor	@x[0], @x[0], @t[0]
-	veor	@t[6], @t[6], @x[5]
-	 veor	@x[1], @x[1], @t[1]
-	 vext.8	@t[0], @x[4], @x[4], #8
-	veor	@t[4], @t[4], @x[3]
-	 vext.8	@t[1], @x[5], @x[5], #8
-	veor	@t[7], @t[7], @x[6]
-	 vext.8	@x[4], @x[3], @x[3], #8
-	veor	@t[3], @t[3], @x[2]
-	 vext.8	@x[5], @x[7], @x[7], #8
-	veor	@t[4], @t[4], @x[7]
-	 vext.8	@x[3], @x[6], @x[6], #8
-	veor	@t[3], @t[3], @x[7]
-	 vext.8	@x[6], @x[2], @x[2], #8
-	veor	@x[7], @t[1], @t[5]
-___
-$code.=<<___ if (!$inv);
-	veor	@x[2], @t[0], @t[4]
-	veor	@x[4], @x[4], @t[3]
-	veor	@x[5], @x[5], @t[7]
-	veor	@x[3], @x[3], @t[6]
-	 @ vmov	@x[2], @t[0]
-	veor	@x[6], @x[6], @t[2]
-	 @ vmov	@x[7], @t[1]
-___
-$code.=<<___ if ($inv);
-	veor	@t[3], @t[3], @x[4]
-	veor	@x[5], @x[5], @t[7]
-	veor	@x[2], @x[3], @t[6]
-	veor	@x[3], @t[0], @t[4]
-	veor	@x[4], @x[6], @t[2]
-	vmov	@x[6], @t[3]
-	 @ vmov	@x[7], @t[1]
-___
-}
-
-sub InvMixColumns_orig {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-$code.=<<___;
-	@ multiplication by 0x0e
-	vext.8	@t[7], @x[7], @x[7], #12
-	vmov	@t[2], @x[2]
-	veor	@x[2], @x[2], @x[5]		@ 2 5
-	veor	@x[7], @x[7], @x[5]		@ 7 5
-	vext.8	@t[0], @x[0], @x[0], #12
-	vmov	@t[5], @x[5]
-	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
-	veor	@x[0], @x[0], @x[1]		@ 0 1
-	vext.8	@t[1], @x[1], @x[1], #12
-	veor	@x[1], @x[1], @x[2]		@ 1 25
-	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
-	vext.8	@t[3], @x[3], @x[3], #12
-	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
-	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
-	veor	@x[3], @x[3], @x[7]		@ 3 75
-	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
-	vext.8	@t[6], @x[6], @x[6], #12
-	vmov	@t[4], @x[4]
-	veor	@x[6], @x[6], @x[4]		@ 6 4
-	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
-	veor	@x[3], @x[3], @x[7]		@ 375 756=36
-	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
-	veor	@x[3], @x[3], @t[2]		@ 36 2
-	vext.8	@t[5], @t[5], @t[5], #12
-	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
-___
-					my @y = @x[7,5,0,2,1,3,4,6];
-$code.=<<___;
-	@ multiplication by 0x0b
-	veor	@y[1], @y[1], @y[0]
-	veor	@y[0], @y[0], @t[0]
-	vext.8	@t[2], @t[2], @t[2], #12
-	veor	@y[1], @y[1], @t[1]
-	veor	@y[0], @y[0], @t[5]
-	vext.8	@t[4], @t[4], @t[4], #12
-	veor	@y[1], @y[1], @t[6]
-	veor	@y[0], @y[0], @t[7]
-	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
-
-	veor	@y[3], @y[3], @t[0]
-	 veor	@y[1], @y[1], @y[0]
-	vext.8	@t[0], @t[0], @t[0], #12
-	veor	@y[2], @y[2], @t[1]
-	veor	@y[4], @y[4], @t[1]
-	vext.8	@t[1], @t[1], @t[1], #12
-	veor	@y[2], @y[2], @t[2]
-	veor	@y[3], @y[3], @t[2]
-	veor	@y[5], @y[5], @t[2]
-	veor	@y[2], @y[2], @t[7]
-	vext.8	@t[2], @t[2], @t[2], #12
-	veor	@y[3], @y[3], @t[3]
-	veor	@y[6], @y[6], @t[3]
-	veor	@y[4], @y[4], @t[3]
-	veor	@y[7], @y[7], @t[4]
-	vext.8	@t[3], @t[3], @t[3], #12
-	veor	@y[5], @y[5], @t[4]
-	veor	@y[7], @y[7], @t[7]
-	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
-	veor	@y[3], @y[3], @t[5]
-	veor	@y[4], @y[4], @t[4]
-
-	veor	@y[5], @y[5], @t[7]
-	vext.8	@t[4], @t[4], @t[4], #12
-	veor	@y[6], @y[6], @t[7]
-	veor	@y[4], @y[4], @t[7]
-
-	veor	@t[7], @t[7], @t[5]
-	vext.8	@t[5], @t[5], @t[5], #12
-
-	@ multiplication by 0x0d
-	veor	@y[4], @y[4], @y[7]
-	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
-	veor	@y[7], @y[7], @t[4]
-	vext.8	@t[6], @t[6], @t[6], #12
-	veor	@y[2], @y[2], @t[0]
-	veor	@y[7], @y[7], @t[5]
-	vext.8	@t[7], @t[7], @t[7], #12
-	veor	@y[2], @y[2], @t[2]
-
-	veor	@y[3], @y[3], @y[1]
-	veor	@y[1], @y[1], @t[1]
-	veor	@y[0], @y[0], @t[0]
-	veor	@y[3], @y[3], @t[0]
-	veor	@y[1], @y[1], @t[5]
-	veor	@y[0], @y[0], @t[5]
-	vext.8	@t[0], @t[0], @t[0], #12
-	veor	@y[1], @y[1], @t[7]
-	veor	@y[0], @y[0], @t[6]
-	veor	@y[3], @y[3], @y[1]
-	veor	@y[4], @y[4], @t[1]
-	vext.8	@t[1], @t[1], @t[1], #12
-
-	veor	@y[7], @y[7], @t[7]
-	veor	@y[4], @y[4], @t[2]
-	veor	@y[5], @y[5], @t[2]
-	veor	@y[2], @y[2], @t[6]
-	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
-	vext.8	@t[2], @t[2], @t[2], #12
-	veor	@y[4], @y[4], @y[7]
-	veor	@y[3], @y[3], @t[6]
-
-	veor	@y[6], @y[6], @t[6]
-	veor	@y[5], @y[5], @t[5]
-	vext.8	@t[5], @t[5], @t[5], #12
-	veor	@y[6], @y[6], @t[4]
-	vext.8	@t[4], @t[4], @t[4], #12
-	veor	@y[5], @y[5], @t[6]
-	veor	@y[6], @y[6], @t[7]
-	vext.8	@t[7], @t[7], @t[7], #12
-	veor	@t[6], @t[6], @t[3]		@ restore t[6]
-	vext.8	@t[3], @t[3], @t[3], #12
-
-	@ multiplication by 0x09
-	veor	@y[4], @y[4], @y[1]
-	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
-	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
-	vext.8	@t[6], @t[6], @t[6], #12
-	veor	@t[1], @t[1], @t[5]
-	veor	@y[3], @y[3], @t[0]
-	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
-	veor	@t[1], @t[1], @t[6]
-	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
-	veor	@y[4], @y[4], @t[1]
-	veor	@y[7], @y[7], @t[4]
-	veor	@y[6], @y[6], @t[3]
-	veor	@y[5], @y[5], @t[2]
-	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
-	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
-	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
-	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
-	veor	@t[3], @t[3], @t[7]
-	veor	@XMM[5], @t[5], @t[6]
-	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
-	veor	@XMM[2], @t[2], @t[6]
-	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
-
-	vmov	@XMM[0], @t[0]
-	vmov	@XMM[1], @t[1]
-	@ vmov	@XMM[2], @t[2]
-	vmov	@XMM[3], @t[3]
-	vmov	@XMM[4], @t[4]
-	@ vmov	@XMM[5], @t[5]
-	@ vmov	@XMM[6], @t[6]
-	@ vmov	@XMM[7], @t[7]
-___
-}
-
-sub InvMixColumns {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-# Thanks to Jussi Kivilinna for providing pointer to
-#
-# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
-# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
-# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
-# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
-
-$code.=<<___;
-	@ multiplication by 0x05-0x00-0x04-0x00
-	vext.8	@t[0], @x[0], @x[0], #8
-	vext.8	@t[6], @x[6], @x[6], #8
-	vext.8	@t[7], @x[7], @x[7], #8
-	veor	@t[0], @t[0], @x[0]
-	vext.8	@t[1], @x[1], @x[1], #8
-	veor	@t[6], @t[6], @x[6]
-	vext.8	@t[2], @x[2], @x[2], #8
-	veor	@t[7], @t[7], @x[7]
-	vext.8	@t[3], @x[3], @x[3], #8
-	veor	@t[1], @t[1], @x[1]
-	vext.8	@t[4], @x[4], @x[4], #8
-	veor	@t[2], @t[2], @x[2]
-	vext.8	@t[5], @x[5], @x[5], #8
-	veor	@t[3], @t[3], @x[3]
-	veor	@t[4], @t[4], @x[4]
-	veor	@t[5], @t[5], @x[5]
-
-	 veor	@x[0], @x[0], @t[6]
-	 veor	@x[1], @x[1], @t[6]
-	 veor	@x[2], @x[2], @t[0]
-	 veor	@x[4], @x[4], @t[2]
-	 veor	@x[3], @x[3], @t[1]
-	 veor	@x[1], @x[1], @t[7]
-	 veor	@x[2], @x[2], @t[7]
-	 veor	@x[4], @x[4], @t[6]
-	 veor	@x[5], @x[5], @t[3]
-	 veor	@x[3], @x[3], @t[6]
-	 veor	@x[6], @x[6], @t[4]
-	 veor	@x[4], @x[4], @t[7]
-	 veor	@x[5], @x[5], @t[7]
-	 veor	@x[7], @x[7], @t[5]
-___
-	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
-}
-
-sub swapmove {
-my ($a,$b,$n,$mask,$t)=@_;
-$code.=<<___;
-	vshr.u64	$t, $b, #$n
-	veor		$t, $t, $a
-	vand		$t, $t, $mask
-	veor		$a, $a, $t
-	vshl.u64	$t, $t, #$n
-	veor		$b, $b, $t
-___
-}
-sub swapmove2x {
-my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
-$code.=<<___;
-	vshr.u64	$t0, $b0, #$n
-	 vshr.u64	$t1, $b1, #$n
-	veor		$t0, $t0, $a0
-	 veor		$t1, $t1, $a1
-	vand		$t0, $t0, $mask
-	 vand		$t1, $t1, $mask
-	veor		$a0, $a0, $t0
-	vshl.u64	$t0, $t0, #$n
-	 veor		$a1, $a1, $t1
-	 vshl.u64	$t1, $t1, #$n
-	veor		$b0, $b0, $t0
-	 veor		$b1, $b1, $t1
-___
-}
-
-sub bitslice {
-my @x=reverse(@_[0..7]);
-my ($t0,$t1,$t2,$t3)=@_[8..11];
-$code.=<<___;
-	vmov.i8	$t0,#0x55			@ compose .LBS0
-	vmov.i8	$t1,#0x33			@ compose .LBS1
-___
-	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
-	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-$code.=<<___;
-	vmov.i8	$t0,#0x0f			@ compose .LBS2
-___
-	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
-	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-
-	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
-	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-
-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
-# define VFP_ABI_FRAME	0x40
-#else
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-# define VFP_ABI_FRAME	0
-# define BSAES_ASM_EXTENDED_KEY
-# define XTS_CHAIN_TWEAK
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-#ifdef __thumb__
-# define adrl adr
-#endif
-
-#if __ARM_MAX_ARCH__>=7
-.arch	armv7-a
-.fpu	neon
-
-.text
-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
-#ifdef __thumb2__
-.thumb
-#else
-.code   32
-#endif
-
-.type	_bsaes_decrypt8,%function
-.align	4
-_bsaes_decrypt8:
-	adr	$const,_bsaes_decrypt8
-	vldmia	$key!, {@XMM[9]}		@ round 0 key
-	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
-
-	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
-	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
-	veor	@XMM[11], @XMM[1], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-	veor	@XMM[12], @XMM[2], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-	veor	@XMM[13], @XMM[3], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-	veor	@XMM[14], @XMM[4], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-	veor	@XMM[15], @XMM[5], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-	veor	@XMM[10], @XMM[6], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-	veor	@XMM[11], @XMM[7], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-___
-	&bitslice	(@XMM[0..7, 8..11]);
-$code.=<<___;
-	sub	$rounds,$rounds,#1
-	b	.Ldec_sbox
-.align	4
-.Ldec_loop:
-___
-	&ShiftRows	(@XMM[0..7, 8..12]);
-$code.=".Ldec_sbox:\n";
-	&InvSbox	(@XMM[0..7, 8..15]);
-$code.=<<___;
-	subs	$rounds,$rounds,#1
-	bcc	.Ldec_done
-___
-	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
-$code.=<<___;
-	vldmia	$const, {@XMM[12]}		@ .LISR
-	ite	eq				@ Thumb2 thing, sanity check in ARM
-	addeq	$const,$const,#0x10
-	bne	.Ldec_loop
-	vldmia	$const, {@XMM[12]}		@ .LISRM0
-	b	.Ldec_loop
-.align	4
-.Ldec_done:
-___
-	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
-$code.=<<___;
-	vldmia	$key, {@XMM[8]}			@ last round key
-	veor	@XMM[6], @XMM[6], @XMM[8]
-	veor	@XMM[4], @XMM[4], @XMM[8]
-	veor	@XMM[2], @XMM[2], @XMM[8]
-	veor	@XMM[7], @XMM[7], @XMM[8]
-	veor	@XMM[3], @XMM[3], @XMM[8]
-	veor	@XMM[5], @XMM[5], @XMM[8]
-	veor	@XMM[0], @XMM[0], @XMM[8]
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	bx	lr
-.size	_bsaes_decrypt8,.-_bsaes_decrypt8
-
-.type	_bsaes_const,%object
-.align	6
-_bsaes_const:
-.LM0ISR:	@ InvShiftRows constants
-	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISR:
-	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-.LISRM0:
-	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-.LM0SR:		@ ShiftRows constants
-	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-.LSR:
-	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-.LM0:
-	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-.LREVM0SR:
-	.quad	0x090d01050c000408, 0x03070b0f060a0e02
-.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	6
-.size	_bsaes_const,.-_bsaes_const
-
-.type	_bsaes_encrypt8,%function
-.align	4
-_bsaes_encrypt8:
-	adr	$const,_bsaes_encrypt8
-	vldmia	$key!, {@XMM[9]}		@ round 0 key
-	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
-
-	vldmia	$const!, {@XMM[8]}		@ .LM0SR
-_bsaes_encrypt8_alt:
-	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
-	veor	@XMM[11], @XMM[1], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-	veor	@XMM[12], @XMM[2], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-	veor	@XMM[13], @XMM[3], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
-	veor	@XMM[14], @XMM[4], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
-	veor	@XMM[15], @XMM[5], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
-	veor	@XMM[10], @XMM[6], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
-	veor	@XMM[11], @XMM[7], @XMM[9]
-	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
-	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
-	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
-_bsaes_encrypt8_bitslice:
-___
-	&bitslice	(@XMM[0..7, 8..11]);
-$code.=<<___;
-	sub	$rounds,$rounds,#1
-	b	.Lenc_sbox
-.align	4
-.Lenc_loop:
-___
-	&ShiftRows	(@XMM[0..7, 8..12]);
-$code.=".Lenc_sbox:\n";
-	&Sbox		(@XMM[0..7, 8..15]);
-$code.=<<___;
-	subs	$rounds,$rounds,#1
-	bcc	.Lenc_done
-___
-	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
-$code.=<<___;
-	vldmia	$const, {@XMM[12]}		@ .LSR
-	ite	eq				@ Thumb2 thing, samity check in ARM
-	addeq	$const,$const,#0x10
-	bne	.Lenc_loop
-	vldmia	$const, {@XMM[12]}		@ .LSRM0
-	b	.Lenc_loop
-.align	4
-.Lenc_done:
-___
-	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
-	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
-$code.=<<___;
-	vldmia	$key, {@XMM[8]}			@ last round key
-	veor	@XMM[4], @XMM[4], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[8]
-	veor	@XMM[3], @XMM[3], @XMM[8]
-	veor	@XMM[7], @XMM[7], @XMM[8]
-	veor	@XMM[2], @XMM[2], @XMM[8]
-	veor	@XMM[5], @XMM[5], @XMM[8]
-	veor	@XMM[0], @XMM[0], @XMM[8]
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	bx	lr
-.size	_bsaes_encrypt8,.-_bsaes_encrypt8
-___
-}
-{
-my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
-
-sub bitslice_key {
-my @x=reverse(@_[0..7]);
-my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
-
-	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
-$code.=<<___;
-	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
-	vmov	@x[2], @x[0]
-	vmov	@x[3], @x[1]
-___
-	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-
-	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
-$code.=<<___;
-	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-	vmov	@x[4], @x[0]
-	vmov	@x[6], @x[2]
-	vmov	@x[5], @x[1]
-	vmov	@x[7], @x[3]
-___
-	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
-	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
-}
-
-$code.=<<___;
-.type	_bsaes_key_convert,%function
-.align	4
-_bsaes_key_convert:
-	adr	$const,_bsaes_key_convert
-	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
-	sub	$const,$const,#_bsaes_key_convert-.LM0
-	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
-
-	vmov.i8	@XMM[8],  #0x01			@ bit masks
-	vmov.i8	@XMM[9],  #0x02
-	vmov.i8	@XMM[10], #0x04
-	vmov.i8	@XMM[11], #0x08
-	vmov.i8	@XMM[12], #0x10
-	vmov.i8	@XMM[13], #0x20
-	vldmia	$const, {@XMM[14]}		@ .LM0
-
-#ifdef __ARMEL__
-	vrev32.8	@XMM[7],  @XMM[7]
-	vrev32.8	@XMM[15], @XMM[15]
-#endif
-	sub	$rounds,$rounds,#1
-	vstmia	$out!, {@XMM[7]}		@ save round 0 key
-	b	.Lkey_loop
-
-.align	4
-.Lkey_loop:
-	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
-	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
-	vmov.i8	@XMM[6],  #0x40
-	vmov.i8	@XMM[15], #0x80
-
-	vtst.8	@XMM[0], @XMM[7], @XMM[8]
-	vtst.8	@XMM[1], @XMM[7], @XMM[9]
-	vtst.8	@XMM[2], @XMM[7], @XMM[10]
-	vtst.8	@XMM[3], @XMM[7], @XMM[11]
-	vtst.8	@XMM[4], @XMM[7], @XMM[12]
-	vtst.8	@XMM[5], @XMM[7], @XMM[13]
-	vtst.8	@XMM[6], @XMM[7], @XMM[6]
-	vtst.8	@XMM[7], @XMM[7], @XMM[15]
-	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
-	vmvn	@XMM[0], @XMM[0]		@ "pnot"
-	vmvn	@XMM[1], @XMM[1]
-	vmvn	@XMM[5], @XMM[5]
-	vmvn	@XMM[6], @XMM[6]
-#ifdef __ARMEL__
-	vrev32.8	@XMM[15], @XMM[15]
-#endif
-	subs	$rounds,$rounds,#1
-	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
-	bne	.Lkey_loop
-
-	vmov.i8	@XMM[7],#0x63			@ compose .L63
-	@ don't save last round key
-	bx	lr
-.size	_bsaes_key_convert,.-_bsaes_key_convert
-___
-}
-
-if (0) {		# following four functions are unsupported interface
-			# used for benchmarking...
-$code.=<<___;
-.globl	bsaes_enc_key_convert
-.type	bsaes_enc_key_convert,%function
-.align	4
-bsaes_enc_key_convert:
-	stmdb	sp!,{r4-r6,lr}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	ldr	r5,[$inp,#240]			@ pass rounds
-	mov	r4,$inp				@ pass key
-	mov	r12,$out			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
-	vstmia	r12, {@XMM[7]}			@ save last round key
-
-	vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4-r6,pc}
-.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
-
-.globl	bsaes_encrypt_128
-.type	bsaes_encrypt_128,%function
-.align	4
-bsaes_encrypt_128:
-	stmdb	sp!,{r4-r6,lr}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-.Lenc128_loop:
-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-	mov	r4,$key				@ pass the key
-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-	mov	r5,#10				@ pass rounds
-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
-
-	bl	_bsaes_encrypt8
-
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[3]}, [$out]!
-	vst1.8	{@XMM[7]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	subs	$len,$len,#0x80
-	vst1.8	{@XMM[5]}, [$out]!
-	bhi	.Lenc128_loop
-
-	vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4-r6,pc}
-.size	bsaes_encrypt_128,.-bsaes_encrypt_128
-
-.globl	bsaes_dec_key_convert
-.type	bsaes_dec_key_convert,%function
-.align	4
-bsaes_dec_key_convert:
-	stmdb	sp!,{r4-r6,lr}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	ldr	r5,[$inp,#240]			@ pass rounds
-	mov	r4,$inp				@ pass key
-	mov	r12,$out			@ pass key schedule
-	bl	_bsaes_key_convert
-	vldmia	$out, {@XMM[6]}
-	vstmia	r12,  {@XMM[15]}		@ save last round key
-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-	vstmia	$out, {@XMM[7]}
-
-	vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4-r6,pc}
-.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
-
-.globl	bsaes_decrypt_128
-.type	bsaes_decrypt_128,%function
-.align	4
-bsaes_decrypt_128:
-	stmdb	sp!,{r4-r6,lr}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-.Ldec128_loop:
-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-	mov	r4,$key				@ pass the key
-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-	mov	r5,#10				@ pass rounds
-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
-
-	bl	_bsaes_decrypt8
-
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	vst1.8	{@XMM[7]}, [$out]!
-	vst1.8	{@XMM[3]}, [$out]!
-	subs	$len,$len,#0x80
-	vst1.8	{@XMM[5]}, [$out]!
-	bhi	.Ldec128_loop
-
-	vldmia	sp!,{d8-d15}
-	ldmia	sp!,{r4-r6,pc}
-.size	bsaes_decrypt_128,.-bsaes_decrypt_128
-___
-}
-{
-my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
-my ($keysched)=("sp");
-
-$code.=<<___;
-.extern AES_cbc_encrypt
-.extern AES_decrypt
-
-.global	bsaes_cbc_encrypt
-.type	bsaes_cbc_encrypt,%function
-.align	5
-bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	$len, #128
-#ifndef	__thumb__
-	blo	AES_cbc_encrypt
-#else
-	bhs	1f
-	b	AES_cbc_encrypt
-1:
-#endif
-#endif
-
-	@ it is up to the caller to make sure we are called with enc == 0
-
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}
-	VFP_ABI_PUSH
-	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
-	mov	$len, $len, lsr#4		@ len in 16 byte blocks
-	sub	sp, #0x10			@ scratch space to carry over the IV
-	mov	$fp, sp				@ save sp
-
-	ldr	$rounds, [$key, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
-	add	r12, #`128-32`			@ sifze of bit-slices key schedule
-
-	@ populate the key schedule
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	mov	sp, r12				@ sp is $keysched
-	bl	_bsaes_key_convert
-	vldmia	$keysched, {@XMM[6]}
-	vstmia	r12,  {@XMM[15]}		@ save last round key
-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-	vstmia	$keysched, {@XMM[7]}
-#else
-	ldr	r12, [$key, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [$key, #244]
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	add	r12, $key, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, $key, #248
-	vldmia	r4, {@XMM[6]}
-	vstmia	r12, {@XMM[15]}			@ save last round key
-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-	vstmia	r4, {@XMM[7]}
-
-.align	2
-0:
-#endif
-
-	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
-	b	.Lcbc_dec_loop
-
-.align	4
-.Lcbc_dec_loop:
-	subs	$len, $len, #0x8
-	bmi	.Lcbc_dec_loop_finish
-
-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, $keysched			@ pass the key
-#else
-	add	r4, $key, #248
-#endif
-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
-	mov	r5, $rounds
-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
-	sub	$inp, $inp, #0x60
-	vstmia	$fp, {@XMM[15]}			@ put aside IV
-
-	bl	_bsaes_decrypt8
-
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
-	veor	@XMM[4], @XMM[4], @XMM[10]
-	veor	@XMM[2], @XMM[2], @XMM[11]
-	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
-	veor	@XMM[7], @XMM[7], @XMM[12]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	veor	@XMM[3], @XMM[3], @XMM[13]
-	vst1.8	{@XMM[6]}, [$out]!
-	veor	@XMM[5], @XMM[5], @XMM[14]
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	vst1.8	{@XMM[7]}, [$out]!
-	vst1.8	{@XMM[3]}, [$out]!
-	vst1.8	{@XMM[5]}, [$out]!
-
-	b	.Lcbc_dec_loop
-
-.Lcbc_dec_loop_finish:
-	adds	$len, $len, #8
-	beq	.Lcbc_dec_done
-
-	vld1.8	{@XMM[0]}, [$inp]!		@ load input
-	cmp	$len, #2
-	blo	.Lcbc_dec_one
-	vld1.8	{@XMM[1]}, [$inp]!
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	mov	r4, $keysched			@ pass the key
-#else
-	add	r4, $key, #248
-#endif
-	mov	r5, $rounds
-	vstmia	$fp, {@XMM[15]}			@ put aside IV
-	beq	.Lcbc_dec_two
-	vld1.8	{@XMM[2]}, [$inp]!
-	cmp	$len, #4
-	blo	.Lcbc_dec_three
-	vld1.8	{@XMM[3]}, [$inp]!
-	beq	.Lcbc_dec_four
-	vld1.8	{@XMM[4]}, [$inp]!
-	cmp	$len, #6
-	blo	.Lcbc_dec_five
-	vld1.8	{@XMM[5]}, [$inp]!
-	beq	.Lcbc_dec_six
-	vld1.8	{@XMM[6]}, [$inp]!
-	sub	$inp, $inp, #0x70
-
-	bl	_bsaes_decrypt8
-
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
-	veor	@XMM[4], @XMM[4], @XMM[10]
-	veor	@XMM[2], @XMM[2], @XMM[11]
-	vld1.8	{@XMM[15]}, [$inp]!
-	veor	@XMM[7], @XMM[7], @XMM[12]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	veor	@XMM[3], @XMM[3], @XMM[13]
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	vst1.8	{@XMM[7]}, [$out]!
-	vst1.8	{@XMM[3]}, [$out]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_six:
-	sub	$inp, $inp, #0x60
-	bl	_bsaes_decrypt8
-	vldmia	$fp,{@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vld1.8	{@XMM[12]}, [$inp]!
-	veor	@XMM[4], @XMM[4], @XMM[10]
-	veor	@XMM[2], @XMM[2], @XMM[11]
-	vld1.8	{@XMM[15]}, [$inp]!
-	veor	@XMM[7], @XMM[7], @XMM[12]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	vst1.8	{@XMM[7]}, [$out]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_five:
-	sub	$inp, $inp, #0x50
-	bl	_bsaes_decrypt8
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vld1.8	{@XMM[15]}, [$inp]!
-	veor	@XMM[4], @XMM[4], @XMM[10]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	veor	@XMM[2], @XMM[2], @XMM[11]
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[4]}, [$out]!
-	vst1.8	{@XMM[2]}, [$out]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_four:
-	sub	$inp, $inp, #0x40
-	bl	_bsaes_decrypt8
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[10]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vld1.8	{@XMM[15]}, [$inp]!
-	veor	@XMM[4], @XMM[4], @XMM[10]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	vst1.8	{@XMM[6]}, [$out]!
-	vst1.8	{@XMM[4]}, [$out]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_three:
-	sub	$inp, $inp, #0x30
-	bl	_bsaes_decrypt8
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[15]}, [$inp]!
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	veor	@XMM[6], @XMM[6], @XMM[9]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	vst1.8	{@XMM[6]}, [$out]!
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_two:
-	sub	$inp, $inp, #0x20
-	bl	_bsaes_decrypt8
-	vldmia	$fp, {@XMM[14]}			@ reload IV
-	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
-	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
-	veor	@XMM[1], @XMM[1], @XMM[8]
-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	b	.Lcbc_dec_done
-.align	4
-.Lcbc_dec_one:
-	sub	$inp, $inp, #0x10
-	mov	$rounds, $out			@ save original out pointer
-	mov	$out, $fp			@ use the iv scratch space as out buffer
-	mov	r2, $key
-	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
-	vmov	@XMM[5],@XMM[0]			@ and input are preserved
-	bl	AES_decrypt
-	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
-	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
-	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
-	vst1.8	{@XMM[0]}, [$rounds]		@ write output
-
-.Lcbc_dec_done:
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-.Lcbc_dec_bzero:				@ wipe key schedule [if any]
-	vstmia		$keysched!, {q0-q1}
-	cmp		$keysched, $fp
-	bne		.Lcbc_dec_bzero
-#endif
-
-	mov	sp, $fp
-	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
-	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
-	VFP_ABI_POP
-	ldmia	sp!, {r4-r10, pc}
-.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-___
-}
-{
-my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
-my $const = "r6";	# shared with _bsaes_encrypt8_alt
-my $keysched = "sp";
-
-$code.=<<___;
-.extern	AES_encrypt
-.global	bsaes_ctr32_encrypt_blocks
-.type	bsaes_ctr32_encrypt_blocks,%function
-.align	5
-bsaes_ctr32_encrypt_blocks:
-	cmp	$len, #8			@ use plain AES for
-	blo	.Lctr_enc_short			@ small sizes
-
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}
-	VFP_ABI_PUSH
-	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
-	mov	$fp, sp				@ save sp
-
-	ldr	$rounds, [$key, #240]		@ get # of rounds
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
-	add	r12, #`128-32`			@ size of bit-sliced key schedule
-
-	@ populate the key schedule
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	mov	sp, r12				@ sp is $keysched
-	bl	_bsaes_key_convert
-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
-	vstmia	r12, {@XMM[7]}			@ save last round key
-
-	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
-	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
-	vldmia	$keysched, {@XMM[4]}		@ load round0 key
-#else
-	ldr	r12, [$key, #244]
-	eors	r12, #1
-	beq	0f
-
-	@ populate the key schedule
-	str	r12, [$key, #244]
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	add	r12, $key, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
-	vstmia	r12, {@XMM[7]}			@ save last round key
-
-.align	2
-0:	add	r12, $key, #248
-	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
-	adrl	$ctr, .LREVM0SR			@ borrow $ctr
-	vldmia	r12, {@XMM[4]}			@ load round0 key
-	sub	sp, #0x10			@ place for adjusted round0 key
-#endif
-
-	vmov.i32	@XMM[8],#1		@ compose 1<<96
-	veor		@XMM[9],@XMM[9],@XMM[9]
-	vrev32.8	@XMM[0],@XMM[0]
-	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
-	vrev32.8	@XMM[4],@XMM[4]
-	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
-	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
-	b	.Lctr_enc_loop
-
-.align	4
-.Lctr_enc_loop:
-	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
-	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
-	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
-	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
-	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
-	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
-	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
-	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
-	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
-
-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
-	@ to flip byte order in 32-bit counter
-
-	vldmia		$keysched, {@XMM[9]}		@ load round0 key
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, $keysched, #0x10		@ pass next round key
-#else
-	add		r4, $key, #`248+16`
-#endif
-	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
-	mov		r5, $rounds			@ pass rounds
-	vstmia		$fp, {@XMM[10]}			@ save next counter
-	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
-
-	bl		_bsaes_encrypt8_alt
-
-	subs		$len, $len, #8
-	blo		.Lctr_enc_loop_done
-
-	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
-	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
-	veor		@XMM[0], @XMM[8]
-	veor		@XMM[1], @XMM[9]
-	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
-	veor		@XMM[4], @XMM[10]
-	veor		@XMM[6], @XMM[11]
-	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
-	veor		@XMM[3], @XMM[12]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
-	veor		@XMM[7], @XMM[13]
-	veor		@XMM[2], @XMM[14]
-	vst1.8		{@XMM[4]}, [$out]!
-	veor		@XMM[5], @XMM[15]
-	vst1.8		{@XMM[6]}, [$out]!
-	vmov.i32	@XMM[8], #1			@ compose 1<<96
-	vst1.8		{@XMM[3]}, [$out]!
-	veor		@XMM[9], @XMM[9], @XMM[9]
-	vst1.8		{@XMM[7]}, [$out]!
-	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
-	vst1.8		{@XMM[2]}, [$out]!
-	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
-	vst1.8		{@XMM[5]}, [$out]!
-	vldmia		$fp, {@XMM[0]}			@ load counter
-
-	bne		.Lctr_enc_loop
-	b		.Lctr_enc_done
-
-.align	4
-.Lctr_enc_loop_done:
-	add		$len, $len, #8
-	vld1.8		{@XMM[8]}, [$inp]!	@ load input
-	veor		@XMM[0], @XMM[8]
-	vst1.8		{@XMM[0]}, [$out]!	@ write output
-	cmp		$len, #2
-	blo		.Lctr_enc_done
-	vld1.8		{@XMM[9]}, [$inp]!
-	veor		@XMM[1], @XMM[9]
-	vst1.8		{@XMM[1]}, [$out]!
-	beq		.Lctr_enc_done
-	vld1.8		{@XMM[10]}, [$inp]!
-	veor		@XMM[4], @XMM[10]
-	vst1.8		{@XMM[4]}, [$out]!
-	cmp		$len, #4
-	blo		.Lctr_enc_done
-	vld1.8		{@XMM[11]}, [$inp]!
-	veor		@XMM[6], @XMM[11]
-	vst1.8		{@XMM[6]}, [$out]!
-	beq		.Lctr_enc_done
-	vld1.8		{@XMM[12]}, [$inp]!
-	veor		@XMM[3], @XMM[12]
-	vst1.8		{@XMM[3]}, [$out]!
-	cmp		$len, #6
-	blo		.Lctr_enc_done
-	vld1.8		{@XMM[13]}, [$inp]!
-	veor		@XMM[7], @XMM[13]
-	vst1.8		{@XMM[7]}, [$out]!
-	beq		.Lctr_enc_done
-	vld1.8		{@XMM[14]}, [$inp]
-	veor		@XMM[2], @XMM[14]
-	vst1.8		{@XMM[2]}, [$out]!
-
-.Lctr_enc_done:
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifndef	BSAES_ASM_EXTENDED_KEY
-.Lctr_enc_bzero:			@ wipe key schedule [if any]
-	vstmia		$keysched!, {q0-q1}
-	cmp		$keysched, $fp
-	bne		.Lctr_enc_bzero
-#else
-	vstmia		$keysched, {q0-q1}
-#endif
-
-	mov	sp, $fp
-	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
-	VFP_ABI_POP
-	ldmia	sp!, {r4-r10, pc}	@ return
-
-.align	4
-.Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4-r8, lr}
-
-	mov	r4, $inp		@ copy arguments
-	mov	r5, $out
-	mov	r6, $len
-	mov	r7, $key
-	ldr	r8, [ip, #12]		@ load counter LSW
-	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
-	sub	sp, sp, #0x10
-
-.Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	AES_encrypt
-
-	vld1.8	{@XMM[0]}, [r4]!	@ load input
-	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	@XMM[0],@XMM[0],@XMM[1]
-	vst1.8	{@XMM[0]}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	.Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia		sp!, {q0-q1}
-
-	ldmia	sp!, {r4-r8, pc}
-.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-___
-}
-{
-######################################################################
-# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
-#	const AES_KEY *key1, const AES_KEY *key2,
-#	const unsigned char iv[16]);
-#
-my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
-my $const="r6";		# returned by _bsaes_key_convert
-my $twmask=@XMM[5];
-my @T=@XMM[6..7];
-
-$code.=<<___;
-.globl	bsaes_xts_encrypt
-.type	bsaes_xts_encrypt,%function
-.align	4
-bsaes_xts_encrypt:
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}		@ 0x20
-	VFP_ABI_PUSH
-	mov	r6, sp				@ future $fp
-
-	mov	$inp, r0
-	mov	$out, r1
-	mov	$len, r2
-	mov	$key, r3
-
-	sub	r0, sp, #0x10			@ 0x10
-	bic	r0, #0xf			@ align at 16 bytes
-	mov	sp, r0
-
-#ifdef	XTS_CHAIN_TWEAK
-	ldr	r0, [ip]			@ pointer to input tweak
-#else
-	@ generate initial tweak
-	ldr	r0, [ip, #4]			@ iv[]
-	mov	r1, sp
-	ldr	r2, [ip, #0]			@ key2
-	bl	AES_encrypt
-	mov	r0,sp				@ pointer to initial tweak
-#endif
-
-	ldr	$rounds, [$key, #240]		@ get # of rounds
-	mov	$fp, r6
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
-	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
-	sub	r12, #`32+16`			@ place for tweak[9]
-
-	@ populate the key schedule
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	mov	sp, r12
-	add	r12, #0x90			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
-	vstmia	r12, {@XMM[7]}			@ save last round key
-#else
-	ldr	r12, [$key, #244]
-	eors	r12, #1
-	beq	0f
-
-	str	r12, [$key, #244]
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	add	r12, $key, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
-	vstmia	r12, {@XMM[7]}
-
-.align	2
-0:	sub	sp, #0x90			@ place for tweak[9]
-#endif
-
-	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
-	adr	$magic, .Lxts_magic
-
-	subs	$len, #0x80
-	blo	.Lxts_enc_short
-	b	.Lxts_enc_loop
-
-.align	4
-.Lxts_enc_loop:
-	vldmia		$magic, {$twmask}	@ load XTS magic
-	vshr.s64	@T[0], @XMM[8], #63
-	mov		r0, sp
-	vand		@T[0], @T[0], $twmask
-___
-for($i=9;$i<16;$i++) {
-$code.=<<___;
-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	vshr.s64	@T[1], @XMM[$i], #63
-	veor		@XMM[$i], @XMM[$i], @T[0]
-	vand		@T[1], @T[1], $twmask
-___
-	@T=reverse(@T);
-
-$code.=<<___ if ($i>=10);
-	vld1.8		{@XMM[$i-10]}, [$inp]!
-___
-$code.=<<___ if ($i>=11);
-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
-___
-}
-$code.=<<___;
-	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
-	vst1.64		{@XMM[15]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	veor		@XMM[8], @XMM[8], @T[0]
-	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-
-	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
-	veor		@XMM[5], @XMM[5], @XMM[13]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[6], @XMM[6], @XMM[14]
-	mov		r5, $rounds			@ pass rounds
-	veor		@XMM[7], @XMM[7], @XMM[15]
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[6], @XMM[11]
-	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
-	veor		@XMM[10], @XMM[3], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	veor		@XMM[12], @XMM[2], @XMM[14]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-	veor		@XMM[13], @XMM[5], @XMM[15]
-	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-
-	subs		$len, #0x80
-	bpl		.Lxts_enc_loop
-
-.Lxts_enc_short:
-	adds		$len, #0x70
-	bmi		.Lxts_enc_done
-
-	vldmia		$magic, {$twmask}	@ load XTS magic
-	vshr.s64	@T[0], @XMM[8], #63
-	mov		r0, sp
-	vand		@T[0], @T[0], $twmask
-___
-for($i=9;$i<16;$i++) {
-$code.=<<___;
-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	vshr.s64	@T[1], @XMM[$i], #63
-	veor		@XMM[$i], @XMM[$i], @T[0]
-	vand		@T[1], @T[1], $twmask
-___
-	@T=reverse(@T);
-
-$code.=<<___ if ($i>=10);
-	vld1.8		{@XMM[$i-10]}, [$inp]!
-	subs		$len, #0x10
-	bmi		.Lxts_enc_`$i-9`
-___
-$code.=<<___ if ($i>=11);
-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
-___
-}
-$code.=<<___;
-	sub		$len, #0x10
-	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
-
-	vld1.8		{@XMM[6]}, [$inp]!
-	veor		@XMM[5], @XMM[5], @XMM[13]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[6], @XMM[6], @XMM[14]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[6], @XMM[11]
-	vld1.64		{@XMM[14]}, [r0,:128]!
-	veor		@XMM[10], @XMM[3], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	veor		@XMM[12], @XMM[2], @XMM[14]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-	vst1.8		{@XMM[12]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_6:
-	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[4], @XMM[4], @XMM[12]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[5], @XMM[5], @XMM[13]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[6], @XMM[11]
-	veor		@XMM[10], @XMM[3], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-
-@ put this in range for both ARM and Thumb mode adr instructions
-.align	5
-.Lxts_magic:
-	.quad	1, 0x87
-
-.align	5
-.Lxts_enc_5:
-	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[3], @XMM[3], @XMM[11]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[4], @XMM[4], @XMM[12]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[6], @XMM[11]
-	veor		@XMM[10], @XMM[3], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	vst1.8		{@XMM[10]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_4:
-	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[2], @XMM[2], @XMM[10]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[3], @XMM[3], @XMM[11]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[6], @XMM[11]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_3:
-	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[1], @XMM[1], @XMM[9]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[2], @XMM[2], @XMM[10]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
-	vld1.64		{@XMM[10]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[4], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	vst1.8		{@XMM[8]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_2:
-	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[0], @XMM[0], @XMM[8]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[1], @XMM[1], @XMM[9]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_encrypt8
-
-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_enc_done
-.align	4
-.Lxts_enc_1:
-	mov		r0, sp
-	veor		@XMM[0], @XMM[8]
-	mov		r1, sp
-	vst1.8		{@XMM[0]}, [sp,:128]
-	mov		r2, $key
-	mov		r4, $fp				@ preserve fp
-
-	bl		AES_encrypt
-
-	vld1.8		{@XMM[0]}, [sp,:128]
-	veor		@XMM[0], @XMM[0], @XMM[8]
-	vst1.8		{@XMM[0]}, [$out]!
-	mov		$fp, r4
-
-	vmov		@XMM[8], @XMM[9]		@ next round tweak
-
-.Lxts_enc_done:
-#ifndef	XTS_CHAIN_TWEAK
-	adds		$len, #0x10
-	beq		.Lxts_enc_ret
-	sub		r6, $out, #0x10
-
-.Lxts_enc_steal:
-	ldrb		r0, [$inp], #1
-	ldrb		r1, [$out, #-0x10]
-	strb		r0, [$out, #-0x10]
-	strb		r1, [$out], #1
-
-	subs		$len, #1
-	bhi		.Lxts_enc_steal
-
-	vld1.8		{@XMM[0]}, [r6]
-	mov		r0, sp
-	veor		@XMM[0], @XMM[0], @XMM[8]
-	mov		r1, sp
-	vst1.8		{@XMM[0]}, [sp,:128]
-	mov		r2, $key
-	mov		r4, $fp			@ preserve fp
-
-	bl		AES_encrypt
-
-	vld1.8		{@XMM[0]}, [sp,:128]
-	veor		@XMM[0], @XMM[0], @XMM[8]
-	vst1.8		{@XMM[0]}, [r6]
-	mov		$fp, r4
-#endif
-
-.Lxts_enc_ret:
-	bic		r0, $fp, #0xf
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifdef	XTS_CHAIN_TWEAK
-	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
-#endif
-.Lxts_enc_bzero:				@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r0
-	bne		.Lxts_enc_bzero
-
-	mov		sp, $fp
-#ifdef	XTS_CHAIN_TWEAK
-	vst1.8		{@XMM[8]}, [r1]
-#endif
-	VFP_ABI_POP
-	ldmia		sp!, {r4-r10, pc}	@ return
-
-.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-.globl	bsaes_xts_decrypt
-.type	bsaes_xts_decrypt,%function
-.align	4
-bsaes_xts_decrypt:
-	mov	ip, sp
-	stmdb	sp!, {r4-r10, lr}		@ 0x20
-	VFP_ABI_PUSH
-	mov	r6, sp				@ future $fp
-
-	mov	$inp, r0
-	mov	$out, r1
-	mov	$len, r2
-	mov	$key, r3
-
-	sub	r0, sp, #0x10			@ 0x10
-	bic	r0, #0xf			@ align at 16 bytes
-	mov	sp, r0
-
-#ifdef	XTS_CHAIN_TWEAK
-	ldr	r0, [ip]			@ pointer to input tweak
-#else
-	@ generate initial tweak
-	ldr	r0, [ip, #4]			@ iv[]
-	mov	r1, sp
-	ldr	r2, [ip, #0]			@ key2
-	bl	AES_encrypt
-	mov	r0, sp				@ pointer to initial tweak
-#endif
-
-	ldr	$rounds, [$key, #240]		@ get # of rounds
-	mov	$fp, r6
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	@ allocate the key schedule on the stack
-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
-	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
-	sub	r12, #`32+16`			@ place for tweak[9]
-
-	@ populate the key schedule
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	mov	sp, r12
-	add	r12, #0x90			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, sp, #0x90
-	vldmia	r4, {@XMM[6]}
-	vstmia	r12,  {@XMM[15]}		@ save last round key
-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-	vstmia	r4, {@XMM[7]}
-#else
-	ldr	r12, [$key, #244]
-	eors	r12, #1
-	beq	0f
-
-	str	r12, [$key, #244]
-	mov	r4, $key			@ pass key
-	mov	r5, $rounds			@ pass # of rounds
-	add	r12, $key, #248			@ pass key schedule
-	bl	_bsaes_key_convert
-	add	r4, $key, #248
-	vldmia	r4, {@XMM[6]}
-	vstmia	r12,  {@XMM[15]}		@ save last round key
-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
-	vstmia	r4, {@XMM[7]}
-
-.align	2
-0:	sub	sp, #0x90			@ place for tweak[9]
-#endif
-	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
-	adr	$magic, .Lxts_magic
-
-#ifndef	XTS_CHAIN_TWEAK
-	tst	$len, #0xf			@ if not multiple of 16
-	it	ne				@ Thumb2 thing, sanity check in ARM
-	subne	$len, #0x10			@ subtract another 16 bytes
-#endif
-	subs	$len, #0x80
-
-	blo	.Lxts_dec_short
-	b	.Lxts_dec_loop
-
-.align	4
-.Lxts_dec_loop:
-	vldmia		$magic, {$twmask}	@ load XTS magic
-	vshr.s64	@T[0], @XMM[8], #63
-	mov		r0, sp
-	vand		@T[0], @T[0], $twmask
-___
-for($i=9;$i<16;$i++) {
-$code.=<<___;
-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	vshr.s64	@T[1], @XMM[$i], #63
-	veor		@XMM[$i], @XMM[$i], @T[0]
-	vand		@T[1], @T[1], $twmask
-___
-	@T=reverse(@T);
-
-$code.=<<___ if ($i>=10);
-	vld1.8		{@XMM[$i-10]}, [$inp]!
-___
-$code.=<<___ if ($i>=11);
-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
-___
-}
-$code.=<<___;
-	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
-	vst1.64		{@XMM[15]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	veor		@XMM[8], @XMM[8], @T[0]
-	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-
-	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
-	veor		@XMM[5], @XMM[5], @XMM[13]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[6], @XMM[6], @XMM[14]
-	mov		r5, $rounds			@ pass rounds
-	veor		@XMM[7], @XMM[7], @XMM[15]
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[4], @XMM[11]
-	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
-	veor		@XMM[10], @XMM[2], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	veor		@XMM[12], @XMM[3], @XMM[14]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-	veor		@XMM[13], @XMM[5], @XMM[15]
-	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-
-	subs		$len, #0x80
-	bpl		.Lxts_dec_loop
-
-.Lxts_dec_short:
-	adds		$len, #0x70
-	bmi		.Lxts_dec_done
-
-	vldmia		$magic, {$twmask}	@ load XTS magic
-	vshr.s64	@T[0], @XMM[8], #63
-	mov		r0, sp
-	vand		@T[0], @T[0], $twmask
-___
-for($i=9;$i<16;$i++) {
-$code.=<<___;
-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
-	vshr.s64	@T[1], @XMM[$i], #63
-	veor		@XMM[$i], @XMM[$i], @T[0]
-	vand		@T[1], @T[1], $twmask
-___
-	@T=reverse(@T);
-
-$code.=<<___ if ($i>=10);
-	vld1.8		{@XMM[$i-10]}, [$inp]!
-	subs		$len, #0x10
-	bmi		.Lxts_dec_`$i-9`
-___
-$code.=<<___ if ($i>=11);
-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
-___
-}
-$code.=<<___;
-	sub		$len, #0x10
-	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
-
-	vld1.8		{@XMM[6]}, [$inp]!
-	veor		@XMM[5], @XMM[5], @XMM[13]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[6], @XMM[6], @XMM[14]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[4], @XMM[11]
-	vld1.64		{@XMM[14]}, [r0,:128]!
-	veor		@XMM[10], @XMM[2], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	veor		@XMM[12], @XMM[3], @XMM[14]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-	vst1.8		{@XMM[12]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_6:
-	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[4], @XMM[4], @XMM[12]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[5], @XMM[5], @XMM[13]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[4], @XMM[11]
-	veor		@XMM[10], @XMM[2], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	veor		@XMM[11], @XMM[7], @XMM[13]
-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_5:
-	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[3], @XMM[3], @XMM[11]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[4], @XMM[4], @XMM[12]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	vld1.64		{@XMM[12]}, [r0,:128]!
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[4], @XMM[11]
-	veor		@XMM[10], @XMM[2], @XMM[12]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-	vst1.8		{@XMM[10]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_4:
-	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[2], @XMM[2], @XMM[10]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[3], @XMM[3], @XMM[11]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	veor		@XMM[9], @XMM[4], @XMM[11]
-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_3:
-	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[1], @XMM[1], @XMM[9]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[2], @XMM[2], @XMM[10]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
-	vld1.64		{@XMM[10]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	veor		@XMM[8], @XMM[6], @XMM[10]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-	vst1.8		{@XMM[8]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_2:
-	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
-
-	veor		@XMM[0], @XMM[0], @XMM[8]
-#ifndef	BSAES_ASM_EXTENDED_KEY
-	add		r4, sp, #0x90			@ pass key schedule
-#else
-	add		r4, $key, #248			@ pass key schedule
-#endif
-	veor		@XMM[1], @XMM[1], @XMM[9]
-	mov		r5, $rounds			@ pass rounds
-	mov		r0, sp
-
-	bl		_bsaes_decrypt8
-
-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
-	veor		@XMM[0], @XMM[0], @XMM[ 8]
-	veor		@XMM[1], @XMM[1], @XMM[ 9]
-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
-
-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
-	b		.Lxts_dec_done
-.align	4
-.Lxts_dec_1:
-	mov		r0, sp
-	veor		@XMM[0], @XMM[8]
-	mov		r1, sp
-	vst1.8		{@XMM[0]}, [sp,:128]
-	mov		r2, $key
-	mov		r4, $fp				@ preserve fp
-	mov		r5, $magic			@ preserve magic
-
-	bl		AES_decrypt
-
-	vld1.8		{@XMM[0]}, [sp,:128]
-	veor		@XMM[0], @XMM[0], @XMM[8]
-	vst1.8		{@XMM[0]}, [$out]!
-	mov		$fp, r4
-	mov		$magic, r5
-
-	vmov		@XMM[8], @XMM[9]		@ next round tweak
-
-.Lxts_dec_done:
-#ifndef	XTS_CHAIN_TWEAK
-	adds		$len, #0x10
-	beq		.Lxts_dec_ret
-
-	@ calculate one round of extra tweak for the stolen ciphertext
-	vldmia		$magic, {$twmask}
-	vshr.s64	@XMM[6], @XMM[8], #63
-	vand		@XMM[6], @XMM[6], $twmask
-	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
-	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
-	veor		@XMM[9], @XMM[9], @XMM[6]
-
-	@ perform the final decryption with the last tweak value
-	vld1.8		{@XMM[0]}, [$inp]!
-	mov		r0, sp
-	veor		@XMM[0], @XMM[0], @XMM[9]
-	mov		r1, sp
-	vst1.8		{@XMM[0]}, [sp,:128]
-	mov		r2, $key
-	mov		r4, $fp			@ preserve fp
-
-	bl		AES_decrypt
-
-	vld1.8		{@XMM[0]}, [sp,:128]
-	veor		@XMM[0], @XMM[0], @XMM[9]
-	vst1.8		{@XMM[0]}, [$out]
-
-	mov		r6, $out
-.Lxts_dec_steal:
-	ldrb		r1, [$out]
-	ldrb		r0, [$inp], #1
-	strb		r1, [$out, #0x10]
-	strb		r0, [$out], #1
-
-	subs		$len, #1
-	bhi		.Lxts_dec_steal
-
-	vld1.8		{@XMM[0]}, [r6]
-	mov		r0, sp
-	veor		@XMM[0], @XMM[8]
-	mov		r1, sp
-	vst1.8		{@XMM[0]}, [sp,:128]
-	mov		r2, $key
-
-	bl		AES_decrypt
-
-	vld1.8		{@XMM[0]}, [sp,:128]
-	veor		@XMM[0], @XMM[0], @XMM[8]
-	vst1.8		{@XMM[0]}, [r6]
-	mov		$fp, r4
-#endif
-
-.Lxts_dec_ret:
-	bic		r0, $fp, #0xf
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-#ifdef	XTS_CHAIN_TWEAK
-	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
-#endif
-.Lxts_dec_bzero:				@ wipe key schedule [if any]
-	vstmia		sp!, {q0-q1}
-	cmp		sp, r0
-	bne		.Lxts_dec_bzero
-
-	mov		sp, $fp
-#ifdef	XTS_CHAIN_TWEAK
-	vst1.8		{@XMM[8]}, [r1]
-#endif
-	VFP_ABI_POP
-	ldmia		sp!, {r4-r10, pc}	@ return
-
-.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
-___
-}
-$code.=<<___;
-#endif
-___
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-open SELF,$0;
-while(<SELF>) {
-	next if (/^#!/);
-        last if (!s/^#/@/ and !/^$/);
-        print;
-}
-close SELF;
-
-print $code;
-
-close STDOUT;
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..3fecb2124c35
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -0,0 +1,523 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+ENTRY(chacha20_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requireds shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	mov		r3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #16
+	vsri.u32	q3, q4, #16
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q4, q3, q0
+	vshl.u32	q3, q4, #8
+	vsri.u32	q3, q4, #24
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #1
+	bne		.Ldoubleround
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	bx		lr
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		5
+ENTRY(chacha20_4block_xor_neon)
+	push		{r4-r6, lr}
+	mov		ip, sp			// preserve the stack pointer
+	sub		r3, sp, #0x20		// allocate a 32 byte buffer
+	bic		r3, r3, #0x1f		// aligned to 32 bytes
+	mov		sp, r3
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+
+	// x0..15[0-3] = s0..3[0..3]
+	add		r3, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [r3]
+
+	adr		r3, CTRINC
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q11}, [r3, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	mov		r3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q8, q12, q0
+	veor		q9, q13, q1
+	vshl.u32	q12, q8, #8
+	vshl.u32	q13, q9, #8
+	vsri.u32	q12, q8, #24
+	vsri.u32	q13, q9, #24
+
+	veor		q8, q14, q2
+	veor		q9, q15, q3
+	vshl.u32	q14, q8, #8
+	vshl.u32	q15, q9, #8
+	vsri.u32	q14, q8, #24
+	vsri.u32	q15, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q8, q15, q0
+	veor		q9, q12, q1
+	vshl.u32	q15, q8, #8
+	vshl.u32	q12, q9, #8
+	vsri.u32	q15, q8, #24
+	vsri.u32	q12, q9, #24
+
+	veor		q8, q13, q2
+	veor		q9, q14, q3
+	vshl.u32	q13, q8, #8
+	vshl.u32	q14, q9, #8
+	vsri.u32	q13, q8, #24
+	vsri.u32	q14, q9, #24
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #1
+	beq		0f
+
+	vld1.32		{q8-q9}, [sp, :256]
+	b		.Ldoubleround4
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+0:	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q0, q0, q8
+	vadd.i32	q1, q1, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q2, q2, q8
+	vadd.i32	q3, q3, q9
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q8, r3
+	vdup.32		q9, r4
+	vadd.i32	q4, q4, q8
+	vadd.i32	q5, q5, q9
+	vdup.32		q8, r5
+	vdup.32		q9, r6
+	vadd.i32	q6, q6, q8
+	vadd.i32	q7, q7, q9
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q0, q1
+	vzip.32		q2, q3
+	vzip.32		q4, q5
+	vzip.32		q6, q7
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d1, d4
+	vswp		d3, d6
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// xor with corresponding input, write to output
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q4
+	vst1.8		{q8-q9}, [r1]!
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	vadd.i32	q8, q8, q0
+	vadd.i32	q9, q9, q4
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q10, q10, q0
+	vadd.i32	q11, q11, q4
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	ldmia		r0!, {r3-r6}
+	vdup.32		q0, r3
+	vdup.32		q4, r4
+	adr		r3, CTRINC
+	vadd.i32	q12, q12, q0
+	vld1.32		{q0}, [r3, :128]
+	vadd.i32	q13, q13, q4
+	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
+
+	vdup.32		q0, r5
+	vdup.32		q4, r6
+	vadd.i32	q14, q14, q0
+	vadd.i32	q15, q15, q4
+
+	// interleave 32-bit words in state n, n+1
+	vzip.32		q8, q9
+	vzip.32		q10, q11
+	vzip.32		q12, q13
+	vzip.32		q14, q15
+
+	// interleave 64-bit words in state n, n+2
+	vswp		d17, d20
+	vswp		d19, d22
+	vswp		d25, d28
+	vswp		d27, d30
+
+	vmov		q4, q1
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	mov		sp, ip
+	pop		{r4-r6, pc}
+ENDPROC(chacha20_4block_xor_neon)
+
+	.align		4
+CTRINC:	.word		0, 1, 2, 3
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..59a7be08e80c
--- /dev/null
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -0,0 +1,127 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha20_init(state, ctx, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+				nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-neon",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_neon,
+	.decrypt		= chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 33b744d54739..6fc6f5a2a6e5 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -516,4 +516,3 @@ CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 # CONFIG_CRYPTO_AES_ARM64_NEON_BLK is not set
-CONFIG_CRYPTO_CRC32_ARM64=y
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 450a85df041a..d92293747d63 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -37,10 +37,14 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	select CRYPTO_HASH
 
 config CRYPTO_CRC32_ARM64_CE
-	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
-	depends on KERNEL_MODE_NEON && CRC32
+	tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
+	depends on CRC32
 	select CRYPTO_HASH
 
+config CRYPTO_AES_ARM64
+	tristate "AES core cipher using scalar instructions"
+	select CRYPTO_AES
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
@@ -67,9 +71,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_AES
 	select CRYPTO_SIMD
 
-config CRYPTO_CRC32_ARM64
-	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
-	depends on ARM64
-	select CRYPTO_HASH
+config CRYPTO_CHACHA20_NEON
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CHACHA20
+
+config CRYPTO_AES_ARM64_BS
+	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES_ARM64_NEON_BLK
+	select CRYPTO_SIMD
 
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index aa8888d7b744..b5edc5918c28 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,15 +41,20 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
 
 CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
 
-obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o
-
-CFLAGS_crc32-arm64.o	:= -mcpu=generic+crc
-
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index cc5515dac74a..6a7dbc7c83a6 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -258,7 +258,6 @@ static struct aead_alg ccm_aes_alg = {
 		.cra_priority		= 300,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.ivsize		= AES_BLOCK_SIZE,
diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
new file mode 100644
index 000000000000..f2f9cc519309
--- /dev/null
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -0,0 +1,110 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	rk		.req	x0
+	out		.req	x1
+	in		.req	x2
+	rounds		.req	x3
+	tt		.req	x4
+	lt		.req	x2
+
+	.macro		__pair, enc, reg0, reg1, in0, in1e, in1d, shift
+	ubfx		\reg0, \in0, #\shift, #8
+	.if		\enc
+	ubfx		\reg1, \in1e, #\shift, #8
+	.else
+	ubfx		\reg1, \in1d, #\shift, #8
+	.endif
+	ldr		\reg0, [tt, \reg0, uxtw #2]
+	ldr		\reg1, [tt, \reg1, uxtw #2]
+	.endm
+
+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
+	ldp		\out0, \out1, [rk], #8
+
+	__pair		\enc, w13, w14, \in0, \in1, \in3, 0
+	__pair		\enc, w15, w16, \in1, \in2, \in0, 8
+	__pair		\enc, w17, w18, \in2, \in3, \in1, 16
+	__pair		\enc, \t0, \t1, \in3, \in0, \in2, 24
+
+	eor		\out0, \out0, w13
+	eor		\out1, \out1, w14
+	eor		\out0, \out0, w15, ror #24
+	eor		\out1, \out1, w16, ror #24
+	eor		\out0, \out0, w17, ror #16
+	eor		\out1, \out1, w18, ror #16
+	eor		\out0, \out0, \t0, ror #8
+	eor		\out1, \out1, \t1, ror #8
+	.endm
+
+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
+	.endm
+
+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
+	.endm
+
+	.macro		do_crypt, round, ttab, ltab
+	ldp		w5, w6, [in]
+	ldp		w7, w8, [in, #8]
+	ldp		w9, w10, [rk], #16
+	ldp		w11, w12, [rk, #-8]
+
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+CPU_BE(	rev		w8, w8		)
+
+	eor		w5, w5, w9
+	eor		w6, w6, w10
+	eor		w7, w7, w11
+	eor		w8, w8, w12
+
+	adr_l		tt, \ttab
+	adr_l		lt, \ltab
+
+	tbnz		rounds, #1, 1f
+
+0:	\round		w9, w10, w11, w12, w5, w6, w7, w8
+	\round		w5, w6, w7, w8, w9, w10, w11, w12
+
+1:	subs		rounds, rounds, #4
+	\round		w9, w10, w11, w12, w5, w6, w7, w8
+	csel		tt, tt, lt, hi
+	\round		w5, w6, w7, w8, w9, w10, w11, w12
+	b.hi		0b
+
+CPU_BE(	rev		w5, w5		)
+CPU_BE(	rev		w6, w6		)
+CPU_BE(	rev		w7, w7		)
+CPU_BE(	rev		w8, w8		)
+
+	stp		w5, w6, [out]
+	stp		w7, w8, [out, #8]
+	ret
+	.endm
+
+	.align		5
+ENTRY(__aes_arm64_encrypt)
+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
+ENDPROC(__aes_arm64_encrypt)
+
+	.align		5
+ENTRY(__aes_arm64_decrypt)
+	do_crypt	iround, crypto_it_tab, crypto_il_tab
+ENDPROC(__aes_arm64_decrypt)
diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c
new file mode 100644
index 000000000000..7288e7cbebff
--- /dev/null
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@@ -0,0 +1,69 @@
+/*
+ * Scalar AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_encrypt);
+
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int rounds = 6 + ctx->key_length / 4;
+
+	__aes_arm64_decrypt(ctx->key_dec, out, in, rounds);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-arm64",
+	.cra_priority			= 200,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Scalar AES cipher for arm64");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 4e3f8adb1793..bcf596b0197e 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,7 @@
 #include <asm/neon.h>
 #include <asm/hwcap.h>
 #include <crypto/aes.h>
+#include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/module.h>
@@ -31,6 +32,7 @@
 #define aes_ctr_encrypt		ce_aes_ctr_encrypt
 #define aes_xts_encrypt		ce_aes_xts_encrypt
 #define aes_xts_decrypt		ce_aes_xts_decrypt
+#define aes_mac_update		ce_aes_mac_update
 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #else
 #define MODE			"neon"
@@ -44,11 +46,15 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #define aes_ctr_encrypt		neon_aes_ctr_encrypt
 #define aes_xts_encrypt		neon_aes_xts_encrypt
 #define aes_xts_decrypt		neon_aes_xts_decrypt
+#define aes_mac_update		neon_aes_mac_update
 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
 MODULE_ALIAS_CRYPTO("ecb(aes)");
 MODULE_ALIAS_CRYPTO("cbc(aes)");
 MODULE_ALIAS_CRYPTO("ctr(aes)");
 MODULE_ALIAS_CRYPTO("xts(aes)");
+MODULE_ALIAS_CRYPTO("cmac(aes)");
+MODULE_ALIAS_CRYPTO("xcbc(aes)");
+MODULE_ALIAS_CRYPTO("cbcmac(aes)");
 #endif
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -75,11 +81,25 @@ asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
 				int rounds, int blocks, u8 const rk2[], u8 iv[],
 				int first);
 
+asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+			       int blocks, u8 dg[], int enc_before,
+			       int enc_after);
+
 struct crypto_aes_xts_ctx {
 	struct crypto_aes_ctx key1;
 	struct crypto_aes_ctx __aligned(8) key2;
 };
 
+struct mac_tfm_ctx {
+	struct crypto_aes_ctx key;
+	u8 __aligned(8) consts[];
+};
+
+struct mac_desc_ctx {
+	unsigned int len;
+	u8 dg[AES_BLOCK_SIZE];
+};
+
 static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			       unsigned int key_len)
 {
@@ -215,14 +235,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 		u8 *tsrc = walk.src.virt.addr;
 
 		/*
-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
-		 * to tell aes_ctr_encrypt() to only read half a block.
+		 * Tell aes_ctr_encrypt() to process a tail block.
 		 */
-		blocks = (nbytes <= 8) ? -1 : 1;
+		blocks = -1;
 
-		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
 				blocks, walk.iv, first);
-		memcpy(tdst, tail, nbytes);
+		if (tdst != tsrc)
+			memcpy(tdst, tsrc, nbytes);
+		crypto_xor(tdst, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
@@ -282,7 +303,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -298,7 +318,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -315,7 +334,22 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= 1,
 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
+	},
+	.min_keysize	= AES_MIN_KEY_SIZE,
+	.max_keysize	= AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.chunksize	= AES_BLOCK_SIZE,
+	.setkey		= skcipher_aes_setkey,
+	.encrypt	= ctr_encrypt,
+	.decrypt	= ctr_encrypt,
+}, {
+	.base = {
+		.cra_name		= "ctr(aes)",
+		.cra_driver_name	= "ctr-aes-" MODE,
+		.cra_priority		= PRIO - 1,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= AES_MIN_KEY_SIZE,
@@ -333,7 +367,6 @@ static struct skcipher_alg aes_algs[] = { {
 		.cra_flags		= CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= AES_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
-		.cra_alignmask		= 7,
 		.cra_module		= THIS_MODULE,
 	},
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
@@ -344,15 +377,228 @@ static struct skcipher_alg aes_algs[] = { {
 	.decrypt	= xts_decrypt,
 } };
 
+static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+			 unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	int err;
+
+	err = aes_expandkey(&ctx->key, in_key, key_len);
+	if (err)
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+
+	return err;
+}
+
+static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
+{
+	u64 a = be64_to_cpu(x->a);
+	u64 b = be64_to_cpu(x->b);
+
+	y->a = cpu_to_be64((a << 1) | (b >> 63));
+	y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
+}
+
+static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	be128 *consts = (be128 *)ctx->consts;
+	u8 *rk = (u8 *)ctx->key.key_enc;
+	int rounds = 6 + key_len / 4;
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	/* encrypt the zero vector */
+	kernel_neon_begin();
+	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
+	kernel_neon_end();
+
+	cmac_gf128_mul_by_x(consts, consts);
+	cmac_gf128_mul_by_x(consts + 1, consts);
+
+	return 0;
+}
+
+static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	static u8 const ks[3][AES_BLOCK_SIZE] = {
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
+	};
+
+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	u8 *rk = (u8 *)ctx->key.key_enc;
+	int rounds = 6 + key_len / 4;
+	u8 key[AES_BLOCK_SIZE];
+	int err;
+
+	err = cbcmac_setkey(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	kernel_neon_begin();
+	aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
+	aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
+	kernel_neon_end();
+
+	return cbcmac_setkey(tfm, key, sizeof(key));
+}
+
+static int mac_init(struct shash_desc *desc)
+{
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	memset(ctx->dg, 0, AES_BLOCK_SIZE);
+	ctx->len = 0;
+
+	return 0;
+}
+
+static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+
+	while (len > 0) {
+		unsigned int l;
+
+		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
+		    (ctx->len + len) > AES_BLOCK_SIZE) {
+
+			int blocks = len / AES_BLOCK_SIZE;
+
+			len %= AES_BLOCK_SIZE;
+
+			kernel_neon_begin();
+			aes_mac_update(p, tctx->key.key_enc, rounds, blocks,
+				       ctx->dg, (ctx->len != 0), (len != 0));
+			kernel_neon_end();
+
+			p += blocks * AES_BLOCK_SIZE;
+
+			if (!len) {
+				ctx->len = AES_BLOCK_SIZE;
+				break;
+			}
+			ctx->len = 0;
+		}
+
+		l = min(len, AES_BLOCK_SIZE - ctx->len);
+
+		if (l <= AES_BLOCK_SIZE) {
+			crypto_xor(ctx->dg + ctx->len, p, l);
+			ctx->len += l;
+			len -= l;
+			p += l;
+		}
+	}
+
+	return 0;
+}
+
+static int cbcmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+
+	kernel_neon_begin();
+	aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0);
+	kernel_neon_end();
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+static int cmac_final(struct shash_desc *desc, u8 *out)
+{
+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int rounds = 6 + tctx->key.key_length / 4;
+	u8 *consts = tctx->consts;
+
+	if (ctx->len != AES_BLOCK_SIZE) {
+		ctx->dg[ctx->len] ^= 0x80;
+		consts += AES_BLOCK_SIZE;
+	}
+
+	kernel_neon_begin();
+	aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1);
+	kernel_neon_end();
+
+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg mac_algs[] = { {
+	.base.cra_name		= "cmac(aes)",
+	.base.cra_driver_name	= "cmac-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= cmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "xcbc(aes)",
+	.base.cra_driver_name	= "xcbc-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
+				  2 * AES_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cmac_final,
+	.setkey			= xcbc_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+}, {
+	.base.cra_name		= "cbcmac(aes)",
+	.base.cra_driver_name	= "cbcmac-aes-" MODE,
+	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= AES_BLOCK_SIZE,
+	.init			= mac_init,
+	.update			= mac_update,
+	.final			= cbcmac_final,
+	.setkey			= cbcmac_setkey,
+	.descsize		= sizeof(struct mac_desc_ctx),
+} };
+
 static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
 
 static void aes_exit(void)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
-		simd_skcipher_free(aes_simd_algs[i]);
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);
 
+	crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 }
 
@@ -369,7 +615,14 @@ static int __init aes_init(void)
 	if (err)
 		return err;
 
+	err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs));
+	if (err)
+		goto unregister_ciphers;
+
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
 		algname = aes_algs[i].base.cra_name + 2;
 		drvname = aes_algs[i].base.cra_driver_name + 2;
 		basename = aes_algs[i].base.cra_driver_name;
@@ -385,6 +638,8 @@ static int __init aes_init(void)
 
 unregister_simds:
 	aes_exit();
+unregister_ciphers:
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 	return err;
 }
 
@@ -392,5 +647,7 @@ unregister_simds:
 module_cpu_feature_match(AES, aes_init);
 #else
 module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
 #endif
 module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 838dad5c209f..2674d43d1384 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -337,7 +337,7 @@ AES_ENTRY(aes_ctr_encrypt)
 
 .Lctrcarrydone:
 	subs		w4, w4, #1
-	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
+	bmi		.Lctrtailblock		/* blocks <0 means tail block */
 	ld1		{v3.16b}, [x1], #16
 	eor		v3.16b, v0.16b, v3.16b
 	st1		{v3.16b}, [x0], #16
@@ -348,10 +348,8 @@ AES_ENTRY(aes_ctr_encrypt)
 	FRAME_POP
 	ret
 
-.Lctrhalfblock:
-	ld1		{v3.8b}, [x1]
-	eor		v3.8b, v0.8b, v3.8b
-	st1		{v3.8b}, [x0]
+.Lctrtailblock:
+	st1		{v0.16b}, [x0]
 	FRAME_POP
 	ret
 
@@ -527,3 +525,30 @@ AES_ENTRY(aes_xts_decrypt)
 	FRAME_POP
 	ret
 AES_ENDPROC(aes_xts_decrypt)
+
+	/*
+	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
+	 */
+AES_ENTRY(aes_mac_update)
+	ld1		{v0.16b}, [x4]			/* get dg */
+	enc_prepare	w2, x1, x7
+	cbnz		w5, .Lmacenc
+
+.Lmacloop:
+	cbz		w3, .Lmacout
+	ld1		{v1.16b}, [x0], #16		/* get next pt block */
+	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
+
+	subs		w3, w3, #1
+	csinv		x5, x6, xzr, eq
+	cbz		w5, .Lmacout
+
+.Lmacenc:
+	encrypt_block	v0, w2, x1, x7, w8
+	b		.Lmacloop
+
+.Lmacout:
+	st1		{v0.16b}, [x4]			/* return dg */
+	ret
+AES_ENDPROC(aes_mac_update)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 85f07ead7c5c..f1e3aa2732f9 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,7 +1,7 @@
 /*
  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
  *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
 	sshr		\temp, \in, #7
-	add		\out, \in, \in
+	shl		\out, \in, #1
 	and		\temp, \temp, \const
 	eor		\out, \out, \temp
 	.endm
 
+	/* multiply by polynomial 'x^2' in GF(2^8) */
+	.macro		mul_by_x2, out, in, temp, const
+	ushr		\temp, \in, #6
+	shl		\out, \in, #2
+	pmul		\temp, \temp, \const
+	eor		\out, \out, \temp
+	.endm
+
 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
 	adr		\temp, \sbox
-	movi		v12.16b, #0x40
+	movi		v12.16b, #0x1b
 	ldr		q13, \shiftrows
-	movi		v14.16b, #0x1b
+	ldr		q14, .Lror32by8
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@
 
 	/* apply SubBytes transformation using the the preloaded Sbox */
 	.macro		sub_bytes, in
-	sub		v9.16b, \in\().16b, v12.16b
+	sub		v9.16b, \in\().16b, v15.16b
 	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
-	sub		v10.16b, v9.16b, v12.16b
+	sub		v10.16b, v9.16b, v15.16b
 	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v11.16b, v10.16b, v12.16b
+	sub		v11.16b, v10.16b, v15.16b
 	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
 	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm
 
 	/* apply MixColumns transformation */
-	.macro		mix_columns, in
-	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
-	rev32		v8.8h, \in\().8h
-	eor		\in\().16b, v10.16b, \in\().16b
-	shl		v9.4s, v8.4s, #24
-	shl		v11.4s, \in\().4s, #24
-	sri		v9.4s, v8.4s, #8
-	sri		v11.4s, \in\().4s, #8
-	eor		v9.16b, v9.16b, v8.16b
-	eor		v10.16b, v10.16b, v9.16b
-	eor		\in\().16b, v10.16b, v11.16b
-	.endm
-
+	.macro		mix_columns, in, enc
+	.if		\enc == 0
 	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
-	.macro		inv_mix_columns, in
-	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
-	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
-	eor		\in\().16b, \in\().16b, v11.16b
-	rev32		v11.8h, v11.8h
-	eor		\in\().16b, \in\().16b, v11.16b
-	mix_columns	\in
+	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in\().16b, \in\().16b, v8.16b
+	.endif
+
+	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
+	rev32		v8.8h, \in\().8h
+	eor		v8.16b, v8.16b, v9.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	tbl		\in\().16b, {\in\().16b}, v14.16b
+	eor		\in\().16b, \in\().16b, v8.16b
 	.endm
 
 	.macro		do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns	\in
-	.else
-	inv_mix_columns	\in
-	.endif
+	mix_columns	\in, \enc
 	b		1111b
 2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
@@ -116,139 +115,114 @@
 	 */
 
 	.macro		sub_bytes_2x, in0, in1
-	sub		v8.16b, \in0\().16b, v12.16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, v8.16b, v12.16b
-	sub		v11.16b, v9.16b, v12.16b
+	sub		v10.16b, v8.16b, v15.16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	sub		v11.16b, v9.16b, v15.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v10.16b, v12.16b
-	sub		v9.16b, v11.16b, v12.16b
+	sub		v8.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	.endm
 
 	.macro		sub_bytes_4x, in0, in1, in2, in3
-	sub		v8.16b, \in0\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, \in2\().16b, v12.16b
+	sub		v10.16b, \in2\().16b, v15.16b
 	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
-	sub		v11.16b, \in3\().16b, v12.16b
+	sub		v11.16b, \in3\().16b, v15.16b
 	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
 	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm
 
 	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
-	sshr		\tmp0\().16b, \in0\().16b,  #7
-	add		\out0\().16b, \in0\().16b,  \in0\().16b
-	sshr		\tmp1\().16b, \in1\().16b,  #7
+	sshr		\tmp0\().16b, \in0\().16b, #7
+	shl		\out0\().16b, \in0\().16b, #1
+	sshr		\tmp1\().16b, \in1\().16b, #7
 	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
-	add		\out1\().16b, \in1\().16b,  \in1\().16b
+	shl		\out1\().16b, \in1\().16b, #1
 	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
 	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
 	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm
 
-	.macro		mix_columns_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
+	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+	ushr		\tmp0\().16b, \in0\().16b, #6
+	shl		\out0\().16b, \in0\().16b, #2
+	ushr		\tmp1\().16b, \in1\().16b, #6
+	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
+	shl		\out1\().16b, \in1\().16b, #2
+	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
+	.endm
+
+	.macro		mix_columns_2x, in0, in1, enc
+	.if		\enc == 0
+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	rev32		v9.8h, v9.8h
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	.endif
+
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
 	rev32		v10.8h, \in0\().8h
 	rev32		v11.8h, \in1\().8h
-	eor		\in0\().16b, v8.16b, \in0\().16b
-	eor		\in1\().16b, v9.16b, \in1\().16b
-	shl		v12.4s, v10.4s, #24
-	shl		v13.4s, v11.4s, #24
-	eor		v8.16b, v8.16b, v10.16b
-	sri		v12.4s, v10.4s, #8
-	shl		v10.4s, \in0\().4s, #24
-	eor		v9.16b, v9.16b, v11.16b
-	sri		v13.4s, v11.4s, #8
-	shl		v11.4s, \in1\().4s, #24
-	sri		v10.4s, \in0\().4s, #8
-	eor		\in0\().16b, v8.16b, v12.16b
-	sri		v11.4s, \in1\().4s, #8
-	eor		\in1\().16b, v9.16b, v13.16b
-	eor		\in0\().16b, v10.16b, \in0\().16b
-	eor		\in1\().16b, v11.16b, \in1\().16b
+	eor		v10.16b, v10.16b, v8.16b
+	eor		v11.16b, v11.16b, v9.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
+	tbl		\in0\().16b, {\in0\().16b}, v14.16b
+	tbl		\in1\().16b, {\in1\().16b}, v14.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm
 
-	.macro		inv_mix_cols_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	mix_columns_2x	\in0, \in1
-	.endm
-
-	.macro		inv_mix_cols_4x, in0, in1, in2, in3
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
-	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
-	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	rev32		v10.8h, v10.8h
-	rev32		v11.8h, v11.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	.endm
-
-	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	sub_bytes_2x	\in0, \in1
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_2x	\in0, \in1
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_2x	\in0, \in1
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -262,23 +236,17 @@
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
-	sub_bytes_4x	\in0, \in1, \in2, \in3
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_4x	\in0, \in1, \in2, \in3
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_4x	\in0, \in1, \in2, \in3
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
+	mix_columns_2x	\in2, \in3, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -305,19 +273,7 @@
 #include "aes-modes.S"
 
 	.text
-	.align		4
-.LForward_ShiftRows:
-CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
-CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
-CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
-CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
-
-.LReverse_ShiftRows:
-CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
-CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
-CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
-CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
-
+	.align		6
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
 	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+	.octa		0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+	.octa		0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+	.octa		0x0c0f0e0d080b0a090407060500030201
diff --git a/arch/arm64/crypto/aes-neonbs-core.S b/arch/arm64/crypto/aes-neonbs-core.S
new file mode 100644
index 000000000000..ca0472500433
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -0,0 +1,972 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * The algorithm implemented here is described in detail by the paper
+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
+ *
+ * This implementation is based primarily on the OpenSSL implementation
+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	rounds		.req	x11
+	bskey		.req	x12
+
+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b2, \b2, \b1
+	eor		\b5, \b5, \b6
+	eor		\b3, \b3, \b0
+	eor		\b6, \b6, \b2
+	eor		\b5, \b5, \b0
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor		\b4, \b4, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b1, \b1, \b5
+	.endm
+
+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	eor		\b4, \b4, \b6
+	eor		\b2, \b2, \b0
+	eor		\b6, \b6, \b1
+	eor		\b1, \b1, \b5
+	eor		\b5, \b5, \b3
+	eor		\b3, \b3, \b7
+	eor		\b7, \b7, \b5
+	eor		\b2, \b2, \b5
+	eor		\b4, \b4, \b7
+	.endm
+
+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
+	eor		\b1, \b1, \b7
+	eor		\b4, \b4, \b7
+	eor		\b7, \b7, \b5
+	eor		\b1, \b1, \b3
+	eor		\b2, \b2, \b5
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b1
+	eor		\b2, \b2, \b0
+	eor		\b5, \b5, \b3
+	eor		\b4, \b4, \b6
+	eor		\b0, \b0, \b6
+	eor		\b1, \b1, \b4
+	.endm
+
+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
+	eor		\b1, \b1, \b5
+	eor		\b2, \b2, \b7
+	eor		\b3, \b3, \b1
+	eor		\b4, \b4, \b5
+	eor		\b7, \b7, \b5
+	eor		\b3, \b3, \b4
+	eor 		\b5, \b5, \b0
+	eor		\b3, \b3, \b7
+	eor		\b6, \b6, \b2
+	eor		\b2, \b2, \b1
+	eor		\b6, \b6, \b3
+	eor		\b3, \b3, \b0
+	eor		\b5, \b5, \b6
+	.endm
+
+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
+	eor 		\t0, \y0, \y1
+	and		\t0, \t0, \x0
+	eor		\x0, \x0, \x1
+	and		\t1, \x1, \y0
+	and		\x0, \x0, \y1
+	eor		\x1, \t1, \t0
+	eor		\x0, \x0, \t1
+	.endm
+
+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
+	eor		\t0, \y0, \y1
+	eor 		\t1, \y2, \y3
+	and		\t0, \t0, \x0
+	and		\t1, \t1, \x2
+	eor		\x0, \x0, \x1
+	eor		\x2, \x2, \x3
+	and		\x1, \x1, \y0
+	and		\x3, \x3, \y2
+	and		\x0, \x0, \y1
+	and		\x2, \x2, \y3
+	eor		\x1, \x1, \x0
+	eor		\x2, \x2, \x3
+	eor		\x0, \x0, \t0
+	eor		\x3, \x3, \t1
+	.endm
+
+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
+				    y0, y1, y2, y3, t0, t1, t2, t3
+	eor		\t0, \x0, \x2
+	eor		\t1, \x1, \x3
+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
+	eor		\x0, \x0, \t0
+	eor		\x2, \x2, \t0
+	eor		\x1, \x1, \t1
+	eor		\x3, \x3, \t1
+	eor		\t0, \x4, \x6
+	eor		\t1, \x5, \x7
+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
+	eor		\y0, \y0, \y2
+	eor		\y1, \y1, \y3
+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
+	eor		\x4, \x4, \t0
+	eor		\x6, \x6, \t0
+	eor		\x5, \x5, \t1
+	eor		\x7, \x7, \t1
+	.endm
+
+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
+				   t0, t1, t2, t3, s0, s1, s2, s3
+	eor		\t3, \x4, \x6
+	eor		\t0, \x5, \x7
+	eor		\t1, \x1, \x3
+	eor		\s1, \x7, \x6
+	eor		\s0, \x0, \x2
+	eor		\s3, \t3, \t0
+	orr		\t2, \t0, \t1
+	and		\s2, \t3, \s0
+	orr		\t3, \t3, \s0
+	eor		\s0, \s0, \t1
+	and		\t0, \t0, \t1
+	eor		\t1, \x3, \x2
+	and		\s3, \s3, \s0
+	and		\s1, \s1, \t1
+	eor		\t1, \x4, \x5
+	eor		\s0, \x1, \x0
+	eor		\t3, \t3, \s1
+	eor		\t2, \t2, \s1
+	and		\s1, \t1, \s0
+	orr		\t1, \t1, \s0
+	eor		\t3, \t3, \s3
+	eor		\t0, \t0, \s1
+	eor		\t2, \t2, \s2
+	eor		\t1, \t1, \s3
+	eor		\t0, \t0, \s2
+	and		\s0, \x7, \x3
+	eor		\t1, \t1, \s2
+	and		\s1, \x6, \x2
+	and		\s2, \x5, \x1
+	orr		\s3, \x4, \x0
+	eor		\t3, \t3, \s0
+	eor		\t1, \t1, \s2
+	eor		\s0, \t0, \s3
+	eor		\t2, \t2, \s1
+	and		\s2, \t3, \t1
+	eor		\s1, \t2, \s2
+	eor		\s3, \s0, \s2
+	bsl		\s1, \t1, \s0
+	not		\t0, \s0
+	bsl		\s0, \s1, \s3
+	bsl		\t0, \s1, \s3
+	bsl		\s3, \t3, \t2
+	eor		\t3, \t3, \t2
+	and		\s2, \s0, \s3
+	eor		\t1, \t1, \t0
+	eor		\s2, \s2, \t3
+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	.endm
+
+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+			      t0, t1, t2, t3, s0, s1, s2, s3
+	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
+			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
+			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
+	.endm
+
+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
+				  t0, t1, t2, t3, s0, s1, s2, s3
+	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
+	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
+			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
+	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
+			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
+	.endm
+
+	.macro		enc_next_rk
+	ldp		q16, q17, [bskey], #128
+	ldp		q18, q19, [bskey, #-96]
+	ldp		q20, q21, [bskey, #-64]
+	ldp		q22, q23, [bskey, #-32]
+	.endm
+
+	.macro		dec_next_rk
+	ldp		q16, q17, [bskey, #-128]!
+	ldp		q18, q19, [bskey, #32]
+	ldp		q20, q21, [bskey, #64]
+	ldp		q22, q23, [bskey, #96]
+	.endm
+
+	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
+	eor		\x0\().16b, \x0\().16b, v16.16b
+	eor		\x1\().16b, \x1\().16b, v17.16b
+	eor		\x2\().16b, \x2\().16b, v18.16b
+	eor		\x3\().16b, \x3\().16b, v19.16b
+	eor		\x4\().16b, \x4\().16b, v20.16b
+	eor		\x5\().16b, \x5\().16b, v21.16b
+	eor		\x6\().16b, \x6\().16b, v22.16b
+	eor		\x7\().16b, \x7\().16b, v23.16b
+	.endm
+
+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
+	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
+	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
+	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
+	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
+	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
+	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
+	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
+	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
+	.endm
+
+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
+	eor		\x2\().16b, \x2\().16b, \t2\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
+	eor		\x3\().16b, \x3\().16b, \t3\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
+	eor		\x4\().16b, \x4\().16b, \t4\().16b
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
+	eor		\x5\().16b, \x5\().16b, \t5\().16b
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
+	eor		\x6\().16b, \x6\().16b, \t6\().16b
+	eor		\t1\().16b, \t1\().16b, \x0\().16b
+	eor		\x7\().16b, \x7\().16b, \t7\().16b
+	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x1\().16b
+	eor		\t0\().16b, \t0\().16b, \x7\().16b
+	eor		\t1\().16b, \t1\().16b, \x7\().16b
+	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t5\().16b, \t5\().16b, \x4\().16b
+	eor		\x0\().16b, \x0\().16b, \t0\().16b
+	eor		\t6\().16b, \t6\().16b, \x5\().16b
+	eor		\x1\().16b, \x1\().16b, \t1\().16b
+	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x3\().16b
+	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x6\().16b
+	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x2\().16b
+	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t4\().16b, \t4\().16b, \x7\().16b
+	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x7\().16b
+	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\x7\().16b, \t1\().16b, \t5\().16b
+	.ifb		\inv
+	eor		\x2\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t3\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t2\().16b
+	.else
+	eor		\t3\().16b, \t3\().16b, \x4\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x2\().16b, \x3\().16b, \t6\().16b
+	eor		\x3\().16b, \t0\().16b, \t4\().16b
+	eor		\x4\().16b, \x6\().16b, \t2\().16b
+	mov		\x6\().16b, \t3\().16b
+	.endif
+	.endm
+
+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
+				      t0, t1, t2, t3, t4, t5, t6, t7
+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
+	eor		\t0\().16b, \t0\().16b, \x0\().16b
+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
+	eor		\t6\().16b, \t6\().16b, \x6\().16b
+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
+	eor		\t7\().16b, \t7\().16b, \x7\().16b
+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
+	eor		\t1\().16b, \t1\().16b, \x1\().16b
+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
+	eor		\t2\().16b, \t2\().16b, \x2\().16b
+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
+	eor		\t3\().16b, \t3\().16b, \x3\().16b
+	eor		\t4\().16b, \t4\().16b, \x4\().16b
+	eor		\t5\().16b, \t5\().16b, \x5\().16b
+	eor		\x0\().16b, \x0\().16b, \t6\().16b
+	eor		\x1\().16b, \x1\().16b, \t6\().16b
+	eor		\x2\().16b, \x2\().16b, \t0\().16b
+	eor		\x4\().16b, \x4\().16b, \t2\().16b
+	eor		\x3\().16b, \x3\().16b, \t1\().16b
+	eor		\x1\().16b, \x1\().16b, \t7\().16b
+	eor		\x2\().16b, \x2\().16b, \t7\().16b
+	eor		\x4\().16b, \x4\().16b, \t6\().16b
+	eor		\x5\().16b, \x5\().16b, \t3\().16b
+	eor		\x3\().16b, \x3\().16b, \t6\().16b
+	eor		\x6\().16b, \x6\().16b, \t4\().16b
+	eor		\x4\().16b, \x4\().16b, \t7\().16b
+	eor		\x5\().16b, \x5\().16b, \t7\().16b
+	eor		\x7\().16b, \x7\().16b, \t5\().16b
+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
+	.endm
+
+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
+	ushr		\t0\().2d, \b0\().2d, #\n
+	ushr		\t1\().2d, \b1\().2d, #\n
+	eor		\t0\().16b, \t0\().16b, \a0\().16b
+	eor		\t1\().16b, \t1\().16b, \a1\().16b
+	and		\t0\().16b, \t0\().16b, \mask\().16b
+	and		\t1\().16b, \t1\().16b, \mask\().16b
+	eor		\a0\().16b, \a0\().16b, \t0\().16b
+	shl		\t0\().2d, \t0\().2d, #\n
+	eor		\a1\().16b, \a1\().16b, \t1\().16b
+	shl		\t1\().2d, \t1\().2d, #\n
+	eor		\b0\().16b, \b0\().16b, \t0\().16b
+	eor		\b1\().16b, \b1\().16b, \t1\().16b
+	.endm
+
+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
+	movi		\t0\().16b, #0x55
+	movi		\t1\().16b, #0x33
+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
+	movi		\t0\().16b, #0x0f
+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
+	.endm
+
+
+	.align		6
+M0:	.octa		0x0004080c0105090d02060a0e03070b0f
+
+M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
+SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
+SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
+
+M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
+ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
+ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
+
+	/*
+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
+	 */
+ENTRY(aesbs_convert_key)
+	ld1		{v7.4s}, [x1], #16		// load round 0 key
+	ld1		{v17.4s}, [x1], #16		// load round 1 key
+
+	movi		v8.16b,  #0x01			// bit masks
+	movi		v9.16b,  #0x02
+	movi		v10.16b, #0x04
+	movi		v11.16b, #0x08
+	movi		v12.16b, #0x10
+	movi		v13.16b, #0x20
+	movi		v14.16b, #0x40
+	movi		v15.16b, #0x80
+	ldr		q16, M0
+
+	sub		x2, x2, #1
+	str		q7, [x0], #16		// save round 0 key
+
+.Lkey_loop:
+	tbl		v7.16b ,{v17.16b}, v16.16b
+	ld1		{v17.4s}, [x1], #16		// load next round key
+
+	cmtst		v0.16b, v7.16b, v8.16b
+	cmtst		v1.16b, v7.16b, v9.16b
+	cmtst		v2.16b, v7.16b, v10.16b
+	cmtst		v3.16b, v7.16b, v11.16b
+	cmtst		v4.16b, v7.16b, v12.16b
+	cmtst		v5.16b, v7.16b, v13.16b
+	cmtst		v6.16b, v7.16b, v14.16b
+	cmtst		v7.16b, v7.16b, v15.16b
+	not		v0.16b, v0.16b
+	not		v1.16b, v1.16b
+	not		v5.16b, v5.16b
+	not		v6.16b, v6.16b
+
+	subs		x2, x2, #1
+	stp		q0, q1, [x0], #128
+	stp		q2, q3, [x0, #-96]
+	stp		q4, q5, [x0, #-64]
+	stp		q6, q7, [x0, #-32]
+	b.ne		.Lkey_loop
+
+	movi		v7.16b, #0x63			// compose .L63
+	eor		v17.16b, v17.16b, v7.16b
+	str		q17, [x0]
+	ret
+ENDPROC(aesbs_convert_key)
+
+	.align		4
+aesbs_encrypt8:
+	ldr		q9, [bskey], #16		// round 0 key
+	ldr		q8, M0SR
+	ldr		q24, SR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Lenc_sbox
+
+.Lenc_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Lenc_sbox:
+	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Lenc_done
+
+	enc_next_rk
+
+	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
+
+	b.ne		.Lenc_loop
+	ldr		q24, SRM0
+	b		.Lenc_loop
+
+.Lenc_done:
+	ldr		q12, [bskey]			// last round key
+
+	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_encrypt8)
+
+	.align		4
+aesbs_decrypt8:
+	lsl		x9, rounds, #7
+	add		bskey, bskey, x9
+
+	ldr		q9, [bskey, #-112]!		// round 0 key
+	ldr		q8, M0ISR
+	ldr		q24, ISR
+
+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
+	eor		v11.16b, v1.16b, v9.16b
+	tbl		v0.16b, {v10.16b}, v8.16b
+	eor		v12.16b, v2.16b, v9.16b
+	tbl		v1.16b, {v11.16b}, v8.16b
+	eor		v13.16b, v3.16b, v9.16b
+	tbl		v2.16b, {v12.16b}, v8.16b
+	eor		v14.16b, v4.16b, v9.16b
+	tbl		v3.16b, {v13.16b}, v8.16b
+	eor		v15.16b, v5.16b, v9.16b
+	tbl		v4.16b, {v14.16b}, v8.16b
+	eor		v10.16b, v6.16b, v9.16b
+	tbl		v5.16b, {v15.16b}, v8.16b
+	eor		v11.16b, v7.16b, v9.16b
+	tbl		v6.16b, {v10.16b}, v8.16b
+	tbl		v7.16b, {v11.16b}, v8.16b
+
+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+
+	sub		rounds, rounds, #1
+	b		.Ldec_sbox
+
+.Ldec_loop:
+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
+.Ldec_sbox:
+	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+	subs		rounds, rounds, #1
+	b.cc		.Ldec_done
+
+	dec_next_rk
+
+	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
+
+	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
+								v13, v14, v15
+
+	b.ne		.Ldec_loop
+	ldr		q24, ISRM0
+	b		.Ldec_loop
+.Ldec_done:
+	ldr		q12, [bskey, #-16]		// last round key
+
+	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
+
+	eor		v0.16b, v0.16b, v12.16b
+	eor		v1.16b, v1.16b, v12.16b
+	eor		v6.16b, v6.16b, v12.16b
+	eor		v4.16b, v4.16b, v12.16b
+	eor		v2.16b, v2.16b, v12.16b
+	eor		v7.16b, v7.16b, v12.16b
+	eor		v3.16b, v3.16b, v12.16b
+	eor		v5.16b, v5.16b, v12.16b
+	ret
+ENDPROC(aesbs_decrypt8)
+
+	/*
+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks)
+	 */
+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+99:	mov		x5, #1
+	lsl		x5, x5, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x5, x5, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	tbnz		x5, #1, 0f
+	ld1		{v1.16b}, [x1], #16
+	tbnz		x5, #2, 0f
+	ld1		{v2.16b}, [x1], #16
+	tbnz		x5, #3, 0f
+	ld1		{v3.16b}, [x1], #16
+	tbnz		x5, #4, 0f
+	ld1		{v4.16b}, [x1], #16
+	tbnz		x5, #5, 0f
+	ld1		{v5.16b}, [x1], #16
+	tbnz		x5, #6, 0f
+	ld1		{v6.16b}, [x1], #16
+	tbnz		x5, #7, 0f
+	ld1		{v7.16b}, [x1], #16
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		\do8
+
+	st1		{\o0\().16b}, [x0], #16
+	tbnz		x5, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	tbnz		x5, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	tbnz		x5, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	tbnz		x5, #4, 1f
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x5, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x5, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x5, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	ldp		x29, x30, [sp], #16
+	ret
+	.endm
+
+	.align		4
+ENTRY(aesbs_ecb_encrypt)
+	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_ecb_encrypt)
+
+	.align		4
+ENTRY(aesbs_ecb_decrypt)
+	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_ecb_decrypt)
+
+	/*
+	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+	.align		4
+ENTRY(aesbs_cbc_decrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+99:	mov		x6, #1
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x6, x6, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	mov		v25.16b, v0.16b
+	tbnz		x6, #1, 0f
+	ld1		{v1.16b}, [x1], #16
+	mov		v26.16b, v1.16b
+	tbnz		x6, #2, 0f
+	ld1		{v2.16b}, [x1], #16
+	mov		v27.16b, v2.16b
+	tbnz		x6, #3, 0f
+	ld1		{v3.16b}, [x1], #16
+	mov		v28.16b, v3.16b
+	tbnz		x6, #4, 0f
+	ld1		{v4.16b}, [x1], #16
+	mov		v29.16b, v4.16b
+	tbnz		x6, #5, 0f
+	ld1		{v5.16b}, [x1], #16
+	mov		v30.16b, v5.16b
+	tbnz		x6, #6, 0f
+	ld1		{v6.16b}, [x1], #16
+	mov		v31.16b, v6.16b
+	tbnz		x6, #7, 0f
+	ld1		{v7.16b}, [x1]
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		aesbs_decrypt8
+
+	ld1		{v24.16b}, [x5]			// load IV
+
+	eor		v1.16b, v1.16b, v25.16b
+	eor		v6.16b, v6.16b, v26.16b
+	eor		v4.16b, v4.16b, v27.16b
+	eor		v2.16b, v2.16b, v28.16b
+	eor		v7.16b, v7.16b, v29.16b
+	eor		v0.16b, v0.16b, v24.16b
+	eor		v3.16b, v3.16b, v30.16b
+	eor		v5.16b, v5.16b, v31.16b
+
+	st1		{v0.16b}, [x0], #16
+	mov		v24.16b, v25.16b
+	tbnz		x6, #1, 1f
+	st1		{v1.16b}, [x0], #16
+	mov		v24.16b, v26.16b
+	tbnz		x6, #2, 1f
+	st1		{v6.16b}, [x0], #16
+	mov		v24.16b, v27.16b
+	tbnz		x6, #3, 1f
+	st1		{v4.16b}, [x0], #16
+	mov		v24.16b, v28.16b
+	tbnz		x6, #4, 1f
+	st1		{v2.16b}, [x0], #16
+	mov		v24.16b, v29.16b
+	tbnz		x6, #5, 1f
+	st1		{v7.16b}, [x0], #16
+	mov		v24.16b, v30.16b
+	tbnz		x6, #6, 1f
+	st1		{v3.16b}, [x0], #16
+	mov		v24.16b, v31.16b
+	tbnz		x6, #7, 1f
+	ld1		{v24.16b}, [x1], #16
+	st1		{v5.16b}, [x0], #16
+1:	st1		{v24.16b}, [x5]			// store IV
+
+	cbnz		x4, 99b
+
+	ldp		x29, x30, [sp], #16
+	ret
+ENDPROC(aesbs_cbc_decrypt)
+
+	.macro		next_tweak, out, in, const, tmp
+	sshr		\tmp\().2d,  \in\().2d,   #63
+	and		\tmp\().16b, \tmp\().16b, \const\().16b
+	add		\out\().2d,  \in\().2d,   \in\().2d
+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+	eor		\out\().16b, \out\().16b, \tmp\().16b
+	.endm
+
+	.align		4
+.Lxts_mul_x:
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)
+
+	/*
+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		     int blocks, u8 iv[])
+	 */
+__xts_crypt8:
+	mov		x6, #1
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x6, x6, xzr, mi
+
+	ld1		{v0.16b}, [x1], #16
+	next_tweak	v26, v25, v30, v31
+	eor		v0.16b, v0.16b, v25.16b
+	tbnz		x6, #1, 0f
+
+	ld1		{v1.16b}, [x1], #16
+	next_tweak	v27, v26, v30, v31
+	eor		v1.16b, v1.16b, v26.16b
+	tbnz		x6, #2, 0f
+
+	ld1		{v2.16b}, [x1], #16
+	next_tweak	v28, v27, v30, v31
+	eor		v2.16b, v2.16b, v27.16b
+	tbnz		x6, #3, 0f
+
+	ld1		{v3.16b}, [x1], #16
+	next_tweak	v29, v28, v30, v31
+	eor		v3.16b, v3.16b, v28.16b
+	tbnz		x6, #4, 0f
+
+	ld1		{v4.16b}, [x1], #16
+	str		q29, [sp, #16]
+	eor		v4.16b, v4.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #5, 0f
+
+	ld1		{v5.16b}, [x1], #16
+	str		q29, [sp, #32]
+	eor		v5.16b, v5.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #6, 0f
+
+	ld1		{v6.16b}, [x1], #16
+	str		q29, [sp, #48]
+	eor		v6.16b, v6.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+	tbnz		x6, #7, 0f
+
+	ld1		{v7.16b}, [x1], #16
+	str		q29, [sp, #64]
+	eor		v7.16b, v7.16b, v29.16b
+	next_tweak	v29, v29, v30, v31
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	br		x7
+ENDPROC(__xts_crypt8)
+
+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
+	stp		x29, x30, [sp, #-80]!
+	mov		x29, sp
+
+	ldr		q30, .Lxts_mul_x
+	ld1		{v25.16b}, [x5]
+
+99:	adr		x7, \do8
+	bl		__xts_crypt8
+
+	ldp		q16, q17, [sp, #16]
+	ldp		q18, q19, [sp, #48]
+
+	eor		\o0\().16b, \o0\().16b, v25.16b
+	eor		\o1\().16b, \o1\().16b, v26.16b
+	eor		\o2\().16b, \o2\().16b, v27.16b
+	eor		\o3\().16b, \o3\().16b, v28.16b
+
+	st1		{\o0\().16b}, [x0], #16
+	mov		v25.16b, v26.16b
+	tbnz		x6, #1, 1f
+	st1		{\o1\().16b}, [x0], #16
+	mov		v25.16b, v27.16b
+	tbnz		x6, #2, 1f
+	st1		{\o2\().16b}, [x0], #16
+	mov		v25.16b, v28.16b
+	tbnz		x6, #3, 1f
+	st1		{\o3\().16b}, [x0], #16
+	mov		v25.16b, v29.16b
+	tbnz		x6, #4, 1f
+
+	eor		\o4\().16b, \o4\().16b, v16.16b
+	eor		\o5\().16b, \o5\().16b, v17.16b
+	eor		\o6\().16b, \o6\().16b, v18.16b
+	eor		\o7\().16b, \o7\().16b, v19.16b
+
+	st1		{\o4\().16b}, [x0], #16
+	tbnz		x6, #5, 1f
+	st1		{\o5\().16b}, [x0], #16
+	tbnz		x6, #6, 1f
+	st1		{\o6\().16b}, [x0], #16
+	tbnz		x6, #7, 1f
+	st1		{\o7\().16b}, [x0], #16
+
+	cbnz		x4, 99b
+
+1:	st1		{v25.16b}, [x5]
+	ldp		x29, x30, [sp], #80
+	ret
+	.endm
+
+ENTRY(aesbs_xts_encrypt)
+	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
+ENDPROC(aesbs_xts_encrypt)
+
+ENTRY(aesbs_xts_decrypt)
+	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
+ENDPROC(aesbs_xts_decrypt)
+
+	.macro		next_ctr, v
+	mov		\v\().d[1], x8
+	adds		x8, x8, #1
+	mov		\v\().d[0], x7
+	adc		x7, x7, xzr
+	rev64		\v\().16b, \v\().16b
+	.endm
+
+	/*
+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+	 *		     int rounds, int blocks, u8 iv[], u8 final[])
+	 */
+ENTRY(aesbs_ctr_encrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	cmp		x6, #0
+	cset		x10, ne
+	add		x4, x4, x10		// do one extra block if final
+
+	ldp		x7, x8, [x5]
+	ld1		{v0.16b}, [x5]
+CPU_LE(	rev		x7, x7		)
+CPU_LE(	rev		x8, x8		)
+	adds		x8, x8, #1
+	adc		x7, x7, xzr
+
+99:	mov		x9, #1
+	lsl		x9, x9, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
+	csel		x9, x9, xzr, le
+
+	tbnz		x9, #1, 0f
+	next_ctr	v1
+	tbnz		x9, #2, 0f
+	next_ctr	v2
+	tbnz		x9, #3, 0f
+	next_ctr	v3
+	tbnz		x9, #4, 0f
+	next_ctr	v4
+	tbnz		x9, #5, 0f
+	next_ctr	v5
+	tbnz		x9, #6, 0f
+	next_ctr	v6
+	tbnz		x9, #7, 0f
+	next_ctr	v7
+
+0:	mov		bskey, x2
+	mov		rounds, x3
+	bl		aesbs_encrypt8
+
+	lsr		x9, x9, x10		// disregard the extra block
+	tbnz		x9, #0, 0f
+
+	ld1		{v8.16b}, [x1], #16
+	eor		v0.16b, v0.16b, v8.16b
+	st1		{v0.16b}, [x0], #16
+	tbnz		x9, #1, 1f
+
+	ld1		{v9.16b}, [x1], #16
+	eor		v1.16b, v1.16b, v9.16b
+	st1		{v1.16b}, [x0], #16
+	tbnz		x9, #2, 2f
+
+	ld1		{v10.16b}, [x1], #16
+	eor		v4.16b, v4.16b, v10.16b
+	st1		{v4.16b}, [x0], #16
+	tbnz		x9, #3, 3f
+
+	ld1		{v11.16b}, [x1], #16
+	eor		v6.16b, v6.16b, v11.16b
+	st1		{v6.16b}, [x0], #16
+	tbnz		x9, #4, 4f
+
+	ld1		{v12.16b}, [x1], #16
+	eor		v3.16b, v3.16b, v12.16b
+	st1		{v3.16b}, [x0], #16
+	tbnz		x9, #5, 5f
+
+	ld1		{v13.16b}, [x1], #16
+	eor		v7.16b, v7.16b, v13.16b
+	st1		{v7.16b}, [x0], #16
+	tbnz		x9, #6, 6f
+
+	ld1		{v14.16b}, [x1], #16
+	eor		v2.16b, v2.16b, v14.16b
+	st1		{v2.16b}, [x0], #16
+	tbnz		x9, #7, 7f
+
+	ld1		{v15.16b}, [x1], #16
+	eor		v5.16b, v5.16b, v15.16b
+	st1		{v5.16b}, [x0], #16
+
+8:	next_ctr	v0
+	cbnz		x4, 99b
+
+0:	st1		{v0.16b}, [x5]
+	ldp		x29, x30, [sp], #16
+	ret
+
+	/*
+	 * If we are handling the tail of the input (x6 != NULL), return the
+	 * final keystream block back to the caller.
+	 */
+1:	cbz		x6, 8b
+	st1		{v1.16b}, [x6]
+	b		8b
+2:	cbz		x6, 8b
+	st1		{v4.16b}, [x6]
+	b		8b
+3:	cbz		x6, 8b
+	st1		{v6.16b}, [x6]
+	b		8b
+4:	cbz		x6, 8b
+	st1		{v3.16b}, [x6]
+	b		8b
+5:	cbz		x6, 8b
+	st1		{v7.16b}, [x6]
+	b		8b
+6:	cbz		x6, 8b
+	st1		{v2.16b}, [x6]
+	b		8b
+7:	cbz		x6, 8b
+	st1		{v5.16b}, [x6]
+	b		8b
+ENDPROC(aesbs_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
new file mode 100644
index 000000000000..db2501d93550
--- /dev/null
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -0,0 +1,439 @@
+/*
+ * Bit sliced AES using NEON instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <crypto/aes.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <linux/module.h>
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
+
+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
+
+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks);
+
+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[], u8 final[]);
+
+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]);
+
+/* borrowed from aes-neon-blk.ko */
+asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
+				     int rounds, int blocks, int first);
+asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
+				     int rounds, int blocks, u8 iv[],
+				     int first);
+
+struct aesbs_ctx {
+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32];
+	int	rounds;
+} __aligned(AES_BLOCK_SIZE);
+
+struct aesbs_cbc_ctx {
+	struct aesbs_ctx	key;
+	u32			enc[AES_MAX_KEYLENGTH_U32];
+};
+
+struct aesbs_xts_ctx {
+	struct aesbs_ctx	key;
+	u32			twkey[AES_MAX_KEYLENGTH_U32];
+};
+
+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			unsigned int key_len)
+{
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->rounds = 6 + key_len / 4;
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int __ecb_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
+		   ctx->rounds, blocks);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_encrypt);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+	return __ecb_crypt(req, aesbs_ecb_decrypt);
+}
+
+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
+	if (err)
+		return err;
+
+	ctx->key.rounds = 6 + key_len / 4;
+
+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
+
+	kernel_neon_begin();
+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err, first = 1;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		/* fall back to the non-bitsliced NEON implementation */
+		neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				     ctx->enc, ctx->key.rounds, blocks, walk.iv,
+				     first);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+		first = 0;
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->key.rk, ctx->key.rounds, blocks,
+				  walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int ctr_encrypt(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u8 buf[AES_BLOCK_SIZE];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
+
+		if (walk.nbytes < walk.total) {
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+			final = NULL;
+		}
+
+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
+
+		if (final) {
+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+
+			if (dst != src)
+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+
+			err = skcipher_walk_done(&walk, 0);
+			break;
+		}
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
+			    unsigned int key_len)
+{
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct crypto_aes_ctx rk;
+	int err;
+
+	err = xts_verify_key(tfm, in_key, key_len);
+	if (err)
+		return err;
+
+	key_len /= 2;
+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
+	if (err)
+		return err;
+
+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
+
+	return aesbs_setkey(tfm, in_key, key_len);
+}
+
+static int __xts_crypt(struct skcipher_request *req,
+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
+				  int rounds, int blocks, u8 iv[]))
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	kernel_neon_begin();
+
+	neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
+			     ctx->key.rounds, 1, 1);
+
+	while (walk.nbytes >= AES_BLOCK_SIZE) {
+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+		if (walk.nbytes < walk.total)
+			blocks = round_down(blocks,
+					    walk.stride / AES_BLOCK_SIZE);
+
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+		   ctx->key.rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_encrypt);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return __xts_crypt(req, aesbs_xts_decrypt);
+}
+
+static struct skcipher_alg aes_algs[] = { {
+	.base.cra_name		= "__ecb(aes)",
+	.base.cra_driver_name	= "__ecb-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ecb_encrypt,
+	.decrypt		= ecb_decrypt,
+}, {
+	.base.cra_name		= "__cbc(aes)",
+	.base.cra_driver_name	= "__cbc-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_cbc_setkey,
+	.encrypt		= cbc_encrypt,
+	.decrypt		= cbc_decrypt,
+}, {
+	.base.cra_name		= "__ctr(aes)",
+	.base.cra_driver_name	= "__ctr-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "ctr(aes)",
+	.base.cra_driver_name	= "ctr-aes-neonbs",
+	.base.cra_priority	= 250 - 1,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= AES_MIN_KEY_SIZE,
+	.max_keysize		= AES_MAX_KEY_SIZE,
+	.chunksize		= AES_BLOCK_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_setkey,
+	.encrypt		= ctr_encrypt,
+	.decrypt		= ctr_encrypt,
+}, {
+	.base.cra_name		= "__xts(aes)",
+	.base.cra_driver_name	= "__xts-aes-neonbs",
+	.base.cra_priority	= 250,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
+
+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
+	.walksize		= 8 * AES_BLOCK_SIZE,
+	.ivsize			= AES_BLOCK_SIZE,
+	.setkey			= aesbs_xts_setkey,
+	.encrypt		= xts_encrypt,
+	.decrypt		= xts_decrypt,
+} };
+
+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
+
+static void aes_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
+		if (aes_simd_algs[i])
+			simd_skcipher_free(aes_simd_algs[i]);
+
+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static int __init aes_init(void)
+{
+	struct simd_skcipher_alg *simd;
+	const char *basename;
+	const char *algname;
+	const char *drvname;
+	int err;
+	int i;
+
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+			continue;
+
+		algname = aes_algs[i].base.cra_name + 2;
+		drvname = aes_algs[i].base.cra_driver_name + 2;
+		basename = aes_algs[i].base.cra_driver_name;
+		simd = simd_skcipher_create_compat(algname, drvname, basename);
+		err = PTR_ERR(simd);
+		if (IS_ERR(simd))
+			goto unregister_simds;
+
+		aes_simd_algs[i] = simd;
+	}
+	return 0;
+
+unregister_simds:
+	aes_exit();
+	return err;
+}
+
+module_init(aes_init);
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S
new file mode 100644
index 000000000000..13c85e272c2a
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -0,0 +1,450 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.align		6
+
+ENTRY(chacha20_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i
+
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requires shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	adr		x3, ROT8
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+	ld1		{v12.4s}, [x3]
+
+	mov		x3, #10
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	ext		v1.16b, v1.16b, v1.16b, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	ext		v3.16b, v3.16b, v3.16b, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	rev32		v3.8h, v3.8h
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #12
+	sri		v1.4s, v4.4s, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	add		v0.4s, v0.4s, v1.4s
+	eor		v3.16b, v3.16b, v0.16b
+	tbl		v3.16b, {v3.16b}, v12.16b
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	add		v2.4s, v2.4s, v3.4s
+	eor		v4.16b, v1.16b, v2.16b
+	shl		v1.4s, v4.4s, #7
+	sri		v1.4s, v4.4s, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	ext		v1.16b, v1.16b, v1.16b, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	ext		v2.16b, v2.16b, v2.16b, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	ext		v3.16b, v3.16b, v3.16b, #4
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround
+
+	ld1		{v4.16b-v7.16b}, [x2]
+
+	// o0 = i0 ^ (x0 + s0)
+	add		v0.4s, v0.4s, v8.4s
+	eor		v0.16b, v0.16b, v4.16b
+
+	// o1 = i1 ^ (x1 + s1)
+	add		v1.4s, v1.4s, v9.4s
+	eor		v1.16b, v1.16b, v5.16b
+
+	// o2 = i2 ^ (x2 + s2)
+	add		v2.4s, v2.4s, v10.4s
+	eor		v2.16b, v2.16b, v6.16b
+
+	// o3 = i3 ^ (x3 + s3)
+	add		v3.4s, v3.4s, v11.4s
+	eor		v3.16b, v3.16b, v7.16b
+
+	st1		{v0.16b-v3.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_block_xor_neon)
+
+	.align		6
+ENTRY(chacha20_4block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 4 data blocks output, o
+	// x2: 4 data blocks input, i
+
+	//
+	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. For final XORing step we transpose the
+	// matrix by interleaving 32- and then 64-bit words, which allows us to
+	// do XOR in NEON registers.
+	//
+	adr		x3, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x3]
+
+	// x0..15[0-3] = s0..3[0..3]
+	mov		x4, x0
+	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{v12.4s-v15.4s}, [x4]
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	mov		x3, #10
+
+.Ldoubleround4:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+	rev32		v15.8h, v15.8h
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #12
+	shl		v5.4s, v17.4s, #12
+	shl		v6.4s, v18.4s, #12
+	shl		v7.4s, v19.4s, #12
+
+	sri		v4.4s, v16.4s, #20
+	sri		v5.4s, v17.4s, #20
+	sri		v6.4s, v18.4s, #20
+	sri		v7.4s, v19.4s, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	add		v0.4s, v0.4s, v4.4s
+	add		v1.4s, v1.4s, v5.4s
+	add		v2.4s, v2.4s, v6.4s
+	add		v3.4s, v3.4s, v7.4s
+
+	eor		v12.16b, v12.16b, v0.16b
+	eor		v13.16b, v13.16b, v1.16b
+	eor		v14.16b, v14.16b, v2.16b
+	eor		v15.16b, v15.16b, v3.16b
+
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+	tbl		v15.16b, {v15.16b}, v31.16b
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	add		v8.4s, v8.4s, v12.4s
+	add		v9.4s, v9.4s, v13.4s
+	add		v10.4s, v10.4s, v14.4s
+	add		v11.4s, v11.4s, v15.4s
+
+	eor		v16.16b, v4.16b, v8.16b
+	eor		v17.16b, v5.16b, v9.16b
+	eor		v18.16b, v6.16b, v10.16b
+	eor		v19.16b, v7.16b, v11.16b
+
+	shl		v4.4s, v16.4s, #7
+	shl		v5.4s, v17.4s, #7
+	shl		v6.4s, v18.4s, #7
+	shl		v7.4s, v19.4s, #7
+
+	sri		v4.4s, v16.4s, #25
+	sri		v5.4s, v17.4s, #25
+	sri		v6.4s, v18.4s, #25
+	sri		v7.4s, v19.4s, #25
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	rev32		v15.8h, v15.8h
+	rev32		v12.8h, v12.8h
+	rev32		v13.8h, v13.8h
+	rev32		v14.8h, v14.8h
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #12
+	shl		v6.4s, v17.4s, #12
+	shl		v7.4s, v18.4s, #12
+	shl		v4.4s, v19.4s, #12
+
+	sri		v5.4s, v16.4s, #20
+	sri		v6.4s, v17.4s, #20
+	sri		v7.4s, v18.4s, #20
+	sri		v4.4s, v19.4s, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	add		v0.4s, v0.4s, v5.4s
+	add		v1.4s, v1.4s, v6.4s
+	add		v2.4s, v2.4s, v7.4s
+	add		v3.4s, v3.4s, v4.4s
+
+	eor		v15.16b, v15.16b, v0.16b
+	eor		v12.16b, v12.16b, v1.16b
+	eor		v13.16b, v13.16b, v2.16b
+	eor		v14.16b, v14.16b, v3.16b
+
+	tbl		v15.16b, {v15.16b}, v31.16b
+	tbl		v12.16b, {v12.16b}, v31.16b
+	tbl		v13.16b, {v13.16b}, v31.16b
+	tbl		v14.16b, {v14.16b}, v31.16b
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	add		v10.4s, v10.4s, v15.4s
+	add		v11.4s, v11.4s, v12.4s
+	add		v8.4s, v8.4s, v13.4s
+	add		v9.4s, v9.4s, v14.4s
+
+	eor		v16.16b, v5.16b, v10.16b
+	eor		v17.16b, v6.16b, v11.16b
+	eor		v18.16b, v7.16b, v8.16b
+	eor		v19.16b, v4.16b, v9.16b
+
+	shl		v5.4s, v16.4s, #7
+	shl		v6.4s, v17.4s, #7
+	shl		v7.4s, v18.4s, #7
+	shl		v4.4s, v19.4s, #7
+
+	sri		v5.4s, v16.4s, #25
+	sri		v6.4s, v17.4s, #25
+	sri		v7.4s, v18.4s, #25
+	sri		v4.4s, v19.4s, #25
+
+	subs		x3, x3, #1
+	b.ne		.Ldoubleround4
+
+	ld4r		{v16.4s-v19.4s}, [x0], #16
+	ld4r		{v20.4s-v23.4s}, [x0], #16
+
+	// x12 += counter values 0-3
+	add		v12.4s, v12.4s, v30.4s
+
+	// x0[0-3] += s0[0]
+	// x1[0-3] += s0[1]
+	// x2[0-3] += s0[2]
+	// x3[0-3] += s0[3]
+	add		v0.4s, v0.4s, v16.4s
+	add		v1.4s, v1.4s, v17.4s
+	add		v2.4s, v2.4s, v18.4s
+	add		v3.4s, v3.4s, v19.4s
+
+	ld4r		{v24.4s-v27.4s}, [x0], #16
+	ld4r		{v28.4s-v31.4s}, [x0]
+
+	// x4[0-3] += s1[0]
+	// x5[0-3] += s1[1]
+	// x6[0-3] += s1[2]
+	// x7[0-3] += s1[3]
+	add		v4.4s, v4.4s, v20.4s
+	add		v5.4s, v5.4s, v21.4s
+	add		v6.4s, v6.4s, v22.4s
+	add		v7.4s, v7.4s, v23.4s
+
+	// x8[0-3] += s2[0]
+	// x9[0-3] += s2[1]
+	// x10[0-3] += s2[2]
+	// x11[0-3] += s2[3]
+	add		v8.4s, v8.4s, v24.4s
+	add		v9.4s, v9.4s, v25.4s
+	add		v10.4s, v10.4s, v26.4s
+	add		v11.4s, v11.4s, v27.4s
+
+	// x12[0-3] += s3[0]
+	// x13[0-3] += s3[1]
+	// x14[0-3] += s3[2]
+	// x15[0-3] += s3[3]
+	add		v12.4s, v12.4s, v28.4s
+	add		v13.4s, v13.4s, v29.4s
+	add		v14.4s, v14.4s, v30.4s
+	add		v15.4s, v15.4s, v31.4s
+
+	// interleave 32-bit words in state n, n+1
+	zip1		v16.4s, v0.4s, v1.4s
+	zip2		v17.4s, v0.4s, v1.4s
+	zip1		v18.4s, v2.4s, v3.4s
+	zip2		v19.4s, v2.4s, v3.4s
+	zip1		v20.4s, v4.4s, v5.4s
+	zip2		v21.4s, v4.4s, v5.4s
+	zip1		v22.4s, v6.4s, v7.4s
+	zip2		v23.4s, v6.4s, v7.4s
+	zip1		v24.4s, v8.4s, v9.4s
+	zip2		v25.4s, v8.4s, v9.4s
+	zip1		v26.4s, v10.4s, v11.4s
+	zip2		v27.4s, v10.4s, v11.4s
+	zip1		v28.4s, v12.4s, v13.4s
+	zip2		v29.4s, v12.4s, v13.4s
+	zip1		v30.4s, v14.4s, v15.4s
+	zip2		v31.4s, v14.4s, v15.4s
+
+	// interleave 64-bit words in state n, n+2
+	zip1		v0.2d, v16.2d, v18.2d
+	zip2		v4.2d, v16.2d, v18.2d
+	zip1		v8.2d, v17.2d, v19.2d
+	zip2		v12.2d, v17.2d, v19.2d
+	ld1		{v16.16b-v19.16b}, [x2], #64
+
+	zip1		v1.2d, v20.2d, v22.2d
+	zip2		v5.2d, v20.2d, v22.2d
+	zip1		v9.2d, v21.2d, v23.2d
+	zip2		v13.2d, v21.2d, v23.2d
+	ld1		{v20.16b-v23.16b}, [x2], #64
+
+	zip1		v2.2d, v24.2d, v26.2d
+	zip2		v6.2d, v24.2d, v26.2d
+	zip1		v10.2d, v25.2d, v27.2d
+	zip2		v14.2d, v25.2d, v27.2d
+	ld1		{v24.16b-v27.16b}, [x2], #64
+
+	zip1		v3.2d, v28.2d, v30.2d
+	zip2		v7.2d, v28.2d, v30.2d
+	zip1		v11.2d, v29.2d, v31.2d
+	zip2		v15.2d, v29.2d, v31.2d
+	ld1		{v28.16b-v31.16b}, [x2]
+
+	// xor with corresponding input, write to output
+	eor		v16.16b, v16.16b, v0.16b
+	eor		v17.16b, v17.16b, v1.16b
+	eor		v18.16b, v18.16b, v2.16b
+	eor		v19.16b, v19.16b, v3.16b
+	eor		v20.16b, v20.16b, v4.16b
+	eor		v21.16b, v21.16b, v5.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
+	eor		v22.16b, v22.16b, v6.16b
+	eor		v23.16b, v23.16b, v7.16b
+	eor		v24.16b, v24.16b, v8.16b
+	eor		v25.16b, v25.16b, v9.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
+	eor		v26.16b, v26.16b, v10.16b
+	eor		v27.16b, v27.16b, v11.16b
+	eor		v28.16b, v28.16b, v12.16b
+	st1		{v24.16b-v27.16b}, [x1], #64
+	eor		v29.16b, v29.16b, v13.16b
+	eor		v30.16b, v30.16b, v14.16b
+	eor		v31.16b, v31.16b, v15.16b
+	st1		{v28.16b-v31.16b}, [x1]
+
+	ret
+ENDPROC(chacha20_4block_xor_neon)
+
+CTRINC:	.word		0, 1, 2, 3
+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c
new file mode 100644
index 000000000000..a7cd575ea223
--- /dev/null
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -0,0 +1,126 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
+		return crypto_chacha20_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha20_init(state, ctx, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+				nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-neon",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_neon,
+	.decrypt		= chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm64/crypto/crc32-arm64.c b/arch/arm64/crypto/crc32-arm64.c
deleted file mode 100644
index 6a37c3c6b11d..000000000000
--- a/arch/arm64/crypto/crc32-arm64.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions
- *
- * Module based on crypto/crc32c_generic.c
- *
- * CRC32 loop taken from Ed Nevill's Hadoop CRC patch
- * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E
- *
- * Using inline assembly instead of intrinsics in order to be backwards
- * compatible with older compilers.
- *
- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/cpufeature.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <crypto/internal/hash.h>
-
-MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>");
-MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions");
-MODULE_LICENSE("GPL v2");
-
-#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
-
-static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
-	s64 length = len;
-
-	while ((length -= sizeof(u64)) >= 0) {
-		CRC32X(crc, get_unaligned_le64(p));
-		p += sizeof(u64);
-	}
-
-	/* The following is more efficient than the straight loop */
-	if (length & sizeof(u32)) {
-		CRC32W(crc, get_unaligned_le32(p));
-		p += sizeof(u32);
-	}
-	if (length & sizeof(u16)) {
-		CRC32H(crc, get_unaligned_le16(p));
-		p += sizeof(u16);
-	}
-	if (length & sizeof(u8))
-		CRC32B(crc, *p);
-
-	return crc;
-}
-
-static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
-{
-	s64 length = len;
-
-	while ((length -= sizeof(u64)) >= 0) {
-		CRC32CX(crc, get_unaligned_le64(p));
-		p += sizeof(u64);
-	}
-
-	/* The following is more efficient than the straight loop */
-	if (length & sizeof(u32)) {
-		CRC32CW(crc, get_unaligned_le32(p));
-		p += sizeof(u32);
-	}
-	if (length & sizeof(u16)) {
-		CRC32CH(crc, get_unaligned_le16(p));
-		p += sizeof(u16);
-	}
-	if (length & sizeof(u8))
-		CRC32CB(crc, *p);
-
-	return crc;
-}
-
-#define CHKSUM_BLOCK_SIZE	1
-#define CHKSUM_DIGEST_SIZE	4
-
-struct chksum_ctx {
-	u32 key;
-};
-
-struct chksum_desc_ctx {
-	u32 crc;
-};
-
-static int chksum_init(struct shash_desc *desc)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = mctx->key;
-
-	return 0;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
-			 unsigned int keylen)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
-
-	if (keylen != sizeof(mctx->key)) {
-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	mctx->key = get_unaligned_le32(key);
-	return 0;
-}
-
-static int chksum_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length);
-	return 0;
-}
-
-static int chksumc_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int length)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length);
-	return 0;
-}
-
-static int chksum_final(struct shash_desc *desc, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	put_unaligned_le32(ctx->crc, out);
-	return 0;
-}
-
-static int chksumc_final(struct shash_desc *desc, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	put_unaligned_le32(~ctx->crc, out);
-	return 0;
-}
-
-static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
-	put_unaligned_le32(crc32_arm64_le_hw(crc, data, len), out);
-	return 0;
-}
-
-static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
-{
-	put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out);
-	return 0;
-}
-
-static int chksum_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	return __chksum_finup(ctx->crc, data, len, out);
-}
-
-static int chksumc_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
-	return __chksumc_finup(ctx->crc, data, len, out);
-}
-
-static int chksum_digest(struct shash_desc *desc, const u8 *data,
-			 unsigned int length, u8 *out)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
-	return __chksum_finup(mctx->key, data, length, out);
-}
-
-static int chksumc_digest(struct shash_desc *desc, const u8 *data,
-			 unsigned int length, u8 *out)
-{
-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
-
-	return __chksumc_finup(mctx->key, data, length, out);
-}
-
-static int crc32_cra_init(struct crypto_tfm *tfm)
-{
-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = 0;
-	return 0;
-}
-
-static int crc32c_cra_init(struct crypto_tfm *tfm)
-{
-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
-
-	mctx->key = ~0;
-	return 0;
-}
-
-static struct shash_alg crc32_alg = {
-	.digestsize		=	CHKSUM_DIGEST_SIZE,
-	.setkey			=	chksum_setkey,
-	.init			=	chksum_init,
-	.update			=	chksum_update,
-	.final			=	chksum_final,
-	.finup			=	chksum_finup,
-	.digest			=	chksum_digest,
-	.descsize		=	sizeof(struct chksum_desc_ctx),
-	.base			=	{
-		.cra_name		=	"crc32",
-		.cra_driver_name	=	"crc32-arm64-hw",
-		.cra_priority		=	300,
-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
-		.cra_alignmask		=	0,
-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
-		.cra_module		=	THIS_MODULE,
-		.cra_init		=	crc32_cra_init,
-	}
-};
-
-static struct shash_alg crc32c_alg = {
-	.digestsize		=	CHKSUM_DIGEST_SIZE,
-	.setkey			=	chksum_setkey,
-	.init			=	chksum_init,
-	.update			=	chksumc_update,
-	.final			=	chksumc_final,
-	.finup			=	chksumc_finup,
-	.digest			=	chksumc_digest,
-	.descsize		=	sizeof(struct chksum_desc_ctx),
-	.base			=	{
-		.cra_name		=	"crc32c",
-		.cra_driver_name	=	"crc32c-arm64-hw",
-		.cra_priority		=	300,
-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
-		.cra_alignmask		=	0,
-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
-		.cra_module		=	THIS_MODULE,
-		.cra_init		=	crc32c_cra_init,
-	}
-};
-
-static int __init crc32_mod_init(void)
-{
-	int err;
-
-	err = crypto_register_shash(&crc32_alg);
-
-	if (err)
-		return err;
-
-	err = crypto_register_shash(&crc32c_alg);
-
-	if (err) {
-		crypto_unregister_shash(&crc32_alg);
-		return err;
-	}
-
-	return 0;
-}
-
-static void __exit crc32_mod_exit(void)
-{
-	crypto_unregister_shash(&crc32_alg);
-	crypto_unregister_shash(&crc32c_alg);
-}
-
-module_cpu_feature_match(CRC32, crc32_mod_init);
-module_exit(crc32_mod_exit);
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
index 8594127d5e01..eccb1ae90064 100644
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -72,6 +72,24 @@ static int crc32_pmull_init(struct shash_desc *desc)
 	return 0;
 }
 
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32c_armv8_le(*crc, data, length);
+	return 0;
+}
+
 static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int length)
 {
@@ -156,7 +174,7 @@ static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
 static struct shash_alg crc32_pmull_algs[] = { {
 	.setkey			= crc32_pmull_setkey,
 	.init			= crc32_pmull_init,
-	.update			= crc32_pmull_update,
+	.update			= crc32_update,
 	.final			= crc32_pmull_final,
 	.descsize		= sizeof(u32),
 	.digestsize		= sizeof(u32),
@@ -171,7 +189,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 }, {
 	.setkey			= crc32_pmull_setkey,
 	.init			= crc32_pmull_init,
-	.update			= crc32c_pmull_update,
+	.update			= crc32c_update,
 	.final			= crc32c_pmull_final,
 	.descsize		= sizeof(u32),
 	.digestsize		= sizeof(u32),
@@ -187,14 +205,20 @@ static struct shash_alg crc32_pmull_algs[] = { {
 
 static int __init crc32_pmull_mod_init(void)
 {
-	if (elf_hwcap & HWCAP_CRC32) {
-		fallback_crc32 = crc32_armv8_le;
-		fallback_crc32c = crc32c_armv8_le;
-	} else {
-		fallback_crc32 = crc32_le;
-		fallback_crc32c = __crc32c_le;
-	}
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+		crc32_pmull_algs[0].update = crc32_pmull_update;
+		crc32_pmull_algs[1].update = crc32c_pmull_update;
 
+		if (elf_hwcap & HWCAP_CRC32) {
+			fallback_crc32 = crc32_armv8_le;
+			fallback_crc32c = crc32c_armv8_le;
+		} else {
+			fallback_crc32 = crc32_le;
+			fallback_crc32c = __crc32c_le;
+		}
+	} else if (!(elf_hwcap & HWCAP_CRC32)) {
+		return -ENODEV;
+	}
 	return crypto_register_shashes(crc32_pmull_algs,
 				       ARRAY_SIZE(crc32_pmull_algs));
 }
@@ -205,7 +229,12 @@ static void __exit crc32_pmull_mod_exit(void)
 				  ARRAY_SIZE(crc32_pmull_algs));
 }
 
-module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+static const struct cpu_feature crc32_cpu_feature[] = {
+	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
+
+module_init(crc32_pmull_mod_init);
 module_exit(crc32_pmull_mod_exit);
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 383a6f84a060..3c465184ff8a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -46,27 +46,48 @@
 
 #ifdef __x86_64__
 
-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
 .align 16
 .Lgf128mul_x_ble_mask:
 	.octa 0x00000000000000010000000000000087
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
 POLY:   .octa 0xC2000000000000000000000000000001
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE: .octa 0x00000001000000000000000000000001
 
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+.section	.rodata.cst16.MASK1, "aM", @progbits, 16
+.align 16
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+.section	.rodata.cst16.MASK2, "aM", @progbits, 16
+.align 16
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE:        .octa 0x00000000000000000000000000000001
+.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
+.align 16
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+.section	.rodata.cst16.dec, "aM", @progbits, 16
+.align 16
+dec:        .octa 0x1
+.section	.rodata.cst16.enc, "aM", @progbits, 16
+.align 16
+enc:        .octa 0x2
+
 # order of these constants should not change.
 # more specifically, ALL_F should follow SHIFT_MASK,
-# and ZERO should follow ALL_F
-
-SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
-MASK1:      .octa 0x0000000000000000ffffffffffffffff
-MASK2:      .octa 0xffffffffffffffff0000000000000000
+# and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
-ZERO:       .octa 0x00000000000000000000000000000000
-ONE:        .octa 0x00000000000000000000000000000001
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-dec:        .octa 0x1
-enc:        .octa 0x2
+            .octa 0x00000000000000000000000000000000
 
 
 .text
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 522ab68d1c88..d664382c6e56 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -122,22 +122,38 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
-.data
+# constants in mergeable sections, linker can reorder and merge
+.section	.rodata.cst16.POLY, "aM", @progbits, 16
 .align 16
-
 POLY:            .octa     0xC2000000000000000000000000000001
+
+.section	.rodata.cst16.POLY2, "aM", @progbits, 16
+.align 16
 POLY2:           .octa     0xC20000000000000000000001C2000000
+
+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE:          .octa     0x00000001000000000000000000000001
 
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
-
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
+
+.section	.rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE:             .octa     0x00000000000000000000000000000001
+
+.section	.rodata.cst16.ONEf, "aM", @progbits, 16
+.align 16
+ONEf:            .octa     0x01000000000000000000000000000000
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
+.section	.rodata, "a", @progbits
+.align 16
 SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
 ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
-ZERO:            .octa     0x00000000000000000000000000000000
-ONE:             .octa     0x00000000000000000000000000000001
-ONEf:            .octa     0x01000000000000000000000000000000
+                 .octa     0x00000000000000000000000000000000
 
 .text
 
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 7ff1b0c86a8e..93de8ea51548 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -740,9 +740,11 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;
 
 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
@@ -822,9 +824,11 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 	*((__be32 *)(iv+12)) = counter;
 
 	if (sg_is_last(req->src) &&
-	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    (!PageHighMem(sg_page(req->src)) ||
+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
 	    sg_is_last(req->dst) &&
-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
+	    (!PageHighMem(sg_page(req->dst)) ||
+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
 		assoc = scatterwalk_map(&src_sg_walk);
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index aa9e8bd163f6..f7c495e2863c 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -571,7 +571,9 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 16(rio); \
 	vmovdqu y7, 15 * 16(rio);
 
-.data
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 
 #define SHUFB_BYTES(idx) \
@@ -711,6 +713,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
 /* 4-bit mask */
+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 16186c18656d..eee5b3982cfd 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -610,20 +610,25 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	vmovdqu y6, 14 * 32(rio); \
 	vmovdqu y7, 15 * 32(rio);
 
-.data
-.align 32
 
+.section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
-
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 
+.section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
 .Lpack_bswap:
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section	.rodata.cst16, "aM", @progbits, 16
+.align 16
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -750,6 +755,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
 
+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
 .align 4
 /* 4-bit mask */
 .L0f0f0f0f:
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 14fa1966bf01..b4a8806234ea 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -195,19 +195,29 @@
 	vpshufb rmask,	x0, x0;           \
 	vpshufb rmask,	x1, x1;
 
-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
 .Lbswap_iv_mask:
 	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+.section	.rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0
 
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index c419389889cd..952d3156a933 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -225,8 +225,7 @@
 	vpshufb rmask,		x2, x2;       \
 	vpshufb rmask,		x3, x3;
 
-.data
-
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
@@ -244,10 +243,19 @@
 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
 .L16_mask:
 	.byte 16, 16, 16, 16
+
+.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
 .L32_mask:
 	.byte 32, 0, 0, 0
+
+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
 .Lfirst_mask:
 	.byte 0x1f, 0, 0, 0
 
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 16694e625f77..3a2dc3dc6cac 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -11,13 +11,18 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
 .align 32
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
 	.octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
 	.octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
 CTRINC:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004
 
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 3a33124e9112..3f511a7d73b8 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -11,11 +11,14 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
-
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
 CTRINC:	.octa 0x00000003000000020000000100000000
 
 .text
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index f910d1d449f0..1e6af1b35f7b 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -11,7 +11,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/chacha20.h>
-#include <linux/crypto.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
@@ -63,36 +63,37 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 	}
 }
 
-static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
-			 struct scatterlist *src, unsigned int nbytes)
+static int chacha20_simd(struct skcipher_request *req)
 {
-	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	u32 *state, state_buf[16 + 2] __aligned(8);
+	struct skcipher_walk walk;
 	int err;
 
-	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
+	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
 
-	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+		return crypto_chacha20_crypt(req);
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+	crypto_chacha20_init(state, ctx, walk.iv);
 
 	kernel_fpu_begin();
 
 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = blkcipher_walk_done(desc, &walk,
-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes % CHACHA20_BLOCK_SIZE);
 	}
 
 	if (walk.nbytes) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		err = skcipher_walk_done(&walk, 0);
 	}
 
 	kernel_fpu_end();
@@ -100,27 +101,22 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return err;
 }
 
-static struct crypto_alg alg = {
-	.cra_name		= "chacha20",
-	.cra_driver_name	= "chacha20-simd",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
-	.cra_alignmask		= sizeof(u32) - 1,
-	.cra_module		= THIS_MODULE,
-	.cra_u			= {
-		.blkcipher = {
-			.min_keysize	= CHACHA20_KEY_SIZE,
-			.max_keysize	= CHACHA20_KEY_SIZE,
-			.ivsize		= CHACHA20_IV_SIZE,
-			.geniv		= "seqiv",
-			.setkey		= crypto_chacha20_setkey,
-			.encrypt	= chacha20_simd,
-			.decrypt	= chacha20_simd,
-		},
-	},
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-simd",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_alignmask	= sizeof(u32) - 1,
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_simd,
+	.decrypt		= chacha20_simd,
 };
 
 static int __init chacha20_simd_mod_init(void)
@@ -133,12 +129,12 @@ static int __init chacha20_simd_mod_init(void)
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }
 
 static void __exit chacha20_simd_mod_fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }
 
 module_init(chacha20_simd_mod_init);
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dc05f010ca9b..7a7de27c6f41 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -312,7 +312,7 @@ do_return:
         ret
 ENDPROC(crc_pcl)
 
-.section	.rodata, "a", %progbits
+.section	.rodata, "a", @progbits
         ################################################################
         ## jump table        Table is 129 entries x 2 bytes each
         ################################################################
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index 35e97569d05f..de04d3e98d8d 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -554,12 +554,11 @@ _only_less_than_2:
 
 ENDPROC(crc_t10dif_pcl)
 
-.data
-
+.section	.rodata, "a", @progbits
+.align 16
 # precomputed constants
 # these constants are precomputed from the poly:
 # 0x8bb70000 (0x8bb7 scaled to 32 bits)
-.align 16
 # Q = 0x18BB70000
 # rk1 = 2^(32*3) mod Q << 32
 # rk2 = 2^(32*5) mod Q << 32
@@ -613,14 +612,23 @@ rk20:
 
 
 
+.section	.rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
 mask1:
 .octa 0x80808080808080808080808080808080
+
+.section	.rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
 mask2:
 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
 
+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
 SHUF_MASK:
 .octa 0x000102030405060708090A0B0C0D0E0F
 
+.section	.rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
+.align 32
 pshufb_shf_table:
 # use these values for shift constants for the pshufb instruction
 # different alignments result in values as shown:
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 038f6ae87c5e..f3e91647ca27 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -537,7 +537,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 	ret;
 ENDPROC(des3_ede_x86_64_crypt_blk_3way)
 
-.data
+.section	.rodata, "a", @progbits
 .align 16
 .L_s1:
 	.quad 0x0010100001010400, 0x0000000000000000
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index eed55c8cca4f..f94375a8dcd1 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -20,8 +20,7 @@
 #include <asm/inst.h>
 #include <asm/frame.h>
 
-.data
-
+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
 .align 16
 .Lbswap_mask:
 	.octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index eff2f414e22b..3b6e70d085da 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -11,11 +11,13 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
 .align 32
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
+.align 32
 ORMASK:	.octa 0x00000000010000000000000001000000
 	.octa 0x00000000010000000000000001000000
 
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index 338c748054ed..c88c670cb5fc 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -11,10 +11,12 @@
 
 #include <linux/linkage.h>
 
-.data
+.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
 .align 16
-
 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+
+.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
+.align 16
 ORMASK:	.octa 0x00000000010000000000000001000000
 
 .text
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 8be571808342..2925077f8c6a 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -29,11 +29,12 @@
 
 .file "serpent-avx-x86_64-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 97c48add33ed..d67888f2a52a 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -20,13 +20,18 @@
 
 .file "serpent-avx2-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_0:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask_1:
 	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
 
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
index 96df6a39d7e2..93b945597ecf 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -281,11 +281,13 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
 two:
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
index 63a0d9c8e31f..7a93b1c0d69a 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -203,8 +203,7 @@ return_null:
 
 ENDPROC(sha1_mb_mgr_submit_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
index c9dae1cd2919..20f77aa633de 100644
--- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -461,21 +461,32 @@ lloop:
 ENDPROC(sha1_x8_avx2)
 
 
-.data
-
+.section	.rodata.cst32.K00_19, "aM", @progbits, 32
 .align 32
 K00_19:
 .octa 0x5A8279995A8279995A8279995A827999
 .octa 0x5A8279995A8279995A8279995A827999
+
+.section	.rodata.cst32.K20_39, "aM", @progbits, 32
+.align 32
 K20_39:
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+
+.section	.rodata.cst32.K40_59, "aM", @progbits, 32
+.align 32
 K40_59:
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+
+.section	.rodata.cst32.K60_79, "aM", @progbits, 32
+.align 32
 K60_79:
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 874a651b9e7d..ebbdba72ae07 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -293,10 +293,12 @@ ENTRY(sha1_ni_transform)
 	ret
 ENDPROC(sha1_ni_transform)
 
-.data
-
-.align 64
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
+
+.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
 UPPER_WORD_MASK:
 	.octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 92b3b5d75ba9..e08888a1a5f2 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -463,7 +463,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_avx)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -483,14 +483,21 @@ K256:
 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 570ec5ec62d7..89c8f09787d2 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -723,7 +723,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_rorx)
 
-.data
+.section	.rodata.cst512.K256, "aM", @progbits, 512
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -759,14 +759,21 @@ K256:
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 
 # shuffle xBxA -> 00BA
+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 
 # shuffle xDxC -> DC00
+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
index a78a0694ddef..8fe6338bcc84 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -284,11 +284,13 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
 	ret
 ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 .octa	0x000000000000000000000000FFFFFFF0
+
+.section	.rodata.cst8, "aM", @progbits, 8
+.align 8
 one:
 .quad	1
 two:
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
index 7ea670e25acc..b36ae7454084 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -208,8 +208,7 @@ return_null:
 
 ENDPROC(sha256_mb_mgr_submit_avx2)
 
-.data
-
+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
 .align 16
 clear_low_nibble:
 	.octa	0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
index aa21aea4c722..1687c80c5995 100644
--- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -437,7 +437,8 @@ Lrounds_16_xx:
 
 	ret
 ENDPROC(sha256_x8_avx2)
-.data
+
+.section	.rodata.K256_8, "a", @progbits
 .align 64
 K256_8:
 	.octa	0x428a2f98428a2f98428a2f98428a2f98
@@ -568,10 +569,14 @@ K256_8:
 	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
 	.octa	0xc67178f2c67178f2c67178f2c67178f2
+
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK:
 .octa 0x0c0d0e0f08090a0b0405060700010203
 .octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 .global K256
 K256:
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 2cedc44e8121..39b83c93e7fd 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -474,7 +474,7 @@ done_hash:
 	ret
 ENDPROC(sha256_transform_ssse3)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -494,13 +494,19 @@ K256:
         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
 # shuffle xBxA -> 00BA
 _SHUF_00BA:
 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 
+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
 # shuffle xDxC -> DC00
 _SHUF_DC00:
 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 748cdf21a938..fb58f58ecfbc 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -329,7 +329,7 @@ ENTRY(sha256_ni_transform)
 	ret
 ENDPROC(sha256_ni_transform)
 
-.data
+.section	.rodata.cst256.K256, "aM", @progbits, 256
 .align 64
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
@@ -349,5 +349,7 @@ K256:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 565274d6a641..39235fefe6f7 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -370,14 +370,17 @@ ENDPROC(sha512_transform_avx)
 ########################################################################
 ### Binary Data
 
-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 1f20b35d8573..7f5f6c6ec72e 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -684,8 +684,11 @@ ENDPROC(sha512_transform_rorx)
 ########################################################################
 ### Binary Data
 
-.data
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
 .align 64
 # K[t] used in SHA512 hashing
 K512:
@@ -730,14 +733,17 @@ K512:
 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 .align 32
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 	.octa 0x18191a1b1c1d1e1f1011121314151617
 
+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
 MASK_YMM_LO:
 	.octa 0x00000000000000000000000000000000
 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
 #endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index 9c1bb6d58141..2dd3674b5a1e 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -221,7 +221,7 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
 }
 
 static struct sha512_hash_ctx
-		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
+		*sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
 {
 	/*
 	 * If get_comp_job returns NULL, there are no jobs complete.
@@ -233,11 +233,17 @@ static struct sha512_hash_ctx
 	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
 	 * still need processing.
 	 */
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;
 
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	ctx = (struct sha512_hash_ctx *)
 				sha512_job_mgr_get_comp_job(&mgr->mgr);
-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }
 
 static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
@@ -246,12 +252,17 @@ static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
 }
 
 static struct sha512_hash_ctx
-			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
+			*sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
 					  struct sha512_hash_ctx *ctx,
 					  const void *buffer,
 					  uint32_t len,
 					  int flags)
 {
+	struct sha512_ctx_mgr *mgr;
+	unsigned long irqflags;
+
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, irqflags);
 	if (flags & (~HASH_ENTIRE)) {
 		/*
 		 * User should not pass anything other than FIRST, UPDATE, or
@@ -351,20 +362,26 @@ static struct sha512_hash_ctx
 		}
 	}
 
-	return sha512_ctx_mgr_resubmit(mgr, ctx);
+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+	spin_unlock_irqrestore(&cstate->work_lock, irqflags);
+	return ctx;
 }
 
-static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
 {
+	struct sha512_ctx_mgr *mgr;
 	struct sha512_hash_ctx *ctx;
+	unsigned long flags;
 
+	mgr = cstate->mgr;
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	while (1) {
 		ctx = (struct sha512_hash_ctx *)
 					sha512_job_mgr_flush(&mgr->mgr);
 
 		/* If flush returned 0, there are no more jobs in flight. */
 		if (!ctx)
-			return NULL;
+			break;
 
 		/*
 		 * If flush returned a job, resubmit the job to finish
@@ -378,8 +395,10 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
 		 * the sha512_ctx_mgr still need processing. Loop.
 		 */
 		if (ctx)
-			return ctx;
+			break;
 	}
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
+	return ctx;
 }
 
 static int sha512_mb_init(struct ahash_request *areq)
@@ -439,11 +458,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 		sha_ctx = (struct sha512_hash_ctx *)
 						ahash_request_ctx(&rctx->areq);
 		kernel_fpu_begin();
-		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
+		sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
 						rctx->walk.data, nbytes, flag);
 		if (!sha_ctx) {
 			if (flush)
-				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
+				sha_ctx = sha512_ctx_mgr_flush(cstate);
 		}
 		kernel_fpu_end();
 		if (sha_ctx)
@@ -471,11 +490,12 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	struct sha512_hash_ctx *sha_ctx;
 	struct mcryptd_hash_request_ctx *req_ctx;
 	int ret;
+	unsigned long flags;
 
 	/* remove from work list */
-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_del(&rctx->waiter);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 	if (irqs_disabled())
 		rctx->complete(&req->base, err);
@@ -486,14 +506,14 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 	}
 
 	/* check to see if there are other jobs that are done */
-	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	while (sha_ctx) {
 		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
 		ret = sha_finish_walk(&req_ctx, cstate, false);
 		if (req_ctx) {
-			spin_lock(&cstate->work_lock);
+			spin_lock_irqsave(&cstate->work_lock, flags);
 			list_del(&req_ctx->waiter);
-			spin_unlock(&cstate->work_lock);
+			spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 			req = cast_mcryptd_ctx_to_req(req_ctx);
 			if (irqs_disabled())
@@ -504,7 +524,7 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 				local_bh_enable();
 			}
 		}
-		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
 	}
 
 	return 0;
@@ -515,6 +535,7 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 {
 	unsigned long next_flush;
 	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+	unsigned long flags;
 
 	/* initialize tag */
 	rctx->tag.arrival = jiffies;    /* tag the arrival time */
@@ -522,9 +543,9 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 	next_flush = rctx->tag.arrival + delay;
 	rctx->tag.expire = next_flush;
 
-	spin_lock(&cstate->work_lock);
+	spin_lock_irqsave(&cstate->work_lock, flags);
 	list_add_tail(&rctx->waiter, &cstate->work_list);
-	spin_unlock(&cstate->work_lock);
+	spin_unlock_irqrestore(&cstate->work_lock, flags);
 
 	mcryptd_arm_flusher(cstate, delay);
 }
@@ -565,7 +586,7 @@ static int sha512_mb_update(struct ahash_request *areq)
 	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 							nbytes, HASH_UPDATE);
 	kernel_fpu_end();
 
@@ -628,7 +649,7 @@ static int sha512_mb_finup(struct ahash_request *areq)
 	sha512_mb_add_list(rctx, cstate);
 
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
 								nbytes, flag);
 	kernel_fpu_end();
 
@@ -677,8 +698,7 @@ static int sha512_mb_final(struct ahash_request *areq)
 	/* flag HASH_FINAL and 0 data size */
 	sha512_mb_add_list(rctx, cstate);
 	kernel_fpu_begin();
-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
-								HASH_LAST);
+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
 	kernel_fpu_end();
 
 	/* check if anything is returned */
@@ -940,7 +960,7 @@ static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
 			break;
 		kernel_fpu_begin();
 		sha_ctx = (struct sha512_hash_ctx *)
-					sha512_ctx_mgr_flush(cstate->mgr);
+					sha512_ctx_mgr_flush(cstate);
 		kernel_fpu_end();
 		if (!sha_ctx) {
 			pr_err("sha512_mb error: nothing got flushed for"
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
index 3ddba19a0db6..7c629caebc05 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -280,12 +280,18 @@ ENTRY(sha512_mb_mgr_get_comp_job_avx2)
 	pop     %rbx
         ret
 ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
-.data
 
-.align 16
+.section	.rodata.cst8.one, "aM", @progbits, 8
+.align 8
 one:
 .quad  1
+
+.section	.rodata.cst8.two, "aM", @progbits, 8
+.align 8
 two:
 .quad  2
+
+.section	.rodata.cst8.three, "aM", @progbits, 8
+.align 8
 three:
 .quad  3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
index 815f07bdd1f8..4ba709ba78e5 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -209,8 +209,9 @@ return_null:
 	xor     job_rax, job_rax
 	jmp     return
 ENDPROC(sha512_mb_mgr_submit_avx2)
-.data
 
+/* UNUSED?
+.section	.rodata.cst16, "aM", @progbits, 16
 .align 16
 H0:     .int  0x6a09e667
 H1:     .int  0xbb67ae85
@@ -220,3 +221,4 @@ H4:     .int  0x510e527f
 H5:     .int  0x9b05688c
 H6:     .int  0x1f83d9ab
 H7:     .int  0x5be0cd19
+*/
diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
index 31ab1eff6413..e22e907643a6 100644
--- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -361,7 +361,7 @@ Lrounds_16_xx:
 	ret
 ENDPROC(sha512_x4_avx2)
 
-.data
+.section	.rodata.K512_4, "a", @progbits
 .align 64
 K512_4:
 	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
@@ -525,5 +525,7 @@ K512_4:
 	.octa 0x6c44198c4a4758176c44198c4a475817,\
 		0x6c44198c4a4758176c44198c4a475817
 
+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
                          .octa 0x18191a1b1c1d1e1f1011121314151617
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index e610e29cbc81..66bbd9058a90 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -369,14 +369,17 @@ ENDPROC(sha512_transform_ssse3)
 ########################################################################
 ### Binary Data
 
-.data
-
+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
 .align 16
-
 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 XMM_QWORD_BSWAP:
 	.octa 0x08090a0b0c0d0e0f0001020304050607
 
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section	.rodata.cst640.K512, "aM", @progbits, 640
+.align 64
 # K[t] used in SHA512 hashing
 K512:
 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index dc66273e610d..b3f49d286348 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -29,11 +29,13 @@
 
 .file "twofish-avx-x86_64-asm_64.S"
 
-.data
+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
 .align 16
-
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
 .Lxts_gf128mul_and_shl1_mask:
 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 160f08e721cc..f37e9cca50e1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -263,6 +263,7 @@ comment "Authenticated Encryption with Associated Data"
 config CRYPTO_CCM
 	tristate "CCM support"
 	select CRYPTO_CTR
+	select CRYPTO_HASH
 	select CRYPTO_AEAD
 	help
 	  Support for Counter with CBC MAC. Required for IPsec.
@@ -374,6 +375,7 @@ config CRYPTO_XTS
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	select CRYPTO_GF128MUL
+	select CRYPTO_ECB
 	help
 	  XTS: IEEE1619/D16 narrow block cipher use with aes-xts-plain,
 	  key size 256, 384 or 512 bits. This implementation currently
@@ -895,6 +897,23 @@ config CRYPTO_AES
 
 	  See <http://csrc.nist.gov/CryptoToolkit/aes/> for more information.
 
+config CRYPTO_AES_TI
+	tristate "Fixed time AES cipher"
+	select CRYPTO_ALGAPI
+	help
+	  This is a generic implementation of AES that attempts to eliminate
+	  data dependent latencies as much as possible without affecting
+	  performance too much. It is intended for use by the generic CCM
+	  and GCM drivers, and other CTR or CMAC/XCBC based modes that rely
+	  solely on encryption (although decryption is supported as well, but
+	  with a more dramatic performance hit)
+
+	  Instead of using 16 lookup tables of 1 KB each, (8 for encryption and
+	  8 for decryption), this implementation only uses just two S-boxes of
+	  256 bytes each, and attempts to eliminate data dependent latencies by
+	  prefetching the entire table into the cache at the start of each
+	  block.
+
 config CRYPTO_AES_586
 	tristate "AES cipher algorithms (i586)"
 	depends on (X86 || UML_X86) && !64BIT
diff --git a/crypto/Makefile b/crypto/Makefile
index b8f0e3eb0791..8a44057240d5 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
+CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
@@ -98,7 +99,9 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_COMMON) += blowfish_common.o
 obj-$(CONFIG_CRYPTO_TWOFISH) += twofish_generic.o
 obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
 obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
+CFLAGS_serpent_generic.o := $(call cc-option,-fsched-pressure)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
+obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
 obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
 obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index d676fc59521a..d880a4897159 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include <crypto/scatterwalk.h>
@@ -394,7 +395,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
@@ -468,7 +469,7 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
diff --git a/crypto/acompress.c b/crypto/acompress.c
index 887783d8e9a9..47d11627cd20 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -20,6 +20,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/internal/acompress.h>
 #include <crypto/internal/scompress.h>
@@ -50,7 +51,7 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/aead.c b/crypto/aead.c
index 3f5c5ff004ab..f794b30a9407 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -132,7 +133,7 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
diff --git a/crypto/aes_generic.c b/crypto/aes_generic.c
index 3dd101144a58..ca554d57d01e 100644
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -54,6 +54,7 @@
 #include <linux/errno.h>
 #include <linux/crypto.h>
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
 
 static inline u8 byte(const u32 x, const unsigned n)
 {
@@ -1216,7 +1217,6 @@ EXPORT_SYMBOL_GPL(crypto_il_tab);
 int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		unsigned int key_len)
 {
-	const __le32 *key = (const __le32 *)in_key;
 	u32 i, t, u, v, w, j;
 
 	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
@@ -1225,10 +1225,15 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
 	ctx->key_length = key_len;
 
-	ctx->key_dec[key_len + 24] = ctx->key_enc[0] = le32_to_cpu(key[0]);
-	ctx->key_dec[key_len + 25] = ctx->key_enc[1] = le32_to_cpu(key[1]);
-	ctx->key_dec[key_len + 26] = ctx->key_enc[2] = le32_to_cpu(key[2]);
-	ctx->key_dec[key_len + 27] = ctx->key_enc[3] = le32_to_cpu(key[3]);
+	ctx->key_enc[0] = get_unaligned_le32(in_key);
+	ctx->key_enc[1] = get_unaligned_le32(in_key + 4);
+	ctx->key_enc[2] = get_unaligned_le32(in_key + 8);
+	ctx->key_enc[3] = get_unaligned_le32(in_key + 12);
+
+	ctx->key_dec[key_len + 24] = ctx->key_enc[0];
+	ctx->key_dec[key_len + 25] = ctx->key_enc[1];
+	ctx->key_dec[key_len + 26] = ctx->key_enc[2];
+	ctx->key_dec[key_len + 27] = ctx->key_enc[3];
 
 	switch (key_len) {
 	case AES_KEYSIZE_128:
@@ -1238,17 +1243,17 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		break;
 
 	case AES_KEYSIZE_192:
-		ctx->key_enc[4] = le32_to_cpu(key[4]);
-		t = ctx->key_enc[5] = le32_to_cpu(key[5]);
+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+		t = ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
 		for (i = 0; i < 8; ++i)
 			loop6(i);
 		break;
 
 	case AES_KEYSIZE_256:
-		ctx->key_enc[4] = le32_to_cpu(key[4]);
-		ctx->key_enc[5] = le32_to_cpu(key[5]);
-		ctx->key_enc[6] = le32_to_cpu(key[6]);
-		t = ctx->key_enc[7] = le32_to_cpu(key[7]);
+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
+		ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
+		ctx->key_enc[6] = get_unaligned_le32(in_key + 24);
+		t = ctx->key_enc[7] = get_unaligned_le32(in_key + 28);
 		for (i = 0; i < 6; ++i)
 			loop8(i);
 		loop8tophalf(i);
@@ -1329,16 +1334,14 @@ EXPORT_SYMBOL_GPL(crypto_aes_set_key);
 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *src = (const __le32 *)in;
-	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const u32 *kp = ctx->key_enc + 4;
 	const int key_len = ctx->key_length;
 
-	b0[0] = le32_to_cpu(src[0]) ^ ctx->key_enc[0];
-	b0[1] = le32_to_cpu(src[1]) ^ ctx->key_enc[1];
-	b0[2] = le32_to_cpu(src[2]) ^ ctx->key_enc[2];
-	b0[3] = le32_to_cpu(src[3]) ^ ctx->key_enc[3];
+	b0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
+	b0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
+	b0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
+	b0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
 
 	if (key_len > 24) {
 		f_nround(b1, b0, kp);
@@ -1361,10 +1364,10 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	f_nround(b1, b0, kp);
 	f_lround(b0, b1, kp);
 
-	dst[0] = cpu_to_le32(b0[0]);
-	dst[1] = cpu_to_le32(b0[1]);
-	dst[2] = cpu_to_le32(b0[2]);
-	dst[3] = cpu_to_le32(b0[3]);
+	put_unaligned_le32(b0[0], out);
+	put_unaligned_le32(b0[1], out + 4);
+	put_unaligned_le32(b0[2], out + 8);
+	put_unaligned_le32(b0[3], out + 12);
 }
 
 /* decrypt a block of text */
@@ -1401,16 +1404,14 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	const __le32 *src = (const __le32 *)in;
-	__le32 *dst = (__le32 *)out;
 	u32 b0[4], b1[4];
 	const int key_len = ctx->key_length;
 	const u32 *kp = ctx->key_dec + 4;
 
-	b0[0] = le32_to_cpu(src[0]) ^  ctx->key_dec[0];
-	b0[1] = le32_to_cpu(src[1]) ^  ctx->key_dec[1];
-	b0[2] = le32_to_cpu(src[2]) ^  ctx->key_dec[2];
-	b0[3] = le32_to_cpu(src[3]) ^  ctx->key_dec[3];
+	b0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
+	b0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
+	b0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
+	b0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
 
 	if (key_len > 24) {
 		i_nround(b1, b0, kp);
@@ -1433,10 +1434,10 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	i_nround(b1, b0, kp);
 	i_lround(b0, b1, kp);
 
-	dst[0] = cpu_to_le32(b0[0]);
-	dst[1] = cpu_to_le32(b0[1]);
-	dst[2] = cpu_to_le32(b0[2]);
-	dst[3] = cpu_to_le32(b0[3]);
+	put_unaligned_le32(b0[0], out);
+	put_unaligned_le32(b0[1], out + 4);
+	put_unaligned_le32(b0[2], out + 8);
+	put_unaligned_le32(b0[3], out + 12);
 }
 
 static struct crypto_alg aes_alg = {
@@ -1446,7 +1447,6 @@ static struct crypto_alg aes_alg = {
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
 	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
-	.cra_alignmask		=	3,
 	.cra_module		=	THIS_MODULE,
 	.cra_u			=	{
 		.cipher = {
diff --git a/crypto/aes_ti.c b/crypto/aes_ti.c
new file mode 100644
index 000000000000..92644fd1ac19
--- /dev/null
+++ b/crypto/aes_ti.c
@@ -0,0 +1,375 @@
+/*
+ * Scalar fixed time AES core transform
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <asm/unaligned.h>
+
+/*
+ * Emit the sbox as volatile const to prevent the compiler from doing
+ * constant folding on sbox references involving fixed indexes.
+ */
+static volatile const u8 __cacheline_aligned __aesti_sbox[] = {
+	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
+};
+
+static volatile const u8 __cacheline_aligned __aesti_inv_sbox[] = {
+	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
+};
+
+static u32 mul_by_x(u32 w)
+{
+	u32 x = w & 0x7f7f7f7f;
+	u32 y = w & 0x80808080;
+
+	/* multiply by polynomial 'x' (0b10) in GF(2^8) */
+	return (x << 1) ^ (y >> 7) * 0x1b;
+}
+
+static u32 mul_by_x2(u32 w)
+{
+	u32 x = w & 0x3f3f3f3f;
+	u32 y = w & 0x80808080;
+	u32 z = w & 0x40404040;
+
+	/* multiply by polynomial 'x^2' (0b100) in GF(2^8) */
+	return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b;
+}
+
+static u32 mix_columns(u32 x)
+{
+	/*
+	 * Perform the following matrix multiplication in GF(2^8)
+	 *
+	 * | 0x2 0x3 0x1 0x1 |   | x[0] |
+	 * | 0x1 0x2 0x3 0x1 |   | x[1] |
+	 * | 0x1 0x1 0x2 0x3 | x | x[2] |
+	 * | 0x3 0x1 0x1 0x3 |   | x[3] |
+	 */
+	u32 y = mul_by_x(x) ^ ror32(x, 16);
+
+	return y ^ ror32(x ^ y, 8);
+}
+
+static u32 inv_mix_columns(u32 x)
+{
+	/*
+	 * Perform the following matrix multiplication in GF(2^8)
+	 *
+	 * | 0xe 0xb 0xd 0x9 |   | x[0] |
+	 * | 0x9 0xe 0xb 0xd |   | x[1] |
+	 * | 0xd 0x9 0xe 0xb | x | x[2] |
+	 * | 0xb 0xd 0x9 0xe |   | x[3] |
+	 *
+	 * which can conveniently be reduced to
+	 *
+	 * | 0x2 0x3 0x1 0x1 |   | 0x5 0x0 0x4 0x0 |   | x[0] |
+	 * | 0x1 0x2 0x3 0x1 |   | 0x0 0x5 0x0 0x4 |   | x[1] |
+	 * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] |
+	 * | 0x3 0x1 0x1 0x2 |   | 0x0 0x4 0x0 0x5 |   | x[3] |
+	 */
+	u32 y = mul_by_x2(x);
+
+	return mix_columns(x ^ y ^ ror32(y, 16));
+}
+
+static __always_inline u32 subshift(u32 in[], int pos)
+{
+	return (__aesti_sbox[in[pos] & 0xff]) ^
+	       (__aesti_sbox[(in[(pos + 1) % 4] >>  8) & 0xff] <<  8) ^
+	       (__aesti_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
+	       (__aesti_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
+}
+
+static __always_inline u32 inv_subshift(u32 in[], int pos)
+{
+	return (__aesti_inv_sbox[in[pos] & 0xff]) ^
+	       (__aesti_inv_sbox[(in[(pos + 3) % 4] >>  8) & 0xff] <<  8) ^
+	       (__aesti_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
+	       (__aesti_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
+}
+
+static u32 subw(u32 in)
+{
+	return (__aesti_sbox[in & 0xff]) ^
+	       (__aesti_sbox[(in >>  8) & 0xff] <<  8) ^
+	       (__aesti_sbox[(in >> 16) & 0xff] << 16) ^
+	       (__aesti_sbox[(in >> 24) & 0xff] << 24);
+}
+
+static int aesti_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+			    unsigned int key_len)
+{
+	u32 kwords = key_len / sizeof(u32);
+	u32 rc, i, j;
+
+	if (key_len != AES_KEYSIZE_128 &&
+	    key_len != AES_KEYSIZE_192 &&
+	    key_len != AES_KEYSIZE_256)
+		return -EINVAL;
+
+	ctx->key_length = key_len;
+
+	for (i = 0; i < kwords; i++)
+		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
+
+	for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
+		u32 *rki = ctx->key_enc + (i * kwords);
+		u32 *rko = rki + kwords;
+
+		rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
+		rko[1] = rko[0] ^ rki[1];
+		rko[2] = rko[1] ^ rki[2];
+		rko[3] = rko[2] ^ rki[3];
+
+		if (key_len == 24) {
+			if (i >= 7)
+				break;
+			rko[4] = rko[3] ^ rki[4];
+			rko[5] = rko[4] ^ rki[5];
+		} else if (key_len == 32) {
+			if (i >= 6)
+				break;
+			rko[4] = subw(rko[3]) ^ rki[4];
+			rko[5] = rko[4] ^ rki[5];
+			rko[6] = rko[5] ^ rki[6];
+			rko[7] = rko[6] ^ rki[7];
+		}
+	}
+
+	/*
+	 * Generate the decryption keys for the Equivalent Inverse Cipher.
+	 * This involves reversing the order of the round keys, and applying
+	 * the Inverse Mix Columns transformation to all but the first and
+	 * the last one.
+	 */
+	ctx->key_dec[0] = ctx->key_enc[key_len + 24];
+	ctx->key_dec[1] = ctx->key_enc[key_len + 25];
+	ctx->key_dec[2] = ctx->key_enc[key_len + 26];
+	ctx->key_dec[3] = ctx->key_enc[key_len + 27];
+
+	for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
+		ctx->key_dec[i]     = inv_mix_columns(ctx->key_enc[j]);
+		ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
+		ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
+		ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
+	}
+
+	ctx->key_dec[i]     = ctx->key_enc[0];
+	ctx->key_dec[i + 1] = ctx->key_enc[1];
+	ctx->key_dec[i + 2] = ctx->key_enc[2];
+	ctx->key_dec[i + 3] = ctx->key_enc[3];
+
+	return 0;
+}
+
+static int aesti_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+			 unsigned int key_len)
+{
+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = aesti_expand_key(ctx, in_key, key_len);
+	if (err)
+		return err;
+
+	/*
+	 * In order to force the compiler to emit data independent Sbox lookups
+	 * at the start of each block, xor the first round key with values at
+	 * fixed indexes in the Sbox. This will need to be repeated each time
+	 * the key is used, which will pull the entire Sbox into the D-cache
+	 * before any data dependent Sbox lookups are performed.
+	 */
+	ctx->key_enc[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
+	ctx->key_enc[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
+	ctx->key_enc[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
+	ctx->key_enc[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
+
+	ctx->key_dec[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
+	ctx->key_dec[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
+	ctx->key_dec[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
+	ctx->key_dec[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
+
+	return 0;
+}
+
+static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *rkp = ctx->key_enc + 4;
+	int rounds = 6 + ctx->key_length / 4;
+	u32 st0[4], st1[4];
+	int round;
+
+	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
+	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
+	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
+	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
+
+	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
+	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
+	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
+	st0[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
+
+	for (round = 0;; round += 2, rkp += 8) {
+		st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
+		st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
+		st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
+		st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
+
+		if (round == rounds - 2)
+			break;
+
+		st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
+		st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
+		st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
+		st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
+	}
+
+	put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
+	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
+	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
+	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
+}
+
+static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *rkp = ctx->key_dec + 4;
+	int rounds = 6 + ctx->key_length / 4;
+	u32 st0[4], st1[4];
+	int round;
+
+	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
+	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
+	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
+	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
+
+	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
+	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
+	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
+	st0[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
+
+	for (round = 0;; round += 2, rkp += 8) {
+		st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
+		st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
+		st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
+		st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
+
+		if (round == rounds - 2)
+			break;
+
+		st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
+		st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
+		st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
+		st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
+	}
+
+	put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
+	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
+	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
+	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
+}
+
+static struct crypto_alg aes_alg = {
+	.cra_name			= "aes",
+	.cra_driver_name		= "aes-fixed-time",
+	.cra_priority			= 100 + 1,
+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize			= AES_BLOCK_SIZE,
+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
+	.cra_module			= THIS_MODULE,
+
+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
+	.cra_cipher.cia_setkey		= aesti_set_key,
+	.cra_cipher.cia_encrypt		= aesti_encrypt,
+	.cra_cipher.cia_decrypt		= aesti_decrypt
+};
+
+static int __init aes_init(void)
+{
+	return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+	crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Generic fixed time AES");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 2ce8bcb9049c..e58c4970c22b 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -493,7 +494,7 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : ahash\n");
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index def301ed1288..cfbdb06d8ca8 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
@@ -47,7 +48,7 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 1fad2a6b3bbb..6b52e8f0b95f 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -962,34 +962,66 @@ void crypto_inc(u8 *a, unsigned int size)
 	__be32 *b = (__be32 *)(a + size);
 	u32 c;
 
-	for (; size >= 4; size -= 4) {
-		c = be32_to_cpu(*--b) + 1;
-		*b = cpu_to_be32(c);
-		if (c)
-			return;
-	}
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    !((unsigned long)b & (__alignof__(*b) - 1)))
+		for (; size >= 4; size -= 4) {
+			c = be32_to_cpu(*--b) + 1;
+			*b = cpu_to_be32(c);
+			if (c)
+				return;
+		}
 
 	crypto_inc_byte(a, size);
 }
 EXPORT_SYMBOL_GPL(crypto_inc);
 
-static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
+void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
 {
-	for (; size; size--)
-		*a++ ^= *b++;
+	int relalign = 0;
+
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+		int size = sizeof(unsigned long);
+		int d = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
+
+		relalign = d ? 1 << __ffs(d) : size;
+
+		/*
+		 * If we care about alignment, process as many bytes as
+		 * needed to advance dst and src to values whose alignments
+		 * equal their relative alignment. This will allow us to
+		 * process the remainder of the input using optimal strides.
+		 */
+		while (((unsigned long)dst & (relalign - 1)) && len > 0) {
+			*dst++ ^= *src++;
+			len--;
+		}
+	}
+
+	while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
+		*(u64 *)dst ^= *(u64 *)src;
+		dst += 8;
+		src += 8;
+		len -= 8;
+	}
+
+	while (len >= 4 && !(relalign & 3)) {
+		*(u32 *)dst ^= *(u32 *)src;
+		dst += 4;
+		src += 4;
+		len -= 4;
+	}
+
+	while (len >= 2 && !(relalign & 1)) {
+		*(u16 *)dst ^= *(u16 *)src;
+		dst += 2;
+		src += 2;
+		len -= 2;
+	}
+
+	while (len--)
+		*dst++ ^= *src++;
 }
-
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
-{
-	u32 *a = (u32 *)dst;
-	u32 *b = (u32 *)src;
-
-	for (; size >= 4; size -= 4)
-		*a++ ^= *b++;
-
-	crypto_xor_byte((u8 *)a, (u8 *)b, size);
-}
-EXPORT_SYMBOL_GPL(crypto_xor);
+EXPORT_SYMBOL_GPL(__crypto_xor);
 
 unsigned int crypto_alg_extsize(struct crypto_alg *alg)
 {
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index d19b09cdf284..54fc90e8339c 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -245,7 +245,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct alg_sock *ask = alg_sk(sk);
 	struct hash_ctx *ctx = ask->private;
 	struct ahash_request *req = &ctx->req;
-	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req))];
+	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req)) ? : 1];
 	struct sock *sk2;
 	struct alg_sock *ask2;
 	struct hash_ctx *ctx2;
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index a832426820e8..6c43a0a17a55 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -1,6 +1,6 @@
 /*
  * Block chaining cipher operations.
- * 
+ *
  * Generic encrypt/decrypt wrapper for ciphers, handles operations across
  * multiple page boundaries by using temporary blocks.  In user context,
  * the kernel is given a chance to schedule us once per page.
@@ -9,7 +9,7 @@
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) 
+ * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.
  *
  */
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -534,7 +535,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : blkcipher\n");
diff --git a/crypto/cbc.c b/crypto/cbc.c
index 68f751a41a84..bc160a3186dc 100644
--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@@ -145,9 +145,6 @@ static int crypto_cbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->cra_alignmask;
 
-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->cra_blocksize;
 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
diff --git a/crypto/ccm.c b/crypto/ccm.c
index 26b924d1e582..442848807a52 100644
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@@ -11,6 +11,7 @@
  */
 
 #include <crypto/internal/aead.h>
+#include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@@ -23,11 +24,11 @@
 
 struct ccm_instance_ctx {
 	struct crypto_skcipher_spawn ctr;
-	struct crypto_spawn cipher;
+	struct crypto_ahash_spawn mac;
 };
 
 struct crypto_ccm_ctx {
-	struct crypto_cipher *cipher;
+	struct crypto_ahash *mac;
 	struct crypto_skcipher *ctr;
 };
 
@@ -44,15 +45,21 @@ struct crypto_rfc4309_req_ctx {
 
 struct crypto_ccm_req_priv_ctx {
 	u8 odata[16];
-	u8 idata[16];
 	u8 auth_tag[16];
-	u32 ilen;
 	u32 flags;
 	struct scatterlist src[3];
 	struct scatterlist dst[3];
 	struct skcipher_request skreq;
 };
 
+struct cbcmac_tfm_ctx {
+	struct crypto_cipher *child;
+};
+
+struct cbcmac_desc_ctx {
+	unsigned int len;
+};
+
 static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx(
 	struct aead_request *req)
 {
@@ -84,7 +91,7 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 {
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
 	struct crypto_skcipher *ctr = ctx->ctr;
-	struct crypto_cipher *tfm = ctx->cipher;
+	struct crypto_ahash *mac = ctx->mac;
 	int err = 0;
 
 	crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
@@ -96,11 +103,11 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 	if (err)
 		goto out;
 
-	crypto_cipher_clear_flags(tfm, CRYPTO_TFM_REQ_MASK);
-	crypto_cipher_set_flags(tfm, crypto_aead_get_flags(aead) &
+	crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK);
+	crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) &
 				    CRYPTO_TFM_REQ_MASK);
-	err = crypto_cipher_setkey(tfm, key, keylen);
-	crypto_aead_set_flags(aead, crypto_cipher_get_flags(tfm) &
+	err = crypto_ahash_setkey(mac, key, keylen);
+	crypto_aead_set_flags(aead, crypto_ahash_get_flags(mac) &
 			      CRYPTO_TFM_RES_MASK);
 
 out:
@@ -167,119 +174,61 @@ static int format_adata(u8 *adata, unsigned int a)
 	return len;
 }
 
-static void compute_mac(struct crypto_cipher *tfm, u8 *data, int n,
-		       struct crypto_ccm_req_priv_ctx *pctx)
-{
-	unsigned int bs = 16;
-	u8 *odata = pctx->odata;
-	u8 *idata = pctx->idata;
-	int datalen, getlen;
-
-	datalen = n;
-
-	/* first time in here, block may be partially filled. */
-	getlen = bs - pctx->ilen;
-	if (datalen >= getlen) {
-		memcpy(idata + pctx->ilen, data, getlen);
-		crypto_xor(odata, idata, bs);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-		datalen -= getlen;
-		data += getlen;
-		pctx->ilen = 0;
-	}
-
-	/* now encrypt rest of data */
-	while (datalen >= bs) {
-		crypto_xor(odata, data, bs);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-
-		datalen -= bs;
-		data += bs;
-	}
-
-	/* check and see if there's leftover data that wasn't
-	 * enough to fill a block.
-	 */
-	if (datalen) {
-		memcpy(idata + pctx->ilen, data, datalen);
-		pctx->ilen += datalen;
-	}
-}
-
-static void get_data_to_compute(struct crypto_cipher *tfm,
-			       struct crypto_ccm_req_priv_ctx *pctx,
-			       struct scatterlist *sg, unsigned int len)
-{
-	struct scatter_walk walk;
-	u8 *data_src;
-	int n;
-
-	scatterwalk_start(&walk, sg);
-
-	while (len) {
-		n = scatterwalk_clamp(&walk, len);
-		if (!n) {
-			scatterwalk_start(&walk, sg_next(walk.sg));
-			n = scatterwalk_clamp(&walk, len);
-		}
-		data_src = scatterwalk_map(&walk);
-
-		compute_mac(tfm, data_src, n, pctx);
-		len -= n;
-
-		scatterwalk_unmap(data_src);
-		scatterwalk_advance(&walk, n);
-		scatterwalk_done(&walk, 0, len);
-		if (len)
-			crypto_yield(pctx->flags);
-	}
-
-	/* any leftover needs padding and then encrypted */
-	if (pctx->ilen) {
-		int padlen;
-		u8 *odata = pctx->odata;
-		u8 *idata = pctx->idata;
-
-		padlen = 16 - pctx->ilen;
-		memset(idata + pctx->ilen, 0, padlen);
-		crypto_xor(odata, idata, 16);
-		crypto_cipher_encrypt_one(tfm, odata, odata);
-		pctx->ilen = 0;
-	}
-}
-
 static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain,
 			   unsigned int cryptlen)
 {
+	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
-	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
-	struct crypto_cipher *cipher = ctx->cipher;
+	AHASH_REQUEST_ON_STACK(ahreq, ctx->mac);
 	unsigned int assoclen = req->assoclen;
-	u8 *odata = pctx->odata;
-	u8 *idata = pctx->idata;
-	int err;
+	struct scatterlist sg[3];
+	u8 odata[16];
+	u8 idata[16];
+	int ilen, err;
 
 	/* format control data for input */
 	err = format_input(odata, req, cryptlen);
 	if (err)
 		goto out;
 
-	/* encrypt first block to use as start in computing mac  */
-	crypto_cipher_encrypt_one(cipher, odata, odata);
+	sg_init_table(sg, 3);
+	sg_set_buf(&sg[0], odata, 16);
 
 	/* format associated data and compute into mac */
 	if (assoclen) {
-		pctx->ilen = format_adata(idata, assoclen);
-		get_data_to_compute(cipher, pctx, req->src, req->assoclen);
+		ilen = format_adata(idata, assoclen);
+		sg_set_buf(&sg[1], idata, ilen);
+		sg_chain(sg, 3, req->src);
 	} else {
-		pctx->ilen = 0;
+		ilen = 0;
+		sg_chain(sg, 2, req->src);
 	}
 
-	/* compute plaintext into mac */
-	if (cryptlen)
-		get_data_to_compute(cipher, pctx, plain, cryptlen);
+	ahash_request_set_tfm(ahreq, ctx->mac);
+	ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL);
+	ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16);
+	err = crypto_ahash_init(ahreq);
+	if (err)
+		goto out;
+	err = crypto_ahash_update(ahreq);
+	if (err)
+		goto out;
 
+	/* we need to pad the MAC input to a round multiple of the block size */
+	ilen = 16 - (assoclen + ilen) % 16;
+	if (ilen < 16) {
+		memset(idata, 0, ilen);
+		sg_init_table(sg, 2);
+		sg_set_buf(&sg[0], idata, ilen);
+		if (plain)
+			sg_chain(sg, 2, plain);
+		plain = sg;
+		cryptlen += ilen;
+	}
+
+	ahash_request_set_crypt(ahreq, plain, pctx->odata, cryptlen);
+	err = crypto_ahash_finup(ahreq);
 out:
 	return err;
 }
@@ -453,21 +402,21 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
 	struct aead_instance *inst = aead_alg_instance(tfm);
 	struct ccm_instance_ctx *ictx = aead_instance_ctx(inst);
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
-	struct crypto_cipher *cipher;
+	struct crypto_ahash *mac;
 	struct crypto_skcipher *ctr;
 	unsigned long align;
 	int err;
 
-	cipher = crypto_spawn_cipher(&ictx->cipher);
-	if (IS_ERR(cipher))
-		return PTR_ERR(cipher);
+	mac = crypto_spawn_ahash(&ictx->mac);
+	if (IS_ERR(mac))
+		return PTR_ERR(mac);
 
 	ctr = crypto_spawn_skcipher(&ictx->ctr);
 	err = PTR_ERR(ctr);
 	if (IS_ERR(ctr))
-		goto err_free_cipher;
+		goto err_free_mac;
 
-	ctx->cipher = cipher;
+	ctx->mac = mac;
 	ctx->ctr = ctr;
 
 	align = crypto_aead_alignmask(tfm);
@@ -479,8 +428,8 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
 
 	return 0;
 
-err_free_cipher:
-	crypto_free_cipher(cipher);
+err_free_mac:
+	crypto_free_ahash(mac);
 	return err;
 }
 
@@ -488,7 +437,7 @@ static void crypto_ccm_exit_tfm(struct crypto_aead *tfm)
 {
 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
 
-	crypto_free_cipher(ctx->cipher);
+	crypto_free_ahash(ctx->mac);
 	crypto_free_skcipher(ctx->ctr);
 }
 
@@ -496,7 +445,7 @@ static void crypto_ccm_free(struct aead_instance *inst)
 {
 	struct ccm_instance_ctx *ctx = aead_instance_ctx(inst);
 
-	crypto_drop_spawn(&ctx->cipher);
+	crypto_drop_ahash(&ctx->mac);
 	crypto_drop_skcipher(&ctx->ctr);
 	kfree(inst);
 }
@@ -505,12 +454,13 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 				    struct rtattr **tb,
 				    const char *full_name,
 				    const char *ctr_name,
-				    const char *cipher_name)
+				    const char *mac_name)
 {
 	struct crypto_attr_type *algt;
 	struct aead_instance *inst;
 	struct skcipher_alg *ctr;
-	struct crypto_alg *cipher;
+	struct crypto_alg *mac_alg;
+	struct hash_alg_common *mac;
 	struct ccm_instance_ctx *ictx;
 	int err;
 
@@ -521,25 +471,26 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
 		return -EINVAL;
 
-	cipher = crypto_alg_mod_lookup(cipher_name,  CRYPTO_ALG_TYPE_CIPHER,
-				       CRYPTO_ALG_TYPE_MASK);
-	if (IS_ERR(cipher))
-		return PTR_ERR(cipher);
+	mac_alg = crypto_find_alg(mac_name, &crypto_ahash_type,
+				  CRYPTO_ALG_TYPE_HASH,
+				  CRYPTO_ALG_TYPE_AHASH_MASK |
+				  CRYPTO_ALG_ASYNC);
+	if (IS_ERR(mac_alg))
+		return PTR_ERR(mac_alg);
 
+	mac = __crypto_hash_alg_common(mac_alg);
 	err = -EINVAL;
-	if (cipher->cra_blocksize != 16)
-		goto out_put_cipher;
+	if (mac->digestsize != 16)
+		goto out_put_mac;
 
 	inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
 	err = -ENOMEM;
 	if (!inst)
-		goto out_put_cipher;
+		goto out_put_mac;
 
 	ictx = aead_instance_ctx(inst);
-
-	err = crypto_init_spawn(&ictx->cipher, cipher,
-				aead_crypto_instance(inst),
-				CRYPTO_ALG_TYPE_MASK);
+	err = crypto_init_ahash_spawn(&ictx->mac, mac,
+				      aead_crypto_instance(inst));
 	if (err)
 		goto err_free_inst;
 
@@ -548,7 +499,7 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 				   crypto_requires_sync(algt->type,
 							algt->mask));
 	if (err)
-		goto err_drop_cipher;
+		goto err_drop_mac;
 
 	ctr = crypto_spawn_skcipher_alg(&ictx->ctr);
 
@@ -564,18 +515,17 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
 		     "ccm_base(%s,%s)", ctr->base.cra_driver_name,
-		     cipher->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		     mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_drop_ctr;
 
 	memcpy(inst->alg.base.cra_name, full_name, CRYPTO_MAX_ALG_NAME);
 
 	inst->alg.base.cra_flags = ctr->base.cra_flags & CRYPTO_ALG_ASYNC;
-	inst->alg.base.cra_priority = (cipher->cra_priority +
+	inst->alg.base.cra_priority = (mac->base.cra_priority +
 				       ctr->base.cra_priority) / 2;
 	inst->alg.base.cra_blocksize = 1;
-	inst->alg.base.cra_alignmask = cipher->cra_alignmask |
-				       ctr->base.cra_alignmask |
-				       (__alignof__(u32) - 1);
+	inst->alg.base.cra_alignmask = mac->base.cra_alignmask |
+				       ctr->base.cra_alignmask;
 	inst->alg.ivsize = 16;
 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(ctr);
 	inst->alg.maxauthsize = 16;
@@ -593,23 +543,24 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 	if (err)
 		goto err_drop_ctr;
 
-out_put_cipher:
-	crypto_mod_put(cipher);
+out_put_mac:
+	crypto_mod_put(mac_alg);
 	return err;
 
 err_drop_ctr:
 	crypto_drop_skcipher(&ictx->ctr);
-err_drop_cipher:
-	crypto_drop_spawn(&ictx->cipher);
+err_drop_mac:
+	crypto_drop_ahash(&ictx->mac);
 err_free_inst:
 	kfree(inst);
-	goto out_put_cipher;
+	goto out_put_mac;
 }
 
 static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
 {
 	const char *cipher_name;
 	char ctr_name[CRYPTO_MAX_ALG_NAME];
+	char mac_name[CRYPTO_MAX_ALG_NAME];
 	char full_name[CRYPTO_MAX_ALG_NAME];
 
 	cipher_name = crypto_attr_alg_name(tb[1]);
@@ -620,12 +571,16 @@ static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
 		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
+	if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)",
+		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
+		return -ENAMETOOLONG;
+
 	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "ccm(%s)", cipher_name) >=
 	    CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
 	return crypto_ccm_create_common(tmpl, tb, full_name, ctr_name,
-					cipher_name);
+					mac_name);
 }
 
 static struct crypto_template crypto_ccm_tmpl = {
@@ -899,14 +854,164 @@ static struct crypto_template crypto_rfc4309_tmpl = {
 	.module = THIS_MODULE,
 };
 
+static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent,
+				     const u8 *inkey, unsigned int keylen)
+{
+	struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
+
+	return crypto_cipher_setkey(ctx->child, inkey, keylen);
+}
+
+static int crypto_cbcmac_digest_init(struct shash_desc *pdesc)
+{
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	int bs = crypto_shash_digestsize(pdesc->tfm);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(pdesc->tfm) - bs;
+
+	ctx->len = 0;
+	memset(dg, 0, bs);
+
+	return 0;
+}
+
+static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p,
+				       unsigned int len)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_digestsize(parent);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
+
+	while (len > 0) {
+		unsigned int l = min(len, bs - ctx->len);
+
+		crypto_xor(dg + ctx->len, p, l);
+		ctx->len +=l;
+		len -= l;
+		p += l;
+
+		if (ctx->len == bs) {
+			crypto_cipher_encrypt_one(tfm, dg, dg);
+			ctx->len = 0;
+		}
+	}
+
+	return 0;
+}
+
+static int crypto_cbcmac_digest_final(struct shash_desc *pdesc, u8 *out)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_digestsize(parent);
+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
+
+	if (ctx->len)
+		crypto_cipher_encrypt_one(tfm, dg, dg);
+
+	memcpy(out, dg, bs);
+	return 0;
+}
+
+static int cbcmac_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_cipher *cipher;
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cipher = crypto_spawn_cipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+
+	return 0;
+};
+
+static void cbcmac_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->child);
+}
+
+static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct shash_instance *inst;
+	struct crypto_alg *alg;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
+	if (err)
+		return err;
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	inst = shash_alloc_instance("cbcmac", alg);
+	err = PTR_ERR(inst);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
+				shash_crypto_instance(inst),
+				CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_free_inst;
+
+	inst->alg.base.cra_priority = alg->cra_priority;
+	inst->alg.base.cra_blocksize = 1;
+
+	inst->alg.digestsize = alg->cra_blocksize;
+	inst->alg.descsize = ALIGN(sizeof(struct cbcmac_desc_ctx),
+				   alg->cra_alignmask + 1) +
+			     alg->cra_blocksize;
+
+	inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx);
+	inst->alg.base.cra_init = cbcmac_init_tfm;
+	inst->alg.base.cra_exit = cbcmac_exit_tfm;
+
+	inst->alg.init = crypto_cbcmac_digest_init;
+	inst->alg.update = crypto_cbcmac_digest_update;
+	inst->alg.final = crypto_cbcmac_digest_final;
+	inst->alg.setkey = crypto_cbcmac_digest_setkey;
+
+	err = shash_register_instance(tmpl, inst);
+
+out_free_inst:
+	if (err)
+		shash_free_instance(shash_crypto_instance(inst));
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
+static struct crypto_template crypto_cbcmac_tmpl = {
+	.name = "cbcmac",
+	.create = cbcmac_create,
+	.free = shash_free_instance,
+	.module = THIS_MODULE,
+};
+
 static int __init crypto_ccm_module_init(void)
 {
 	int err;
 
-	err = crypto_register_template(&crypto_ccm_base_tmpl);
+	err = crypto_register_template(&crypto_cbcmac_tmpl);
 	if (err)
 		goto out;
 
+	err = crypto_register_template(&crypto_ccm_base_tmpl);
+	if (err)
+		goto out_undo_cbcmac;
+
 	err = crypto_register_template(&crypto_ccm_tmpl);
 	if (err)
 		goto out_undo_base;
@@ -922,6 +1027,8 @@ out_undo_ccm:
 	crypto_unregister_template(&crypto_ccm_tmpl);
 out_undo_base:
 	crypto_unregister_template(&crypto_ccm_base_tmpl);
+out_undo_cbcmac:
+	crypto_register_template(&crypto_cbcmac_tmpl);
 	goto out;
 }
 
@@ -930,6 +1037,7 @@ static void __exit crypto_ccm_module_exit(void)
 	crypto_unregister_template(&crypto_rfc4309_tmpl);
 	crypto_unregister_template(&crypto_ccm_tmpl);
 	crypto_unregister_template(&crypto_ccm_base_tmpl);
+	crypto_unregister_template(&crypto_cbcmac_tmpl);
 }
 
 module_init(crypto_ccm_module_init);
diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index 1cab83146e33..8b3c04d625c3 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -10,10 +10,9 @@
  */
 
 #include <crypto/algapi.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
 #include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/module.h>
 
 static inline u32 le32_to_cpuvp(const void *p)
 {
@@ -63,10 +62,10 @@ void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_init);
 
-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize)
 {
-	struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int i;
 
 	if (keysize != CHACHA20_KEY_SIZE)
@@ -79,66 +78,54 @@ int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
 
-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-			  struct scatterlist *src, unsigned int nbytes)
+int crypto_chacha20_crypt(struct skcipher_request *req)
 {
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, true);
 
-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+	crypto_chacha20_init(state, ctx, walk.iv);
 
-	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
-				 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
-		err = blkcipher_walk_done(desc, &walk,
-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
-	}
-
-	if (walk.nbytes) {
+	while (walk.nbytes > 0) {
 		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
 				 walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		err = skcipher_walk_done(&walk, 0);
 	}
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
 
-static struct crypto_alg alg = {
-	.cra_name		= "chacha20",
-	.cra_driver_name	= "chacha20-generic",
-	.cra_priority		= 100,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_blocksize		= 1,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
-	.cra_alignmask		= sizeof(u32) - 1,
-	.cra_module		= THIS_MODULE,
-	.cra_u			= {
-		.blkcipher = {
-			.min_keysize	= CHACHA20_KEY_SIZE,
-			.max_keysize	= CHACHA20_KEY_SIZE,
-			.ivsize		= CHACHA20_IV_SIZE,
-			.geniv		= "seqiv",
-			.setkey		= crypto_chacha20_setkey,
-			.encrypt	= crypto_chacha20_crypt,
-			.decrypt	= crypto_chacha20_crypt,
-		},
-	},
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-generic",
+	.base.cra_priority	= 100,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
+	.base.cra_alignmask	= sizeof(u32) - 1,
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA20_KEY_SIZE,
+	.max_keysize		= CHACHA20_KEY_SIZE,
+	.ivsize			= CHACHA20_IV_SIZE,
+	.chunksize		= CHACHA20_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= crypto_chacha20_crypt,
+	.decrypt		= crypto_chacha20_crypt,
 };
 
 static int __init chacha20_generic_mod_init(void)
 {
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }
 
 static void __exit chacha20_generic_mod_fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }
 
 module_init(chacha20_generic_mod_init);
diff --git a/crypto/cmac.c b/crypto/cmac.c
index 04080dca8f0c..16301f52858c 100644
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@@ -260,8 +260,7 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (err)
 		goto out_free_inst;
 
-	/* We access the data as u32s when xoring. */
-	alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
+	alignmask = alg->cra_alignmask;
 	inst->alg.base.cra_alignmask = alignmask;
 	inst->alg.base.cra_priority = alg->cra_priority;
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
diff --git a/crypto/ctr.c b/crypto/ctr.c
index a9a7a44f2783..a4f4a8983169 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -209,7 +209,7 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER;
 	inst->alg.cra_priority = alg->cra_priority;
 	inst->alg.cra_blocksize = 1;
-	inst->alg.cra_alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
+	inst->alg.cra_alignmask = alg->cra_alignmask;
 	inst->alg.cra_type = &crypto_blkcipher_type;
 
 	inst->alg.cra_blkcipher.ivsize = alg->cra_blocksize;
diff --git a/crypto/cts.c b/crypto/cts.c
index 00254d76b21b..243f591dc409 100644
--- a/crypto/cts.c
+++ b/crypto/cts.c
@@ -49,6 +49,7 @@
 #include <linux/scatterlist.h>
 #include <crypto/scatterwalk.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 struct crypto_cts_ctx {
 	struct crypto_skcipher *child;
@@ -103,7 +104,7 @@ static int cts_cbc_encrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	int lastn;
@@ -183,7 +184,7 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_request *subreq = &rctx->subreq;
 	int bsize = crypto_skcipher_blocksize(tfm);
-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
+	u8 d[bsize * 2] __aligned(__alignof__(u32));
 	struct scatterlist *sg;
 	unsigned int offset;
 	u8 *space;
@@ -373,9 +374,6 @@ static int crypto_cts_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
 
-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->base.cra_blocksize;
 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(alg);
 	inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
diff --git a/crypto/kpp.c b/crypto/kpp.c
index d36ce05eee43..a90edc27af77 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -19,6 +19,7 @@
 #include <linux/crypto.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 #include <crypto/kpp.h>
 #include <crypto/internal/kpp.h>
@@ -47,7 +48,7 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index e4538e07f7ca..29dd2b4a3b85 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 struct crypto_pcbc_ctx {
 	struct crypto_cipher *child;
@@ -146,7 +147,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
 	unsigned int nbytes = walk->nbytes;
 	u8 *src = walk->src.virt.addr;
 	u8 *iv = walk->iv;
-	u8 tmpbuf[bsize] __attribute__ ((aligned(__alignof__(u32))));
+	u8 tmpbuf[bsize] __aligned(__alignof__(u32));
 
 	do {
 		memcpy(tmpbuf, src, bsize);
@@ -259,9 +260,6 @@ static int crypto_pcbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
 	inst->alg.base.cra_alignmask = alg->cra_alignmask;
 
-	/* We access the data as u32s when xoring. */
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	inst->alg.ivsize = alg->cra_blocksize;
 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
diff --git a/crypto/rng.c b/crypto/rng.c
index b81cffb13bab..f46dac5288b9 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <net/netlink.h>
 
 #include "internal.h"
@@ -95,7 +96,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : rng\n");
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 35e396d154b7..6b048b36312d 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/crypto.h>
+#include <linux/compiler.h>
 #include <linux/vmalloc.h>
 #include <crypto/algapi.h>
 #include <linux/cryptouser.h>
@@ -57,7 +58,7 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 
 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
 {
diff --git a/crypto/seqiv.c b/crypto/seqiv.c
index c7049231861f..570b7d1aa0ca 100644
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
@@ -153,8 +153,6 @@ static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb)
 	if (IS_ERR(inst))
 		return PTR_ERR(inst);
 
-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
-
 	spawn = aead_instance_ctx(inst);
 	alg = crypto_spawn_aead_alg(spawn);
 
diff --git a/crypto/shash.c b/crypto/shash.c
index a051541a4a17..5e31c8d776df 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
+#include <linux/compiler.h>
 
 #include "internal.h"
 
@@ -67,7 +68,7 @@ EXPORT_SYMBOL_GPL(crypto_shash_setkey);
 static inline unsigned int shash_align_buffer_size(unsigned len,
 						   unsigned long mask)
 {
-	typedef u8 __attribute__ ((aligned)) u8_aligned;
+	typedef u8 __aligned_largest u8_aligned;
 	return len + (mask & ~(__alignof__(u8_aligned) - 1));
 }
 
@@ -80,7 +81,7 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
 	unsigned int unaligned_len = alignmask + 1 -
 				     ((unsigned long)data & alignmask);
 	u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;
 
@@ -116,7 +117,7 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
 	struct shash_alg *shash = crypto_shash_alg(tfm);
 	unsigned int ds = crypto_shash_digestsize(tfm);
 	u8 ubuf[shash_align_buffer_size(ds, alignmask)]
-		__attribute__ ((aligned));
+		__aligned_largest;
 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
 	int err;
 
@@ -403,7 +404,7 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 #endif
 
 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct shash_alg *salg = __crypto_shash_alg(alg);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 0e1e6c35188e..014af741fc6a 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -19,6 +19,7 @@
 #include <crypto/scatterwalk.h>
 #include <linux/bug.h>
 #include <linux/cryptouser.h>
+#include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
@@ -185,12 +186,12 @@ void skcipher_walk_complete(struct skcipher_walk *walk, int err)
 		data = p->data;
 		if (!data) {
 			data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1);
-			data = skcipher_get_spot(data, walk->chunksize);
+			data = skcipher_get_spot(data, walk->stride);
 		}
 
 		scatterwalk_copychunks(data, &p->dst, p->len, 1);
 
-		if (offset_in_page(p->data) + p->len + walk->chunksize >
+		if (offset_in_page(p->data) + p->len + walk->stride >
 		    PAGE_SIZE)
 			free_page((unsigned long)p->data);
 
@@ -299,7 +300,7 @@ static int skcipher_next_copy(struct skcipher_walk *walk)
 	p->len = walk->nbytes;
 	skcipher_queue_write(walk, p);
 
-	if (offset_in_page(walk->page) + walk->nbytes + walk->chunksize >
+	if (offset_in_page(walk->page) + walk->nbytes + walk->stride >
 	    PAGE_SIZE)
 		walk->page = NULL;
 	else
@@ -344,7 +345,7 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
 			 SKCIPHER_WALK_DIFF);
 
 	n = walk->total;
-	bsize = min(walk->chunksize, max(n, walk->blocksize));
+	bsize = min(walk->stride, max(n, walk->blocksize));
 	n = scatterwalk_clamp(&walk->in, n);
 	n = scatterwalk_clamp(&walk->out, n);
 
@@ -393,7 +394,7 @@ static int skcipher_copy_iv(struct skcipher_walk *walk)
 	unsigned a = crypto_tfm_ctx_alignment() - 1;
 	unsigned alignmask = walk->alignmask;
 	unsigned ivsize = walk->ivsize;
-	unsigned bs = walk->chunksize;
+	unsigned bs = walk->stride;
 	unsigned aligned_bs;
 	unsigned size;
 	u8 *iv;
@@ -463,7 +464,7 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 		       SKCIPHER_WALK_SLEEP : 0;
 
 	walk->blocksize = crypto_skcipher_blocksize(tfm);
-	walk->chunksize = crypto_skcipher_chunksize(tfm);
+	walk->stride = crypto_skcipher_walksize(tfm);
 	walk->ivsize = crypto_skcipher_ivsize(tfm);
 	walk->alignmask = crypto_skcipher_alignmask(tfm);
 
@@ -525,7 +526,7 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
 		walk->flags &= ~SKCIPHER_WALK_SLEEP;
 
 	walk->blocksize = crypto_aead_blocksize(tfm);
-	walk->chunksize = crypto_aead_chunksize(tfm);
+	walk->stride = crypto_aead_chunksize(tfm);
 	walk->ivsize = crypto_aead_ivsize(tfm);
 	walk->alignmask = crypto_aead_alignmask(tfm);
 
@@ -807,7 +808,7 @@ static void crypto_skcipher_free_instance(struct crypto_instance *inst)
 }
 
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
-	__attribute__ ((unused));
+	__maybe_unused;
 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
@@ -821,6 +822,7 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "max keysize  : %u\n", skcipher->max_keysize);
 	seq_printf(m, "ivsize       : %u\n", skcipher->ivsize);
 	seq_printf(m, "chunksize    : %u\n", skcipher->chunksize);
+	seq_printf(m, "walksize     : %u\n", skcipher->walksize);
 }
 
 #ifdef CONFIG_NET
@@ -893,11 +895,14 @@ static int skcipher_prepare_alg(struct skcipher_alg *alg)
 {
 	struct crypto_alg *base = &alg->base;
 
-	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8)
+	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
+	    alg->walksize > PAGE_SIZE / 8)
 		return -EINVAL;
 
 	if (!alg->chunksize)
 		alg->chunksize = base->cra_blocksize;
+	if (!alg->walksize)
+		alg->walksize = alg->chunksize;
 
 	base->cra_type = &crypto_skcipher_type2;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index ae22f05d5936..9a11f3c2bf98 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -22,6 +22,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <crypto/aead.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
@@ -1010,6 +1012,8 @@ static inline int tcrypt_test(const char *alg)
 {
 	int ret;
 
+	pr_debug("testing %s\n", alg);
+
 	ret = alg_test(alg, alg, 0, 0);
 	/* non-fips algs return -EINVAL in fips mode */
 	if (fips_enabled && ret == -EINVAL)
@@ -2059,6 +2063,8 @@ static int __init tcrypt_mod_init(void)
 	if (err) {
 		printk(KERN_ERR "tcrypt: one or more tests failed!\n");
 		goto err_free_tv;
+	} else {
+		pr_debug("all tests passed\n");
 	}
 
 	/* We intentionaly return -EAGAIN to prevent keeping the module,
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 44e888b0b041..f9c378af3907 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -265,6 +265,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 		       const int align_offset)
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
+	size_t digest_size = crypto_ahash_digestsize(tfm);
 	unsigned int i, j, k, temp;
 	struct scatterlist sg[8];
 	char *result;
@@ -275,7 +276,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 	char *xbuf[XBUFSIZE];
 	int ret = -ENOMEM;
 
-	result = kmalloc(MAX_DIGEST_SIZE, GFP_KERNEL);
+	result = kmalloc(digest_size, GFP_KERNEL);
 	if (!result)
 		return ret;
 	key = kmalloc(MAX_KEYLEN, GFP_KERNEL);
@@ -305,7 +306,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 			goto out;
 
 		j++;
-		memset(result, 0, MAX_DIGEST_SIZE);
+		memset(result, 0, digest_size);
 
 		hash_buff = xbuf[0];
 		hash_buff += align_offset;
@@ -380,7 +381,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 			continue;
 
 		j++;
-		memset(result, 0, MAX_DIGEST_SIZE);
+		memset(result, 0, digest_size);
 
 		temp = 0;
 		sg_init_table(sg, template[i].np);
@@ -458,7 +459,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 			continue;
 
 		j++;
-		memset(result, 0, MAX_DIGEST_SIZE);
+		memset(result, 0, digest_size);
 
 		ret = -EINVAL;
 		hash_buff = xbuf[0];
@@ -1463,13 +1464,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 		int ilen = ctemplate[i].inlen;
 		void *input_vec;
 
-		input_vec = kmalloc(ilen, GFP_KERNEL);
+		input_vec = kmemdup(ctemplate[i].input, ilen, GFP_KERNEL);
 		if (!input_vec) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
-		memcpy(input_vec, ctemplate[i].input, ilen);
 		memset(output, 0, dlen);
 		init_completion(&result.completion);
 		sg_init_one(&src, input_vec, ilen);
@@ -1525,13 +1525,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 		int ilen = dtemplate[i].inlen;
 		void *input_vec;
 
-		input_vec = kmalloc(ilen, GFP_KERNEL);
+		input_vec = kmemdup(dtemplate[i].input, ilen, GFP_KERNEL);
 		if (!input_vec) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
-		memcpy(input_vec, dtemplate[i].input, ilen);
 		memset(output, 0, dlen);
 		init_completion(&result.completion);
 		sg_init_one(&src, input_vec, ilen);
@@ -2251,30 +2250,23 @@ static int alg_test_null(const struct alg_test_desc *desc,
 	return 0;
 }
 
+#define __VECS(tv)	{ .vecs = tv, .count = ARRAY_SIZE(tv) }
+
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
 		.alg = "ansi_cprng",
 		.test = alg_test_cprng,
 		.suite = {
-			.cprng = {
-				.vecs = ansi_cprng_aes_tv_template,
-				.count = ANSI_CPRNG_AES_TEST_VECTORS
-			}
+			.cprng = __VECS(ansi_cprng_aes_tv_template)
 		}
 	}, {
 		.alg = "authenc(hmac(md5),ecb(cipher_null))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = hmac_md5_ecb_cipher_null_enc_tv_template,
-					.count = HMAC_MD5_ECB_CIPHER_NULL_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = hmac_md5_ecb_cipher_null_dec_tv_template,
-					.count = HMAC_MD5_ECB_CIPHER_NULL_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(hmac_md5_ecb_cipher_null_enc_tv_template),
+				.dec = __VECS(hmac_md5_ecb_cipher_null_dec_tv_template)
 			}
 		}
 	}, {
@@ -2282,12 +2274,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha1_aes_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA1_AES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha1_aes_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2295,12 +2282,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha1_des_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA1_DES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha1_des_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2309,12 +2291,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha1_des3_ede_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA1_DES3_EDE_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha1_des3_ede_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2326,18 +2303,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha1_ecb_cipher_null_enc_tv_temp,
-					.count =
-					HMAC_SHA1_ECB_CIPHER_NULL_ENC_TEST_VEC
-				},
-				.dec = {
-					.vecs =
-					hmac_sha1_ecb_cipher_null_dec_tv_temp,
-					.count =
-					HMAC_SHA1_ECB_CIPHER_NULL_DEC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha1_ecb_cipher_null_enc_tv_temp),
+				.dec = __VECS(hmac_sha1_ecb_cipher_null_dec_tv_temp)
 			}
 		}
 	}, {
@@ -2349,12 +2316,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha224_des_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA224_DES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha224_des_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2363,12 +2325,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha224_des3_ede_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA224_DES3_EDE_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha224_des3_ede_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2377,12 +2334,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha256_aes_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA256_AES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha256_aes_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2390,12 +2342,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha256_des_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA256_DES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha256_des_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2404,12 +2351,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha256_des3_ede_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA256_DES3_EDE_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha256_des3_ede_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2425,12 +2367,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha384_des_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA384_DES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha384_des_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2439,12 +2376,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha384_des3_ede_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA384_DES3_EDE_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha384_des3_ede_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2461,12 +2393,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha512_aes_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA512_AES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha512_aes_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2474,12 +2401,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha512_des_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA512_DES_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha512_des_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2488,12 +2410,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs =
-					hmac_sha512_des3_ede_cbc_enc_tv_temp,
-					.count =
-					HMAC_SHA512_DES3_EDE_CBC_ENC_TEST_VEC
-				}
+				.enc = __VECS(hmac_sha512_des3_ede_cbc_enc_tv_temp)
 			}
 		}
 	}, {
@@ -2510,14 +2427,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_cbc_enc_tv_template,
-					.count = AES_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_cbc_dec_tv_template,
-					.count = AES_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_cbc_enc_tv_template),
+				.dec = __VECS(aes_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2525,14 +2436,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = anubis_cbc_enc_tv_template,
-					.count = ANUBIS_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = anubis_cbc_dec_tv_template,
-					.count = ANUBIS_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(anubis_cbc_enc_tv_template),
+				.dec = __VECS(anubis_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2540,14 +2445,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = bf_cbc_enc_tv_template,
-					.count = BF_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = bf_cbc_dec_tv_template,
-					.count = BF_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(bf_cbc_enc_tv_template),
+				.dec = __VECS(bf_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2555,14 +2454,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = camellia_cbc_enc_tv_template,
-					.count = CAMELLIA_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = camellia_cbc_dec_tv_template,
-					.count = CAMELLIA_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(camellia_cbc_enc_tv_template),
+				.dec = __VECS(camellia_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2570,14 +2463,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast5_cbc_enc_tv_template,
-					.count = CAST5_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast5_cbc_dec_tv_template,
-					.count = CAST5_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast5_cbc_enc_tv_template),
+				.dec = __VECS(cast5_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2585,14 +2472,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast6_cbc_enc_tv_template,
-					.count = CAST6_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast6_cbc_dec_tv_template,
-					.count = CAST6_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast6_cbc_enc_tv_template),
+				.dec = __VECS(cast6_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2600,14 +2481,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des_cbc_enc_tv_template,
-					.count = DES_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des_cbc_dec_tv_template,
-					.count = DES_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des_cbc_enc_tv_template),
+				.dec = __VECS(des_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2616,14 +2491,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des3_ede_cbc_enc_tv_template,
-					.count = DES3_EDE_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des3_ede_cbc_dec_tv_template,
-					.count = DES3_EDE_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des3_ede_cbc_enc_tv_template),
+				.dec = __VECS(des3_ede_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2631,14 +2500,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = serpent_cbc_enc_tv_template,
-					.count = SERPENT_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = serpent_cbc_dec_tv_template,
-					.count = SERPENT_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(serpent_cbc_enc_tv_template),
+				.dec = __VECS(serpent_cbc_dec_tv_template)
 			}
 		}
 	}, {
@@ -2646,30 +2509,25 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tf_cbc_enc_tv_template,
-					.count = TF_CBC_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tf_cbc_dec_tv_template,
-					.count = TF_CBC_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tf_cbc_enc_tv_template),
+				.dec = __VECS(tf_cbc_dec_tv_template)
 			}
 		}
+	}, {
+		.alg = "cbcmac(aes)",
+		.fips_allowed = 1,
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(aes_cbcmac_tv_template)
+		}
 	}, {
 		.alg = "ccm(aes)",
 		.test = alg_test_aead,
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = aes_ccm_enc_tv_template,
-					.count = AES_CCM_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_ccm_dec_tv_template,
-					.count = AES_CCM_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_ccm_enc_tv_template),
+				.dec = __VECS(aes_ccm_dec_tv_template)
 			}
 		}
 	}, {
@@ -2677,14 +2535,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = chacha20_enc_tv_template,
-					.count = CHACHA20_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = chacha20_enc_tv_template,
-					.count = CHACHA20_ENC_TEST_VECTORS
-				},
+				.enc = __VECS(chacha20_enc_tv_template),
+				.dec = __VECS(chacha20_enc_tv_template),
 			}
 		}
 	}, {
@@ -2692,20 +2544,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = aes_cmac128_tv_template,
-				.count = CMAC_AES_TEST_VECTORS
-			}
+			.hash = __VECS(aes_cmac128_tv_template)
 		}
 	}, {
 		.alg = "cmac(des3_ede)",
 		.fips_allowed = 1,
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = des3_ede_cmac64_tv_template,
-				.count = CMAC_DES3_EDE_TEST_VECTORS
-			}
+			.hash = __VECS(des3_ede_cmac64_tv_template)
 		}
 	}, {
 		.alg = "compress_null",
@@ -2714,30 +2560,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "crc32",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = crc32_tv_template,
-				.count = CRC32_TEST_VECTORS
-			}
+			.hash = __VECS(crc32_tv_template)
 		}
 	}, {
 		.alg = "crc32c",
 		.test = alg_test_crc32c,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = crc32c_tv_template,
-				.count = CRC32C_TEST_VECTORS
-			}
+			.hash = __VECS(crc32c_tv_template)
 		}
 	}, {
 		.alg = "crct10dif",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = crct10dif_tv_template,
-				.count = CRCT10DIF_TEST_VECTORS
-			}
+			.hash = __VECS(crct10dif_tv_template)
 		}
 	}, {
 		.alg = "ctr(aes)",
@@ -2745,14 +2582,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_ctr_enc_tv_template,
-					.count = AES_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_ctr_dec_tv_template,
-					.count = AES_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_ctr_enc_tv_template),
+				.dec = __VECS(aes_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2760,14 +2591,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = bf_ctr_enc_tv_template,
-					.count = BF_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = bf_ctr_dec_tv_template,
-					.count = BF_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(bf_ctr_enc_tv_template),
+				.dec = __VECS(bf_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2775,14 +2600,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = camellia_ctr_enc_tv_template,
-					.count = CAMELLIA_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = camellia_ctr_dec_tv_template,
-					.count = CAMELLIA_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(camellia_ctr_enc_tv_template),
+				.dec = __VECS(camellia_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2790,14 +2609,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast5_ctr_enc_tv_template,
-					.count = CAST5_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast5_ctr_dec_tv_template,
-					.count = CAST5_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast5_ctr_enc_tv_template),
+				.dec = __VECS(cast5_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2805,14 +2618,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast6_ctr_enc_tv_template,
-					.count = CAST6_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast6_ctr_dec_tv_template,
-					.count = CAST6_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast6_ctr_enc_tv_template),
+				.dec = __VECS(cast6_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2820,14 +2627,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des_ctr_enc_tv_template,
-					.count = DES_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des_ctr_dec_tv_template,
-					.count = DES_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des_ctr_enc_tv_template),
+				.dec = __VECS(des_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2835,14 +2636,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des3_ede_ctr_enc_tv_template,
-					.count = DES3_EDE_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des3_ede_ctr_dec_tv_template,
-					.count = DES3_EDE_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des3_ede_ctr_enc_tv_template),
+				.dec = __VECS(des3_ede_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2850,14 +2645,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = serpent_ctr_enc_tv_template,
-					.count = SERPENT_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = serpent_ctr_dec_tv_template,
-					.count = SERPENT_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(serpent_ctr_enc_tv_template),
+				.dec = __VECS(serpent_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2865,14 +2654,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tf_ctr_enc_tv_template,
-					.count = TF_CTR_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tf_ctr_dec_tv_template,
-					.count = TF_CTR_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tf_ctr_enc_tv_template),
+				.dec = __VECS(tf_ctr_dec_tv_template)
 			}
 		}
 	}, {
@@ -2880,14 +2663,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cts_mode_enc_tv_template,
-					.count = CTS_MODE_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cts_mode_dec_tv_template,
-					.count = CTS_MODE_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cts_mode_enc_tv_template),
+				.dec = __VECS(cts_mode_dec_tv_template)
 			}
 		}
 	}, {
@@ -2896,14 +2673,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.comp = {
-				.comp = {
-					.vecs = deflate_comp_tv_template,
-					.count = DEFLATE_COMP_TEST_VECTORS
-				},
-				.decomp = {
-					.vecs = deflate_decomp_tv_template,
-					.count = DEFLATE_DECOMP_TEST_VECTORS
-				}
+				.comp = __VECS(deflate_comp_tv_template),
+				.decomp = __VECS(deflate_decomp_tv_template)
 			}
 		}
 	}, {
@@ -2911,10 +2682,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_kpp,
 		.fips_allowed = 1,
 		.suite = {
-			.kpp = {
-				.vecs = dh_tv_template,
-				.count = DH_TEST_VECTORS
-			}
+			.kpp = __VECS(dh_tv_template)
 		}
 	}, {
 		.alg = "digest_null",
@@ -2924,30 +2692,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_nopr_ctr_aes128_tv_template,
-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes128_tv_template)
-			}
+			.drbg = __VECS(drbg_nopr_ctr_aes128_tv_template)
 		}
 	}, {
 		.alg = "drbg_nopr_ctr_aes192",
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_nopr_ctr_aes192_tv_template,
-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes192_tv_template)
-			}
+			.drbg = __VECS(drbg_nopr_ctr_aes192_tv_template)
 		}
 	}, {
 		.alg = "drbg_nopr_ctr_aes256",
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_nopr_ctr_aes256_tv_template,
-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes256_tv_template)
-			}
+			.drbg = __VECS(drbg_nopr_ctr_aes256_tv_template)
 		}
 	}, {
 		/*
@@ -2962,11 +2721,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_nopr_hmac_sha256_tv_template,
-				.count =
-				ARRAY_SIZE(drbg_nopr_hmac_sha256_tv_template)
-			}
+			.drbg = __VECS(drbg_nopr_hmac_sha256_tv_template)
 		}
 	}, {
 		/* covered by drbg_nopr_hmac_sha256 test */
@@ -2986,10 +2741,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_nopr_sha256_tv_template,
-				.count = ARRAY_SIZE(drbg_nopr_sha256_tv_template)
-			}
+			.drbg = __VECS(drbg_nopr_sha256_tv_template)
 		}
 	}, {
 		/* covered by drbg_nopr_sha256 test */
@@ -3005,10 +2757,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_pr_ctr_aes128_tv_template,
-				.count = ARRAY_SIZE(drbg_pr_ctr_aes128_tv_template)
-			}
+			.drbg = __VECS(drbg_pr_ctr_aes128_tv_template)
 		}
 	}, {
 		/* covered by drbg_pr_ctr_aes128 test */
@@ -3028,10 +2777,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_pr_hmac_sha256_tv_template,
-				.count = ARRAY_SIZE(drbg_pr_hmac_sha256_tv_template)
-			}
+			.drbg = __VECS(drbg_pr_hmac_sha256_tv_template)
 		}
 	}, {
 		/* covered by drbg_pr_hmac_sha256 test */
@@ -3051,10 +2797,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_drbg,
 		.fips_allowed = 1,
 		.suite = {
-			.drbg = {
-				.vecs = drbg_pr_sha256_tv_template,
-				.count = ARRAY_SIZE(drbg_pr_sha256_tv_template)
-			}
+			.drbg = __VECS(drbg_pr_sha256_tv_template)
 		}
 	}, {
 		/* covered by drbg_pr_sha256 test */
@@ -3071,14 +2814,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_enc_tv_template,
-					.count = AES_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_dec_tv_template,
-					.count = AES_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_enc_tv_template),
+				.dec = __VECS(aes_dec_tv_template)
 			}
 		}
 	}, {
@@ -3086,14 +2823,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = anubis_enc_tv_template,
-					.count = ANUBIS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = anubis_dec_tv_template,
-					.count = ANUBIS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(anubis_enc_tv_template),
+				.dec = __VECS(anubis_dec_tv_template)
 			}
 		}
 	}, {
@@ -3101,14 +2832,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = arc4_enc_tv_template,
-					.count = ARC4_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = arc4_dec_tv_template,
-					.count = ARC4_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(arc4_enc_tv_template),
+				.dec = __VECS(arc4_dec_tv_template)
 			}
 		}
 	}, {
@@ -3116,14 +2841,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = bf_enc_tv_template,
-					.count = BF_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = bf_dec_tv_template,
-					.count = BF_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(bf_enc_tv_template),
+				.dec = __VECS(bf_dec_tv_template)
 			}
 		}
 	}, {
@@ -3131,14 +2850,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = camellia_enc_tv_template,
-					.count = CAMELLIA_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = camellia_dec_tv_template,
-					.count = CAMELLIA_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(camellia_enc_tv_template),
+				.dec = __VECS(camellia_dec_tv_template)
 			}
 		}
 	}, {
@@ -3146,14 +2859,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast5_enc_tv_template,
-					.count = CAST5_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast5_dec_tv_template,
-					.count = CAST5_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast5_enc_tv_template),
+				.dec = __VECS(cast5_dec_tv_template)
 			}
 		}
 	}, {
@@ -3161,14 +2868,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast6_enc_tv_template,
-					.count = CAST6_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast6_dec_tv_template,
-					.count = CAST6_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast6_enc_tv_template),
+				.dec = __VECS(cast6_dec_tv_template)
 			}
 		}
 	}, {
@@ -3179,14 +2880,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des_enc_tv_template,
-					.count = DES_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des_dec_tv_template,
-					.count = DES_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des_enc_tv_template),
+				.dec = __VECS(des_dec_tv_template)
 			}
 		}
 	}, {
@@ -3195,14 +2890,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = des3_ede_enc_tv_template,
-					.count = DES3_EDE_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = des3_ede_dec_tv_template,
-					.count = DES3_EDE_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(des3_ede_enc_tv_template),
+				.dec = __VECS(des3_ede_dec_tv_template)
 			}
 		}
 	}, {
@@ -3225,14 +2914,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = khazad_enc_tv_template,
-					.count = KHAZAD_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = khazad_dec_tv_template,
-					.count = KHAZAD_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(khazad_enc_tv_template),
+				.dec = __VECS(khazad_dec_tv_template)
 			}
 		}
 	}, {
@@ -3240,14 +2923,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = seed_enc_tv_template,
-					.count = SEED_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = seed_dec_tv_template,
-					.count = SEED_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(seed_enc_tv_template),
+				.dec = __VECS(seed_dec_tv_template)
 			}
 		}
 	}, {
@@ -3255,14 +2932,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = serpent_enc_tv_template,
-					.count = SERPENT_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = serpent_dec_tv_template,
-					.count = SERPENT_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(serpent_enc_tv_template),
+				.dec = __VECS(serpent_dec_tv_template)
 			}
 		}
 	}, {
@@ -3270,14 +2941,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tea_enc_tv_template,
-					.count = TEA_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tea_dec_tv_template,
-					.count = TEA_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tea_enc_tv_template),
+				.dec = __VECS(tea_dec_tv_template)
 			}
 		}
 	}, {
@@ -3285,14 +2950,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tnepres_enc_tv_template,
-					.count = TNEPRES_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tnepres_dec_tv_template,
-					.count = TNEPRES_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tnepres_enc_tv_template),
+				.dec = __VECS(tnepres_dec_tv_template)
 			}
 		}
 	}, {
@@ -3300,14 +2959,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tf_enc_tv_template,
-					.count = TF_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tf_dec_tv_template,
-					.count = TF_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tf_enc_tv_template),
+				.dec = __VECS(tf_dec_tv_template)
 			}
 		}
 	}, {
@@ -3315,14 +2968,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = xeta_enc_tv_template,
-					.count = XETA_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = xeta_dec_tv_template,
-					.count = XETA_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(xeta_enc_tv_template),
+				.dec = __VECS(xeta_dec_tv_template)
 			}
 		}
 	}, {
@@ -3330,14 +2977,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = xtea_enc_tv_template,
-					.count = XTEA_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = xtea_dec_tv_template,
-					.count = XTEA_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(xtea_enc_tv_template),
+				.dec = __VECS(xtea_dec_tv_template)
 			}
 		}
 	}, {
@@ -3345,10 +2986,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_kpp,
 		.fips_allowed = 1,
 		.suite = {
-			.kpp = {
-				.vecs = ecdh_tv_template,
-				.count = ECDH_TEST_VECTORS
-			}
+			.kpp = __VECS(ecdh_tv_template)
 		}
 	}, {
 		.alg = "gcm(aes)",
@@ -3356,14 +2994,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = aes_gcm_enc_tv_template,
-					.count = AES_GCM_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_gcm_dec_tv_template,
-					.count = AES_GCM_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_gcm_enc_tv_template),
+				.dec = __VECS(aes_gcm_dec_tv_template)
 			}
 		}
 	}, {
@@ -3371,136 +3003,94 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = ghash_tv_template,
-				.count = GHASH_TEST_VECTORS
-			}
+			.hash = __VECS(ghash_tv_template)
 		}
 	}, {
 		.alg = "hmac(crc32)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = bfin_crc_tv_template,
-				.count = BFIN_CRC_TEST_VECTORS
-			}
+			.hash = __VECS(bfin_crc_tv_template)
 		}
 	}, {
 		.alg = "hmac(md5)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = hmac_md5_tv_template,
-				.count = HMAC_MD5_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_md5_tv_template)
 		}
 	}, {
 		.alg = "hmac(rmd128)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = hmac_rmd128_tv_template,
-				.count = HMAC_RMD128_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_rmd128_tv_template)
 		}
 	}, {
 		.alg = "hmac(rmd160)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = hmac_rmd160_tv_template,
-				.count = HMAC_RMD160_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_rmd160_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha1)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha1_tv_template,
-				.count = HMAC_SHA1_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha1_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha224)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha224_tv_template,
-				.count = HMAC_SHA224_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha224_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha256)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha256_tv_template,
-				.count = HMAC_SHA256_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha256_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-224)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha3_224_tv_template,
-				.count = HMAC_SHA3_224_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha3_224_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-256)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha3_256_tv_template,
-				.count = HMAC_SHA3_256_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha3_256_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-384)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha3_384_tv_template,
-				.count = HMAC_SHA3_384_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha3_384_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha3-512)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha3_512_tv_template,
-				.count = HMAC_SHA3_512_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha3_512_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha384)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha384_tv_template,
-				.count = HMAC_SHA384_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha384_tv_template)
 		}
 	}, {
 		.alg = "hmac(sha512)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = hmac_sha512_tv_template,
-				.count = HMAC_SHA512_TEST_VECTORS
-			}
+			.hash = __VECS(hmac_sha512_tv_template)
 		}
 	}, {
 		.alg = "jitterentropy_rng",
@@ -3512,14 +3102,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_kw_enc_tv_template,
-					.count = ARRAY_SIZE(aes_kw_enc_tv_template)
-				},
-				.dec = {
-					.vecs = aes_kw_dec_tv_template,
-					.count = ARRAY_SIZE(aes_kw_dec_tv_template)
-				}
+				.enc = __VECS(aes_kw_enc_tv_template),
+				.dec = __VECS(aes_kw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3527,14 +3111,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_lrw_enc_tv_template,
-					.count = AES_LRW_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_lrw_dec_tv_template,
-					.count = AES_LRW_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_lrw_enc_tv_template),
+				.dec = __VECS(aes_lrw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3542,14 +3120,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = camellia_lrw_enc_tv_template,
-					.count = CAMELLIA_LRW_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = camellia_lrw_dec_tv_template,
-					.count = CAMELLIA_LRW_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(camellia_lrw_enc_tv_template),
+				.dec = __VECS(camellia_lrw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3557,14 +3129,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast6_lrw_enc_tv_template,
-					.count = CAST6_LRW_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast6_lrw_dec_tv_template,
-					.count = CAST6_LRW_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast6_lrw_enc_tv_template),
+				.dec = __VECS(cast6_lrw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3572,14 +3138,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = serpent_lrw_enc_tv_template,
-					.count = SERPENT_LRW_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = serpent_lrw_dec_tv_template,
-					.count = SERPENT_LRW_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(serpent_lrw_enc_tv_template),
+				.dec = __VECS(serpent_lrw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3587,14 +3147,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tf_lrw_enc_tv_template,
-					.count = TF_LRW_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tf_lrw_dec_tv_template,
-					.count = TF_LRW_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tf_lrw_enc_tv_template),
+				.dec = __VECS(tf_lrw_dec_tv_template)
 			}
 		}
 	}, {
@@ -3603,14 +3157,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.comp = {
-				.comp = {
-					.vecs = lz4_comp_tv_template,
-					.count = LZ4_COMP_TEST_VECTORS
-				},
-				.decomp = {
-					.vecs = lz4_decomp_tv_template,
-					.count = LZ4_DECOMP_TEST_VECTORS
-				}
+				.comp = __VECS(lz4_comp_tv_template),
+				.decomp = __VECS(lz4_decomp_tv_template)
 			}
 		}
 	}, {
@@ -3619,14 +3167,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.comp = {
-				.comp = {
-					.vecs = lz4hc_comp_tv_template,
-					.count = LZ4HC_COMP_TEST_VECTORS
-				},
-				.decomp = {
-					.vecs = lz4hc_decomp_tv_template,
-					.count = LZ4HC_DECOMP_TEST_VECTORS
-				}
+				.comp = __VECS(lz4hc_comp_tv_template),
+				.decomp = __VECS(lz4hc_decomp_tv_template)
 			}
 		}
 	}, {
@@ -3635,42 +3177,27 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.comp = {
-				.comp = {
-					.vecs = lzo_comp_tv_template,
-					.count = LZO_COMP_TEST_VECTORS
-				},
-				.decomp = {
-					.vecs = lzo_decomp_tv_template,
-					.count = LZO_DECOMP_TEST_VECTORS
-				}
+				.comp = __VECS(lzo_comp_tv_template),
+				.decomp = __VECS(lzo_decomp_tv_template)
 			}
 		}
 	}, {
 		.alg = "md4",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = md4_tv_template,
-				.count = MD4_TEST_VECTORS
-			}
+			.hash = __VECS(md4_tv_template)
 		}
 	}, {
 		.alg = "md5",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = md5_tv_template,
-				.count = MD5_TEST_VECTORS
-			}
+			.hash = __VECS(md5_tv_template)
 		}
 	}, {
 		.alg = "michael_mic",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = michael_mic_tv_template,
-				.count = MICHAEL_MIC_TEST_VECTORS
-			}
+			.hash = __VECS(michael_mic_tv_template)
 		}
 	}, {
 		.alg = "ofb(aes)",
@@ -3678,14 +3205,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_ofb_enc_tv_template,
-					.count = AES_OFB_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_ofb_dec_tv_template,
-					.count = AES_OFB_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_ofb_enc_tv_template),
+				.dec = __VECS(aes_ofb_dec_tv_template)
 			}
 		}
 	}, {
@@ -3693,24 +3214,15 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = fcrypt_pcbc_enc_tv_template,
-					.count = FCRYPT_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = fcrypt_pcbc_dec_tv_template,
-					.count = FCRYPT_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(fcrypt_pcbc_enc_tv_template),
+				.dec = __VECS(fcrypt_pcbc_dec_tv_template)
 			}
 		}
 	}, {
 		.alg = "poly1305",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = poly1305_tv_template,
-				.count = POLY1305_TEST_VECTORS
-			}
+			.hash = __VECS(poly1305_tv_template)
 		}
 	}, {
 		.alg = "rfc3686(ctr(aes))",
@@ -3718,14 +3230,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_ctr_rfc3686_enc_tv_template,
-					.count = AES_CTR_3686_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_ctr_rfc3686_dec_tv_template,
-					.count = AES_CTR_3686_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_ctr_rfc3686_enc_tv_template),
+				.dec = __VECS(aes_ctr_rfc3686_dec_tv_template)
 			}
 		}
 	}, {
@@ -3734,14 +3240,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = aes_gcm_rfc4106_enc_tv_template,
-					.count = AES_GCM_4106_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_gcm_rfc4106_dec_tv_template,
-					.count = AES_GCM_4106_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_gcm_rfc4106_enc_tv_template),
+				.dec = __VECS(aes_gcm_rfc4106_dec_tv_template)
 			}
 		}
 	}, {
@@ -3750,14 +3250,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = aes_ccm_rfc4309_enc_tv_template,
-					.count = AES_CCM_4309_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_ccm_rfc4309_dec_tv_template,
-					.count = AES_CCM_4309_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_ccm_rfc4309_enc_tv_template),
+				.dec = __VECS(aes_ccm_rfc4309_dec_tv_template)
 			}
 		}
 	}, {
@@ -3765,14 +3259,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = aes_gcm_rfc4543_enc_tv_template,
-					.count = AES_GCM_4543_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_gcm_rfc4543_dec_tv_template,
-					.count = AES_GCM_4543_DEC_TEST_VECTORS
-				},
+				.enc = __VECS(aes_gcm_rfc4543_enc_tv_template),
+				.dec = __VECS(aes_gcm_rfc4543_dec_tv_template),
 			}
 		}
 	}, {
@@ -3780,14 +3268,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = rfc7539_enc_tv_template,
-					.count = RFC7539_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = rfc7539_dec_tv_template,
-					.count = RFC7539_DEC_TEST_VECTORS
-				},
+				.enc = __VECS(rfc7539_enc_tv_template),
+				.dec = __VECS(rfc7539_dec_tv_template),
 			}
 		}
 	}, {
@@ -3795,71 +3277,47 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_aead,
 		.suite = {
 			.aead = {
-				.enc = {
-					.vecs = rfc7539esp_enc_tv_template,
-					.count = RFC7539ESP_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = rfc7539esp_dec_tv_template,
-					.count = RFC7539ESP_DEC_TEST_VECTORS
-				},
+				.enc = __VECS(rfc7539esp_enc_tv_template),
+				.dec = __VECS(rfc7539esp_dec_tv_template),
 			}
 		}
 	}, {
 		.alg = "rmd128",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = rmd128_tv_template,
-				.count = RMD128_TEST_VECTORS
-			}
+			.hash = __VECS(rmd128_tv_template)
 		}
 	}, {
 		.alg = "rmd160",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = rmd160_tv_template,
-				.count = RMD160_TEST_VECTORS
-			}
+			.hash = __VECS(rmd160_tv_template)
 		}
 	}, {
 		.alg = "rmd256",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = rmd256_tv_template,
-				.count = RMD256_TEST_VECTORS
-			}
+			.hash = __VECS(rmd256_tv_template)
 		}
 	}, {
 		.alg = "rmd320",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = rmd320_tv_template,
-				.count = RMD320_TEST_VECTORS
-			}
+			.hash = __VECS(rmd320_tv_template)
 		}
 	}, {
 		.alg = "rsa",
 		.test = alg_test_akcipher,
 		.fips_allowed = 1,
 		.suite = {
-			.akcipher = {
-				.vecs = rsa_tv_template,
-				.count = RSA_TEST_VECTORS
-			}
+			.akcipher = __VECS(rsa_tv_template)
 		}
 	}, {
 		.alg = "salsa20",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = salsa20_stream_enc_tv_template,
-					.count = SALSA20_STREAM_ENC_TEST_VECTORS
-				}
+				.enc = __VECS(salsa20_stream_enc_tv_template)
 			}
 		}
 	}, {
@@ -3867,162 +3325,111 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha1_tv_template,
-				.count = SHA1_TEST_VECTORS
-			}
+			.hash = __VECS(sha1_tv_template)
 		}
 	}, {
 		.alg = "sha224",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha224_tv_template,
-				.count = SHA224_TEST_VECTORS
-			}
+			.hash = __VECS(sha224_tv_template)
 		}
 	}, {
 		.alg = "sha256",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha256_tv_template,
-				.count = SHA256_TEST_VECTORS
-			}
+			.hash = __VECS(sha256_tv_template)
 		}
 	}, {
 		.alg = "sha3-224",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha3_224_tv_template,
-				.count = SHA3_224_TEST_VECTORS
-			}
+			.hash = __VECS(sha3_224_tv_template)
 		}
 	}, {
 		.alg = "sha3-256",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha3_256_tv_template,
-				.count = SHA3_256_TEST_VECTORS
-			}
+			.hash = __VECS(sha3_256_tv_template)
 		}
 	}, {
 		.alg = "sha3-384",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha3_384_tv_template,
-				.count = SHA3_384_TEST_VECTORS
-			}
+			.hash = __VECS(sha3_384_tv_template)
 		}
 	}, {
 		.alg = "sha3-512",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha3_512_tv_template,
-				.count = SHA3_512_TEST_VECTORS
-			}
+			.hash = __VECS(sha3_512_tv_template)
 		}
 	}, {
 		.alg = "sha384",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha384_tv_template,
-				.count = SHA384_TEST_VECTORS
-			}
+			.hash = __VECS(sha384_tv_template)
 		}
 	}, {
 		.alg = "sha512",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
-			.hash = {
-				.vecs = sha512_tv_template,
-				.count = SHA512_TEST_VECTORS
-			}
+			.hash = __VECS(sha512_tv_template)
 		}
 	}, {
 		.alg = "tgr128",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = tgr128_tv_template,
-				.count = TGR128_TEST_VECTORS
-			}
+			.hash = __VECS(tgr128_tv_template)
 		}
 	}, {
 		.alg = "tgr160",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = tgr160_tv_template,
-				.count = TGR160_TEST_VECTORS
-			}
+			.hash = __VECS(tgr160_tv_template)
 		}
 	}, {
 		.alg = "tgr192",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = tgr192_tv_template,
-				.count = TGR192_TEST_VECTORS
-			}
+			.hash = __VECS(tgr192_tv_template)
 		}
 	}, {
 		.alg = "vmac(aes)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = aes_vmac128_tv_template,
-				.count = VMAC_AES_TEST_VECTORS
-			}
+			.hash = __VECS(aes_vmac128_tv_template)
 		}
 	}, {
 		.alg = "wp256",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = wp256_tv_template,
-				.count = WP256_TEST_VECTORS
-			}
+			.hash = __VECS(wp256_tv_template)
 		}
 	}, {
 		.alg = "wp384",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = wp384_tv_template,
-				.count = WP384_TEST_VECTORS
-			}
+			.hash = __VECS(wp384_tv_template)
 		}
 	}, {
 		.alg = "wp512",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = wp512_tv_template,
-				.count = WP512_TEST_VECTORS
-			}
+			.hash = __VECS(wp512_tv_template)
 		}
 	}, {
 		.alg = "xcbc(aes)",
 		.test = alg_test_hash,
 		.suite = {
-			.hash = {
-				.vecs = aes_xcbc128_tv_template,
-				.count = XCBC_AES_TEST_VECTORS
-			}
+			.hash = __VECS(aes_xcbc128_tv_template)
 		}
 	}, {
 		.alg = "xts(aes)",
@@ -4030,14 +3437,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = aes_xts_enc_tv_template,
-					.count = AES_XTS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = aes_xts_dec_tv_template,
-					.count = AES_XTS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(aes_xts_enc_tv_template),
+				.dec = __VECS(aes_xts_dec_tv_template)
 			}
 		}
 	}, {
@@ -4045,14 +3446,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = camellia_xts_enc_tv_template,
-					.count = CAMELLIA_XTS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = camellia_xts_dec_tv_template,
-					.count = CAMELLIA_XTS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(camellia_xts_enc_tv_template),
+				.dec = __VECS(camellia_xts_dec_tv_template)
 			}
 		}
 	}, {
@@ -4060,14 +3455,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = cast6_xts_enc_tv_template,
-					.count = CAST6_XTS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = cast6_xts_dec_tv_template,
-					.count = CAST6_XTS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(cast6_xts_enc_tv_template),
+				.dec = __VECS(cast6_xts_dec_tv_template)
 			}
 		}
 	}, {
@@ -4075,14 +3464,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = serpent_xts_enc_tv_template,
-					.count = SERPENT_XTS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = serpent_xts_dec_tv_template,
-					.count = SERPENT_XTS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(serpent_xts_enc_tv_template),
+				.dec = __VECS(serpent_xts_dec_tv_template)
 			}
 		}
 	}, {
@@ -4090,14 +3473,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = {
-				.enc = {
-					.vecs = tf_xts_enc_tv_template,
-					.count = TF_XTS_ENC_TEST_VECTORS
-				},
-				.dec = {
-					.vecs = tf_xts_dec_tv_template,
-					.count = TF_XTS_DEC_TEST_VECTORS
-				}
+				.enc = __VECS(tf_xts_enc_tv_template),
+				.dec = __VECS(tf_xts_dec_tv_template)
 			}
 		}
 	}
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 9b656be7f52f..f85e51cf7dcc 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -151,11 +151,6 @@ static char zeroed_string[48];
 /*
  * RSA test vectors. Borrowed from openSSL.
  */
-#ifdef CONFIG_CRYPTO_FIPS
-#define RSA_TEST_VECTORS	2
-#else
-#define RSA_TEST_VECTORS	5
-#endif
 static struct akcipher_testvec rsa_tv_template[] = {
 	{
 #ifndef CONFIG_CRYPTO_FIPS
@@ -340,6 +335,7 @@ static struct akcipher_testvec rsa_tv_template[] = {
 	.m_size = 8,
 	.c_size = 256,
 	.public_key_vec = true,
+#ifndef CONFIG_CRYPTO_FIPS
 	}, {
 	.key =
 	"\x30\x82\x09\x29" /* sequence of 2345 bytes */
@@ -538,11 +534,10 @@ static struct akcipher_testvec rsa_tv_template[] = {
 	.key_len = 2349,
 	.m_size = 8,
 	.c_size = 512,
+#endif
 	}
 };
 
-#define DH_TEST_VECTORS 2
-
 struct kpp_testvec dh_tv_template[] = {
 	{
 	.secret =
@@ -760,11 +755,6 @@ struct kpp_testvec dh_tv_template[] = {
 	}
 };
 
-#ifdef CONFIG_CRYPTO_FIPS
-#define ECDH_TEST_VECTORS 1
-#else
-#define ECDH_TEST_VECTORS 2
-#endif
 struct kpp_testvec ecdh_tv_template[] = {
 	{
 #ifndef CONFIG_CRYPTO_FIPS
@@ -856,8 +846,6 @@ struct kpp_testvec ecdh_tv_template[] = {
 /*
  * MD4 test vectors from RFC1320
  */
-#define MD4_TEST_VECTORS	7
-
 static struct hash_testvec md4_tv_template [] = {
 	{
 		.plaintext = "",
@@ -899,7 +887,6 @@ static struct hash_testvec md4_tv_template [] = {
 	},
 };
 
-#define SHA3_224_TEST_VECTORS	3
 static struct hash_testvec sha3_224_tv_template[] = {
 	{
 		.plaintext = "",
@@ -925,7 +912,6 @@ static struct hash_testvec sha3_224_tv_template[] = {
 	},
 };
 
-#define SHA3_256_TEST_VECTORS	3
 static struct hash_testvec sha3_256_tv_template[] = {
 	{
 		.plaintext = "",
@@ -952,7 +938,6 @@ static struct hash_testvec sha3_256_tv_template[] = {
 };
 
 
-#define SHA3_384_TEST_VECTORS	3
 static struct hash_testvec sha3_384_tv_template[] = {
 	{
 		.plaintext = "",
@@ -985,7 +970,6 @@ static struct hash_testvec sha3_384_tv_template[] = {
 };
 
 
-#define SHA3_512_TEST_VECTORS	3
 static struct hash_testvec sha3_512_tv_template[] = {
 	{
 		.plaintext = "",
@@ -1027,8 +1011,6 @@ static struct hash_testvec sha3_512_tv_template[] = {
 /*
  * MD5 test vectors from RFC1321
  */
-#define MD5_TEST_VECTORS	7
-
 static struct hash_testvec md5_tv_template[] = {
 	{
 		.digest	= "\xd4\x1d\x8c\xd9\x8f\x00\xb2\x04"
@@ -1073,8 +1055,6 @@ static struct hash_testvec md5_tv_template[] = {
 /*
  * RIPEMD-128 test vectors from ISO/IEC 10118-3:2004(E)
  */
-#define RMD128_TEST_VECTORS     10
-
 static struct hash_testvec rmd128_tv_template[] = {
 	{
 		.digest	= "\xcd\xf2\x62\x13\xa1\x50\xdc\x3e"
@@ -1137,8 +1117,6 @@ static struct hash_testvec rmd128_tv_template[] = {
 /*
  * RIPEMD-160 test vectors from ISO/IEC 10118-3:2004(E)
  */
-#define RMD160_TEST_VECTORS     10
-
 static struct hash_testvec rmd160_tv_template[] = {
 	{
 		.digest	= "\x9c\x11\x85\xa5\xc5\xe9\xfc\x54\x61\x28"
@@ -1201,8 +1179,6 @@ static struct hash_testvec rmd160_tv_template[] = {
 /*
  * RIPEMD-256 test vectors
  */
-#define RMD256_TEST_VECTORS     8
-
 static struct hash_testvec rmd256_tv_template[] = {
 	{
 		.digest	= "\x02\xba\x4c\x4e\x5f\x8e\xcd\x18"
@@ -1269,8 +1245,6 @@ static struct hash_testvec rmd256_tv_template[] = {
 /*
  * RIPEMD-320 test vectors
  */
-#define RMD320_TEST_VECTORS     8
-
 static struct hash_testvec rmd320_tv_template[] = {
 	{
 		.digest	= "\x22\xd6\x5d\x56\x61\x53\x6c\xdc\x75\xc1"
@@ -1334,7 +1308,6 @@ static struct hash_testvec rmd320_tv_template[] = {
 	}
 };
 
-#define CRCT10DIF_TEST_VECTORS	ARRAY_SIZE(crct10dif_tv_template)
 static struct hash_testvec crct10dif_tv_template[] = {
 	{
 		.plaintext	= "abc",
@@ -1385,8 +1358,6 @@ static struct hash_testvec crct10dif_tv_template[] = {
  * SHA1 test vectors  from from FIPS PUB 180-1
  * Long vector from CAVS 5.0
  */
-#define SHA1_TEST_VECTORS	6
-
 static struct hash_testvec sha1_tv_template[] = {
 	{
 		.plaintext = "",
@@ -1577,8 +1548,6 @@ static struct hash_testvec sha1_tv_template[] = {
 /*
  * SHA224 test vectors from from FIPS PUB 180-2
  */
-#define SHA224_TEST_VECTORS     5
-
 static struct hash_testvec sha224_tv_template[] = {
 	{
 		.plaintext = "",
@@ -1751,8 +1720,6 @@ static struct hash_testvec sha224_tv_template[] = {
 /*
  * SHA256 test vectors from from NIST
  */
-#define SHA256_TEST_VECTORS	5
-
 static struct hash_testvec sha256_tv_template[] = {
 	{
 		.plaintext = "",
@@ -1924,8 +1891,6 @@ static struct hash_testvec sha256_tv_template[] = {
 /*
  * SHA384 test vectors from from NIST and kerneli
  */
-#define SHA384_TEST_VECTORS	6
-
 static struct hash_testvec sha384_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2118,8 +2083,6 @@ static struct hash_testvec sha384_tv_template[] = {
 /*
  * SHA512 test vectors from from NIST and kerneli
  */
-#define SHA512_TEST_VECTORS	6
-
 static struct hash_testvec sha512_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2327,8 +2290,6 @@ static struct hash_testvec sha512_tv_template[] = {
  * by Vincent Rijmen and Paulo S. L. M. Barreto as part of the NESSIE
  * submission
  */
-#define WP512_TEST_VECTORS	8
-
 static struct hash_testvec wp512_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2425,8 +2386,6 @@ static struct hash_testvec wp512_tv_template[] = {
 	},
 };
 
-#define WP384_TEST_VECTORS	8
-
 static struct hash_testvec wp384_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2507,8 +2466,6 @@ static struct hash_testvec wp384_tv_template[] = {
 	},
 };
 
-#define WP256_TEST_VECTORS	8
-
 static struct hash_testvec wp256_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2576,8 +2533,6 @@ static struct hash_testvec wp256_tv_template[] = {
 /*
  * TIGER test vectors from Tiger website
  */
-#define TGR192_TEST_VECTORS	6
-
 static struct hash_testvec tgr192_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2621,8 +2576,6 @@ static struct hash_testvec tgr192_tv_template[] = {
 	},
 };
 
-#define TGR160_TEST_VECTORS	6
-
 static struct hash_testvec tgr160_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2666,8 +2619,6 @@ static struct hash_testvec tgr160_tv_template[] = {
 	},
 };
 
-#define TGR128_TEST_VECTORS	6
-
 static struct hash_testvec tgr128_tv_template[] = {
 	{
 		.plaintext = "",
@@ -2705,8 +2656,6 @@ static struct hash_testvec tgr128_tv_template[] = {
 	},
 };
 
-#define GHASH_TEST_VECTORS 6
-
 static struct hash_testvec ghash_tv_template[] =
 {
 	{
@@ -2822,8 +2771,6 @@ static struct hash_testvec ghash_tv_template[] =
  * HMAC-MD5 test vectors from RFC2202
  * (These need to be fixed to not use strlen).
  */
-#define HMAC_MD5_TEST_VECTORS	7
-
 static struct hash_testvec hmac_md5_tv_template[] =
 {
 	{
@@ -2904,8 +2851,6 @@ static struct hash_testvec hmac_md5_tv_template[] =
 /*
  * HMAC-RIPEMD128 test vectors from RFC2286
  */
-#define HMAC_RMD128_TEST_VECTORS	7
-
 static struct hash_testvec hmac_rmd128_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
@@ -2985,8 +2930,6 @@ static struct hash_testvec hmac_rmd128_tv_template[] = {
 /*
  * HMAC-RIPEMD160 test vectors from RFC2286
  */
-#define HMAC_RMD160_TEST_VECTORS	7
-
 static struct hash_testvec hmac_rmd160_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
@@ -3066,8 +3009,6 @@ static struct hash_testvec hmac_rmd160_tv_template[] = {
 /*
  * HMAC-SHA1 test vectors from RFC2202
  */
-#define HMAC_SHA1_TEST_VECTORS	7
-
 static struct hash_testvec hmac_sha1_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
@@ -3149,8 +3090,6 @@ static struct hash_testvec hmac_sha1_tv_template[] = {
 /*
  * SHA224 HMAC test vectors from RFC4231
  */
-#define HMAC_SHA224_TEST_VECTORS    4
-
 static struct hash_testvec hmac_sha224_tv_template[] = {
 	{
 		.key    = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -3264,8 +3203,6 @@ static struct hash_testvec hmac_sha224_tv_template[] = {
  * HMAC-SHA256 test vectors from
  * draft-ietf-ipsec-ciph-sha-256-01.txt
  */
-#define HMAC_SHA256_TEST_VECTORS	10
-
 static struct hash_testvec hmac_sha256_tv_template[] = {
 	{
 		.key	= "\x01\x02\x03\x04\x05\x06\x07\x08"
@@ -3401,8 +3338,6 @@ static struct hash_testvec hmac_sha256_tv_template[] = {
 	},
 };
 
-#define CMAC_AES_TEST_VECTORS 6
-
 static struct hash_testvec aes_cmac128_tv_template[] = {
 	{ /* From NIST Special Publication 800-38B, AES-128 */
 		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
@@ -3478,7 +3413,65 @@ static struct hash_testvec aes_cmac128_tv_template[] = {
 	}
 };
 
-#define CMAC_DES3_EDE_TEST_VECTORS 4
+static struct hash_testvec aes_cbcmac_tv_template[] = {
+	{
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+		.digest		= "\x3a\xd7\x7b\xb4\x0d\x7a\x36\x60"
+				  "\xa8\x9e\xca\xf3\x24\x66\xef\x97",
+		.psize		= 16,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30",
+		.digest		= "\x9d\x0d\xd0\x63\xfb\xcb\x24\x43"
+				  "\xf8\xf2\x76\x03\xac\x39\xb0\x9d",
+		.psize		= 33,
+		.ksize		= 16,
+		.np		= 2,
+		.tap		= { 7, 26 },
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37",
+		.digest		= "\xc0\x71\x73\xb8\xa0\x2c\x11\x7c"
+				  "\xaf\xdc\xb2\xf8\x89\x32\xa3\x3a",
+		.psize		= 63,
+		.ksize		= 16,
+	}, {
+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10"
+				  "\x1c",
+		.digest		= "\x6a\x4e\xdb\x21\x47\x51\xdf\x4f"
+				  "\xa8\x4d\x4c\x10\x3b\x72\x7d\xd6",
+		.psize		= 65,
+		.ksize		= 32,
+	}
+};
 
 static struct hash_testvec des3_ede_cmac64_tv_template[] = {
 /*
@@ -3526,8 +3519,6 @@ static struct hash_testvec des3_ede_cmac64_tv_template[] = {
 	}
 };
 
-#define XCBC_AES_TEST_VECTORS 6
-
 static struct hash_testvec aes_xcbc128_tv_template[] = {
 	{
 		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
@@ -3594,7 +3585,6 @@ static struct hash_testvec aes_xcbc128_tv_template[] = {
 	}
 };
 
-#define VMAC_AES_TEST_VECTORS	11
 static char vmac_string1[128] = {'\x01', '\x01', '\x01', '\x01',
 				'\x02', '\x03', '\x02', '\x02',
 				'\x02', '\x04', '\x01', '\x07',
@@ -3701,8 +3691,6 @@ static struct hash_testvec aes_vmac128_tv_template[] = {
  * SHA384 HMAC test vectors from RFC4231
  */
 
-#define HMAC_SHA384_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha384_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -3801,8 +3789,6 @@ static struct hash_testvec hmac_sha384_tv_template[] = {
  * SHA512 HMAC test vectors from RFC4231
  */
 
-#define HMAC_SHA512_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha512_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -3908,8 +3894,6 @@ static struct hash_testvec hmac_sha512_tv_template[] = {
 	},
 };
 
-#define HMAC_SHA3_224_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha3_224_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -3999,8 +3983,6 @@ static struct hash_testvec hmac_sha3_224_tv_template[] = {
 	},
 };
 
-#define HMAC_SHA3_256_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha3_256_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -4090,8 +4072,6 @@ static struct hash_testvec hmac_sha3_256_tv_template[] = {
 	},
 };
 
-#define HMAC_SHA3_384_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha3_384_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -4189,8 +4169,6 @@ static struct hash_testvec hmac_sha3_384_tv_template[] = {
 	},
 };
 
-#define HMAC_SHA3_512_TEST_VECTORS	4
-
 static struct hash_testvec hmac_sha3_512_tv_template[] = {
 	{
 		.key	= "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
@@ -4300,8 +4278,6 @@ static struct hash_testvec hmac_sha3_512_tv_template[] = {
  * Poly1305 test vectors from RFC7539 A.3.
  */
 
-#define POLY1305_TEST_VECTORS	11
-
 static struct hash_testvec poly1305_tv_template[] = {
 	{ /* Test Vector #1 */
 		.plaintext	= "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -4547,19 +4523,6 @@ static struct hash_testvec poly1305_tv_template[] = {
 /*
  * DES test vectors.
  */
-#define DES_ENC_TEST_VECTORS		11
-#define DES_DEC_TEST_VECTORS		5
-#define DES_CBC_ENC_TEST_VECTORS	6
-#define DES_CBC_DEC_TEST_VECTORS	5
-#define DES_CTR_ENC_TEST_VECTORS	2
-#define DES_CTR_DEC_TEST_VECTORS	2
-#define DES3_EDE_ENC_TEST_VECTORS	4
-#define DES3_EDE_DEC_TEST_VECTORS	4
-#define DES3_EDE_CBC_ENC_TEST_VECTORS	2
-#define DES3_EDE_CBC_DEC_TEST_VECTORS	2
-#define DES3_EDE_CTR_ENC_TEST_VECTORS	2
-#define DES3_EDE_CTR_DEC_TEST_VECTORS	2
-
 static struct cipher_testvec des_enc_tv_template[] = {
 	{ /* From Applied Cryptography */
 		.key	= "\x01\x23\x45\x67\x89\xab\xcd\xef",
@@ -6620,13 +6583,6 @@ static struct cipher_testvec des3_ede_ctr_dec_tv_template[] = {
 /*
  * Blowfish test vectors.
  */
-#define BF_ENC_TEST_VECTORS	7
-#define BF_DEC_TEST_VECTORS	7
-#define BF_CBC_ENC_TEST_VECTORS	2
-#define BF_CBC_DEC_TEST_VECTORS	2
-#define BF_CTR_ENC_TEST_VECTORS	2
-#define BF_CTR_DEC_TEST_VECTORS	2
-
 static struct cipher_testvec bf_enc_tv_template[] = {
 	{ /* DES test vectors from OpenSSL */
 		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -8152,17 +8108,6 @@ static struct cipher_testvec bf_ctr_dec_tv_template[] = {
 /*
  * Twofish test vectors.
  */
-#define TF_ENC_TEST_VECTORS		4
-#define TF_DEC_TEST_VECTORS		4
-#define TF_CBC_ENC_TEST_VECTORS		5
-#define TF_CBC_DEC_TEST_VECTORS		5
-#define TF_CTR_ENC_TEST_VECTORS		2
-#define TF_CTR_DEC_TEST_VECTORS		2
-#define TF_LRW_ENC_TEST_VECTORS		8
-#define TF_LRW_DEC_TEST_VECTORS		8
-#define TF_XTS_ENC_TEST_VECTORS		5
-#define TF_XTS_DEC_TEST_VECTORS		5
-
 static struct cipher_testvec tf_enc_tv_template[] = {
 	{
 		.key	= zeroed_string,
@@ -10881,24 +10826,6 @@ static struct cipher_testvec tf_xts_dec_tv_template[] = {
  * Serpent test vectors.  These are backwards because Serpent writes
  * octet sequences in right-to-left mode.
  */
-#define SERPENT_ENC_TEST_VECTORS	5
-#define SERPENT_DEC_TEST_VECTORS	5
-
-#define TNEPRES_ENC_TEST_VECTORS	4
-#define TNEPRES_DEC_TEST_VECTORS	4
-
-#define SERPENT_CBC_ENC_TEST_VECTORS	1
-#define SERPENT_CBC_DEC_TEST_VECTORS	1
-
-#define SERPENT_CTR_ENC_TEST_VECTORS	2
-#define SERPENT_CTR_DEC_TEST_VECTORS	2
-
-#define SERPENT_LRW_ENC_TEST_VECTORS	8
-#define SERPENT_LRW_DEC_TEST_VECTORS	8
-
-#define SERPENT_XTS_ENC_TEST_VECTORS	5
-#define SERPENT_XTS_DEC_TEST_VECTORS	5
-
 static struct cipher_testvec serpent_enc_tv_template[] = {
 	{
 		.input	= "\x00\x01\x02\x03\x04\x05\x06\x07"
@@ -13637,17 +13564,6 @@ static struct cipher_testvec serpent_xts_dec_tv_template[] = {
 };
 
 /* Cast6 test vectors from RFC 2612 */
-#define CAST6_ENC_TEST_VECTORS		4
-#define CAST6_DEC_TEST_VECTORS		4
-#define CAST6_CBC_ENC_TEST_VECTORS	1
-#define CAST6_CBC_DEC_TEST_VECTORS	1
-#define CAST6_CTR_ENC_TEST_VECTORS	2
-#define CAST6_CTR_DEC_TEST_VECTORS	2
-#define CAST6_LRW_ENC_TEST_VECTORS	1
-#define CAST6_LRW_DEC_TEST_VECTORS	1
-#define CAST6_XTS_ENC_TEST_VECTORS	1
-#define CAST6_XTS_DEC_TEST_VECTORS	1
-
 static struct cipher_testvec cast6_enc_tv_template[] = {
 	{
 		.key	= "\x23\x42\xbb\x9e\xfa\x38\x54\x2c"
@@ -15182,38 +15098,6 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = {
 /*
  * AES test vectors.
  */
-#define AES_ENC_TEST_VECTORS 4
-#define AES_DEC_TEST_VECTORS 4
-#define AES_CBC_ENC_TEST_VECTORS 5
-#define AES_CBC_DEC_TEST_VECTORS 5
-#define HMAC_MD5_ECB_CIPHER_NULL_ENC_TEST_VECTORS 2
-#define HMAC_MD5_ECB_CIPHER_NULL_DEC_TEST_VECTORS 2
-#define HMAC_SHA1_ECB_CIPHER_NULL_ENC_TEST_VEC 2
-#define HMAC_SHA1_ECB_CIPHER_NULL_DEC_TEST_VEC 2
-#define HMAC_SHA1_AES_CBC_ENC_TEST_VEC 7
-#define HMAC_SHA256_AES_CBC_ENC_TEST_VEC 7
-#define HMAC_SHA512_AES_CBC_ENC_TEST_VEC 7
-#define AES_LRW_ENC_TEST_VECTORS 8
-#define AES_LRW_DEC_TEST_VECTORS 8
-#define AES_XTS_ENC_TEST_VECTORS 5
-#define AES_XTS_DEC_TEST_VECTORS 5
-#define AES_CTR_ENC_TEST_VECTORS 5
-#define AES_CTR_DEC_TEST_VECTORS 5
-#define AES_OFB_ENC_TEST_VECTORS 1
-#define AES_OFB_DEC_TEST_VECTORS 1
-#define AES_CTR_3686_ENC_TEST_VECTORS 7
-#define AES_CTR_3686_DEC_TEST_VECTORS 6
-#define AES_GCM_ENC_TEST_VECTORS 9
-#define AES_GCM_DEC_TEST_VECTORS 8
-#define AES_GCM_4106_ENC_TEST_VECTORS 23
-#define AES_GCM_4106_DEC_TEST_VECTORS 23
-#define AES_GCM_4543_ENC_TEST_VECTORS 1
-#define AES_GCM_4543_DEC_TEST_VECTORS 2
-#define AES_CCM_ENC_TEST_VECTORS 8
-#define AES_CCM_DEC_TEST_VECTORS 7
-#define AES_CCM_4309_ENC_TEST_VECTORS 7
-#define AES_CCM_4309_DEC_TEST_VECTORS 10
-
 static struct cipher_testvec aes_enc_tv_template[] = {
 	{ /* From FIPS-197 */
 		.key	= "\x00\x01\x02\x03\x04\x05\x06\x07"
@@ -17069,8 +16953,6 @@ static struct aead_testvec hmac_sha512_aes_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA1_DES_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha1_des_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17130,8 +17012,6 @@ static struct aead_testvec hmac_sha1_des_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA224_DES_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha224_des_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17191,8 +17071,6 @@ static struct aead_testvec hmac_sha224_des_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA256_DES_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha256_des_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17254,8 +17132,6 @@ static struct aead_testvec hmac_sha256_des_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA384_DES_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha384_des_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17321,8 +17197,6 @@ static struct aead_testvec hmac_sha384_des_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA512_DES_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha512_des_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17392,8 +17266,6 @@ static struct aead_testvec hmac_sha512_des_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA1_DES3_EDE_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha1_des3_ede_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17455,8 +17327,6 @@ static struct aead_testvec hmac_sha1_des3_ede_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA224_DES3_EDE_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha224_des3_ede_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17518,8 +17388,6 @@ static struct aead_testvec hmac_sha224_des3_ede_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA256_DES3_EDE_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha256_des3_ede_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17583,8 +17451,6 @@ static struct aead_testvec hmac_sha256_des3_ede_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA384_DES3_EDE_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha384_des3_ede_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -17652,8 +17518,6 @@ static struct aead_testvec hmac_sha384_des3_ede_cbc_enc_tv_temp[] = {
 	},
 };
 
-#define HMAC_SHA512_DES3_EDE_CBC_ENC_TEST_VEC	1
-
 static struct aead_testvec hmac_sha512_des3_ede_cbc_enc_tv_temp[] = {
 	{ /*Generated with cryptopp*/
 #ifdef __LITTLE_ENDIAN
@@ -24434,8 +24298,6 @@ static struct aead_testvec aes_ccm_rfc4309_dec_tv_template[]	= {
 /*
  * ChaCha20-Poly1305 AEAD test vectors from RFC7539 2.8.2./A.5.
  */
-#define RFC7539_ENC_TEST_VECTORS 2
-#define RFC7539_DEC_TEST_VECTORS 2
 static struct aead_testvec rfc7539_enc_tv_template[] = {
 	{
 		.key	= "\x80\x81\x82\x83\x84\x85\x86\x87"
@@ -24703,8 +24565,6 @@ static struct aead_testvec rfc7539_dec_tv_template[] = {
 /*
  * draft-irtf-cfrg-chacha20-poly1305
  */
-#define RFC7539ESP_DEC_TEST_VECTORS 1
-#define RFC7539ESP_ENC_TEST_VECTORS 1
 static struct aead_testvec rfc7539esp_enc_tv_template[] = {
 	{
 		.key	= "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a"
@@ -24927,8 +24787,6 @@ static struct cipher_testvec aes_kw_dec_tv_template[] = {
  *     http://csrc.nist.gov/groups/STM/cavp/documents/rng/RNGVS.pdf
  * Only AES-128 is supported at this time.
  */
-#define ANSI_CPRNG_AES_TEST_VECTORS	6
-
 static struct cprng_testvec ansi_cprng_aes_tv_template[] = {
 	{
 		.key	= "\xf3\xb1\x66\x6d\x13\x60\x72\x42"
@@ -25846,13 +25704,6 @@ static struct drbg_testvec drbg_nopr_ctr_aes128_tv_template[] = {
 };
 
 /* Cast5 test vectors from RFC 2144 */
-#define CAST5_ENC_TEST_VECTORS		4
-#define CAST5_DEC_TEST_VECTORS		4
-#define CAST5_CBC_ENC_TEST_VECTORS	1
-#define CAST5_CBC_DEC_TEST_VECTORS	1
-#define CAST5_CTR_ENC_TEST_VECTORS	2
-#define CAST5_CTR_DEC_TEST_VECTORS	2
-
 static struct cipher_testvec cast5_enc_tv_template[] = {
 	{
 		.key	= "\x01\x23\x45\x67\x12\x34\x56\x78"
@@ -26756,9 +26607,6 @@ static struct cipher_testvec cast5_ctr_dec_tv_template[] = {
 /*
  * ARC4 test vectors from OpenSSL
  */
-#define ARC4_ENC_TEST_VECTORS	7
-#define ARC4_DEC_TEST_VECTORS	7
-
 static struct cipher_testvec arc4_enc_tv_template[] = {
 	{
 		.key	= "\x01\x23\x45\x67\x89\xab\xcd\xef",
@@ -26894,9 +26742,6 @@ static struct cipher_testvec arc4_dec_tv_template[] = {
 /*
  * TEA test vectors
  */
-#define TEA_ENC_TEST_VECTORS	4
-#define TEA_DEC_TEST_VECTORS	4
-
 static struct cipher_testvec tea_enc_tv_template[] = {
 	{
 		.key    = zeroed_string,
@@ -26986,9 +26831,6 @@ static struct cipher_testvec tea_dec_tv_template[] = {
 /*
  * XTEA test vectors
  */
-#define XTEA_ENC_TEST_VECTORS	4
-#define XTEA_DEC_TEST_VECTORS	4
-
 static struct cipher_testvec xtea_enc_tv_template[] = {
 	{
 		.key    = zeroed_string,
@@ -27078,9 +26920,6 @@ static struct cipher_testvec xtea_dec_tv_template[] = {
 /*
  * KHAZAD test vectors.
  */
-#define KHAZAD_ENC_TEST_VECTORS 5
-#define KHAZAD_DEC_TEST_VECTORS 5
-
 static struct cipher_testvec khazad_enc_tv_template[] = {
 	{
 		.key	= "\x80\x00\x00\x00\x00\x00\x00\x00"
@@ -27177,11 +27016,6 @@ static struct cipher_testvec khazad_dec_tv_template[] = {
  * Anubis test vectors.
  */
 
-#define ANUBIS_ENC_TEST_VECTORS			5
-#define ANUBIS_DEC_TEST_VECTORS			5
-#define ANUBIS_CBC_ENC_TEST_VECTORS		2
-#define ANUBIS_CBC_DEC_TEST_VECTORS		2
-
 static struct cipher_testvec anubis_enc_tv_template[] = {
 	{
 		.key	= "\xfe\xfe\xfe\xfe\xfe\xfe\xfe\xfe"
@@ -27381,9 +27215,6 @@ static struct cipher_testvec anubis_cbc_dec_tv_template[] = {
 /*
  * XETA test vectors
  */
-#define XETA_ENC_TEST_VECTORS	4
-#define XETA_DEC_TEST_VECTORS	4
-
 static struct cipher_testvec xeta_enc_tv_template[] = {
 	{
 		.key    = zeroed_string,
@@ -27473,9 +27304,6 @@ static struct cipher_testvec xeta_dec_tv_template[] = {
 /*
  * FCrypt test vectors
  */
-#define FCRYPT_ENC_TEST_VECTORS	ARRAY_SIZE(fcrypt_pcbc_enc_tv_template)
-#define FCRYPT_DEC_TEST_VECTORS	ARRAY_SIZE(fcrypt_pcbc_dec_tv_template)
-
 static struct cipher_testvec fcrypt_pcbc_enc_tv_template[] = {
 	{ /* http://www.openafs.org/pipermail/openafs-devel/2000-December/005320.html */
 		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00",
@@ -27601,17 +27429,6 @@ static struct cipher_testvec fcrypt_pcbc_dec_tv_template[] = {
 /*
  * CAMELLIA test vectors.
  */
-#define CAMELLIA_ENC_TEST_VECTORS 4
-#define CAMELLIA_DEC_TEST_VECTORS 4
-#define CAMELLIA_CBC_ENC_TEST_VECTORS 3
-#define CAMELLIA_CBC_DEC_TEST_VECTORS 3
-#define CAMELLIA_CTR_ENC_TEST_VECTORS 2
-#define CAMELLIA_CTR_DEC_TEST_VECTORS 2
-#define CAMELLIA_LRW_ENC_TEST_VECTORS 8
-#define CAMELLIA_LRW_DEC_TEST_VECTORS 8
-#define CAMELLIA_XTS_ENC_TEST_VECTORS 5
-#define CAMELLIA_XTS_DEC_TEST_VECTORS 5
-
 static struct cipher_testvec camellia_enc_tv_template[] = {
 	{
 		.key	= "\x01\x23\x45\x67\x89\xab\xcd\xef"
@@ -31331,9 +31148,6 @@ static struct cipher_testvec camellia_xts_dec_tv_template[] = {
 /*
  * SEED test vectors
  */
-#define SEED_ENC_TEST_VECTORS	4
-#define SEED_DEC_TEST_VECTORS	4
-
 static struct cipher_testvec seed_enc_tv_template[] = {
 	{
 		.key    = zeroed_string,
@@ -31418,7 +31232,6 @@ static struct cipher_testvec seed_dec_tv_template[] = {
 	}
 };
 
-#define SALSA20_STREAM_ENC_TEST_VECTORS 5
 static struct cipher_testvec salsa20_stream_enc_tv_template[] = {
 	/*
 	* Testvectors from verified.test-vectors submitted to ECRYPT.
@@ -32588,7 +32401,6 @@ static struct cipher_testvec salsa20_stream_enc_tv_template[] = {
 	},
 };
 
-#define CHACHA20_ENC_TEST_VECTORS 4
 static struct cipher_testvec chacha20_enc_tv_template[] = {
 	{ /* RFC7539 A.2. Test Vector #1 */
 		.key	= "\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -33100,8 +32912,6 @@ static struct cipher_testvec chacha20_enc_tv_template[] = {
 /*
  * CTS (Cipher Text Stealing) mode tests
  */
-#define CTS_MODE_ENC_TEST_VECTORS 6
-#define CTS_MODE_DEC_TEST_VECTORS 6
 static struct cipher_testvec cts_mode_enc_tv_template[] = {
 	{ /* from rfc3962 */
 		.klen	= 16,
@@ -33322,9 +33132,6 @@ struct comp_testvec {
  * Params: winbits=-11, Z_DEFAULT_COMPRESSION, MAX_MEM_LEVEL.
  */
 
-#define DEFLATE_COMP_TEST_VECTORS 2
-#define DEFLATE_DECOMP_TEST_VECTORS 2
-
 static struct comp_testvec deflate_comp_tv_template[] = {
 	{
 		.inlen	= 70,
@@ -33400,9 +33207,6 @@ static struct comp_testvec deflate_decomp_tv_template[] = {
 /*
  * LZO test vectors (null-terminated strings).
  */
-#define LZO_COMP_TEST_VECTORS 2
-#define LZO_DECOMP_TEST_VECTORS 2
-
 static struct comp_testvec lzo_comp_tv_template[] = {
 	{
 		.inlen	= 70,
@@ -33534,8 +33338,6 @@ static struct hash_testvec michael_mic_tv_template[] = {
 /*
  * CRC32 test vectors
  */
-#define CRC32_TEST_VECTORS 14
-
 static struct hash_testvec crc32_tv_template[] = {
 	{
 		.key = "\x87\xa9\xcb\xed",
@@ -33968,8 +33770,6 @@ static struct hash_testvec crc32_tv_template[] = {
 /*
  * CRC32C test vectors
  */
-#define CRC32C_TEST_VECTORS 15
-
 static struct hash_testvec crc32c_tv_template[] = {
 	{
 		.psize = 0,
@@ -34406,8 +34206,6 @@ static struct hash_testvec crc32c_tv_template[] = {
 /*
  * Blakcifn CRC test vectors
  */
-#define BFIN_CRC_TEST_VECTORS 6
-
 static struct hash_testvec bfin_crc_tv_template[] = {
 	{
 		.psize = 0,
@@ -34493,9 +34291,6 @@ static struct hash_testvec bfin_crc_tv_template[] = {
 
 };
 
-#define LZ4_COMP_TEST_VECTORS 1
-#define LZ4_DECOMP_TEST_VECTORS 1
-
 static struct comp_testvec lz4_comp_tv_template[] = {
 	{
 		.inlen	= 70,
@@ -34526,9 +34321,6 @@ static struct comp_testvec lz4_decomp_tv_template[] = {
 	},
 };
 
-#define LZ4HC_COMP_TEST_VECTORS 1
-#define LZ4HC_DECOMP_TEST_VECTORS 1
-
 static struct comp_testvec lz4hc_comp_tv_template[] = {
 	{
 		.inlen	= 70,
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index ceff2fc524b1..0cafe08919c9 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -172,8 +172,8 @@ config HW_RANDOM_OMAP
 	default HW_RANDOM
  	---help---
  	  This driver provides kernel-side support for the Random Number
-	  Generator hardware found on OMAP16xx, OMAP2/3/4/5 and AM33xx/AM43xx
-	  multimedia processors.
+	  Generator hardware found on OMAP16xx, OMAP2/3/4/5, AM33xx/AM43xx
+	  multimedia processors, and Marvell Armada 7k/8k SoCs.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called omap-rng.
diff --git a/drivers/char/hw_random/cavium-rng-vf.c b/drivers/char/hw_random/cavium-rng-vf.c
index 066ae0e78d63..dd1007aecb10 100644
--- a/drivers/char/hw_random/cavium-rng-vf.c
+++ b/drivers/char/hw_random/cavium-rng-vf.c
@@ -57,7 +57,11 @@ static int cavium_rng_probe_vf(struct	pci_dev		*pdev,
 		return -ENOMEM;
 	}
 
-	rng->ops.name    = "cavium rng";
+	rng->ops.name = devm_kasprintf(&pdev->dev, GFP_KERNEL,
+				       "cavium-rng-%s", dev_name(&pdev->dev));
+	if (!rng->ops.name)
+		return -ENOMEM;
+
 	rng->ops.read    = cavium_rng_read;
 	rng->ops.quality = 1000;
 
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 87fba424817e..5c654b5d4adf 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -1,55 +1,30 @@
 /*
-        Added support for the AMD Geode LX RNG
-	(c) Copyright 2004-2005 Advanced Micro Devices, Inc.
-
-	derived from
-
- 	Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG)
-	(c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com>
-
- 	derived from
-
-        Hardware driver for the AMD 768 Random Number Generator (RNG)
-        (c) Copyright 2001 Red Hat Inc <alan@redhat.com>
-
- 	derived from
-
-	Hardware driver for Intel i810 Random Number Generator (RNG)
-	Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
-	Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
-
-	Added generic RNG API
-	Copyright 2006 Michael Buesch <m@bues.ch>
-	Copyright 2005 (c) MontaVista Software, Inc.
-
-	Please read Documentation/hw_random.txt for details on use.
-
-	----------------------------------------------------------
-	This software may be used and distributed according to the terms
-        of the GNU General Public License, incorporated herein by reference.
-
+ * hw_random/core.c: HWRNG core API
+ *
+ * Copyright 2006 Michael Buesch <m@bues.ch>
+ * Copyright 2005 (c) MontaVista Software, Inc.
+ *
+ * Please read Documentation/hw_random.txt for details on use.
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
  */
 
-
-#include <linux/device.h>
-#include <linux/hw_random.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/miscdevice.h>
-#include <linux/kthread.h>
 #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/random.h>
+#include <linux/device.h>
 #include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/hw_random.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/uaccess.h>
 
-
 #define RNG_MODULE_NAME		"hw_random"
-#define PFX			RNG_MODULE_NAME ": "
-#define RNG_MISCDEV_MINOR	183 /* official */
-
 
 static struct hwrng *current_rng;
 static struct task_struct *hwrng_fill;
@@ -296,7 +271,6 @@ out_put:
 	goto out;
 }
 
-
 static const struct file_operations rng_chrdev_ops = {
 	.owner		= THIS_MODULE,
 	.open		= rng_dev_open,
@@ -307,14 +281,13 @@ static const struct file_operations rng_chrdev_ops = {
 static const struct attribute_group *rng_dev_groups[];
 
 static struct miscdevice rng_miscdev = {
-	.minor		= RNG_MISCDEV_MINOR,
+	.minor		= HWRNG_MINOR,
 	.name		= RNG_MODULE_NAME,
 	.nodename	= "hwrng",
 	.fops		= &rng_chrdev_ops,
 	.groups		= rng_dev_groups,
 };
 
-
 static ssize_t hwrng_attr_current_store(struct device *dev,
 					struct device_attribute *attr,
 					const char *buf, size_t len)
@@ -444,8 +417,7 @@ int hwrng_register(struct hwrng *rng)
 	int err = -EINVAL;
 	struct hwrng *old_rng, *tmp;
 
-	if (rng->name == NULL ||
-	    (rng->data_read == NULL && rng->read == NULL))
+	if (!rng->name || (!rng->data_read && !rng->read))
 		goto out;
 
 	mutex_lock(&rng_mutex);
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 3b06c1d6cfb2..31cbdbbaebfc 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -21,11 +21,11 @@
 
 #define DRV_MODULE_NAME		"n2rng"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"0.2"
-#define DRV_MODULE_RELDATE	"July 27, 2011"
+#define DRV_MODULE_VERSION	"0.3"
+#define DRV_MODULE_RELDATE	"Jan 7, 2017"
 
 static char version[] =
-	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+	DRV_MODULE_NAME " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
 MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
 MODULE_DESCRIPTION("Niagara2 RNG driver");
@@ -302,26 +302,57 @@ static int n2rng_try_read_ctl(struct n2rng *np)
 	return n2rng_hv_err_trans(hv_err);
 }
 
-#define CONTROL_DEFAULT_BASE		\
-	((2 << RNG_CTL_ASEL_SHIFT) |	\
-	 (N2RNG_ACCUM_CYCLES_DEFAULT << RNG_CTL_WAIT_SHIFT) |	\
-	 RNG_CTL_LFSR)
+static u64 n2rng_control_default(struct n2rng *np, int ctl)
+{
+	u64 val = 0;
 
-#define CONTROL_DEFAULT_0		\
-	(CONTROL_DEFAULT_BASE |		\
-	 (1 << RNG_CTL_VCO_SHIFT) |	\
-	 RNG_CTL_ES1)
-#define CONTROL_DEFAULT_1		\
-	(CONTROL_DEFAULT_BASE |		\
-	 (2 << RNG_CTL_VCO_SHIFT) |	\
-	 RNG_CTL_ES2)
-#define CONTROL_DEFAULT_2		\
-	(CONTROL_DEFAULT_BASE |		\
-	 (3 << RNG_CTL_VCO_SHIFT) |	\
-	 RNG_CTL_ES3)
-#define CONTROL_DEFAULT_3		\
-	(CONTROL_DEFAULT_BASE |		\
-	 RNG_CTL_ES1 | RNG_CTL_ES2 | RNG_CTL_ES3)
+	if (np->data->chip_version == 1) {
+		val = ((2 << RNG_v1_CTL_ASEL_SHIFT) |
+			(N2RNG_ACCUM_CYCLES_DEFAULT << RNG_v1_CTL_WAIT_SHIFT) |
+			 RNG_CTL_LFSR);
+
+		switch (ctl) {
+		case 0:
+			val |= (1 << RNG_v1_CTL_VCO_SHIFT) | RNG_CTL_ES1;
+			break;
+		case 1:
+			val |= (2 << RNG_v1_CTL_VCO_SHIFT) | RNG_CTL_ES2;
+			break;
+		case 2:
+			val |= (3 << RNG_v1_CTL_VCO_SHIFT) | RNG_CTL_ES3;
+			break;
+		case 3:
+			val |= RNG_CTL_ES1 | RNG_CTL_ES2 | RNG_CTL_ES3;
+			break;
+		default:
+			break;
+		}
+
+	} else {
+		val = ((2 << RNG_v2_CTL_ASEL_SHIFT) |
+			(N2RNG_ACCUM_CYCLES_DEFAULT << RNG_v2_CTL_WAIT_SHIFT) |
+			 RNG_CTL_LFSR);
+
+		switch (ctl) {
+		case 0:
+			val |= (1 << RNG_v2_CTL_VCO_SHIFT) | RNG_CTL_ES1;
+			break;
+		case 1:
+			val |= (2 << RNG_v2_CTL_VCO_SHIFT) | RNG_CTL_ES2;
+			break;
+		case 2:
+			val |= (3 << RNG_v2_CTL_VCO_SHIFT) | RNG_CTL_ES3;
+			break;
+		case 3:
+			val |= RNG_CTL_ES1 | RNG_CTL_ES2 | RNG_CTL_ES3;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return val;
+}
 
 static void n2rng_control_swstate_init(struct n2rng *np)
 {
@@ -336,10 +367,10 @@ static void n2rng_control_swstate_init(struct n2rng *np)
 	for (i = 0; i < np->num_units; i++) {
 		struct n2rng_unit *up = &np->units[i];
 
-		up->control[0] = CONTROL_DEFAULT_0;
-		up->control[1] = CONTROL_DEFAULT_1;
-		up->control[2] = CONTROL_DEFAULT_2;
-		up->control[3] = CONTROL_DEFAULT_3;
+		up->control[0] = n2rng_control_default(np, 0);
+		up->control[1] = n2rng_control_default(np, 1);
+		up->control[2] = n2rng_control_default(np, 2);
+		up->control[3] = n2rng_control_default(np, 3);
 	}
 
 	np->hv_state = HV_RNG_STATE_UNCONFIGURED;
@@ -399,6 +430,7 @@ static int n2rng_data_read(struct hwrng *rng, u32 *data)
 	} else {
 		int err = n2rng_generic_read_data(ra);
 		if (!err) {
+			np->flags |= N2RNG_FLAG_BUFFER_VALID;
 			np->buffer = np->test_data >> 32;
 			*data = np->test_data & 0xffffffff;
 			len = 4;
@@ -487,9 +519,21 @@ static void n2rng_dump_test_buffer(struct n2rng *np)
 
 static int n2rng_check_selftest_buffer(struct n2rng *np, unsigned long unit)
 {
-	u64 val = SELFTEST_VAL;
+	u64 val;
 	int err, matches, limit;
 
+	switch (np->data->id) {
+	case N2_n2_rng:
+	case N2_vf_rng:
+	case N2_kt_rng:
+	case N2_m4_rng:  /* yes, m4 uses the old value */
+		val = RNG_v1_SELFTEST_VAL;
+		break;
+	default:
+		val = RNG_v2_SELFTEST_VAL;
+		break;
+	}
+
 	matches = 0;
 	for (limit = 0; limit < SELFTEST_LOOPS_MAX; limit++) {
 		matches += n2rng_test_buffer_find(np, val);
@@ -512,14 +556,32 @@ static int n2rng_check_selftest_buffer(struct n2rng *np, unsigned long unit)
 static int n2rng_control_selftest(struct n2rng *np, unsigned long unit)
 {
 	int err;
+	u64 base, base3;
 
-	np->test_control[0] = (0x2 << RNG_CTL_ASEL_SHIFT);
-	np->test_control[1] = (0x2 << RNG_CTL_ASEL_SHIFT);
-	np->test_control[2] = (0x2 << RNG_CTL_ASEL_SHIFT);
-	np->test_control[3] = ((0x2 << RNG_CTL_ASEL_SHIFT) |
-			       RNG_CTL_LFSR |
-			       ((SELFTEST_TICKS - 2) << RNG_CTL_WAIT_SHIFT));
+	switch (np->data->id) {
+	case N2_n2_rng:
+	case N2_vf_rng:
+	case N2_kt_rng:
+		base = RNG_v1_CTL_ASEL_NOOUT << RNG_v1_CTL_ASEL_SHIFT;
+		base3 = base | RNG_CTL_LFSR |
+			((RNG_v1_SELFTEST_TICKS - 2) << RNG_v1_CTL_WAIT_SHIFT);
+		break;
+	case N2_m4_rng:
+		base = RNG_v2_CTL_ASEL_NOOUT << RNG_v2_CTL_ASEL_SHIFT;
+		base3 = base | RNG_CTL_LFSR |
+			((RNG_v1_SELFTEST_TICKS - 2) << RNG_v2_CTL_WAIT_SHIFT);
+		break;
+	default:
+		base = RNG_v2_CTL_ASEL_NOOUT << RNG_v2_CTL_ASEL_SHIFT;
+		base3 = base | RNG_CTL_LFSR |
+			(RNG_v2_SELFTEST_TICKS << RNG_v2_CTL_WAIT_SHIFT);
+		break;
+	}
 
+	np->test_control[0] = base;
+	np->test_control[1] = base;
+	np->test_control[2] = base;
+	np->test_control[3] = base3;
 
 	err = n2rng_entropy_diag_read(np, unit, np->test_control,
 				      HV_RNG_STATE_HEALTHCHECK,
@@ -557,11 +619,19 @@ static int n2rng_control_configure_units(struct n2rng *np)
 		struct n2rng_unit *up = &np->units[unit];
 		unsigned long ctl_ra = __pa(&up->control[0]);
 		int esrc;
-		u64 base;
+		u64 base, shift;
 
-		base = ((np->accum_cycles << RNG_CTL_WAIT_SHIFT) |
-			(2 << RNG_CTL_ASEL_SHIFT) |
-			RNG_CTL_LFSR);
+		if (np->data->chip_version == 1) {
+			base = ((np->accum_cycles << RNG_v1_CTL_WAIT_SHIFT) |
+			      (RNG_v1_CTL_ASEL_NOOUT << RNG_v1_CTL_ASEL_SHIFT) |
+			      RNG_CTL_LFSR);
+			shift = RNG_v1_CTL_VCO_SHIFT;
+		} else {
+			base = ((np->accum_cycles << RNG_v2_CTL_WAIT_SHIFT) |
+			      (RNG_v2_CTL_ASEL_NOOUT << RNG_v2_CTL_ASEL_SHIFT) |
+			      RNG_CTL_LFSR);
+			shift = RNG_v2_CTL_VCO_SHIFT;
+		}
 
 		/* XXX This isn't the best.  We should fetch a bunch
 		 * XXX of words using each entropy source combined XXX
@@ -570,7 +640,7 @@ static int n2rng_control_configure_units(struct n2rng *np)
 		 */
 		for (esrc = 0; esrc < 3; esrc++)
 			up->control[esrc] = base |
-				(esrc << RNG_CTL_VCO_SHIFT) |
+				(esrc << shift) |
 				(RNG_CTL_ES1 << esrc);
 
 		up->control[3] = base |
@@ -589,6 +659,7 @@ static void n2rng_work(struct work_struct *work)
 {
 	struct n2rng *np = container_of(work, struct n2rng, work.work);
 	int err = 0;
+	static int retries = 4;
 
 	if (!(np->flags & N2RNG_FLAG_CONTROL)) {
 		err = n2rng_guest_check(np);
@@ -606,7 +677,9 @@ static void n2rng_work(struct work_struct *work)
 		dev_info(&np->op->dev, "RNG ready\n");
 	}
 
-	if (err && !(np->flags & N2RNG_FLAG_SHUTDOWN))
+	if (--retries == 0)
+		dev_err(&np->op->dev, "Self-test retries failed, RNG not ready\n");
+	else if (err && !(np->flags & N2RNG_FLAG_SHUTDOWN))
 		schedule_delayed_work(&np->work, HZ * 2);
 }
 
@@ -622,24 +695,23 @@ static const struct of_device_id n2rng_match[];
 static int n2rng_probe(struct platform_device *op)
 {
 	const struct of_device_id *match;
-	int multi_capable;
 	int err = -ENOMEM;
 	struct n2rng *np;
 
 	match = of_match_device(n2rng_match, &op->dev);
 	if (!match)
 		return -EINVAL;
-	multi_capable = (match->data != NULL);
 
 	n2rng_driver_version();
 	np = devm_kzalloc(&op->dev, sizeof(*np), GFP_KERNEL);
 	if (!np)
 		goto out;
 	np->op = op;
+	np->data = (struct n2rng_template *)match->data;
 
 	INIT_DELAYED_WORK(&np->work, n2rng_work);
 
-	if (multi_capable)
+	if (np->data->multi_capable)
 		np->flags |= N2RNG_FLAG_MULTI;
 
 	err = -ENODEV;
@@ -670,8 +742,9 @@ static int n2rng_probe(struct platform_device *op)
 			dev_err(&op->dev, "VF RNG lacks rng-#units property\n");
 			goto out_hvapi_unregister;
 		}
-	} else
+	} else {
 		np->num_units = 1;
+	}
 
 	dev_info(&op->dev, "Registered RNG HVAPI major %lu minor %lu\n",
 		 np->hvapi_major, np->hvapi_minor);
@@ -692,7 +765,7 @@ static int n2rng_probe(struct platform_device *op)
 		  "multi-unit-capable" : "single-unit"),
 		 np->num_units);
 
-	np->hwrng.name = "n2rng";
+	np->hwrng.name = DRV_MODULE_NAME;
 	np->hwrng.data_read = n2rng_data_read;
 	np->hwrng.priv = (unsigned long) np;
 
@@ -728,30 +801,61 @@ static int n2rng_remove(struct platform_device *op)
 	return 0;
 }
 
+static struct n2rng_template n2_template = {
+	.id = N2_n2_rng,
+	.multi_capable = 0,
+	.chip_version = 1,
+};
+
+static struct n2rng_template vf_template = {
+	.id = N2_vf_rng,
+	.multi_capable = 1,
+	.chip_version = 1,
+};
+
+static struct n2rng_template kt_template = {
+	.id = N2_kt_rng,
+	.multi_capable = 1,
+	.chip_version = 1,
+};
+
+static struct n2rng_template m4_template = {
+	.id = N2_m4_rng,
+	.multi_capable = 1,
+	.chip_version = 2,
+};
+
+static struct n2rng_template m7_template = {
+	.id = N2_m7_rng,
+	.multi_capable = 1,
+	.chip_version = 2,
+};
+
 static const struct of_device_id n2rng_match[] = {
 	{
 		.name		= "random-number-generator",
 		.compatible	= "SUNW,n2-rng",
+		.data		= &n2_template,
 	},
 	{
 		.name		= "random-number-generator",
 		.compatible	= "SUNW,vf-rng",
-		.data		= (void *) 1,
+		.data		= &vf_template,
 	},
 	{
 		.name		= "random-number-generator",
 		.compatible	= "SUNW,kt-rng",
-		.data		= (void *) 1,
+		.data		= &kt_template,
 	},
 	{
 		.name		= "random-number-generator",
 		.compatible	= "ORCL,m4-rng",
-		.data		= (void *) 1,
+		.data		= &m4_template,
 	},
 	{
 		.name		= "random-number-generator",
 		.compatible	= "ORCL,m7-rng",
-		.data		= (void *) 1,
+		.data		= &m7_template,
 	},
 	{},
 };
diff --git a/drivers/char/hw_random/n2rng.h b/drivers/char/hw_random/n2rng.h
index f244ac89087f..6bad6cc634e8 100644
--- a/drivers/char/hw_random/n2rng.h
+++ b/drivers/char/hw_random/n2rng.h
@@ -6,18 +6,34 @@
 #ifndef _N2RNG_H
 #define _N2RNG_H
 
-#define RNG_CTL_WAIT       0x0000000001fffe00ULL /* Minimum wait time       */
-#define RNG_CTL_WAIT_SHIFT 9
-#define RNG_CTL_BYPASS     0x0000000000000100ULL /* VCO voltage source      */
-#define RNG_CTL_VCO        0x00000000000000c0ULL /* VCO rate control        */
-#define RNG_CTL_VCO_SHIFT  6
-#define RNG_CTL_ASEL       0x0000000000000030ULL /* Analog MUX select       */
-#define RNG_CTL_ASEL_SHIFT 4
+/* ver1 devices - n2-rng, vf-rng, kt-rng */
+#define RNG_v1_CTL_WAIT       0x0000000001fffe00ULL /* Minimum wait time    */
+#define RNG_v1_CTL_WAIT_SHIFT 9
+#define RNG_v1_CTL_BYPASS     0x0000000000000100ULL /* VCO voltage source   */
+#define RNG_v1_CTL_VCO        0x00000000000000c0ULL /* VCO rate control     */
+#define RNG_v1_CTL_VCO_SHIFT  6
+#define RNG_v1_CTL_ASEL       0x0000000000000030ULL /* Analog MUX select    */
+#define RNG_v1_CTL_ASEL_SHIFT 4
+#define RNG_v1_CTL_ASEL_NOOUT 2
+
+/* these are the same in v2 as in v1 */
 #define RNG_CTL_LFSR       0x0000000000000008ULL /* Use LFSR or plain shift */
 #define RNG_CTL_ES3        0x0000000000000004ULL /* Enable entropy source 3 */
 #define RNG_CTL_ES2        0x0000000000000002ULL /* Enable entropy source 2 */
 #define RNG_CTL_ES1        0x0000000000000001ULL /* Enable entropy source 1 */
 
+/* ver2 devices - m4-rng, m7-rng */
+#define RNG_v2_CTL_WAIT       0x0000000007fff800ULL /* Minimum wait time    */
+#define RNG_v2_CTL_WAIT_SHIFT 12
+#define RNG_v2_CTL_BYPASS     0x0000000000000400ULL /* VCO voltage source   */
+#define RNG_v2_CTL_VCO        0x0000000000000300ULL /* VCO rate control     */
+#define RNG_v2_CTL_VCO_SHIFT  9
+#define RNG_v2_CTL_PERF       0x0000000000000180ULL /* Perf */
+#define RNG_v2_CTL_ASEL       0x0000000000000070ULL /* Analog MUX select    */
+#define RNG_v2_CTL_ASEL_SHIFT 4
+#define RNG_v2_CTL_ASEL_NOOUT 7
+
+
 #define HV_FAST_RNG_GET_DIAG_CTL	0x130
 #define HV_FAST_RNG_CTL_READ		0x131
 #define HV_FAST_RNG_CTL_WRITE		0x132
@@ -60,6 +76,20 @@ extern unsigned long sun4v_rng_data_read_diag_v2(unsigned long data_ra,
 extern unsigned long sun4v_rng_data_read(unsigned long data_ra,
 					 unsigned long *tick_delta);
 
+enum n2rng_compat_id {
+	N2_n2_rng,
+	N2_vf_rng,
+	N2_kt_rng,
+	N2_m4_rng,
+	N2_m7_rng,
+};
+
+struct n2rng_template {
+	enum n2rng_compat_id id;
+	int multi_capable;
+	int chip_version;
+};
+
 struct n2rng_unit {
 	u64			control[HV_RNG_NUM_CONTROL];
 };
@@ -74,6 +104,7 @@ struct n2rng {
 #define N2RNG_FLAG_SHUTDOWN	0x00000010 /* Driver unregistering        */
 #define N2RNG_FLAG_BUFFER_VALID	0x00000020 /* u32 buffer holds valid data */
 
+	struct n2rng_template	*data;
 	int			num_units;
 	struct n2rng_unit	*units;
 
@@ -97,8 +128,10 @@ struct n2rng {
 
 	u64			scratch_control[HV_RNG_NUM_CONTROL];
 
-#define SELFTEST_TICKS		38859
-#define SELFTEST_VAL		((u64)0xB8820C7BD387E32C)
+#define RNG_v1_SELFTEST_TICKS	38859
+#define RNG_v1_SELFTEST_VAL	((u64)0xB8820C7BD387E32C)
+#define RNG_v2_SELFTEST_TICKS	64
+#define RNG_v2_SELFTEST_VAL	((u64)0xffffffffffffffff)
 #define SELFTEST_POLY		((u64)0x231DCEE91262B8A3)
 #define SELFTEST_MATCH_GOAL	6
 #define SELFTEST_LOOPS_MAX	40000
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 79564785ae30..2cac445b02fd 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -339,7 +339,7 @@ config CRYPTO_DEV_OMAP_DES
 
 config CRYPTO_DEV_PICOXCELL
 	tristate "Support for picoXcell IPSEC and Layer2 crypto engines"
-	depends on ARCH_PICOXCELL && HAVE_CLK
+	depends on (ARCH_PICOXCELL || COMPILE_TEST) && HAVE_CLK
 	select CRYPTO_AEAD
 	select CRYPTO_AES
 	select CRYPTO_AUTHENC
@@ -415,10 +415,23 @@ config CRYPTO_DEV_BFIN_CRC
 	  Newer Blackfin processors have CRC hardware. Select this if you
 	  want to use the Blackfin CRC module.
 
+config CRYPTO_DEV_ATMEL_AUTHENC
+	tristate "Support for Atmel IPSEC/SSL hw accelerator"
+	depends on HAS_DMA
+	depends on ARCH_AT91 || COMPILE_TEST
+	select CRYPTO_AUTHENC
+	select CRYPTO_DEV_ATMEL_AES
+	select CRYPTO_DEV_ATMEL_SHA
+	help
+	  Some Atmel processors can combine the AES and SHA hw accelerators
+	  to enhance support of IPSEC/SSL.
+	  Select this if you want to use the Atmel modules for
+	  authenc(hmac(shaX),Y(cbc)) algorithms.
+
 config CRYPTO_DEV_ATMEL_AES
 	tristate "Support for Atmel AES hw accelerator"
 	depends on HAS_DMA
-	depends on AT_XDMAC || AT_HDMAC || COMPILE_TEST
+	depends on ARCH_AT91 || COMPILE_TEST
 	select CRYPTO_AES
 	select CRYPTO_AEAD
 	select CRYPTO_BLKCIPHER
@@ -432,7 +445,7 @@ config CRYPTO_DEV_ATMEL_AES
 
 config CRYPTO_DEV_ATMEL_TDES
 	tristate "Support for Atmel DES/TDES hw accelerator"
-	depends on ARCH_AT91
+	depends on ARCH_AT91 || COMPILE_TEST
 	select CRYPTO_DES
 	select CRYPTO_BLKCIPHER
 	help
@@ -445,7 +458,7 @@ config CRYPTO_DEV_ATMEL_TDES
 
 config CRYPTO_DEV_ATMEL_SHA
 	tristate "Support for Atmel SHA hw accelerator"
-	depends on ARCH_AT91
+	depends on ARCH_AT91 || COMPILE_TEST
 	select CRYPTO_HASH
 	help
 	  Some Atmel processors have SHA1/SHA224/SHA256/SHA384/SHA512
@@ -484,6 +497,7 @@ config CRYPTO_DEV_MXS_DCP
 	  will be called mxs-dcp.
 
 source "drivers/crypto/qat/Kconfig"
+source "drivers/crypto/cavium/cpt/Kconfig"
 
 config CRYPTO_DEV_QCE
 	tristate "Qualcomm crypto engine accelerator"
@@ -553,8 +567,39 @@ config CRYPTO_DEV_ROCKCHIP
 	  This driver interfaces with the hardware crypto accelerator.
 	  Supporting cbc/ecb chainmode, and aes/des/des3_ede cipher mode.
 
+config CRYPTO_DEV_MEDIATEK
+	tristate "MediaTek's EIP97 Cryptographic Engine driver"
+	depends on (ARM && ARCH_MEDIATEK) || COMPILE_TEST
+	select CRYPTO_AES
+	select CRYPTO_AEAD
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_CTR
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	select CRYPTO_HMAC
+	help
+	  This driver allows you to utilize the hardware crypto accelerator
+	  EIP97 which can be found on the MT7623 MT2701, MT8521p, etc ....
+	  Select this if you want to use it for AES/SHA1/SHA2 algorithms.
+
 source "drivers/crypto/chelsio/Kconfig"
 
 source "drivers/crypto/virtio/Kconfig"
 
+config CRYPTO_DEV_BCM_SPU
+	tristate "Broadcom symmetric crypto/hash acceleration support"
+	depends on ARCH_BCM_IPROC
+	depends on BCM_PDC_MBOX
+	default m
+	select CRYPTO_DES
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	help
+	  This driver provides support for Broadcom crypto acceleration using the
+	  Secure Processing Unit (SPU). The SPU driver registers ablkcipher,
+	  ahash, and aead algorithms with the kernel cryptographic API.
+
 endif # CRYPTO_HW
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index bc53cb833a06..739609471169 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -3,6 +3,8 @@ obj-$(CONFIG_CRYPTO_DEV_ATMEL_SHA) += atmel-sha.o
 obj-$(CONFIG_CRYPTO_DEV_ATMEL_TDES) += atmel-tdes.o
 obj-$(CONFIG_CRYPTO_DEV_BFIN_CRC) += bfin_crc.o
 obj-$(CONFIG_CRYPTO_DEV_CCP) += ccp/
+obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chelsio/
+obj-$(CONFIG_CRYPTO_DEV_CPT) += cavium/cpt/
 obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM) += caam/
 obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-aes.o
 obj-$(CONFIG_CRYPTO_DEV_HIFN_795X) += hifn_795x.o
@@ -10,7 +12,9 @@ obj-$(CONFIG_CRYPTO_DEV_IMGTEC_HASH) += img-hash.o
 obj-$(CONFIG_CRYPTO_DEV_IXP4XX) += ixp4xx_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_MV_CESA) += mv_cesa.o
 obj-$(CONFIG_CRYPTO_DEV_MARVELL_CESA) += marvell/
+obj-$(CONFIG_CRYPTO_DEV_MEDIATEK) += mediatek/
 obj-$(CONFIG_CRYPTO_DEV_MXS_DCP) += mxs-dcp.o
+obj-$(CONFIG_CRYPTO_DEV_MXC_SCC) += mxc-scc.o
 obj-$(CONFIG_CRYPTO_DEV_NIAGARA2) += n2_crypto.o
 n2_crypto-y := n2_core.o n2_asm.o
 obj-$(CONFIG_CRYPTO_DEV_NX) += nx/
@@ -21,15 +25,14 @@ obj-$(CONFIG_CRYPTO_DEV_PADLOCK_AES) += padlock-aes.o
 obj-$(CONFIG_CRYPTO_DEV_PADLOCK_SHA) += padlock-sha.o
 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += amcc/
-obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
-obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
-obj-$(CONFIG_CRYPTO_DEV_MXC_SCC) += mxc-scc.o
-obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
-obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
 obj-$(CONFIG_CRYPTO_DEV_QAT) += qat/
 obj-$(CONFIG_CRYPTO_DEV_QCE) += qce/
-obj-$(CONFIG_CRYPTO_DEV_VMX) += vmx/
-obj-$(CONFIG_CRYPTO_DEV_SUN4I_SS) += sunxi-ss/
 obj-$(CONFIG_CRYPTO_DEV_ROCKCHIP) += rockchip/
-obj-$(CONFIG_CRYPTO_DEV_CHELSIO) += chelsio/
+obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
+obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
+obj-$(CONFIG_CRYPTO_DEV_SUN4I_SS) += sunxi-ss/
+obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
+obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
 obj-$(CONFIG_CRYPTO_DEV_VIRTIO) += virtio/
+obj-$(CONFIG_CRYPTO_DEV_VMX) += vmx/
+obj-$(CONFIG_CRYPTO_DEV_BCM_SPU) += bcm/
diff --git a/drivers/crypto/atmel-aes-regs.h b/drivers/crypto/atmel-aes-regs.h
index 0ec04407b533..7694679802b3 100644
--- a/drivers/crypto/atmel-aes-regs.h
+++ b/drivers/crypto/atmel-aes-regs.h
@@ -68,6 +68,22 @@
 #define AES_CTRR	0x98
 #define AES_GCMHR(x)	(0x9c + ((x) * 0x04))
 
+#define AES_EMR		0xb0
+#define AES_EMR_APEN		BIT(0)	/* Auto Padding Enable */
+#define AES_EMR_APM		BIT(1)	/* Auto Padding Mode */
+#define AES_EMR_APM_IPSEC	0x0
+#define AES_EMR_APM_SSL		BIT(1)
+#define AES_EMR_PLIPEN		BIT(4)	/* PLIP Enable */
+#define AES_EMR_PLIPD		BIT(5)	/* PLIP Decipher */
+#define AES_EMR_PADLEN_MASK	(0xFu << 8)
+#define AES_EMR_PADLEN_OFFSET	8
+#define AES_EMR_PADLEN(padlen)	(((padlen) << AES_EMR_PADLEN_OFFSET) &\
+				 AES_EMR_PADLEN_MASK)
+#define AES_EMR_NHEAD_MASK	(0xFu << 16)
+#define AES_EMR_NHEAD_OFFSET	16
+#define AES_EMR_NHEAD(nhead)	(((nhead) << AES_EMR_NHEAD_OFFSET) &\
+				 AES_EMR_NHEAD_MASK)
+
 #define AES_TWR(x)	(0xc0 + ((x) * 0x04))
 #define AES_ALPHAR(x)	(0xd0 + ((x) * 0x04))
 
diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
index 0e3d0d655b96..29e20c37f3a6 100644
--- a/drivers/crypto/atmel-aes.c
+++ b/drivers/crypto/atmel-aes.c
@@ -41,6 +41,7 @@
 #include <linux/platform_data/crypto-atmel.h>
 #include <dt-bindings/dma/at91.h>
 #include "atmel-aes-regs.h"
+#include "atmel-authenc.h"
 
 #define ATMEL_AES_PRIORITY	300
 
@@ -78,6 +79,7 @@
 #define AES_FLAGS_INIT		BIT(2)
 #define AES_FLAGS_BUSY		BIT(3)
 #define AES_FLAGS_DUMP_REG	BIT(4)
+#define AES_FLAGS_OWN_SHA	BIT(5)
 
 #define AES_FLAGS_PERSISTENT	(AES_FLAGS_INIT | AES_FLAGS_BUSY)
 
@@ -92,6 +94,7 @@ struct atmel_aes_caps {
 	bool			has_ctr32;
 	bool			has_gcm;
 	bool			has_xts;
+	bool			has_authenc;
 	u32			max_burst_size;
 };
 
@@ -144,10 +147,31 @@ struct atmel_aes_xts_ctx {
 	u32			key2[AES_KEYSIZE_256 / sizeof(u32)];
 };
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+struct atmel_aes_authenc_ctx {
+	struct atmel_aes_base_ctx	base;
+	struct atmel_sha_authenc_ctx	*auth;
+};
+#endif
+
 struct atmel_aes_reqctx {
 	unsigned long		mode;
 };
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+struct atmel_aes_authenc_reqctx {
+	struct atmel_aes_reqctx	base;
+
+	struct scatterlist	src[2];
+	struct scatterlist	dst[2];
+	size_t			textlen;
+	u32			digest[SHA512_DIGEST_SIZE / sizeof(u32)];
+
+	/* auth_req MUST be place last. */
+	struct ahash_request	auth_req;
+};
+#endif
+
 struct atmel_aes_dma {
 	struct dma_chan		*chan;
 	struct scatterlist	*sg;
@@ -291,6 +315,9 @@ static const char *atmel_aes_reg_name(u32 offset, char *tmp, size_t sz)
 		snprintf(tmp, sz, "GCMHR[%u]", (offset - AES_GCMHR(0)) >> 2);
 		break;
 
+	case AES_EMR:
+		return "EMR";
+
 	case AES_TWR(0):
 	case AES_TWR(1):
 	case AES_TWR(2):
@@ -463,8 +490,16 @@ static inline bool atmel_aes_is_encrypt(const struct atmel_aes_dev *dd)
 	return (dd->flags & AES_FLAGS_ENCRYPT);
 }
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+static void atmel_aes_authenc_complete(struct atmel_aes_dev *dd, int err);
+#endif
+
 static inline int atmel_aes_complete(struct atmel_aes_dev *dd, int err)
 {
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+	atmel_aes_authenc_complete(dd, err);
+#endif
+
 	clk_disable(dd->iclk);
 	dd->flags &= ~AES_FLAGS_BUSY;
 
@@ -879,6 +914,7 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 	struct crypto_async_request *areq, *backlog;
 	struct atmel_aes_base_ctx *ctx;
 	unsigned long flags;
+	bool start_async;
 	int err, ret = 0;
 
 	spin_lock_irqsave(&dd->lock, flags);
@@ -904,10 +940,12 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 
 	dd->areq = areq;
 	dd->ctx = ctx;
-	dd->is_async = (areq != new_areq);
+	start_async = (areq != new_areq);
+	dd->is_async = start_async;
 
+	/* WARNING: ctx->start() MAY change dd->is_async. */
 	err = ctx->start(dd);
-	return (dd->is_async) ? ret : err;
+	return (start_async) ? ret : err;
 }
 
 
@@ -1928,6 +1966,384 @@ static struct crypto_alg aes_xts_alg = {
 	}
 };
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+/* authenc aead functions */
+
+static int atmel_aes_authenc_start(struct atmel_aes_dev *dd);
+static int atmel_aes_authenc_init(struct atmel_aes_dev *dd, int err,
+				  bool is_async);
+static int atmel_aes_authenc_transfer(struct atmel_aes_dev *dd, int err,
+				      bool is_async);
+static int atmel_aes_authenc_digest(struct atmel_aes_dev *dd);
+static int atmel_aes_authenc_final(struct atmel_aes_dev *dd, int err,
+				   bool is_async);
+
+static void atmel_aes_authenc_complete(struct atmel_aes_dev *dd, int err)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+
+	if (err && (dd->flags & AES_FLAGS_OWN_SHA))
+		atmel_sha_authenc_abort(&rctx->auth_req);
+	dd->flags &= ~AES_FLAGS_OWN_SHA;
+}
+
+static int atmel_aes_authenc_start(struct atmel_aes_dev *dd)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
+	int err;
+
+	atmel_aes_set_mode(dd, &rctx->base);
+
+	err = atmel_aes_hw_init(dd);
+	if (err)
+		return atmel_aes_complete(dd, err);
+
+	return atmel_sha_authenc_schedule(&rctx->auth_req, ctx->auth,
+					  atmel_aes_authenc_init, dd);
+}
+
+static int atmel_aes_authenc_init(struct atmel_aes_dev *dd, int err,
+				  bool is_async)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+
+	if (is_async)
+		dd->is_async = true;
+	if (err)
+		return atmel_aes_complete(dd, err);
+
+	/* If here, we've got the ownership of the SHA device. */
+	dd->flags |= AES_FLAGS_OWN_SHA;
+
+	/* Configure the SHA device. */
+	return atmel_sha_authenc_init(&rctx->auth_req,
+				      req->src, req->assoclen,
+				      rctx->textlen,
+				      atmel_aes_authenc_transfer, dd);
+}
+
+static int atmel_aes_authenc_transfer(struct atmel_aes_dev *dd, int err,
+				      bool is_async)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+	bool enc = atmel_aes_is_encrypt(dd);
+	struct scatterlist *src, *dst;
+	u32 iv[AES_BLOCK_SIZE / sizeof(u32)];
+	u32 emr;
+
+	if (is_async)
+		dd->is_async = true;
+	if (err)
+		return atmel_aes_complete(dd, err);
+
+	/* Prepare src and dst scatter-lists to transfer cipher/plain texts. */
+	src = scatterwalk_ffwd(rctx->src, req->src, req->assoclen);
+	dst = src;
+
+	if (req->src != req->dst)
+		dst = scatterwalk_ffwd(rctx->dst, req->dst, req->assoclen);
+
+	/* Configure the AES device. */
+	memcpy(iv, req->iv, sizeof(iv));
+
+	/*
+	 * Here we always set the 2nd parameter of atmel_aes_write_ctrl() to
+	 * 'true' even if the data transfer is actually performed by the CPU (so
+	 * not by the DMA) because we must force the AES_MR_SMOD bitfield to the
+	 * value AES_MR_SMOD_IDATAR0. Indeed, both AES_MR_SMOD and SHA_MR_SMOD
+	 * must be set to *_MR_SMOD_IDATAR0.
+	 */
+	atmel_aes_write_ctrl(dd, true, iv);
+	emr = AES_EMR_PLIPEN;
+	if (!enc)
+		emr |= AES_EMR_PLIPD;
+	atmel_aes_write(dd, AES_EMR, emr);
+
+	/* Transfer data. */
+	return atmel_aes_dma_start(dd, src, dst, rctx->textlen,
+				   atmel_aes_authenc_digest);
+}
+
+static int atmel_aes_authenc_digest(struct atmel_aes_dev *dd)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+
+	/* atmel_sha_authenc_final() releases the SHA device. */
+	dd->flags &= ~AES_FLAGS_OWN_SHA;
+	return atmel_sha_authenc_final(&rctx->auth_req,
+				       rctx->digest, sizeof(rctx->digest),
+				       atmel_aes_authenc_final, dd);
+}
+
+static int atmel_aes_authenc_final(struct atmel_aes_dev *dd, int err,
+				   bool is_async)
+{
+	struct aead_request *req = aead_request_cast(dd->areq);
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	bool enc = atmel_aes_is_encrypt(dd);
+	u32 idigest[SHA512_DIGEST_SIZE / sizeof(u32)], *odigest = rctx->digest;
+	u32 offs, authsize;
+
+	if (is_async)
+		dd->is_async = true;
+	if (err)
+		goto complete;
+
+	offs = req->assoclen + rctx->textlen;
+	authsize = crypto_aead_authsize(tfm);
+	if (enc) {
+		scatterwalk_map_and_copy(odigest, req->dst, offs, authsize, 1);
+	} else {
+		scatterwalk_map_and_copy(idigest, req->src, offs, authsize, 0);
+		if (crypto_memneq(idigest, odigest, authsize))
+			err = -EBADMSG;
+	}
+
+complete:
+	return atmel_aes_complete(dd, err);
+}
+
+static int atmel_aes_authenc_setkey(struct crypto_aead *tfm, const u8 *key,
+				    unsigned int keylen)
+{
+	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
+	struct crypto_authenc_keys keys;
+	u32 flags;
+	int err;
+
+	if (crypto_authenc_extractkeys(&keys, key, keylen) != 0)
+		goto badkey;
+
+	if (keys.enckeylen > sizeof(ctx->base.key))
+		goto badkey;
+
+	/* Save auth key. */
+	flags = crypto_aead_get_flags(tfm);
+	err = atmel_sha_authenc_setkey(ctx->auth,
+				       keys.authkey, keys.authkeylen,
+				       &flags);
+	crypto_aead_set_flags(tfm, flags & CRYPTO_TFM_RES_MASK);
+	if (err) {
+		memzero_explicit(&keys, sizeof(keys));
+		return err;
+	}
+
+	/* Save enc key. */
+	ctx->base.keylen = keys.enckeylen;
+	memcpy(ctx->base.key, keys.enckey, keys.enckeylen);
+
+	memzero_explicit(&keys, sizeof(keys));
+	return 0;
+
+badkey:
+	crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	memzero_explicit(&key, sizeof(keys));
+	return -EINVAL;
+}
+
+static int atmel_aes_authenc_init_tfm(struct crypto_aead *tfm,
+				      unsigned long auth_mode)
+{
+	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
+	unsigned int auth_reqsize = atmel_sha_authenc_get_reqsize();
+
+	ctx->auth = atmel_sha_authenc_spawn(auth_mode);
+	if (IS_ERR(ctx->auth))
+		return PTR_ERR(ctx->auth);
+
+	crypto_aead_set_reqsize(tfm, (sizeof(struct atmel_aes_authenc_reqctx) +
+				      auth_reqsize));
+	ctx->base.start = atmel_aes_authenc_start;
+
+	return 0;
+}
+
+static int atmel_aes_authenc_hmac_sha1_init_tfm(struct crypto_aead *tfm)
+{
+	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA1);
+}
+
+static int atmel_aes_authenc_hmac_sha224_init_tfm(struct crypto_aead *tfm)
+{
+	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA224);
+}
+
+static int atmel_aes_authenc_hmac_sha256_init_tfm(struct crypto_aead *tfm)
+{
+	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA256);
+}
+
+static int atmel_aes_authenc_hmac_sha384_init_tfm(struct crypto_aead *tfm)
+{
+	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA384);
+}
+
+static int atmel_aes_authenc_hmac_sha512_init_tfm(struct crypto_aead *tfm)
+{
+	return atmel_aes_authenc_init_tfm(tfm, SHA_FLAGS_HMAC_SHA512);
+}
+
+static void atmel_aes_authenc_exit_tfm(struct crypto_aead *tfm)
+{
+	struct atmel_aes_authenc_ctx *ctx = crypto_aead_ctx(tfm);
+
+	atmel_sha_authenc_free(ctx->auth);
+}
+
+static int atmel_aes_authenc_crypt(struct aead_request *req,
+				   unsigned long mode)
+{
+	struct atmel_aes_authenc_reqctx *rctx = aead_request_ctx(req);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct atmel_aes_base_ctx *ctx = crypto_aead_ctx(tfm);
+	u32 authsize = crypto_aead_authsize(tfm);
+	bool enc = (mode & AES_FLAGS_ENCRYPT);
+	struct atmel_aes_dev *dd;
+
+	/* Compute text length. */
+	if (!enc && req->cryptlen < authsize)
+		return -EINVAL;
+	rctx->textlen = req->cryptlen - (enc ? 0 : authsize);
+
+	/*
+	 * Currently, empty messages are not supported yet:
+	 * the SHA auto-padding can be used only on non-empty messages.
+	 * Hence a special case needs to be implemented for empty message.
+	 */
+	if (!rctx->textlen && !req->assoclen)
+		return -EINVAL;
+
+	rctx->base.mode = mode;
+	ctx->block_size = AES_BLOCK_SIZE;
+
+	dd = atmel_aes_find_dev(ctx);
+	if (!dd)
+		return -ENODEV;
+
+	return atmel_aes_handle_queue(dd, &req->base);
+}
+
+static int atmel_aes_authenc_cbc_aes_encrypt(struct aead_request *req)
+{
+	return atmel_aes_authenc_crypt(req, AES_FLAGS_CBC | AES_FLAGS_ENCRYPT);
+}
+
+static int atmel_aes_authenc_cbc_aes_decrypt(struct aead_request *req)
+{
+	return atmel_aes_authenc_crypt(req, AES_FLAGS_CBC);
+}
+
+static struct aead_alg aes_authenc_algs[] = {
+{
+	.setkey		= atmel_aes_authenc_setkey,
+	.encrypt	= atmel_aes_authenc_cbc_aes_encrypt,
+	.decrypt	= atmel_aes_authenc_cbc_aes_decrypt,
+	.init		= atmel_aes_authenc_hmac_sha1_init_tfm,
+	.exit		= atmel_aes_authenc_exit_tfm,
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= SHA1_DIGEST_SIZE,
+
+	.base = {
+		.cra_name		= "authenc(hmac(sha1),cbc(aes))",
+		.cra_driver_name	= "atmel-authenc-hmac-sha1-cbc-aes",
+		.cra_priority		= ATMEL_AES_PRIORITY,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct atmel_aes_authenc_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+},
+{
+	.setkey		= atmel_aes_authenc_setkey,
+	.encrypt	= atmel_aes_authenc_cbc_aes_encrypt,
+	.decrypt	= atmel_aes_authenc_cbc_aes_decrypt,
+	.init		= atmel_aes_authenc_hmac_sha224_init_tfm,
+	.exit		= atmel_aes_authenc_exit_tfm,
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= SHA224_DIGEST_SIZE,
+
+	.base = {
+		.cra_name		= "authenc(hmac(sha224),cbc(aes))",
+		.cra_driver_name	= "atmel-authenc-hmac-sha224-cbc-aes",
+		.cra_priority		= ATMEL_AES_PRIORITY,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct atmel_aes_authenc_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+},
+{
+	.setkey		= atmel_aes_authenc_setkey,
+	.encrypt	= atmel_aes_authenc_cbc_aes_encrypt,
+	.decrypt	= atmel_aes_authenc_cbc_aes_decrypt,
+	.init		= atmel_aes_authenc_hmac_sha256_init_tfm,
+	.exit		= atmel_aes_authenc_exit_tfm,
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= SHA256_DIGEST_SIZE,
+
+	.base = {
+		.cra_name		= "authenc(hmac(sha256),cbc(aes))",
+		.cra_driver_name	= "atmel-authenc-hmac-sha256-cbc-aes",
+		.cra_priority		= ATMEL_AES_PRIORITY,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct atmel_aes_authenc_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+},
+{
+	.setkey		= atmel_aes_authenc_setkey,
+	.encrypt	= atmel_aes_authenc_cbc_aes_encrypt,
+	.decrypt	= atmel_aes_authenc_cbc_aes_decrypt,
+	.init		= atmel_aes_authenc_hmac_sha384_init_tfm,
+	.exit		= atmel_aes_authenc_exit_tfm,
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= SHA384_DIGEST_SIZE,
+
+	.base = {
+		.cra_name		= "authenc(hmac(sha384),cbc(aes))",
+		.cra_driver_name	= "atmel-authenc-hmac-sha384-cbc-aes",
+		.cra_priority		= ATMEL_AES_PRIORITY,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct atmel_aes_authenc_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+},
+{
+	.setkey		= atmel_aes_authenc_setkey,
+	.encrypt	= atmel_aes_authenc_cbc_aes_encrypt,
+	.decrypt	= atmel_aes_authenc_cbc_aes_decrypt,
+	.init		= atmel_aes_authenc_hmac_sha512_init_tfm,
+	.exit		= atmel_aes_authenc_exit_tfm,
+	.ivsize		= AES_BLOCK_SIZE,
+	.maxauthsize	= SHA512_DIGEST_SIZE,
+
+	.base = {
+		.cra_name		= "authenc(hmac(sha512),cbc(aes))",
+		.cra_driver_name	= "atmel-authenc-hmac-sha512-cbc-aes",
+		.cra_priority		= ATMEL_AES_PRIORITY,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct atmel_aes_authenc_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+},
+};
+#endif /* CONFIG_CRYPTO_DEV_ATMEL_AUTHENC */
 
 /* Probe functions */
 
@@ -2037,6 +2453,12 @@ static void atmel_aes_unregister_algs(struct atmel_aes_dev *dd)
 {
 	int i;
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+	if (dd->caps.has_authenc)
+		for (i = 0; i < ARRAY_SIZE(aes_authenc_algs); i++)
+			crypto_unregister_aead(&aes_authenc_algs[i]);
+#endif
+
 	if (dd->caps.has_xts)
 		crypto_unregister_alg(&aes_xts_alg);
 
@@ -2078,8 +2500,25 @@ static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
 			goto err_aes_xts_alg;
 	}
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+	if (dd->caps.has_authenc) {
+		for (i = 0; i < ARRAY_SIZE(aes_authenc_algs); i++) {
+			err = crypto_register_aead(&aes_authenc_algs[i]);
+			if (err)
+				goto err_aes_authenc_alg;
+		}
+	}
+#endif
+
 	return 0;
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+	/* i = ARRAY_SIZE(aes_authenc_algs); */
+err_aes_authenc_alg:
+	for (j = 0; j < i; j++)
+		crypto_unregister_aead(&aes_authenc_algs[j]);
+	crypto_unregister_alg(&aes_xts_alg);
+#endif
 err_aes_xts_alg:
 	crypto_unregister_aead(&aes_gcm_alg);
 err_aes_gcm_alg:
@@ -2100,6 +2539,7 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
 	dd->caps.has_ctr32 = 0;
 	dd->caps.has_gcm = 0;
 	dd->caps.has_xts = 0;
+	dd->caps.has_authenc = 0;
 	dd->caps.max_burst_size = 1;
 
 	/* keep only major version number */
@@ -2110,6 +2550,7 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
 		dd->caps.has_ctr32 = 1;
 		dd->caps.has_gcm = 1;
 		dd->caps.has_xts = 1;
+		dd->caps.has_authenc = 1;
 		dd->caps.max_burst_size = 4;
 		break;
 	case 0x200:
@@ -2268,6 +2709,13 @@ static int atmel_aes_probe(struct platform_device *pdev)
 
 	atmel_aes_get_cap(aes_dd);
 
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+	if (aes_dd->caps.has_authenc && !atmel_sha_authenc_is_ready()) {
+		err = -EPROBE_DEFER;
+		goto iclk_unprepare;
+	}
+#endif
+
 	err = atmel_aes_buff_init(aes_dd);
 	if (err)
 		goto err_aes_buff;
@@ -2304,7 +2752,8 @@ res_err:
 	tasklet_kill(&aes_dd->done_task);
 	tasklet_kill(&aes_dd->queue_task);
 aes_dd_err:
-	dev_err(dev, "initialization failed.\n");
+	if (err != -EPROBE_DEFER)
+		dev_err(dev, "initialization failed.\n");
 
 	return err;
 }
diff --git a/drivers/crypto/atmel-authenc.h b/drivers/crypto/atmel-authenc.h
new file mode 100644
index 000000000000..2a60d1224143
--- /dev/null
+++ b/drivers/crypto/atmel-authenc.h
@@ -0,0 +1,64 @@
+/*
+ * API for Atmel Secure Protocol Layers Improved Performances (SPLIP)
+ *
+ * Copyright (C) 2016 Atmel Corporation
+ *
+ * Author: Cyrille Pitchen <cyrille.pitchen@atmel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This driver is based on drivers/mtd/spi-nor/fsl-quadspi.c from Freescale.
+ */
+
+#ifndef __ATMEL_AUTHENC_H__
+#define __ATMEL_AUTHENC_H__
+
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+
+#include <crypto/authenc.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "atmel-sha-regs.h"
+
+struct atmel_aes_dev;
+typedef int (*atmel_aes_authenc_fn_t)(struct atmel_aes_dev *, int, bool);
+
+struct atmel_sha_authenc_ctx;
+
+bool atmel_sha_authenc_is_ready(void);
+unsigned int atmel_sha_authenc_get_reqsize(void);
+
+struct atmel_sha_authenc_ctx *atmel_sha_authenc_spawn(unsigned long mode);
+void atmel_sha_authenc_free(struct atmel_sha_authenc_ctx *auth);
+int atmel_sha_authenc_setkey(struct atmel_sha_authenc_ctx *auth,
+			     const u8 *key, unsigned int keylen,
+			     u32 *flags);
+
+int atmel_sha_authenc_schedule(struct ahash_request *req,
+			       struct atmel_sha_authenc_ctx *auth,
+			       atmel_aes_authenc_fn_t cb,
+			       struct atmel_aes_dev *dd);
+int atmel_sha_authenc_init(struct ahash_request *req,
+			   struct scatterlist *assoc, unsigned int assoclen,
+			   unsigned int textlen,
+			   atmel_aes_authenc_fn_t cb,
+			   struct atmel_aes_dev *dd);
+int atmel_sha_authenc_final(struct ahash_request *req,
+			    u32 *digest, unsigned int digestlen,
+			    atmel_aes_authenc_fn_t cb,
+			    struct atmel_aes_dev *dd);
+void  atmel_sha_authenc_abort(struct ahash_request *req);
+
+#endif /* CONFIG_CRYPTO_DEV_ATMEL_AUTHENC */
+
+#endif /* __ATMEL_AUTHENC_H__ */
diff --git a/drivers/crypto/atmel-sha-regs.h b/drivers/crypto/atmel-sha-regs.h
index e08897109cab..1b0eba4a2706 100644
--- a/drivers/crypto/atmel-sha-regs.h
+++ b/drivers/crypto/atmel-sha-regs.h
@@ -16,16 +16,33 @@
 #define SHA_MR_MODE_MANUAL		0x0
 #define SHA_MR_MODE_AUTO		0x1
 #define SHA_MR_MODE_PDC			0x2
+#define SHA_MR_MODE_IDATAR0		0x2
 #define SHA_MR_PROCDLY			(1 << 4)
 #define SHA_MR_UIHV			(1 << 5)
 #define SHA_MR_UIEHV			(1 << 6)
+#define SHA_MR_ALGO_MASK		GENMASK(10, 8)
 #define SHA_MR_ALGO_SHA1		(0 << 8)
 #define SHA_MR_ALGO_SHA256		(1 << 8)
 #define SHA_MR_ALGO_SHA384		(2 << 8)
 #define SHA_MR_ALGO_SHA512		(3 << 8)
 #define SHA_MR_ALGO_SHA224		(4 << 8)
+#define SHA_MR_HMAC			(1 << 11)
 #define	SHA_MR_DUALBUFF			(1 << 16)
 
+#define SHA_FLAGS_ALGO_MASK	SHA_MR_ALGO_MASK
+#define SHA_FLAGS_SHA1		SHA_MR_ALGO_SHA1
+#define SHA_FLAGS_SHA256	SHA_MR_ALGO_SHA256
+#define SHA_FLAGS_SHA384	SHA_MR_ALGO_SHA384
+#define SHA_FLAGS_SHA512	SHA_MR_ALGO_SHA512
+#define SHA_FLAGS_SHA224	SHA_MR_ALGO_SHA224
+#define SHA_FLAGS_HMAC		SHA_MR_HMAC
+#define SHA_FLAGS_HMAC_SHA1	(SHA_FLAGS_HMAC | SHA_FLAGS_SHA1)
+#define SHA_FLAGS_HMAC_SHA256	(SHA_FLAGS_HMAC | SHA_FLAGS_SHA256)
+#define SHA_FLAGS_HMAC_SHA384	(SHA_FLAGS_HMAC | SHA_FLAGS_SHA384)
+#define SHA_FLAGS_HMAC_SHA512	(SHA_FLAGS_HMAC | SHA_FLAGS_SHA512)
+#define SHA_FLAGS_HMAC_SHA224	(SHA_FLAGS_HMAC | SHA_FLAGS_SHA224)
+#define SHA_FLAGS_MODE_MASK	(SHA_FLAGS_HMAC | SHA_FLAGS_ALGO_MASK)
+
 #define SHA_IER				0x10
 #define SHA_IDR				0x14
 #define SHA_IMR				0x18
@@ -40,6 +57,9 @@
 #define SHA_ISR_URAT_MR			(0x2 << 12)
 #define SHA_ISR_URAT_WO			(0x5 << 12)
 
+#define SHA_MSR				0x20
+#define SHA_BCR				0x30
+
 #define	SHA_HW_VERSION		0xFC
 
 #define SHA_TPR				0x108
diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c
index 97e34799e077..a9482023d7d3 100644
--- a/drivers/crypto/atmel-sha.c
+++ b/drivers/crypto/atmel-sha.c
@@ -41,6 +41,7 @@
 #include <crypto/internal/hash.h>
 #include <linux/platform_data/crypto-atmel.h>
 #include "atmel-sha-regs.h"
+#include "atmel-authenc.h"
 
 /* SHA flags */
 #define SHA_FLAGS_BUSY			BIT(0)
@@ -50,21 +51,22 @@
 #define SHA_FLAGS_INIT			BIT(4)
 #define SHA_FLAGS_CPU			BIT(5)
 #define SHA_FLAGS_DMA_READY		BIT(6)
+#define SHA_FLAGS_DUMP_REG	BIT(7)
+
+/* bits[11:8] are reserved. */
 
 #define SHA_FLAGS_FINUP		BIT(16)
 #define SHA_FLAGS_SG		BIT(17)
-#define SHA_FLAGS_ALGO_MASK	GENMASK(22, 18)
-#define SHA_FLAGS_SHA1		BIT(18)
-#define SHA_FLAGS_SHA224	BIT(19)
-#define SHA_FLAGS_SHA256	BIT(20)
-#define SHA_FLAGS_SHA384	BIT(21)
-#define SHA_FLAGS_SHA512	BIT(22)
 #define SHA_FLAGS_ERROR		BIT(23)
 #define SHA_FLAGS_PAD		BIT(24)
 #define SHA_FLAGS_RESTORE	BIT(25)
+#define SHA_FLAGS_IDATAR0	BIT(26)
+#define SHA_FLAGS_WAIT_DATARDY	BIT(27)
 
+#define SHA_OP_INIT	0
 #define SHA_OP_UPDATE	1
 #define SHA_OP_FINAL	2
+#define SHA_OP_DIGEST	3
 
 #define SHA_BUFFER_LEN		(PAGE_SIZE / 16)
 
@@ -76,6 +78,7 @@ struct atmel_sha_caps {
 	bool	has_sha224;
 	bool	has_sha_384_512;
 	bool	has_uihv;
+	bool	has_hmac;
 };
 
 struct atmel_sha_dev;
@@ -101,12 +104,16 @@ struct atmel_sha_reqctx {
 	unsigned int	total;	/* total request */
 
 	size_t block_size;
+	size_t hash_size;
 
 	u8 buffer[SHA_BUFFER_LEN + SHA512_BLOCK_SIZE] __aligned(sizeof(u32));
 };
 
+typedef int (*atmel_sha_fn_t)(struct atmel_sha_dev *);
+
 struct atmel_sha_ctx {
 	struct atmel_sha_dev	*dd;
+	atmel_sha_fn_t		start;
 
 	unsigned long		flags;
 };
@@ -116,6 +123,9 @@ struct atmel_sha_ctx {
 struct atmel_sha_dma {
 	struct dma_chan			*chan;
 	struct dma_slave_config dma_conf;
+	struct scatterlist	*sg;
+	int			nents;
+	unsigned int		last_sg_length;
 };
 
 struct atmel_sha_dev {
@@ -134,11 +144,17 @@ struct atmel_sha_dev {
 	unsigned long		flags;
 	struct crypto_queue	queue;
 	struct ahash_request	*req;
+	bool			is_async;
+	bool			force_complete;
+	atmel_sha_fn_t		resume;
+	atmel_sha_fn_t		cpu_transfer_complete;
 
 	struct atmel_sha_dma	dma_lch_in;
 
 	struct atmel_sha_caps	caps;
 
+	struct scatterlist	tmp;
+
 	u32	hw_version;
 };
 
@@ -152,17 +168,140 @@ static struct atmel_sha_drv atmel_sha = {
 	.lock = __SPIN_LOCK_UNLOCKED(atmel_sha.lock),
 };
 
+#ifdef VERBOSE_DEBUG
+static const char *atmel_sha_reg_name(u32 offset, char *tmp, size_t sz, bool wr)
+{
+	switch (offset) {
+	case SHA_CR:
+		return "CR";
+
+	case SHA_MR:
+		return "MR";
+
+	case SHA_IER:
+		return "IER";
+
+	case SHA_IDR:
+		return "IDR";
+
+	case SHA_IMR:
+		return "IMR";
+
+	case SHA_ISR:
+		return "ISR";
+
+	case SHA_MSR:
+		return "MSR";
+
+	case SHA_BCR:
+		return "BCR";
+
+	case SHA_REG_DIN(0):
+	case SHA_REG_DIN(1):
+	case SHA_REG_DIN(2):
+	case SHA_REG_DIN(3):
+	case SHA_REG_DIN(4):
+	case SHA_REG_DIN(5):
+	case SHA_REG_DIN(6):
+	case SHA_REG_DIN(7):
+	case SHA_REG_DIN(8):
+	case SHA_REG_DIN(9):
+	case SHA_REG_DIN(10):
+	case SHA_REG_DIN(11):
+	case SHA_REG_DIN(12):
+	case SHA_REG_DIN(13):
+	case SHA_REG_DIN(14):
+	case SHA_REG_DIN(15):
+		snprintf(tmp, sz, "IDATAR[%u]", (offset - SHA_REG_DIN(0)) >> 2);
+		break;
+
+	case SHA_REG_DIGEST(0):
+	case SHA_REG_DIGEST(1):
+	case SHA_REG_DIGEST(2):
+	case SHA_REG_DIGEST(3):
+	case SHA_REG_DIGEST(4):
+	case SHA_REG_DIGEST(5):
+	case SHA_REG_DIGEST(6):
+	case SHA_REG_DIGEST(7):
+	case SHA_REG_DIGEST(8):
+	case SHA_REG_DIGEST(9):
+	case SHA_REG_DIGEST(10):
+	case SHA_REG_DIGEST(11):
+	case SHA_REG_DIGEST(12):
+	case SHA_REG_DIGEST(13):
+	case SHA_REG_DIGEST(14):
+	case SHA_REG_DIGEST(15):
+		if (wr)
+			snprintf(tmp, sz, "IDATAR[%u]",
+				 16u + ((offset - SHA_REG_DIGEST(0)) >> 2));
+		else
+			snprintf(tmp, sz, "ODATAR[%u]",
+				 (offset - SHA_REG_DIGEST(0)) >> 2);
+		break;
+
+	case SHA_HW_VERSION:
+		return "HWVER";
+
+	default:
+		snprintf(tmp, sz, "0x%02x", offset);
+		break;
+	}
+
+	return tmp;
+}
+
+#endif /* VERBOSE_DEBUG */
+
 static inline u32 atmel_sha_read(struct atmel_sha_dev *dd, u32 offset)
 {
-	return readl_relaxed(dd->io_base + offset);
+	u32 value = readl_relaxed(dd->io_base + offset);
+
+#ifdef VERBOSE_DEBUG
+	if (dd->flags & SHA_FLAGS_DUMP_REG) {
+		char tmp[16];
+
+		dev_vdbg(dd->dev, "read 0x%08x from %s\n", value,
+			 atmel_sha_reg_name(offset, tmp, sizeof(tmp), false));
+	}
+#endif /* VERBOSE_DEBUG */
+
+	return value;
 }
 
 static inline void atmel_sha_write(struct atmel_sha_dev *dd,
 					u32 offset, u32 value)
 {
+#ifdef VERBOSE_DEBUG
+	if (dd->flags & SHA_FLAGS_DUMP_REG) {
+		char tmp[16];
+
+		dev_vdbg(dd->dev, "write 0x%08x into %s\n", value,
+			 atmel_sha_reg_name(offset, tmp, sizeof(tmp), true));
+	}
+#endif /* VERBOSE_DEBUG */
+
 	writel_relaxed(value, dd->io_base + offset);
 }
 
+static inline int atmel_sha_complete(struct atmel_sha_dev *dd, int err)
+{
+	struct ahash_request *req = dd->req;
+
+	dd->flags &= ~(SHA_FLAGS_BUSY | SHA_FLAGS_FINAL | SHA_FLAGS_CPU |
+		       SHA_FLAGS_DMA_READY | SHA_FLAGS_OUTPUT_READY |
+		       SHA_FLAGS_DUMP_REG);
+
+	clk_disable(dd->iclk);
+
+	if ((dd->is_async || dd->force_complete) && req->base.complete)
+		req->base.complete(&req->base, err);
+
+	/* handle new request */
+	tasklet_schedule(&dd->queue_task);
+
+	return err;
+}
+
 static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
 {
 	size_t count;
@@ -241,7 +380,9 @@ static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
 	bits[1] = cpu_to_be64(size[0] << 3);
 	bits[0] = cpu_to_be64(size[1] << 3 | size[0] >> 61);
 
-	if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
+	switch (ctx->flags & SHA_FLAGS_ALGO_MASK) {
+	case SHA_FLAGS_SHA384:
+	case SHA_FLAGS_SHA512:
 		index = ctx->bufcnt & 0x7f;
 		padlen = (index < 112) ? (112 - index) : ((128+112) - index);
 		*(ctx->buffer + ctx->bufcnt) = 0x80;
@@ -249,7 +390,9 @@ static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
 		memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
 		ctx->bufcnt += padlen + 16;
 		ctx->flags |= SHA_FLAGS_PAD;
-	} else {
+		break;
+
+	default:
 		index = ctx->bufcnt & 0x3f;
 		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
 		*(ctx->buffer + ctx->bufcnt) = 0x80;
@@ -257,14 +400,12 @@ static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
 		memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
 		ctx->bufcnt += padlen + 8;
 		ctx->flags |= SHA_FLAGS_PAD;
+		break;
 	}
 }
 
-static int atmel_sha_init(struct ahash_request *req)
+static struct atmel_sha_dev *atmel_sha_find_dev(struct atmel_sha_ctx *tctx)
 {
-	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct atmel_sha_ctx *tctx = crypto_ahash_ctx(tfm);
-	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
 	struct atmel_sha_dev *dd = NULL;
 	struct atmel_sha_dev *tmp;
 
@@ -281,6 +422,16 @@ static int atmel_sha_init(struct ahash_request *req)
 
 	spin_unlock_bh(&atmel_sha.lock);
 
+	return dd;
+}
+
+static int atmel_sha_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct atmel_sha_dev *dd = atmel_sha_find_dev(tctx);
+
 	ctx->dd = dd;
 
 	ctx->flags = 0;
@@ -397,6 +548,19 @@ static void atmel_sha_write_ctrl(struct atmel_sha_dev *dd, int dma)
 	atmel_sha_write(dd, SHA_MR, valmr);
 }
 
+static inline int atmel_sha_wait_for_data_ready(struct atmel_sha_dev *dd,
+						atmel_sha_fn_t resume)
+{
+	u32 isr = atmel_sha_read(dd, SHA_ISR);
+
+	if (unlikely(isr & SHA_INT_DATARDY))
+		return resume(dd);
+
+	dd->resume = resume;
+	atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
+	return -EINPROGRESS;
+}
+
 static int atmel_sha_xmit_cpu(struct atmel_sha_dev *dd, const u8 *buf,
 			      size_t length, int final)
 {
@@ -404,7 +568,7 @@ static int atmel_sha_xmit_cpu(struct atmel_sha_dev *dd, const u8 *buf,
 	int count, len32;
 	const u32 *buffer = (const u32 *)buf;
 
-	dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+	dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %zd, final: %d\n",
 		ctx->digcnt[1], ctx->digcnt[0], length, final);
 
 	atmel_sha_write_ctrl(dd, 0);
@@ -433,7 +597,7 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
 	int len32;
 
-	dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+	dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %zd, final: %d\n",
 		ctx->digcnt[1], ctx->digcnt[0], length1, final);
 
 	len32 = DIV_ROUND_UP(length1, sizeof(u32));
@@ -467,6 +631,8 @@ static void atmel_sha_dma_callback(void *data)
 {
 	struct atmel_sha_dev *dd = data;
 
+	dd->is_async = true;
+
 	/* dma_lch_in - completed - wait DATRDY */
 	atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
 }
@@ -478,7 +644,7 @@ static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 	struct dma_async_tx_descriptor	*in_desc;
 	struct scatterlist sg[2];
 
-	dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+	dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %zd, final: %d\n",
 		ctx->digcnt[1], ctx->digcnt[0], length1, final);
 
 	dd->dma_lch_in.dma_conf.src_maxburst = 16;
@@ -502,7 +668,7 @@ static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	}
 	if (!in_desc)
-		return -EINVAL;
+		return atmel_sha_complete(dd, -EINVAL);
 
 	in_desc->callback = atmel_sha_dma_callback;
 	in_desc->callback_param = dd;
@@ -557,9 +723,9 @@ static int atmel_sha_xmit_dma_map(struct atmel_sha_dev *dd,
 	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
 				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
 	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
-		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
+		dev_err(dd->dev, "dma %zu bytes error\n", ctx->buflen +
 				ctx->block_size);
-		return -EINVAL;
+		return atmel_sha_complete(dd, -EINVAL);
 	}
 
 	ctx->flags &= ~SHA_FLAGS_SG;
@@ -578,7 +744,7 @@ static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
 
 	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
 
-	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
+	dev_dbg(dd->dev, "slow: bufcnt: %zu, digcnt: 0x%llx 0x%llx, final: %d\n",
 		 ctx->bufcnt, ctx->digcnt[1], ctx->digcnt[0], final);
 
 	if (final)
@@ -606,7 +772,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 	if (ctx->bufcnt || ctx->offset)
 		return atmel_sha_update_dma_slow(dd);
 
-	dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
+	dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %zd, total: %u\n",
 		ctx->digcnt[1], ctx->digcnt[0], ctx->bufcnt, ctx->total);
 
 	sg = ctx->sg;
@@ -648,9 +814,9 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 		ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
 			ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
 		if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
-			dev_err(dd->dev, "dma %u bytes error\n",
+			dev_err(dd->dev, "dma %zu bytes error\n",
 				ctx->buflen + ctx->block_size);
-			return -EINVAL;
+			return atmel_sha_complete(dd, -EINVAL);
 		}
 
 		if (length == 0) {
@@ -664,7 +830,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 			if (!dma_map_sg(dd->dev, ctx->sg, 1,
 				DMA_TO_DEVICE)) {
 					dev_err(dd->dev, "dma_map_sg  error\n");
-					return -EINVAL;
+					return atmel_sha_complete(dd, -EINVAL);
 			}
 
 			ctx->flags |= SHA_FLAGS_SG;
@@ -678,7 +844,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
 	if (!dma_map_sg(dd->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
 		dev_err(dd->dev, "dma_map_sg  error\n");
-		return -EINVAL;
+		return atmel_sha_complete(dd, -EINVAL);
 	}
 
 	ctx->flags |= SHA_FLAGS_SG;
@@ -796,16 +962,28 @@ static void atmel_sha_copy_ready_hash(struct ahash_request *req)
 	if (!req->result)
 		return;
 
-	if (ctx->flags & SHA_FLAGS_SHA1)
+	switch (ctx->flags & SHA_FLAGS_ALGO_MASK) {
+	default:
+	case SHA_FLAGS_SHA1:
 		memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE);
-	else if (ctx->flags & SHA_FLAGS_SHA224)
+		break;
+
+	case SHA_FLAGS_SHA224:
 		memcpy(req->result, ctx->digest, SHA224_DIGEST_SIZE);
-	else if (ctx->flags & SHA_FLAGS_SHA256)
+		break;
+
+	case SHA_FLAGS_SHA256:
 		memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE);
-	else if (ctx->flags & SHA_FLAGS_SHA384)
+		break;
+
+	case SHA_FLAGS_SHA384:
 		memcpy(req->result, ctx->digest, SHA384_DIGEST_SIZE);
-	else
+		break;
+
+	case SHA_FLAGS_SHA512:
 		memcpy(req->result, ctx->digest, SHA512_DIGEST_SIZE);
+		break;
+	}
 }
 
 static int atmel_sha_finish(struct ahash_request *req)
@@ -816,7 +994,7 @@ static int atmel_sha_finish(struct ahash_request *req)
 	if (ctx->digcnt[0] || ctx->digcnt[1])
 		atmel_sha_copy_ready_hash(req);
 
-	dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
+	dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %zd\n", ctx->digcnt[1],
 		ctx->digcnt[0], ctx->bufcnt);
 
 	return 0;
@@ -836,16 +1014,7 @@ static void atmel_sha_finish_req(struct ahash_request *req, int err)
 	}
 
 	/* atomic operation is not needed here */
-	dd->flags &= ~(SHA_FLAGS_BUSY | SHA_FLAGS_FINAL | SHA_FLAGS_CPU |
-			SHA_FLAGS_DMA_READY | SHA_FLAGS_OUTPUT_READY);
-
-	clk_disable(dd->iclk);
-
-	if (req->base.complete)
-		req->base.complete(&req->base, err);
-
-	/* handle new request */
-	tasklet_schedule(&dd->queue_task);
+	(void)atmel_sha_complete(dd, err);
 }
 
 static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
@@ -886,8 +1055,9 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 				  struct ahash_request *req)
 {
 	struct crypto_async_request *async_req, *backlog;
-	struct atmel_sha_reqctx *ctx;
+	struct atmel_sha_ctx *ctx;
 	unsigned long flags;
+	bool start_async;
 	int err = 0, ret = 0;
 
 	spin_lock_irqsave(&dd->lock, flags);
@@ -912,35 +1082,69 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 	if (backlog)
 		backlog->complete(backlog, -EINPROGRESS);
 
-	req = ahash_request_cast(async_req);
-	dd->req = req;
-	ctx = ahash_request_ctx(req);
+	ctx = crypto_tfm_ctx(async_req->tfm);
+
+	dd->req = ahash_request_cast(async_req);
+	start_async = (dd->req != req);
+	dd->is_async = start_async;
+	dd->force_complete = false;
+
+	/* WARNING: ctx->start() MAY change dd->is_async. */
+	err = ctx->start(dd);
+	return (start_async) ? ret : err;
+}
+
+static int atmel_sha_done(struct atmel_sha_dev *dd);
+
+static int atmel_sha_start(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	int err;
 
 	dev_dbg(dd->dev, "handling new req, op: %lu, nbytes: %d\n",
 						ctx->op, req->nbytes);
 
 	err = atmel_sha_hw_init(dd);
-
 	if (err)
-		goto err1;
+		return atmel_sha_complete(dd, err);
 
+	/*
+	 * atmel_sha_update_req() and atmel_sha_final_req() can return either:
+	 *  -EINPROGRESS: the hardware is busy and the SHA driver will resume
+	 *                its job later in the done_task.
+	 *                This is the main path.
+	 *
+	 * 0: the SHA driver can continue its job then release the hardware
+	 *    later, if needed, with atmel_sha_finish_req().
+	 *    This is the alternate path.
+	 *
+	 * < 0: an error has occurred so atmel_sha_complete(dd, err) has already
+	 *      been called, hence the hardware has been released.
+	 *      The SHA driver must stop its job without calling
+	 *      atmel_sha_finish_req(), otherwise atmel_sha_complete() would be
+	 *      called a second time.
+	 *
+	 * Please note that currently, atmel_sha_final_req() never returns 0.
+	 */
+
+	dd->resume = atmel_sha_done;
 	if (ctx->op == SHA_OP_UPDATE) {
 		err = atmel_sha_update_req(dd);
-		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
+		if (!err && (ctx->flags & SHA_FLAGS_FINUP))
 			/* no final() after finup() */
 			err = atmel_sha_final_req(dd);
 	} else if (ctx->op == SHA_OP_FINAL) {
 		err = atmel_sha_final_req(dd);
 	}
 
-err1:
-	if (err != -EINPROGRESS)
+	if (!err)
 		/* done_task will not finish it, so do it here */
 		atmel_sha_finish_req(req, err);
 
 	dev_dbg(dd->dev, "exit, err: %d\n", err);
 
-	return ret;
+	return err;
 }
 
 static int atmel_sha_enqueue(struct ahash_request *req, unsigned int op)
@@ -1036,8 +1240,11 @@ static int atmel_sha_import(struct ahash_request *req, const void *in)
 
 static int atmel_sha_cra_init(struct crypto_tfm *tfm)
 {
+	struct atmel_sha_ctx *ctx = crypto_tfm_ctx(tfm);
+
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct atmel_sha_reqctx));
+	ctx->start = atmel_sha_start;
 
 	return 0;
 }
@@ -1176,9 +1383,8 @@ static void atmel_sha_queue_task(unsigned long data)
 	atmel_sha_handle_queue(dd, NULL);
 }
 
-static void atmel_sha_done_task(unsigned long data)
+static int atmel_sha_done(struct atmel_sha_dev *dd)
 {
-	struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
 	int err = 0;
 
 	if (SHA_FLAGS_CPU & dd->flags) {
@@ -1204,11 +1410,21 @@ static void atmel_sha_done_task(unsigned long data)
 				goto finish;
 		}
 	}
-	return;
+	return err;
 
 finish:
 	/* finish curent request */
 	atmel_sha_finish_req(dd->req, err);
+
+	return err;
+}
+
+static void atmel_sha_done_task(unsigned long data)
+{
+	struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
+
+	dd->is_async = true;
+	(void)dd->resume(dd);
 }
 
 static irqreturn_t atmel_sha_irq(int irq, void *dev_id)
@@ -1233,10 +1449,1104 @@ static irqreturn_t atmel_sha_irq(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
+
+/* DMA transfer functions */
+
+static bool atmel_sha_dma_check_aligned(struct atmel_sha_dev *dd,
+					struct scatterlist *sg,
+					size_t len)
+{
+	struct atmel_sha_dma *dma = &dd->dma_lch_in;
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t bs = ctx->block_size;
+	int nents;
+
+	for (nents = 0; sg; sg = sg_next(sg), ++nents) {
+		if (!IS_ALIGNED(sg->offset, sizeof(u32)))
+			return false;
+
+		/*
+		 * This is the last sg, the only one that is allowed to
+		 * have an unaligned length.
+		 */
+		if (len <= sg->length) {
+			dma->nents = nents + 1;
+			dma->last_sg_length = sg->length;
+			sg->length = ALIGN(len, sizeof(u32));
+			return true;
+		}
+
+		/* All other sg lengths MUST be aligned to the block size. */
+		if (!IS_ALIGNED(sg->length, bs))
+			return false;
+
+		len -= sg->length;
+	}
+
+	return false;
+}
+
+static void atmel_sha_dma_callback2(void *data)
+{
+	struct atmel_sha_dev *dd = data;
+	struct atmel_sha_dma *dma = &dd->dma_lch_in;
+	struct scatterlist *sg;
+	int nents;
+
+	dmaengine_terminate_all(dma->chan);
+	dma_unmap_sg(dd->dev, dma->sg, dma->nents, DMA_TO_DEVICE);
+
+	sg = dma->sg;
+	for (nents = 0; nents < dma->nents - 1; ++nents)
+		sg = sg_next(sg);
+	sg->length = dma->last_sg_length;
+
+	dd->is_async = true;
+	(void)atmel_sha_wait_for_data_ready(dd, dd->resume);
+}
+
+static int atmel_sha_dma_start(struct atmel_sha_dev *dd,
+			       struct scatterlist *src,
+			       size_t len,
+			       atmel_sha_fn_t resume)
+{
+	struct atmel_sha_dma *dma = &dd->dma_lch_in;
+	struct dma_slave_config *config = &dma->dma_conf;
+	struct dma_chan *chan = dma->chan;
+	struct dma_async_tx_descriptor *desc;
+	dma_cookie_t cookie;
+	unsigned int sg_len;
+	int err;
+
+	dd->resume = resume;
+
+	/*
+	 * dma->nents has already been initialized by
+	 * atmel_sha_dma_check_aligned().
+	 */
+	dma->sg = src;
+	sg_len = dma_map_sg(dd->dev, dma->sg, dma->nents, DMA_TO_DEVICE);
+	if (!sg_len) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	config->src_maxburst = 16;
+	config->dst_maxburst = 16;
+	err = dmaengine_slave_config(chan, config);
+	if (err)
+		goto unmap_sg;
+
+	desc = dmaengine_prep_slave_sg(chan, dma->sg, sg_len, DMA_MEM_TO_DEV,
+				       DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!desc) {
+		err = -ENOMEM;
+		goto unmap_sg;
+	}
+
+	desc->callback = atmel_sha_dma_callback2;
+	desc->callback_param = dd;
+	cookie = dmaengine_submit(desc);
+	err = dma_submit_error(cookie);
+	if (err)
+		goto unmap_sg;
+
+	dma_async_issue_pending(chan);
+
+	return -EINPROGRESS;
+
+unmap_sg:
+	dma_unmap_sg(dd->dev, dma->sg, dma->nents, DMA_TO_DEVICE);
+exit:
+	return atmel_sha_complete(dd, err);
+}
+
+
+/* CPU transfer functions */
+
+static int atmel_sha_cpu_transfer(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	const u32 *words = (const u32 *)ctx->buffer;
+	size_t i, num_words;
+	u32 isr, din, din_inc;
+
+	din_inc = (ctx->flags & SHA_FLAGS_IDATAR0) ? 0 : 1;
+	for (;;) {
+		/* Write data into the Input Data Registers. */
+		num_words = DIV_ROUND_UP(ctx->bufcnt, sizeof(u32));
+		for (i = 0, din = 0; i < num_words; ++i, din += din_inc)
+			atmel_sha_write(dd, SHA_REG_DIN(din), words[i]);
+
+		ctx->offset += ctx->bufcnt;
+		ctx->total -= ctx->bufcnt;
+
+		if (!ctx->total)
+			break;
+
+		/*
+		 * Prepare next block:
+		 * Fill ctx->buffer now with the next data to be written into
+		 * IDATARx: it gives time for the SHA hardware to process
+		 * the current data so the SHA_INT_DATARDY flag might be set
+		 * in SHA_ISR when polling this register at the beginning of
+		 * the next loop.
+		 */
+		ctx->bufcnt = min_t(size_t, ctx->block_size, ctx->total);
+		scatterwalk_map_and_copy(ctx->buffer, ctx->sg,
+					 ctx->offset, ctx->bufcnt, 0);
+
+		/* Wait for hardware to be ready again. */
+		isr = atmel_sha_read(dd, SHA_ISR);
+		if (!(isr & SHA_INT_DATARDY)) {
+			/* Not ready yet. */
+			dd->resume = atmel_sha_cpu_transfer;
+			atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
+			return -EINPROGRESS;
+		}
+	}
+
+	if (unlikely(!(ctx->flags & SHA_FLAGS_WAIT_DATARDY)))
+		return dd->cpu_transfer_complete(dd);
+
+	return atmel_sha_wait_for_data_ready(dd, dd->cpu_transfer_complete);
+}
+
+static int atmel_sha_cpu_start(struct atmel_sha_dev *dd,
+			       struct scatterlist *sg,
+			       unsigned int len,
+			       bool idatar0_only,
+			       bool wait_data_ready,
+			       atmel_sha_fn_t resume)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	if (!len)
+		return resume(dd);
+
+	ctx->flags &= ~(SHA_FLAGS_IDATAR0 | SHA_FLAGS_WAIT_DATARDY);
+
+	if (idatar0_only)
+		ctx->flags |= SHA_FLAGS_IDATAR0;
+
+	if (wait_data_ready)
+		ctx->flags |= SHA_FLAGS_WAIT_DATARDY;
+
+	ctx->sg = sg;
+	ctx->total = len;
+	ctx->offset = 0;
+
+	/* Prepare the first block to be written. */
+	ctx->bufcnt = min_t(size_t, ctx->block_size, ctx->total);
+	scatterwalk_map_and_copy(ctx->buffer, ctx->sg,
+				 ctx->offset, ctx->bufcnt, 0);
+
+	dd->cpu_transfer_complete = resume;
+	return atmel_sha_cpu_transfer(dd);
+}
+
+static int atmel_sha_cpu_hash(struct atmel_sha_dev *dd,
+			      const void *data, unsigned int datalen,
+			      bool auto_padding,
+			      atmel_sha_fn_t resume)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	u32 msglen = (auto_padding) ? datalen : 0;
+	u32 mr = SHA_MR_MODE_AUTO;
+
+	if (!(IS_ALIGNED(datalen, ctx->block_size) || auto_padding))
+		return atmel_sha_complete(dd, -EINVAL);
+
+	mr |= (ctx->flags & SHA_FLAGS_ALGO_MASK);
+	atmel_sha_write(dd, SHA_MR, mr);
+	atmel_sha_write(dd, SHA_MSR, msglen);
+	atmel_sha_write(dd, SHA_BCR, msglen);
+	atmel_sha_write(dd, SHA_CR, SHA_CR_FIRST);
+
+	sg_init_one(&dd->tmp, data, datalen);
+	return atmel_sha_cpu_start(dd, &dd->tmp, datalen, false, true, resume);
+}
+
+
+/* hmac functions */
+
+struct atmel_sha_hmac_key {
+	bool			valid;
+	unsigned int		keylen;
+	u8			buffer[SHA512_BLOCK_SIZE];
+	u8			*keydup;
+};
+
+static inline void atmel_sha_hmac_key_init(struct atmel_sha_hmac_key *hkey)
+{
+	memset(hkey, 0, sizeof(*hkey));
+}
+
+static inline void atmel_sha_hmac_key_release(struct atmel_sha_hmac_key *hkey)
+{
+	kfree(hkey->keydup);
+	memset(hkey, 0, sizeof(*hkey));
+}
+
+static inline int atmel_sha_hmac_key_set(struct atmel_sha_hmac_key *hkey,
+					 const u8 *key,
+					 unsigned int keylen)
+{
+	atmel_sha_hmac_key_release(hkey);
+
+	if (keylen > sizeof(hkey->buffer)) {
+		hkey->keydup = kmemdup(key, keylen, GFP_KERNEL);
+		if (!hkey->keydup)
+			return -ENOMEM;
+
+	} else {
+		memcpy(hkey->buffer, key, keylen);
+	}
+
+	hkey->valid = true;
+	hkey->keylen = keylen;
+	return 0;
+}
+
+static inline bool atmel_sha_hmac_key_get(const struct atmel_sha_hmac_key *hkey,
+					  const u8 **key,
+					  unsigned int *keylen)
+{
+	if (!hkey->valid)
+		return false;
+
+	*keylen = hkey->keylen;
+	*key = (hkey->keydup) ? hkey->keydup : hkey->buffer;
+	return true;
+}
+
+
+struct atmel_sha_hmac_ctx {
+	struct atmel_sha_ctx	base;
+
+	struct atmel_sha_hmac_key	hkey;
+	u32			ipad[SHA512_BLOCK_SIZE / sizeof(u32)];
+	u32			opad[SHA512_BLOCK_SIZE / sizeof(u32)];
+	atmel_sha_fn_t		resume;
+};
+
+static int atmel_sha_hmac_setup(struct atmel_sha_dev *dd,
+				atmel_sha_fn_t resume);
+static int atmel_sha_hmac_prehash_key(struct atmel_sha_dev *dd,
+				      const u8 *key, unsigned int keylen);
+static int atmel_sha_hmac_prehash_key_done(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_compute_ipad_hash(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_compute_opad_hash(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_setup_done(struct atmel_sha_dev *dd);
+
+static int atmel_sha_hmac_init_done(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_final(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_final_done(struct atmel_sha_dev *dd);
+static int atmel_sha_hmac_digest2(struct atmel_sha_dev *dd);
+
+static int atmel_sha_hmac_setup(struct atmel_sha_dev *dd,
+				atmel_sha_fn_t resume)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	unsigned int keylen;
+	const u8 *key;
+	size_t bs;
+
+	hmac->resume = resume;
+	switch (ctx->flags & SHA_FLAGS_ALGO_MASK) {
+	case SHA_FLAGS_SHA1:
+		ctx->block_size = SHA1_BLOCK_SIZE;
+		ctx->hash_size = SHA1_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA224:
+		ctx->block_size = SHA224_BLOCK_SIZE;
+		ctx->hash_size = SHA256_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA256:
+		ctx->block_size = SHA256_BLOCK_SIZE;
+		ctx->hash_size = SHA256_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA384:
+		ctx->block_size = SHA384_BLOCK_SIZE;
+		ctx->hash_size = SHA512_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA512:
+		ctx->block_size = SHA512_BLOCK_SIZE;
+		ctx->hash_size = SHA512_DIGEST_SIZE;
+		break;
+
+	default:
+		return atmel_sha_complete(dd, -EINVAL);
+	}
+	bs = ctx->block_size;
+
+	if (likely(!atmel_sha_hmac_key_get(&hmac->hkey, &key, &keylen)))
+		return resume(dd);
+
+	/* Compute K' from K. */
+	if (unlikely(keylen > bs))
+		return atmel_sha_hmac_prehash_key(dd, key, keylen);
+
+	/* Prepare ipad. */
+	memcpy((u8 *)hmac->ipad, key, keylen);
+	memset((u8 *)hmac->ipad + keylen, 0, bs - keylen);
+	return atmel_sha_hmac_compute_ipad_hash(dd);
+}
+
+static int atmel_sha_hmac_prehash_key(struct atmel_sha_dev *dd,
+				      const u8 *key, unsigned int keylen)
+{
+	return atmel_sha_cpu_hash(dd, key, keylen, true,
+				  atmel_sha_hmac_prehash_key_done);
+}
+
+static int atmel_sha_hmac_prehash_key_done(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t ds = crypto_ahash_digestsize(tfm);
+	size_t bs = ctx->block_size;
+	size_t i, num_words = ds / sizeof(u32);
+
+	/* Prepare ipad. */
+	for (i = 0; i < num_words; ++i)
+		hmac->ipad[i] = atmel_sha_read(dd, SHA_REG_DIGEST(i));
+	memset((u8 *)hmac->ipad + ds, 0, bs - ds);
+	return atmel_sha_hmac_compute_ipad_hash(dd);
+}
+
+static int atmel_sha_hmac_compute_ipad_hash(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t bs = ctx->block_size;
+	size_t i, num_words = bs / sizeof(u32);
+
+	memcpy(hmac->opad, hmac->ipad, bs);
+	for (i = 0; i < num_words; ++i) {
+		hmac->ipad[i] ^= 0x36363636;
+		hmac->opad[i] ^= 0x5c5c5c5c;
+	}
+
+	return atmel_sha_cpu_hash(dd, hmac->ipad, bs, false,
+				  atmel_sha_hmac_compute_opad_hash);
+}
+
+static int atmel_sha_hmac_compute_opad_hash(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t bs = ctx->block_size;
+	size_t hs = ctx->hash_size;
+	size_t i, num_words = hs / sizeof(u32);
+
+	for (i = 0; i < num_words; ++i)
+		hmac->ipad[i] = atmel_sha_read(dd, SHA_REG_DIGEST(i));
+	return atmel_sha_cpu_hash(dd, hmac->opad, bs, false,
+				  atmel_sha_hmac_setup_done);
+}
+
+static int atmel_sha_hmac_setup_done(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t hs = ctx->hash_size;
+	size_t i, num_words = hs / sizeof(u32);
+
+	for (i = 0; i < num_words; ++i)
+		hmac->opad[i] = atmel_sha_read(dd, SHA_REG_DIGEST(i));
+	atmel_sha_hmac_key_release(&hmac->hkey);
+	return hmac->resume(dd);
+}
+
+static int atmel_sha_hmac_start(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	int err;
+
+	err = atmel_sha_hw_init(dd);
+	if (err)
+		return atmel_sha_complete(dd, err);
+
+	switch (ctx->op) {
+	case SHA_OP_INIT:
+		err = atmel_sha_hmac_setup(dd, atmel_sha_hmac_init_done);
+		break;
+
+	case SHA_OP_UPDATE:
+		dd->resume = atmel_sha_done;
+		err = atmel_sha_update_req(dd);
+		break;
+
+	case SHA_OP_FINAL:
+		dd->resume = atmel_sha_hmac_final;
+		err = atmel_sha_final_req(dd);
+		break;
+
+	case SHA_OP_DIGEST:
+		err = atmel_sha_hmac_setup(dd, atmel_sha_hmac_digest2);
+		break;
+
+	default:
+		return atmel_sha_complete(dd, -EINVAL);
+	}
+
+	return err;
+}
+
+static int atmel_sha_hmac_setkey(struct crypto_ahash *tfm, const u8 *key,
+				 unsigned int keylen)
+{
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+
+	if (atmel_sha_hmac_key_set(&hmac->hkey, key, keylen)) {
+		crypto_ahash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int atmel_sha_hmac_init(struct ahash_request *req)
+{
+	int err;
+
+	err = atmel_sha_init(req);
+	if (err)
+		return err;
+
+	return atmel_sha_enqueue(req, SHA_OP_INIT);
+}
+
+static int atmel_sha_hmac_init_done(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	size_t bs = ctx->block_size;
+	size_t hs = ctx->hash_size;
+
+	ctx->bufcnt = 0;
+	ctx->digcnt[0] = bs;
+	ctx->digcnt[1] = 0;
+	ctx->flags |= SHA_FLAGS_RESTORE;
+	memcpy(ctx->digest, hmac->ipad, hs);
+	return atmel_sha_complete(dd, 0);
+}
+
+static int atmel_sha_hmac_final(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	u32 *digest = (u32 *)ctx->digest;
+	size_t ds = crypto_ahash_digestsize(tfm);
+	size_t bs = ctx->block_size;
+	size_t hs = ctx->hash_size;
+	size_t i, num_words;
+	u32 mr;
+
+	/* Save d = SHA((K' + ipad) | msg). */
+	num_words = ds / sizeof(u32);
+	for (i = 0; i < num_words; ++i)
+		digest[i] = atmel_sha_read(dd, SHA_REG_DIGEST(i));
+
+	/* Restore context to finish computing SHA((K' + opad) | d). */
+	atmel_sha_write(dd, SHA_CR, SHA_CR_WUIHV);
+	num_words = hs / sizeof(u32);
+	for (i = 0; i < num_words; ++i)
+		atmel_sha_write(dd, SHA_REG_DIN(i), hmac->opad[i]);
+
+	mr = SHA_MR_MODE_AUTO | SHA_MR_UIHV;
+	mr |= (ctx->flags & SHA_FLAGS_ALGO_MASK);
+	atmel_sha_write(dd, SHA_MR, mr);
+	atmel_sha_write(dd, SHA_MSR, bs + ds);
+	atmel_sha_write(dd, SHA_BCR, ds);
+	atmel_sha_write(dd, SHA_CR, SHA_CR_FIRST);
+
+	sg_init_one(&dd->tmp, digest, ds);
+	return atmel_sha_cpu_start(dd, &dd->tmp, ds, false, true,
+				   atmel_sha_hmac_final_done);
+}
+
+static int atmel_sha_hmac_final_done(struct atmel_sha_dev *dd)
+{
+	/*
+	 * req->result might not be sizeof(u32) aligned, so copy the
+	 * digest into ctx->digest[] before memcpy() the data into
+	 * req->result.
+	 */
+	atmel_sha_copy_hash(dd->req);
+	atmel_sha_copy_ready_hash(dd->req);
+	return atmel_sha_complete(dd, 0);
+}
+
+static int atmel_sha_hmac_digest(struct ahash_request *req)
+{
+	int err;
+
+	err = atmel_sha_init(req);
+	if (err)
+		return err;
+
+	return atmel_sha_enqueue(req, SHA_OP_DIGEST);
+}
+
+static int atmel_sha_hmac_digest2(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	size_t hs = ctx->hash_size;
+	size_t i, num_words = hs / sizeof(u32);
+	bool use_dma = false;
+	u32 mr;
+
+	/* Special case for empty message. */
+	if (!req->nbytes)
+		return atmel_sha_complete(dd, -EINVAL); // TODO:
+
+	/* Check DMA threshold and alignment. */
+	if (req->nbytes > ATMEL_SHA_DMA_THRESHOLD &&
+	    atmel_sha_dma_check_aligned(dd, req->src, req->nbytes))
+		use_dma = true;
+
+	/* Write both initial hash values to compute a HMAC. */
+	atmel_sha_write(dd, SHA_CR, SHA_CR_WUIHV);
+	for (i = 0; i < num_words; ++i)
+		atmel_sha_write(dd, SHA_REG_DIN(i), hmac->ipad[i]);
+
+	atmel_sha_write(dd, SHA_CR, SHA_CR_WUIEHV);
+	for (i = 0; i < num_words; ++i)
+		atmel_sha_write(dd, SHA_REG_DIN(i), hmac->opad[i]);
+
+	/* Write the Mode, Message Size, Bytes Count then Control Registers. */
+	mr = (SHA_MR_HMAC | SHA_MR_DUALBUFF);
+	mr |= ctx->flags & SHA_FLAGS_ALGO_MASK;
+	if (use_dma)
+		mr |= SHA_MR_MODE_IDATAR0;
+	else
+		mr |= SHA_MR_MODE_AUTO;
+	atmel_sha_write(dd, SHA_MR, mr);
+
+	atmel_sha_write(dd, SHA_MSR, req->nbytes);
+	atmel_sha_write(dd, SHA_BCR, req->nbytes);
+
+	atmel_sha_write(dd, SHA_CR, SHA_CR_FIRST);
+
+	/* Process data. */
+	if (use_dma)
+		return atmel_sha_dma_start(dd, req->src, req->nbytes,
+					   atmel_sha_hmac_final_done);
+
+	return atmel_sha_cpu_start(dd, req->src, req->nbytes, false, true,
+				   atmel_sha_hmac_final_done);
+}
+
+static int atmel_sha_hmac_cra_init(struct crypto_tfm *tfm)
+{
+	struct atmel_sha_hmac_ctx *hmac = crypto_tfm_ctx(tfm);
+
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				 sizeof(struct atmel_sha_reqctx));
+	hmac->base.start = atmel_sha_hmac_start;
+	atmel_sha_hmac_key_init(&hmac->hkey);
+
+	return 0;
+}
+
+static void atmel_sha_hmac_cra_exit(struct crypto_tfm *tfm)
+{
+	struct atmel_sha_hmac_ctx *hmac = crypto_tfm_ctx(tfm);
+
+	atmel_sha_hmac_key_release(&hmac->hkey);
+}
+
+static struct ahash_alg sha_hmac_algs[] = {
+{
+	.init		= atmel_sha_hmac_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.digest		= atmel_sha_hmac_digest,
+	.setkey		= atmel_sha_hmac_setkey,
+	.export		= atmel_sha_export,
+	.import		= atmel_sha_import,
+	.halg = {
+		.digestsize	= SHA1_DIGEST_SIZE,
+		.statesize	= sizeof(struct atmel_sha_reqctx),
+		.base	= {
+			.cra_name		= "hmac(sha1)",
+			.cra_driver_name	= "atmel-hmac-sha1",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= SHA1_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_hmac_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_hmac_cra_init,
+			.cra_exit		= atmel_sha_hmac_cra_exit,
+		}
+	}
+},
+{
+	.init		= atmel_sha_hmac_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.digest		= atmel_sha_hmac_digest,
+	.setkey		= atmel_sha_hmac_setkey,
+	.export		= atmel_sha_export,
+	.import		= atmel_sha_import,
+	.halg = {
+		.digestsize	= SHA224_DIGEST_SIZE,
+		.statesize	= sizeof(struct atmel_sha_reqctx),
+		.base	= {
+			.cra_name		= "hmac(sha224)",
+			.cra_driver_name	= "atmel-hmac-sha224",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= SHA224_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_hmac_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_hmac_cra_init,
+			.cra_exit		= atmel_sha_hmac_cra_exit,
+		}
+	}
+},
+{
+	.init		= atmel_sha_hmac_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.digest		= atmel_sha_hmac_digest,
+	.setkey		= atmel_sha_hmac_setkey,
+	.export		= atmel_sha_export,
+	.import		= atmel_sha_import,
+	.halg = {
+		.digestsize	= SHA256_DIGEST_SIZE,
+		.statesize	= sizeof(struct atmel_sha_reqctx),
+		.base	= {
+			.cra_name		= "hmac(sha256)",
+			.cra_driver_name	= "atmel-hmac-sha256",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= SHA256_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_hmac_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_hmac_cra_init,
+			.cra_exit		= atmel_sha_hmac_cra_exit,
+		}
+	}
+},
+{
+	.init		= atmel_sha_hmac_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.digest		= atmel_sha_hmac_digest,
+	.setkey		= atmel_sha_hmac_setkey,
+	.export		= atmel_sha_export,
+	.import		= atmel_sha_import,
+	.halg = {
+		.digestsize	= SHA384_DIGEST_SIZE,
+		.statesize	= sizeof(struct atmel_sha_reqctx),
+		.base	= {
+			.cra_name		= "hmac(sha384)",
+			.cra_driver_name	= "atmel-hmac-sha384",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= SHA384_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_hmac_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_hmac_cra_init,
+			.cra_exit		= atmel_sha_hmac_cra_exit,
+		}
+	}
+},
+{
+	.init		= atmel_sha_hmac_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.digest		= atmel_sha_hmac_digest,
+	.setkey		= atmel_sha_hmac_setkey,
+	.export		= atmel_sha_export,
+	.import		= atmel_sha_import,
+	.halg = {
+		.digestsize	= SHA512_DIGEST_SIZE,
+		.statesize	= sizeof(struct atmel_sha_reqctx),
+		.base	= {
+			.cra_name		= "hmac(sha512)",
+			.cra_driver_name	= "atmel-hmac-sha512",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC,
+			.cra_blocksize		= SHA512_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_hmac_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_hmac_cra_init,
+			.cra_exit		= atmel_sha_hmac_cra_exit,
+		}
+	}
+},
+};
+
+#ifdef CONFIG_CRYPTO_DEV_ATMEL_AUTHENC
+/* authenc functions */
+
+static int atmel_sha_authenc_init2(struct atmel_sha_dev *dd);
+static int atmel_sha_authenc_init_done(struct atmel_sha_dev *dd);
+static int atmel_sha_authenc_final_done(struct atmel_sha_dev *dd);
+
+
+struct atmel_sha_authenc_ctx {
+	struct crypto_ahash	*tfm;
+};
+
+struct atmel_sha_authenc_reqctx {
+	struct atmel_sha_reqctx	base;
+
+	atmel_aes_authenc_fn_t	cb;
+	struct atmel_aes_dev	*aes_dev;
+
+	/* _init() parameters. */
+	struct scatterlist	*assoc;
+	u32			assoclen;
+	u32			textlen;
+
+	/* _final() parameters. */
+	u32			*digest;
+	unsigned int		digestlen;
+};
+
+static void atmel_sha_authenc_complete(struct crypto_async_request *areq,
+				       int err)
+{
+	struct ahash_request *req = areq->data;
+	struct atmel_sha_authenc_reqctx *authctx  = ahash_request_ctx(req);
+
+	authctx->cb(authctx->aes_dev, err, authctx->base.dd->is_async);
+}
+
+static int atmel_sha_authenc_start(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	int err;
+
+	/*
+	 * Force atmel_sha_complete() to call req->base.complete(), ie
+	 * atmel_sha_authenc_complete(), which in turn calls authctx->cb().
+	 */
+	dd->force_complete = true;
+
+	err = atmel_sha_hw_init(dd);
+	return authctx->cb(authctx->aes_dev, err, dd->is_async);
+}
+
+bool atmel_sha_authenc_is_ready(void)
+{
+	struct atmel_sha_ctx dummy;
+
+	dummy.dd = NULL;
+	return (atmel_sha_find_dev(&dummy) != NULL);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_is_ready);
+
+unsigned int atmel_sha_authenc_get_reqsize(void)
+{
+	return sizeof(struct atmel_sha_authenc_reqctx);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_get_reqsize);
+
+struct atmel_sha_authenc_ctx *atmel_sha_authenc_spawn(unsigned long mode)
+{
+	struct atmel_sha_authenc_ctx *auth;
+	struct crypto_ahash *tfm;
+	struct atmel_sha_ctx *tctx;
+	const char *name;
+	int err = -EINVAL;
+
+	switch (mode & SHA_FLAGS_MODE_MASK) {
+	case SHA_FLAGS_HMAC_SHA1:
+		name = "atmel-hmac-sha1";
+		break;
+
+	case SHA_FLAGS_HMAC_SHA224:
+		name = "atmel-hmac-sha224";
+		break;
+
+	case SHA_FLAGS_HMAC_SHA256:
+		name = "atmel-hmac-sha256";
+		break;
+
+	case SHA_FLAGS_HMAC_SHA384:
+		name = "atmel-hmac-sha384";
+		break;
+
+	case SHA_FLAGS_HMAC_SHA512:
+		name = "atmel-hmac-sha512";
+		break;
+
+	default:
+		goto error;
+	}
+
+	tfm = crypto_alloc_ahash(name,
+				 CRYPTO_ALG_TYPE_AHASH,
+				 CRYPTO_ALG_TYPE_AHASH_MASK);
+	if (IS_ERR(tfm)) {
+		err = PTR_ERR(tfm);
+		goto error;
+	}
+	tctx = crypto_ahash_ctx(tfm);
+	tctx->start = atmel_sha_authenc_start;
+	tctx->flags = mode;
+
+	auth = kzalloc(sizeof(*auth), GFP_KERNEL);
+	if (!auth) {
+		err = -ENOMEM;
+		goto err_free_ahash;
+	}
+	auth->tfm = tfm;
+
+	return auth;
+
+err_free_ahash:
+	crypto_free_ahash(tfm);
+error:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_spawn);
+
+void atmel_sha_authenc_free(struct atmel_sha_authenc_ctx *auth)
+{
+	if (auth)
+		crypto_free_ahash(auth->tfm);
+	kfree(auth);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_free);
+
+int atmel_sha_authenc_setkey(struct atmel_sha_authenc_ctx *auth,
+			     const u8 *key, unsigned int keylen,
+			     u32 *flags)
+{
+	struct crypto_ahash *tfm = auth->tfm;
+	int err;
+
+	crypto_ahash_clear_flags(tfm, CRYPTO_TFM_REQ_MASK);
+	crypto_ahash_set_flags(tfm, *flags & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ahash_setkey(tfm, key, keylen);
+	*flags = crypto_ahash_get_flags(tfm);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_setkey);
+
+int atmel_sha_authenc_schedule(struct ahash_request *req,
+			       struct atmel_sha_authenc_ctx *auth,
+			       atmel_aes_authenc_fn_t cb,
+			       struct atmel_aes_dev *aes_dev)
+{
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	struct atmel_sha_reqctx *ctx = &authctx->base;
+	struct crypto_ahash *tfm = auth->tfm;
+	struct atmel_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct atmel_sha_dev *dd;
+
+	/* Reset request context (MUST be done first). */
+	memset(authctx, 0, sizeof(*authctx));
+
+	/* Get SHA device. */
+	dd = atmel_sha_find_dev(tctx);
+	if (!dd)
+		return cb(aes_dev, -ENODEV, false);
+
+	/* Init request context. */
+	ctx->dd = dd;
+	ctx->buflen = SHA_BUFFER_LEN;
+	authctx->cb = cb;
+	authctx->aes_dev = aes_dev;
+	ahash_request_set_tfm(req, tfm);
+	ahash_request_set_callback(req, 0, atmel_sha_authenc_complete, req);
+
+	return atmel_sha_handle_queue(dd, req);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_schedule);
+
+int atmel_sha_authenc_init(struct ahash_request *req,
+			   struct scatterlist *assoc, unsigned int assoclen,
+			   unsigned int textlen,
+			   atmel_aes_authenc_fn_t cb,
+			   struct atmel_aes_dev *aes_dev)
+{
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	struct atmel_sha_reqctx *ctx = &authctx->base;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	struct atmel_sha_dev *dd = ctx->dd;
+
+	if (unlikely(!IS_ALIGNED(assoclen, sizeof(u32))))
+		return atmel_sha_complete(dd, -EINVAL);
+
+	authctx->cb = cb;
+	authctx->aes_dev = aes_dev;
+	authctx->assoc = assoc;
+	authctx->assoclen = assoclen;
+	authctx->textlen = textlen;
+
+	ctx->flags = hmac->base.flags;
+	return atmel_sha_hmac_setup(dd, atmel_sha_authenc_init2);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_init);
+
+static int atmel_sha_authenc_init2(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	struct atmel_sha_reqctx *ctx = &authctx->base;
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct atmel_sha_hmac_ctx *hmac = crypto_ahash_ctx(tfm);
+	size_t hs = ctx->hash_size;
+	size_t i, num_words = hs / sizeof(u32);
+	u32 mr, msg_size;
+
+	atmel_sha_write(dd, SHA_CR, SHA_CR_WUIHV);
+	for (i = 0; i < num_words; ++i)
+		atmel_sha_write(dd, SHA_REG_DIN(i), hmac->ipad[i]);
+
+	atmel_sha_write(dd, SHA_CR, SHA_CR_WUIEHV);
+	for (i = 0; i < num_words; ++i)
+		atmel_sha_write(dd, SHA_REG_DIN(i), hmac->opad[i]);
+
+	mr = (SHA_MR_MODE_IDATAR0 |
+	      SHA_MR_HMAC |
+	      SHA_MR_DUALBUFF);
+	mr |= ctx->flags & SHA_FLAGS_ALGO_MASK;
+	atmel_sha_write(dd, SHA_MR, mr);
+
+	msg_size = authctx->assoclen + authctx->textlen;
+	atmel_sha_write(dd, SHA_MSR, msg_size);
+	atmel_sha_write(dd, SHA_BCR, msg_size);
+
+	atmel_sha_write(dd, SHA_CR, SHA_CR_FIRST);
+
+	/* Process assoc data. */
+	return atmel_sha_cpu_start(dd, authctx->assoc, authctx->assoclen,
+				   true, false,
+				   atmel_sha_authenc_init_done);
+}
+
+static int atmel_sha_authenc_init_done(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+
+	return authctx->cb(authctx->aes_dev, 0, dd->is_async);
+}
+
+int atmel_sha_authenc_final(struct ahash_request *req,
+			    u32 *digest, unsigned int digestlen,
+			    atmel_aes_authenc_fn_t cb,
+			    struct atmel_aes_dev *aes_dev)
+{
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	struct atmel_sha_reqctx *ctx = &authctx->base;
+	struct atmel_sha_dev *dd = ctx->dd;
+
+	switch (ctx->flags & SHA_FLAGS_ALGO_MASK) {
+	case SHA_FLAGS_SHA1:
+		authctx->digestlen = SHA1_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA224:
+		authctx->digestlen = SHA224_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA256:
+		authctx->digestlen = SHA256_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA384:
+		authctx->digestlen = SHA384_DIGEST_SIZE;
+		break;
+
+	case SHA_FLAGS_SHA512:
+		authctx->digestlen = SHA512_DIGEST_SIZE;
+		break;
+
+	default:
+		return atmel_sha_complete(dd, -EINVAL);
+	}
+	if (authctx->digestlen > digestlen)
+		authctx->digestlen = digestlen;
+
+	authctx->cb = cb;
+	authctx->aes_dev = aes_dev;
+	authctx->digest = digest;
+	return atmel_sha_wait_for_data_ready(dd,
+					     atmel_sha_authenc_final_done);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_final);
+
+static int atmel_sha_authenc_final_done(struct atmel_sha_dev *dd)
+{
+	struct ahash_request *req = dd->req;
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	size_t i, num_words = authctx->digestlen / sizeof(u32);
+
+	for (i = 0; i < num_words; ++i)
+		authctx->digest[i] = atmel_sha_read(dd, SHA_REG_DIGEST(i));
+
+	return atmel_sha_complete(dd, 0);
+}
+
+void atmel_sha_authenc_abort(struct ahash_request *req)
+{
+	struct atmel_sha_authenc_reqctx *authctx = ahash_request_ctx(req);
+	struct atmel_sha_reqctx *ctx = &authctx->base;
+	struct atmel_sha_dev *dd = ctx->dd;
+
+	/* Prevent atmel_sha_complete() from calling req->base.complete(). */
+	dd->is_async = false;
+	dd->force_complete = false;
+	(void)atmel_sha_complete(dd, 0);
+}
+EXPORT_SYMBOL_GPL(atmel_sha_authenc_abort);
+
+#endif /* CONFIG_CRYPTO_DEV_ATMEL_AUTHENC */
+
+
 static void atmel_sha_unregister_algs(struct atmel_sha_dev *dd)
 {
 	int i;
 
+	if (dd->caps.has_hmac)
+		for (i = 0; i < ARRAY_SIZE(sha_hmac_algs); i++)
+			crypto_unregister_ahash(&sha_hmac_algs[i]);
+
 	for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++)
 		crypto_unregister_ahash(&sha_1_256_algs[i]);
 
@@ -1273,8 +2583,21 @@ static int atmel_sha_register_algs(struct atmel_sha_dev *dd)
 		}
 	}
 
+	if (dd->caps.has_hmac) {
+		for (i = 0; i < ARRAY_SIZE(sha_hmac_algs); i++) {
+			err = crypto_register_ahash(&sha_hmac_algs[i]);
+			if (err)
+				goto err_sha_hmac_algs;
+		}
+	}
+
 	return 0;
 
+	/*i = ARRAY_SIZE(sha_hmac_algs);*/
+err_sha_hmac_algs:
+	for (j = 0; j < i; j++)
+		crypto_unregister_ahash(&sha_hmac_algs[j]);
+	i = ARRAY_SIZE(sha_384_512_algs);
 err_sha_384_512_algs:
 	for (j = 0; j < i; j++)
 		crypto_unregister_ahash(&sha_384_512_algs[j]);
@@ -1344,6 +2667,7 @@ static void atmel_sha_get_cap(struct atmel_sha_dev *dd)
 	dd->caps.has_sha224 = 0;
 	dd->caps.has_sha_384_512 = 0;
 	dd->caps.has_uihv = 0;
+	dd->caps.has_hmac = 0;
 
 	/* keep only major version number */
 	switch (dd->hw_version & 0xff0) {
@@ -1353,6 +2677,7 @@ static void atmel_sha_get_cap(struct atmel_sha_dev *dd)
 		dd->caps.has_sha224 = 1;
 		dd->caps.has_sha_384_512 = 1;
 		dd->caps.has_uihv = 1;
+		dd->caps.has_hmac = 1;
 		break;
 	case 0x420:
 		dd->caps.has_dma = 1;
diff --git a/drivers/crypto/atmel-tdes.c b/drivers/crypto/atmel-tdes.c
index bf467d7be35c..b25f1b3c981f 100644
--- a/drivers/crypto/atmel-tdes.c
+++ b/drivers/crypto/atmel-tdes.c
@@ -150,7 +150,7 @@ static struct atmel_tdes_drv atmel_tdes = {
 static int atmel_tdes_sg_copy(struct scatterlist **sg, size_t *offset,
 			void *buf, size_t buflen, size_t total, int out)
 {
-	unsigned int count, off = 0;
+	size_t count, off = 0;
 
 	while (buflen && total) {
 		count = min((*sg)->length - *offset, total);
@@ -336,7 +336,7 @@ static int atmel_tdes_crypt_pdc_stop(struct atmel_tdes_dev *dd)
 				dd->buf_out, dd->buflen, dd->dma_size, 1);
 		if (count != dd->dma_size) {
 			err = -EINVAL;
-			pr_err("not all data converted: %u\n", count);
+			pr_err("not all data converted: %zu\n", count);
 		}
 	}
 
@@ -361,7 +361,7 @@ static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd)
 	dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in,
 					dd->buflen, DMA_TO_DEVICE);
 	if (dma_mapping_error(dd->dev, dd->dma_addr_in)) {
-		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		dev_err(dd->dev, "dma %zd bytes error\n", dd->buflen);
 		err = -EINVAL;
 		goto err_map_in;
 	}
@@ -369,7 +369,7 @@ static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd)
 	dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out,
 					dd->buflen, DMA_FROM_DEVICE);
 	if (dma_mapping_error(dd->dev, dd->dma_addr_out)) {
-		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		dev_err(dd->dev, "dma %zd bytes error\n", dd->buflen);
 		err = -EINVAL;
 		goto err_map_out;
 	}
@@ -525,8 +525,8 @@ static int atmel_tdes_crypt_start(struct atmel_tdes_dev *dd)
 
 
 	if (fast)  {
-		count = min(dd->total, sg_dma_len(dd->in_sg));
-		count = min(count, sg_dma_len(dd->out_sg));
+		count = min_t(size_t, dd->total, sg_dma_len(dd->in_sg));
+		count = min_t(size_t, count, sg_dma_len(dd->out_sg));
 
 		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
 		if (!err) {
@@ -661,7 +661,7 @@ static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
 				dd->buf_out, dd->buflen, dd->dma_size, 1);
 			if (count != dd->dma_size) {
 				err = -EINVAL;
-				pr_err("not all data converted: %u\n", count);
+				pr_err("not all data converted: %zu\n", count);
 			}
 		}
 	}
diff --git a/drivers/crypto/bcm/Makefile b/drivers/crypto/bcm/Makefile
new file mode 100644
index 000000000000..13cb80eb2665
--- /dev/null
+++ b/drivers/crypto/bcm/Makefile
@@ -0,0 +1,15 @@
+# File: drivers/crypto/bcm/Makefile
+#
+# Makefile for crypto acceleration files for Broadcom SPU driver
+#
+# Uncomment to enable debug tracing in the SPU driver.
+# CFLAGS_util.o := -DDEBUG
+# CFLAGS_cipher.o := -DDEBUG
+# CFLAGS_spu.o := -DDEBUG
+# CFLAGS_spu2.o := -DDEBUG
+
+obj-$(CONFIG_CRYPTO_DEV_BCM_SPU) := bcm_crypto_spu.o
+
+bcm_crypto_spu-objs :=  util.o spu.o spu2.o cipher.o
+
+ccflags-y += -I. -DBCMDRIVER
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
new file mode 100644
index 000000000000..cc0d5b98006e
--- /dev/null
+++ b/drivers/crypto/bcm/cipher.c
@@ -0,0 +1,4963 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/kthread.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+
+#include <crypto/algapi.h>
+#include <crypto/aead.h>
+#include <crypto/internal/aead.h>
+#include <crypto/aes.h>
+#include <crypto/des.h>
+#include <crypto/sha.h>
+#include <crypto/md5.h>
+#include <crypto/authenc.h>
+#include <crypto/skcipher.h>
+#include <crypto/hash.h>
+#include <crypto/aes.h>
+#include <crypto/sha3.h>
+
+#include "util.h"
+#include "cipher.h"
+#include "spu.h"
+#include "spum.h"
+#include "spu2.h"
+
+/* ================= Device Structure ================== */
+
+struct device_private iproc_priv;
+
+/* ==================== Parameters ===================== */
+
+int flow_debug_logging;
+module_param(flow_debug_logging, int, 0644);
+MODULE_PARM_DESC(flow_debug_logging, "Enable Flow Debug Logging");
+
+int packet_debug_logging;
+module_param(packet_debug_logging, int, 0644);
+MODULE_PARM_DESC(packet_debug_logging, "Enable Packet Debug Logging");
+
+int debug_logging_sleep;
+module_param(debug_logging_sleep, int, 0644);
+MODULE_PARM_DESC(debug_logging_sleep, "Packet Debug Logging Sleep");
+
+/*
+ * The value of these module parameters is used to set the priority for each
+ * algo type when this driver registers algos with the kernel crypto API.
+ * To use a priority other than the default, set the priority in the insmod or
+ * modprobe. Changing the module priority after init time has no effect.
+ *
+ * The default priorities are chosen to be lower (less preferred) than ARMv8 CE
+ * algos, but more preferred than generic software algos.
+ */
+static int cipher_pri = 150;
+module_param(cipher_pri, int, 0644);
+MODULE_PARM_DESC(cipher_pri, "Priority for cipher algos");
+
+static int hash_pri = 100;
+module_param(hash_pri, int, 0644);
+MODULE_PARM_DESC(hash_pri, "Priority for hash algos");
+
+static int aead_pri = 150;
+module_param(aead_pri, int, 0644);
+MODULE_PARM_DESC(aead_pri, "Priority for AEAD algos");
+
+#define MAX_SPUS 16
+
+/* A type 3 BCM header, expected to precede the SPU header for SPU-M.
+ * Bits 3 and 4 in the first byte encode the channel number (the dma ringset).
+ * 0x60 - ring 0
+ * 0x68 - ring 1
+ * 0x70 - ring 2
+ * 0x78 - ring 3
+ */
+char BCMHEADER[] = { 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28 };
+/*
+ * Some SPU hw does not use BCM header on SPU messages. So BCM_HDR_LEN
+ * is set dynamically after reading SPU type from device tree.
+ */
+#define BCM_HDR_LEN  iproc_priv.bcm_hdr_len
+
+/* min and max time to sleep before retrying when mbox queue is full. usec */
+#define MBOX_SLEEP_MIN  800
+#define MBOX_SLEEP_MAX 1000
+
+/**
+ * select_channel() - Select a SPU channel to handle a crypto request. Selects
+ * channel in round robin order.
+ *
+ * Return:  channel index
+ */
+static u8 select_channel(void)
+{
+	u8 chan_idx = atomic_inc_return(&iproc_priv.next_chan);
+
+	return chan_idx % iproc_priv.spu.num_spu;
+}
+
+/**
+ * spu_ablkcipher_rx_sg_create() - Build up the scatterlist of buffers used to
+ * receive a SPU response message for an ablkcipher request. Includes buffers to
+ * catch SPU message headers and the response data.
+ * @mssg:	mailbox message containing the receive sg
+ * @rctx:	crypto request context
+ * @rx_frag_num: number of scatterlist elements required to hold the
+ *		SPU response message
+ * @chunksize:	Number of bytes of response data expected
+ * @stat_pad_len: Number of bytes required to pad the STAT field to
+ *		a 4-byte boundary
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Returns:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int
+spu_ablkcipher_rx_sg_create(struct brcm_message *mssg,
+			    struct iproc_reqctx_s *rctx,
+			    u8 rx_frag_num,
+			    unsigned int chunksize, u32 stat_pad_len)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 datalen;		/* Number of bytes of response data expected */
+
+	mssg->spu.dst = kcalloc(rx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (!mssg->spu.dst)
+		return -ENOMEM;
+
+	sg = mssg->spu.dst;
+	sg_init_table(sg, rx_frag_num);
+	/* Space for SPU message header */
+	sg_set_buf(sg++, rctx->msg_buf.spu_resp_hdr, ctx->spu_resp_hdr_len);
+
+	/* If XTS tweak in payload, add buffer to receive encrypted tweak */
+	if ((ctx->cipher.mode == CIPHER_MODE_XTS) &&
+	    spu->spu_xts_tweak_in_payload())
+		sg_set_buf(sg++, rctx->msg_buf.c.supdt_tweak,
+			   SPU_XTS_TWEAK_SIZE);
+
+	/* Copy in each dst sg entry from request, up to chunksize */
+	datalen = spu_msg_sg_add(&sg, &rctx->dst_sg, &rctx->dst_skip,
+				 rctx->dst_nents, chunksize);
+	if (datalen < chunksize) {
+		pr_err("%s(): failed to copy dst sg to mbox msg. chunksize %u, datalen %u",
+		       __func__, chunksize, datalen);
+		return -EFAULT;
+	}
+
+	if (ctx->cipher.alg == CIPHER_ALG_RC4)
+		/* Add buffer to catch 260-byte SUPDT field for RC4 */
+		sg_set_buf(sg++, rctx->msg_buf.c.supdt_tweak, SPU_SUPDT_LEN);
+
+	if (stat_pad_len)
+		sg_set_buf(sg++, rctx->msg_buf.rx_stat_pad, stat_pad_len);
+
+	memset(rctx->msg_buf.rx_stat, 0, SPU_RX_STATUS_LEN);
+	sg_set_buf(sg, rctx->msg_buf.rx_stat, spu->spu_rx_status_len());
+
+	return 0;
+}
+
+/**
+ * spu_ablkcipher_tx_sg_create() - Build up the scatterlist of buffers used to
+ * send a SPU request message for an ablkcipher request. Includes SPU message
+ * headers and the request data.
+ * @mssg:	mailbox message containing the transmit sg
+ * @rctx:	crypto request context
+ * @tx_frag_num: number of scatterlist elements required to construct the
+ *		SPU request message
+ * @chunksize:	Number of bytes of request data
+ * @pad_len:	Number of pad bytes
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Returns:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int
+spu_ablkcipher_tx_sg_create(struct brcm_message *mssg,
+			    struct iproc_reqctx_s *rctx,
+			    u8 tx_frag_num, unsigned int chunksize, u32 pad_len)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 datalen;		/* Number of bytes of response data expected */
+	u32 stat_len;
+
+	mssg->spu.src = kcalloc(tx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (unlikely(!mssg->spu.src))
+		return -ENOMEM;
+
+	sg = mssg->spu.src;
+	sg_init_table(sg, tx_frag_num);
+
+	sg_set_buf(sg++, rctx->msg_buf.bcm_spu_req_hdr,
+		   BCM_HDR_LEN + ctx->spu_req_hdr_len);
+
+	/* if XTS tweak in payload, copy from IV (where crypto API puts it) */
+	if ((ctx->cipher.mode == CIPHER_MODE_XTS) &&
+	    spu->spu_xts_tweak_in_payload())
+		sg_set_buf(sg++, rctx->msg_buf.iv_ctr, SPU_XTS_TWEAK_SIZE);
+
+	/* Copy in each src sg entry from request, up to chunksize */
+	datalen = spu_msg_sg_add(&sg, &rctx->src_sg, &rctx->src_skip,
+				 rctx->src_nents, chunksize);
+	if (unlikely(datalen < chunksize)) {
+		pr_err("%s(): failed to copy src sg to mbox msg",
+		       __func__);
+		return -EFAULT;
+	}
+
+	if (pad_len)
+		sg_set_buf(sg++, rctx->msg_buf.spu_req_pad, pad_len);
+
+	stat_len = spu->spu_tx_status_len();
+	if (stat_len) {
+		memset(rctx->msg_buf.tx_stat, 0, stat_len);
+		sg_set_buf(sg, rctx->msg_buf.tx_stat, stat_len);
+	}
+	return 0;
+}
+
+/**
+ * handle_ablkcipher_req() - Submit as much of a block cipher request as fits in
+ * a single SPU request message, starting at the current position in the request
+ * data.
+ * @rctx:	Crypto request context
+ *
+ * This may be called on the crypto API thread, or, when a request is so large
+ * it must be broken into multiple SPU messages, on the thread used to invoke
+ * the response callback. When requests are broken into multiple SPU
+ * messages, we assume subsequent messages depend on previous results, and
+ * thus always wait for previous results before submitting the next message.
+ * Because requests are submitted in lock step like this, there is no need
+ * to synchronize access to request data structures.
+ *
+ * Return: -EINPROGRESS: request has been accepted and result will be returned
+ *			 asynchronously
+ *         Any other value indicates an error
+ */
+static int handle_ablkcipher_req(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_async_request *areq = rctx->parent;
+	struct ablkcipher_request *req =
+	    container_of(areq, struct ablkcipher_request, base);
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	struct spu_cipher_parms cipher_parms;
+	int err = 0;
+	unsigned int chunksize = 0;	/* Num bytes of request to submit */
+	int remaining = 0;	/* Bytes of request still to process */
+	int chunk_start;	/* Beginning of data for current SPU msg */
+
+	/* IV or ctr value to use in this SPU msg */
+	u8 local_iv_ctr[MAX_IV_SIZE];
+	u32 stat_pad_len;	/* num bytes to align status field */
+	u32 pad_len;		/* total length of all padding */
+	bool update_key = false;
+	struct brcm_message *mssg;	/* mailbox message */
+	int retry_cnt = 0;
+
+	/* number of entries in src and dst sg in mailbox message. */
+	u8 rx_frag_num = 2;	/* response header and STATUS */
+	u8 tx_frag_num = 1;	/* request header */
+
+	flow_log("%s\n", __func__);
+
+	cipher_parms.alg = ctx->cipher.alg;
+	cipher_parms.mode = ctx->cipher.mode;
+	cipher_parms.type = ctx->cipher_type;
+	cipher_parms.key_len = ctx->enckeylen;
+	cipher_parms.key_buf = ctx->enckey;
+	cipher_parms.iv_buf = local_iv_ctr;
+	cipher_parms.iv_len = rctx->iv_ctr_len;
+
+	mssg = &rctx->mb_mssg;
+	chunk_start = rctx->src_sent;
+	remaining = rctx->total_todo - chunk_start;
+
+	/* determine the chunk we are breaking off and update the indexes */
+	if ((ctx->max_payload != SPU_MAX_PAYLOAD_INF) &&
+	    (remaining > ctx->max_payload))
+		chunksize = ctx->max_payload;
+	else
+		chunksize = remaining;
+
+	rctx->src_sent += chunksize;
+	rctx->total_sent = rctx->src_sent;
+
+	/* Count number of sg entries to be included in this request */
+	rctx->src_nents = spu_sg_count(rctx->src_sg, rctx->src_skip, chunksize);
+	rctx->dst_nents = spu_sg_count(rctx->dst_sg, rctx->dst_skip, chunksize);
+
+	if ((ctx->cipher.mode == CIPHER_MODE_CBC) &&
+	    rctx->is_encrypt && chunk_start)
+		/*
+		 * Encrypting non-first first chunk. Copy last block of
+		 * previous result to IV for this chunk.
+		 */
+		sg_copy_part_to_buf(req->dst, rctx->msg_buf.iv_ctr,
+				    rctx->iv_ctr_len,
+				    chunk_start - rctx->iv_ctr_len);
+
+	if (rctx->iv_ctr_len) {
+		/* get our local copy of the iv */
+		__builtin_memcpy(local_iv_ctr, rctx->msg_buf.iv_ctr,
+				 rctx->iv_ctr_len);
+
+		/* generate the next IV if possible */
+		if ((ctx->cipher.mode == CIPHER_MODE_CBC) &&
+		    !rctx->is_encrypt) {
+			/*
+			 * CBC Decrypt: next IV is the last ciphertext block in
+			 * this chunk
+			 */
+			sg_copy_part_to_buf(req->src, rctx->msg_buf.iv_ctr,
+					    rctx->iv_ctr_len,
+					    rctx->src_sent - rctx->iv_ctr_len);
+		} else if (ctx->cipher.mode == CIPHER_MODE_CTR) {
+			/*
+			 * The SPU hardware increments the counter once for
+			 * each AES block of 16 bytes. So update the counter
+			 * for the next chunk, if there is one. Note that for
+			 * this chunk, the counter has already been copied to
+			 * local_iv_ctr. We can assume a block size of 16,
+			 * because we only support CTR mode for AES, not for
+			 * any other cipher alg.
+			 */
+			add_to_ctr(rctx->msg_buf.iv_ctr, chunksize >> 4);
+		}
+	}
+
+	if (ctx->cipher.alg == CIPHER_ALG_RC4) {
+		rx_frag_num++;
+		if (chunk_start) {
+			/*
+			 * for non-first RC4 chunks, use SUPDT from previous
+			 * response as key for this chunk.
+			 */
+			cipher_parms.key_buf = rctx->msg_buf.c.supdt_tweak;
+			update_key = true;
+			cipher_parms.type = CIPHER_TYPE_UPDT;
+		} else if (!rctx->is_encrypt) {
+			/*
+			 * First RC4 chunk. For decrypt, key in pre-built msg
+			 * header may have been changed if encrypt required
+			 * multiple chunks. So revert the key to the
+			 * ctx->enckey value.
+			 */
+			update_key = true;
+			cipher_parms.type = CIPHER_TYPE_INIT;
+		}
+	}
+
+	if (ctx->max_payload == SPU_MAX_PAYLOAD_INF)
+		flow_log("max_payload infinite\n");
+	else
+		flow_log("max_payload %u\n", ctx->max_payload);
+
+	flow_log("sent:%u start:%u remains:%u size:%u\n",
+		 rctx->src_sent, chunk_start, remaining, chunksize);
+
+	/* Copy SPU header template created at setkey time */
+	memcpy(rctx->msg_buf.bcm_spu_req_hdr, ctx->bcm_spu_req_hdr,
+	       sizeof(rctx->msg_buf.bcm_spu_req_hdr));
+
+	/*
+	 * Pass SUPDT field as key. Key field in finish() call is only used
+	 * when update_key has been set above for RC4. Will be ignored in
+	 * all other cases.
+	 */
+	spu->spu_cipher_req_finish(rctx->msg_buf.bcm_spu_req_hdr + BCM_HDR_LEN,
+				   ctx->spu_req_hdr_len, !(rctx->is_encrypt),
+				   &cipher_parms, update_key, chunksize);
+
+	atomic64_add(chunksize, &iproc_priv.bytes_out);
+
+	stat_pad_len = spu->spu_wordalign_padlen(chunksize);
+	if (stat_pad_len)
+		rx_frag_num++;
+	pad_len = stat_pad_len;
+	if (pad_len) {
+		tx_frag_num++;
+		spu->spu_request_pad(rctx->msg_buf.spu_req_pad, 0,
+				     0, ctx->auth.alg, ctx->auth.mode,
+				     rctx->total_sent, stat_pad_len);
+	}
+
+	spu->spu_dump_msg_hdr(rctx->msg_buf.bcm_spu_req_hdr + BCM_HDR_LEN,
+			      ctx->spu_req_hdr_len);
+	packet_log("payload:\n");
+	dump_sg(rctx->src_sg, rctx->src_skip, chunksize);
+	packet_dump("   pad: ", rctx->msg_buf.spu_req_pad, pad_len);
+
+	/*
+	 * Build mailbox message containing SPU request msg and rx buffers
+	 * to catch response message
+	 */
+	memset(mssg, 0, sizeof(*mssg));
+	mssg->type = BRCM_MESSAGE_SPU;
+	mssg->ctx = rctx;	/* Will be returned in response */
+
+	/* Create rx scatterlist to catch result */
+	rx_frag_num += rctx->dst_nents;
+
+	if ((ctx->cipher.mode == CIPHER_MODE_XTS) &&
+	    spu->spu_xts_tweak_in_payload())
+		rx_frag_num++;	/* extra sg to insert tweak */
+
+	err = spu_ablkcipher_rx_sg_create(mssg, rctx, rx_frag_num, chunksize,
+					  stat_pad_len);
+	if (err)
+		return err;
+
+	/* Create tx scatterlist containing SPU request message */
+	tx_frag_num += rctx->src_nents;
+	if (spu->spu_tx_status_len())
+		tx_frag_num++;
+
+	if ((ctx->cipher.mode == CIPHER_MODE_XTS) &&
+	    spu->spu_xts_tweak_in_payload())
+		tx_frag_num++;	/* extra sg to insert tweak */
+
+	err = spu_ablkcipher_tx_sg_create(mssg, rctx, tx_frag_num, chunksize,
+					  pad_len);
+	if (err)
+		return err;
+
+	err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx], mssg);
+	if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) {
+		while ((err == -ENOBUFS) && (retry_cnt < SPU_MB_RETRY_MAX)) {
+			/*
+			 * Mailbox queue is full. Since MAY_SLEEP is set, assume
+			 * not in atomic context and we can wait and try again.
+			 */
+			retry_cnt++;
+			usleep_range(MBOX_SLEEP_MIN, MBOX_SLEEP_MAX);
+			err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx],
+						mssg);
+			atomic_inc(&iproc_priv.mb_no_spc);
+		}
+	}
+	if (unlikely(err < 0)) {
+		atomic_inc(&iproc_priv.mb_send_fail);
+		return err;
+	}
+
+	return -EINPROGRESS;
+}
+
+/**
+ * handle_ablkcipher_resp() - Process a block cipher SPU response. Updates the
+ * total received count for the request and updates global stats.
+ * @rctx:	Crypto request context
+ */
+static void handle_ablkcipher_resp(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+#ifdef DEBUG
+	struct crypto_async_request *areq = rctx->parent;
+	struct ablkcipher_request *req = ablkcipher_request_cast(areq);
+#endif
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 payload_len;
+
+	/* See how much data was returned */
+	payload_len = spu->spu_payload_length(rctx->msg_buf.spu_resp_hdr);
+
+	/*
+	 * In XTS mode, the first SPU_XTS_TWEAK_SIZE bytes may be the
+	 * encrypted tweak ("i") value; we don't count those.
+	 */
+	if ((ctx->cipher.mode == CIPHER_MODE_XTS) &&
+	    spu->spu_xts_tweak_in_payload() &&
+	    (payload_len >= SPU_XTS_TWEAK_SIZE))
+		payload_len -= SPU_XTS_TWEAK_SIZE;
+
+	atomic64_add(payload_len, &iproc_priv.bytes_in);
+
+	flow_log("%s() offset: %u, bd_len: %u BD:\n",
+		 __func__, rctx->total_received, payload_len);
+
+	dump_sg(req->dst, rctx->total_received, payload_len);
+	if (ctx->cipher.alg == CIPHER_ALG_RC4)
+		packet_dump("  supdt ", rctx->msg_buf.c.supdt_tweak,
+			    SPU_SUPDT_LEN);
+
+	rctx->total_received += payload_len;
+	if (rctx->total_received == rctx->total_todo) {
+		atomic_inc(&iproc_priv.op_counts[SPU_OP_CIPHER]);
+		atomic_inc(
+		   &iproc_priv.cipher_cnt[ctx->cipher.alg][ctx->cipher.mode]);
+	}
+}
+
+/**
+ * spu_ahash_rx_sg_create() - Build up the scatterlist of buffers used to
+ * receive a SPU response message for an ahash request.
+ * @mssg:	mailbox message containing the receive sg
+ * @rctx:	crypto request context
+ * @rx_frag_num: number of scatterlist elements required to hold the
+ *		SPU response message
+ * @digestsize: length of hash digest, in bytes
+ * @stat_pad_len: Number of bytes required to pad the STAT field to
+ *		a 4-byte boundary
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Return:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int
+spu_ahash_rx_sg_create(struct brcm_message *mssg,
+		       struct iproc_reqctx_s *rctx,
+		       u8 rx_frag_num, unsigned int digestsize,
+		       u32 stat_pad_len)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	struct iproc_ctx_s *ctx = rctx->ctx;
+
+	mssg->spu.dst = kcalloc(rx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (!mssg->spu.dst)
+		return -ENOMEM;
+
+	sg = mssg->spu.dst;
+	sg_init_table(sg, rx_frag_num);
+	/* Space for SPU message header */
+	sg_set_buf(sg++, rctx->msg_buf.spu_resp_hdr, ctx->spu_resp_hdr_len);
+
+	/* Space for digest */
+	sg_set_buf(sg++, rctx->msg_buf.digest, digestsize);
+
+	if (stat_pad_len)
+		sg_set_buf(sg++, rctx->msg_buf.rx_stat_pad, stat_pad_len);
+
+	memset(rctx->msg_buf.rx_stat, 0, SPU_RX_STATUS_LEN);
+	sg_set_buf(sg, rctx->msg_buf.rx_stat, spu->spu_rx_status_len());
+	return 0;
+}
+
+/**
+ * spu_ahash_tx_sg_create() -  Build up the scatterlist of buffers used to send
+ * a SPU request message for an ahash request. Includes SPU message headers and
+ * the request data.
+ * @mssg:	mailbox message containing the transmit sg
+ * @rctx:	crypto request context
+ * @tx_frag_num: number of scatterlist elements required to construct the
+ *		SPU request message
+ * @spu_hdr_len: length in bytes of SPU message header
+ * @hash_carry_len: Number of bytes of data carried over from previous req
+ * @new_data_len: Number of bytes of new request data
+ * @pad_len:	Number of pad bytes
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Return:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int
+spu_ahash_tx_sg_create(struct brcm_message *mssg,
+		       struct iproc_reqctx_s *rctx,
+		       u8 tx_frag_num,
+		       u32 spu_hdr_len,
+		       unsigned int hash_carry_len,
+		       unsigned int new_data_len, u32 pad_len)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	u32 datalen;		/* Number of bytes of response data expected */
+	u32 stat_len;
+
+	mssg->spu.src = kcalloc(tx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (!mssg->spu.src)
+		return -ENOMEM;
+
+	sg = mssg->spu.src;
+	sg_init_table(sg, tx_frag_num);
+
+	sg_set_buf(sg++, rctx->msg_buf.bcm_spu_req_hdr,
+		   BCM_HDR_LEN + spu_hdr_len);
+
+	if (hash_carry_len)
+		sg_set_buf(sg++, rctx->hash_carry, hash_carry_len);
+
+	if (new_data_len) {
+		/* Copy in each src sg entry from request, up to chunksize */
+		datalen = spu_msg_sg_add(&sg, &rctx->src_sg, &rctx->src_skip,
+					 rctx->src_nents, new_data_len);
+		if (datalen < new_data_len) {
+			pr_err("%s(): failed to copy src sg to mbox msg",
+			       __func__);
+			return -EFAULT;
+		}
+	}
+
+	if (pad_len)
+		sg_set_buf(sg++, rctx->msg_buf.spu_req_pad, pad_len);
+
+	stat_len = spu->spu_tx_status_len();
+	if (stat_len) {
+		memset(rctx->msg_buf.tx_stat, 0, stat_len);
+		sg_set_buf(sg, rctx->msg_buf.tx_stat, stat_len);
+	}
+
+	return 0;
+}
+
+/**
+ * handle_ahash_req() - Process an asynchronous hash request from the crypto
+ * API.
+ * @rctx:  Crypto request context
+ *
+ * Builds a SPU request message embedded in a mailbox message and submits the
+ * mailbox message on a selected mailbox channel. The SPU request message is
+ * constructed as a scatterlist, including entries from the crypto API's
+ * src scatterlist to avoid copying the data to be hashed. This function is
+ * called either on the thread from the crypto API, or, in the case that the
+ * crypto API request is too large to fit in a single SPU request message,
+ * on the thread that invokes the receive callback with a response message.
+ * Because some operations require the response from one chunk before the next
+ * chunk can be submitted, we always wait for the response for the previous
+ * chunk before submitting the next chunk. Because requests are submitted in
+ * lock step like this, there is no need to synchronize access to request data
+ * structures.
+ *
+ * Return:
+ *   -EINPROGRESS: request has been submitted to SPU and response will be
+ *		   returned asynchronously
+ *   -EAGAIN:      non-final request included a small amount of data, which for
+ *		   efficiency we did not submit to the SPU, but instead stored
+ *		   to be submitted to the SPU with the next part of the request
+ *   other:        an error code
+ */
+static int handle_ahash_req(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_async_request *areq = rctx->parent;
+	struct ahash_request *req = ahash_request_cast(areq);
+	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
+	struct crypto_tfm *tfm = crypto_ahash_tfm(ahash);
+	unsigned int blocksize = crypto_tfm_alg_blocksize(tfm);
+	struct iproc_ctx_s *ctx = rctx->ctx;
+
+	/* number of bytes still to be hashed in this req */
+	unsigned int nbytes_to_hash = 0;
+	int err = 0;
+	unsigned int chunksize = 0;	/* length of hash carry + new data */
+	/*
+	 * length of new data, not from hash carry, to be submitted in
+	 * this hw request
+	 */
+	unsigned int new_data_len;
+
+	unsigned int chunk_start = 0;
+	u32 db_size;	 /* Length of data field, incl gcm and hash padding */
+	int pad_len = 0; /* total pad len, including gcm, hash, stat padding */
+	u32 data_pad_len = 0;	/* length of GCM/CCM padding */
+	u32 stat_pad_len = 0;	/* length of padding to align STATUS word */
+	struct brcm_message *mssg;	/* mailbox message */
+	struct spu_request_opts req_opts;
+	struct spu_cipher_parms cipher_parms;
+	struct spu_hash_parms hash_parms;
+	struct spu_aead_parms aead_parms;
+	unsigned int local_nbuf;
+	u32 spu_hdr_len;
+	unsigned int digestsize;
+	u16 rem = 0;
+	int retry_cnt = 0;
+
+	/*
+	 * number of entries in src and dst sg. Always includes SPU msg header.
+	 * rx always includes a buffer to catch digest and STATUS.
+	 */
+	u8 rx_frag_num = 3;
+	u8 tx_frag_num = 1;
+
+	flow_log("total_todo %u, total_sent %u\n",
+		 rctx->total_todo, rctx->total_sent);
+
+	memset(&req_opts, 0, sizeof(req_opts));
+	memset(&cipher_parms, 0, sizeof(cipher_parms));
+	memset(&hash_parms, 0, sizeof(hash_parms));
+	memset(&aead_parms, 0, sizeof(aead_parms));
+
+	req_opts.bd_suppress = true;
+	hash_parms.alg = ctx->auth.alg;
+	hash_parms.mode = ctx->auth.mode;
+	hash_parms.type = HASH_TYPE_NONE;
+	hash_parms.key_buf = (u8 *)ctx->authkey;
+	hash_parms.key_len = ctx->authkeylen;
+
+	/*
+	 * For hash algorithms below assignment looks bit odd but
+	 * it's needed for AES-XCBC and AES-CMAC hash algorithms
+	 * to differentiate between 128, 192, 256 bit key values.
+	 * Based on the key values, hash algorithm is selected.
+	 * For example for 128 bit key, hash algorithm is AES-128.
+	 */
+	cipher_parms.type = ctx->cipher_type;
+
+	mssg = &rctx->mb_mssg;
+	chunk_start = rctx->src_sent;
+
+	/*
+	 * Compute the amount remaining to hash. This may include data
+	 * carried over from previous requests.
+	 */
+	nbytes_to_hash = rctx->total_todo - rctx->total_sent;
+	chunksize = nbytes_to_hash;
+	if ((ctx->max_payload != SPU_MAX_PAYLOAD_INF) &&
+	    (chunksize > ctx->max_payload))
+		chunksize = ctx->max_payload;
+
+	/*
+	 * If this is not a final request and the request data is not a multiple
+	 * of a full block, then simply park the extra data and prefix it to the
+	 * data for the next request.
+	 */
+	if (!rctx->is_final) {
+		u8 *dest = rctx->hash_carry + rctx->hash_carry_len;
+		u16 new_len;  /* len of data to add to hash carry */
+
+		rem = chunksize % blocksize;   /* remainder */
+		if (rem) {
+			/* chunksize not a multiple of blocksize */
+			chunksize -= rem;
+			if (chunksize == 0) {
+				/* Don't have a full block to submit to hw */
+				new_len = rem - rctx->hash_carry_len;
+				sg_copy_part_to_buf(req->src, dest, new_len,
+						    rctx->src_sent);
+				rctx->hash_carry_len = rem;
+				flow_log("Exiting with hash carry len: %u\n",
+					 rctx->hash_carry_len);
+				packet_dump("  buf: ",
+					    rctx->hash_carry,
+					    rctx->hash_carry_len);
+				return -EAGAIN;
+			}
+		}
+	}
+
+	/* if we have hash carry, then prefix it to the data in this request */
+	local_nbuf = rctx->hash_carry_len;
+	rctx->hash_carry_len = 0;
+	if (local_nbuf)
+		tx_frag_num++;
+	new_data_len = chunksize - local_nbuf;
+
+	/* Count number of sg entries to be used in this request */
+	rctx->src_nents = spu_sg_count(rctx->src_sg, rctx->src_skip,
+				       new_data_len);
+
+	/* AES hashing keeps key size in type field, so need to copy it here */
+	if (hash_parms.alg == HASH_ALG_AES)
+		hash_parms.type = cipher_parms.type;
+	else
+		hash_parms.type = spu->spu_hash_type(rctx->total_sent);
+
+	digestsize = spu->spu_digest_size(ctx->digestsize, ctx->auth.alg,
+					  hash_parms.type);
+	hash_parms.digestsize =	digestsize;
+
+	/* update the indexes */
+	rctx->total_sent += chunksize;
+	/* if you sent a prebuf then that wasn't from this req->src */
+	rctx->src_sent += new_data_len;
+
+	if ((rctx->total_sent == rctx->total_todo) && rctx->is_final)
+		hash_parms.pad_len = spu->spu_hash_pad_len(hash_parms.alg,
+							   hash_parms.mode,
+							   chunksize,
+							   blocksize);
+
+	/*
+	 * If a non-first chunk, then include the digest returned from the
+	 * previous chunk so that hw can add to it (except for AES types).
+	 */
+	if ((hash_parms.type == HASH_TYPE_UPDT) &&
+	    (hash_parms.alg != HASH_ALG_AES)) {
+		hash_parms.key_buf = rctx->incr_hash;
+		hash_parms.key_len = digestsize;
+	}
+
+	atomic64_add(chunksize, &iproc_priv.bytes_out);
+
+	flow_log("%s() final: %u nbuf: %u ",
+		 __func__, rctx->is_final, local_nbuf);
+
+	if (ctx->max_payload == SPU_MAX_PAYLOAD_INF)
+		flow_log("max_payload infinite\n");
+	else
+		flow_log("max_payload %u\n", ctx->max_payload);
+
+	flow_log("chunk_start: %u chunk_size: %u\n", chunk_start, chunksize);
+
+	/* Prepend SPU header with type 3 BCM header */
+	memcpy(rctx->msg_buf.bcm_spu_req_hdr, BCMHEADER, BCM_HDR_LEN);
+
+	hash_parms.prebuf_len = local_nbuf;
+	spu_hdr_len = spu->spu_create_request(rctx->msg_buf.bcm_spu_req_hdr +
+					      BCM_HDR_LEN,
+					      &req_opts, &cipher_parms,
+					      &hash_parms, &aead_parms,
+					      new_data_len);
+
+	if (spu_hdr_len == 0) {
+		pr_err("Failed to create SPU request header\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Determine total length of padding required. Put all padding in one
+	 * buffer.
+	 */
+	data_pad_len = spu->spu_gcm_ccm_pad_len(ctx->cipher.mode, chunksize);
+	db_size = spu_real_db_size(0, 0, local_nbuf, new_data_len,
+				   0, 0, hash_parms.pad_len);
+	if (spu->spu_tx_status_len())
+		stat_pad_len = spu->spu_wordalign_padlen(db_size);
+	if (stat_pad_len)
+		rx_frag_num++;
+	pad_len = hash_parms.pad_len + data_pad_len + stat_pad_len;
+	if (pad_len) {
+		tx_frag_num++;
+		spu->spu_request_pad(rctx->msg_buf.spu_req_pad, data_pad_len,
+				     hash_parms.pad_len, ctx->auth.alg,
+				     ctx->auth.mode, rctx->total_sent,
+				     stat_pad_len);
+	}
+
+	spu->spu_dump_msg_hdr(rctx->msg_buf.bcm_spu_req_hdr + BCM_HDR_LEN,
+			      spu_hdr_len);
+	packet_dump("    prebuf: ", rctx->hash_carry, local_nbuf);
+	flow_log("Data:\n");
+	dump_sg(rctx->src_sg, rctx->src_skip, new_data_len);
+	packet_dump("   pad: ", rctx->msg_buf.spu_req_pad, pad_len);
+
+	/*
+	 * Build mailbox message containing SPU request msg and rx buffers
+	 * to catch response message
+	 */
+	memset(mssg, 0, sizeof(*mssg));
+	mssg->type = BRCM_MESSAGE_SPU;
+	mssg->ctx = rctx;	/* Will be returned in response */
+
+	/* Create rx scatterlist to catch result */
+	err = spu_ahash_rx_sg_create(mssg, rctx, rx_frag_num, digestsize,
+				     stat_pad_len);
+	if (err)
+		return err;
+
+	/* Create tx scatterlist containing SPU request message */
+	tx_frag_num += rctx->src_nents;
+	if (spu->spu_tx_status_len())
+		tx_frag_num++;
+	err = spu_ahash_tx_sg_create(mssg, rctx, tx_frag_num, spu_hdr_len,
+				     local_nbuf, new_data_len, pad_len);
+	if (err)
+		return err;
+
+	err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx], mssg);
+	if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) {
+		while ((err == -ENOBUFS) && (retry_cnt < SPU_MB_RETRY_MAX)) {
+			/*
+			 * Mailbox queue is full. Since MAY_SLEEP is set, assume
+			 * not in atomic context and we can wait and try again.
+			 */
+			retry_cnt++;
+			usleep_range(MBOX_SLEEP_MIN, MBOX_SLEEP_MAX);
+			err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx],
+						mssg);
+			atomic_inc(&iproc_priv.mb_no_spc);
+		}
+	}
+	if (err < 0) {
+		atomic_inc(&iproc_priv.mb_send_fail);
+		return err;
+	}
+	return -EINPROGRESS;
+}
+
+/**
+ * spu_hmac_outer_hash() - Request synchonous software compute of the outer hash
+ * for an HMAC request.
+ * @req:  The HMAC request from the crypto API
+ * @ctx:  The session context
+ *
+ * Return: 0 if synchronous hash operation successful
+ *         -EINVAL if the hash algo is unrecognized
+ *         any other value indicates an error
+ */
+static int spu_hmac_outer_hash(struct ahash_request *req,
+			       struct iproc_ctx_s *ctx)
+{
+	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
+	unsigned int blocksize =
+		crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash));
+	int rc;
+
+	switch (ctx->auth.alg) {
+	case HASH_ALG_MD5:
+		rc = do_shash("md5", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	case HASH_ALG_SHA1:
+		rc = do_shash("sha1", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	case HASH_ALG_SHA224:
+		rc = do_shash("sha224", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	case HASH_ALG_SHA256:
+		rc = do_shash("sha256", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	case HASH_ALG_SHA384:
+		rc = do_shash("sha384", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	case HASH_ALG_SHA512:
+		rc = do_shash("sha512", req->result, ctx->opad, blocksize,
+			      req->result, ctx->digestsize, NULL, 0);
+		break;
+	default:
+		pr_err("%s() Error : unknown hmac type\n", __func__);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+/**
+ * ahash_req_done() - Process a hash result from the SPU hardware.
+ * @rctx: Crypto request context
+ *
+ * Return: 0 if successful
+ *         < 0 if an error
+ */
+static int ahash_req_done(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_async_request *areq = rctx->parent;
+	struct ahash_request *req = ahash_request_cast(areq);
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	int err;
+
+	memcpy(req->result, rctx->msg_buf.digest, ctx->digestsize);
+
+	if (spu->spu_type == SPU_TYPE_SPUM) {
+		/* byte swap the output from the UPDT function to network byte
+		 * order
+		 */
+		if (ctx->auth.alg == HASH_ALG_MD5) {
+			__swab32s((u32 *)req->result);
+			__swab32s(((u32 *)req->result) + 1);
+			__swab32s(((u32 *)req->result) + 2);
+			__swab32s(((u32 *)req->result) + 3);
+			__swab32s(((u32 *)req->result) + 4);
+		}
+	}
+
+	flow_dump("  digest ", req->result, ctx->digestsize);
+
+	/* if this an HMAC then do the outer hash */
+	if (rctx->is_sw_hmac) {
+		err = spu_hmac_outer_hash(req, ctx);
+		if (err < 0)
+			return err;
+		flow_dump("  hmac: ", req->result, ctx->digestsize);
+	}
+
+	if (rctx->is_sw_hmac || ctx->auth.mode == HASH_MODE_HMAC) {
+		atomic_inc(&iproc_priv.op_counts[SPU_OP_HMAC]);
+		atomic_inc(&iproc_priv.hmac_cnt[ctx->auth.alg]);
+	} else {
+		atomic_inc(&iproc_priv.op_counts[SPU_OP_HASH]);
+		atomic_inc(&iproc_priv.hash_cnt[ctx->auth.alg]);
+	}
+
+	return 0;
+}
+
+/**
+ * handle_ahash_resp() - Process a SPU response message for a hash request.
+ * Checks if the entire crypto API request has been processed, and if so,
+ * invokes post processing on the result.
+ * @rctx: Crypto request context
+ */
+static void handle_ahash_resp(struct iproc_reqctx_s *rctx)
+{
+	struct iproc_ctx_s *ctx = rctx->ctx;
+#ifdef DEBUG
+	struct crypto_async_request *areq = rctx->parent;
+	struct ahash_request *req = ahash_request_cast(areq);
+	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
+	unsigned int blocksize =
+		crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash));
+#endif
+	/*
+	 * Save hash to use as input to next op if incremental. Might be copying
+	 * too much, but that's easier than figuring out actual digest size here
+	 */
+	memcpy(rctx->incr_hash, rctx->msg_buf.digest, MAX_DIGEST_SIZE);
+
+	flow_log("%s() blocksize:%u digestsize:%u\n",
+		 __func__, blocksize, ctx->digestsize);
+
+	atomic64_add(ctx->digestsize, &iproc_priv.bytes_in);
+
+	if (rctx->is_final && (rctx->total_sent == rctx->total_todo))
+		ahash_req_done(rctx);
+}
+
+/**
+ * spu_aead_rx_sg_create() - Build up the scatterlist of buffers used to receive
+ * a SPU response message for an AEAD request. Includes buffers to catch SPU
+ * message headers and the response data.
+ * @mssg:	mailbox message containing the receive sg
+ * @rctx:	crypto request context
+ * @rx_frag_num: number of scatterlist elements required to hold the
+ *		SPU response message
+ * @assoc_len:	Length of associated data included in the crypto request
+ * @ret_iv_len: Length of IV returned in response
+ * @resp_len:	Number of bytes of response data expected to be written to
+ *              dst buffer from crypto API
+ * @digestsize: Length of hash digest, in bytes
+ * @stat_pad_len: Number of bytes required to pad the STAT field to
+ *		a 4-byte boundary
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Returns:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int spu_aead_rx_sg_create(struct brcm_message *mssg,
+				 struct aead_request *req,
+				 struct iproc_reqctx_s *rctx,
+				 u8 rx_frag_num,
+				 unsigned int assoc_len,
+				 u32 ret_iv_len, unsigned int resp_len,
+				 unsigned int digestsize, u32 stat_pad_len)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 datalen;		/* Number of bytes of response data expected */
+	u32 assoc_buf_len;
+	u8 data_padlen = 0;
+
+	if (ctx->is_rfc4543) {
+		/* RFC4543: only pad after data, not after AAD */
+		data_padlen = spu->spu_gcm_ccm_pad_len(ctx->cipher.mode,
+							  assoc_len + resp_len);
+		assoc_buf_len = assoc_len;
+	} else {
+		data_padlen = spu->spu_gcm_ccm_pad_len(ctx->cipher.mode,
+							  resp_len);
+		assoc_buf_len = spu->spu_assoc_resp_len(ctx->cipher.mode,
+						assoc_len, ret_iv_len,
+						rctx->is_encrypt);
+	}
+
+	if (ctx->cipher.mode == CIPHER_MODE_CCM)
+		/* ICV (after data) must be in the next 32-bit word for CCM */
+		data_padlen += spu->spu_wordalign_padlen(assoc_buf_len +
+							 resp_len +
+							 data_padlen);
+
+	if (data_padlen)
+		/* have to catch gcm pad in separate buffer */
+		rx_frag_num++;
+
+	mssg->spu.dst = kcalloc(rx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (!mssg->spu.dst)
+		return -ENOMEM;
+
+	sg = mssg->spu.dst;
+	sg_init_table(sg, rx_frag_num);
+
+	/* Space for SPU message header */
+	sg_set_buf(sg++, rctx->msg_buf.spu_resp_hdr, ctx->spu_resp_hdr_len);
+
+	if (assoc_buf_len) {
+		/*
+		 * Don't write directly to req->dst, because SPU may pad the
+		 * assoc data in the response
+		 */
+		memset(rctx->msg_buf.a.resp_aad, 0, assoc_buf_len);
+		sg_set_buf(sg++, rctx->msg_buf.a.resp_aad, assoc_buf_len);
+	}
+
+	if (resp_len) {
+		/*
+		 * Copy in each dst sg entry from request, up to chunksize.
+		 * dst sg catches just the data. digest caught in separate buf.
+		 */
+		datalen = spu_msg_sg_add(&sg, &rctx->dst_sg, &rctx->dst_skip,
+					 rctx->dst_nents, resp_len);
+		if (datalen < (resp_len)) {
+			pr_err("%s(): failed to copy dst sg to mbox msg. expected len %u, datalen %u",
+			       __func__, resp_len, datalen);
+			return -EFAULT;
+		}
+	}
+
+	/* If GCM/CCM data is padded, catch padding in separate buffer */
+	if (data_padlen) {
+		memset(rctx->msg_buf.a.gcmpad, 0, data_padlen);
+		sg_set_buf(sg++, rctx->msg_buf.a.gcmpad, data_padlen);
+	}
+
+	/* Always catch ICV in separate buffer */
+	sg_set_buf(sg++, rctx->msg_buf.digest, digestsize);
+
+	flow_log("stat_pad_len %u\n", stat_pad_len);
+	if (stat_pad_len) {
+		memset(rctx->msg_buf.rx_stat_pad, 0, stat_pad_len);
+		sg_set_buf(sg++, rctx->msg_buf.rx_stat_pad, stat_pad_len);
+	}
+
+	memset(rctx->msg_buf.rx_stat, 0, SPU_RX_STATUS_LEN);
+	sg_set_buf(sg, rctx->msg_buf.rx_stat, spu->spu_rx_status_len());
+
+	return 0;
+}
+
+/**
+ * spu_aead_tx_sg_create() - Build up the scatterlist of buffers used to send a
+ * SPU request message for an AEAD request. Includes SPU message headers and the
+ * request data.
+ * @mssg:	mailbox message containing the transmit sg
+ * @rctx:	crypto request context
+ * @tx_frag_num: number of scatterlist elements required to construct the
+ *		SPU request message
+ * @spu_hdr_len: length of SPU message header in bytes
+ * @assoc:	crypto API associated data scatterlist
+ * @assoc_len:	length of associated data
+ * @assoc_nents: number of scatterlist entries containing assoc data
+ * @aead_iv_len: length of AEAD IV, if included
+ * @chunksize:	Number of bytes of request data
+ * @aad_pad_len: Number of bytes of padding at end of AAD. For GCM/CCM.
+ * @pad_len:	Number of pad bytes
+ * @incl_icv:	If true, write separate ICV buffer after data and
+ *              any padding
+ *
+ * The scatterlist that gets allocated here is freed in spu_chunk_cleanup()
+ * when the request completes, whether the request is handled successfully or
+ * there is an error.
+ *
+ * Return:
+ *   0 if successful
+ *   < 0 if an error
+ */
+static int spu_aead_tx_sg_create(struct brcm_message *mssg,
+				 struct iproc_reqctx_s *rctx,
+				 u8 tx_frag_num,
+				 u32 spu_hdr_len,
+				 struct scatterlist *assoc,
+				 unsigned int assoc_len,
+				 int assoc_nents,
+				 unsigned int aead_iv_len,
+				 unsigned int chunksize,
+				 u32 aad_pad_len, u32 pad_len, bool incl_icv)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct scatterlist *sg;	/* used to build sgs in mbox message */
+	struct scatterlist *assoc_sg = assoc;
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 datalen;		/* Number of bytes of data to write */
+	u32 written;		/* Number of bytes of data written */
+	u32 assoc_offset = 0;
+	u32 stat_len;
+
+	mssg->spu.src = kcalloc(tx_frag_num, sizeof(struct scatterlist),
+				rctx->gfp);
+	if (!mssg->spu.src)
+		return -ENOMEM;
+
+	sg = mssg->spu.src;
+	sg_init_table(sg, tx_frag_num);
+
+	sg_set_buf(sg++, rctx->msg_buf.bcm_spu_req_hdr,
+		   BCM_HDR_LEN + spu_hdr_len);
+
+	if (assoc_len) {
+		/* Copy in each associated data sg entry from request */
+		written = spu_msg_sg_add(&sg, &assoc_sg, &assoc_offset,
+					 assoc_nents, assoc_len);
+		if (written < assoc_len) {
+			pr_err("%s(): failed to copy assoc sg to mbox msg",
+			       __func__);
+			return -EFAULT;
+		}
+	}
+
+	if (aead_iv_len)
+		sg_set_buf(sg++, rctx->msg_buf.iv_ctr, aead_iv_len);
+
+	if (aad_pad_len) {
+		memset(rctx->msg_buf.a.req_aad_pad, 0, aad_pad_len);
+		sg_set_buf(sg++, rctx->msg_buf.a.req_aad_pad, aad_pad_len);
+	}
+
+	datalen = chunksize;
+	if ((chunksize > ctx->digestsize) && incl_icv)
+		datalen -= ctx->digestsize;
+	if (datalen) {
+		/* For aead, a single msg should consume the entire src sg */
+		written = spu_msg_sg_add(&sg, &rctx->src_sg, &rctx->src_skip,
+					 rctx->src_nents, datalen);
+		if (written < datalen) {
+			pr_err("%s(): failed to copy src sg to mbox msg",
+			       __func__);
+			return -EFAULT;
+		}
+	}
+
+	if (pad_len) {
+		memset(rctx->msg_buf.spu_req_pad, 0, pad_len);
+		sg_set_buf(sg++, rctx->msg_buf.spu_req_pad, pad_len);
+	}
+
+	if (incl_icv)
+		sg_set_buf(sg++, rctx->msg_buf.digest, ctx->digestsize);
+
+	stat_len = spu->spu_tx_status_len();
+	if (stat_len) {
+		memset(rctx->msg_buf.tx_stat, 0, stat_len);
+		sg_set_buf(sg, rctx->msg_buf.tx_stat, stat_len);
+	}
+	return 0;
+}
+
+/**
+ * handle_aead_req() - Submit a SPU request message for the next chunk of the
+ * current AEAD request.
+ * @rctx:  Crypto request context
+ *
+ * Unlike other operation types, we assume the length of the request fits in
+ * a single SPU request message. aead_enqueue() makes sure this is true.
+ * Comments for other op types regarding threads applies here as well.
+ *
+ * Unlike incremental hash ops, where the spu returns the entire hash for
+ * truncated algs like sha-224, the SPU returns just the truncated hash in
+ * response to aead requests. So digestsize is always ctx->digestsize here.
+ *
+ * Return: -EINPROGRESS: crypto request has been accepted and result will be
+ *			 returned asynchronously
+ *         Any other value indicates an error
+ */
+static int handle_aead_req(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_async_request *areq = rctx->parent;
+	struct aead_request *req = container_of(areq,
+						struct aead_request, base);
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	int err;
+	unsigned int chunksize;
+	unsigned int resp_len;
+	u32 spu_hdr_len;
+	u32 db_size;
+	u32 stat_pad_len;
+	u32 pad_len;
+	struct brcm_message *mssg;	/* mailbox message */
+	struct spu_request_opts req_opts;
+	struct spu_cipher_parms cipher_parms;
+	struct spu_hash_parms hash_parms;
+	struct spu_aead_parms aead_parms;
+	int assoc_nents = 0;
+	bool incl_icv = false;
+	unsigned int digestsize = ctx->digestsize;
+	int retry_cnt = 0;
+
+	/* number of entries in src and dst sg. Always includes SPU msg header.
+	 */
+	u8 rx_frag_num = 2;	/* and STATUS */
+	u8 tx_frag_num = 1;
+
+	/* doing the whole thing at once */
+	chunksize = rctx->total_todo;
+
+	flow_log("%s: chunksize %u\n", __func__, chunksize);
+
+	memset(&req_opts, 0, sizeof(req_opts));
+	memset(&hash_parms, 0, sizeof(hash_parms));
+	memset(&aead_parms, 0, sizeof(aead_parms));
+
+	req_opts.is_inbound = !(rctx->is_encrypt);
+	req_opts.auth_first = ctx->auth_first;
+	req_opts.is_aead = true;
+	req_opts.is_esp = ctx->is_esp;
+
+	cipher_parms.alg = ctx->cipher.alg;
+	cipher_parms.mode = ctx->cipher.mode;
+	cipher_parms.type = ctx->cipher_type;
+	cipher_parms.key_buf = ctx->enckey;
+	cipher_parms.key_len = ctx->enckeylen;
+	cipher_parms.iv_buf = rctx->msg_buf.iv_ctr;
+	cipher_parms.iv_len = rctx->iv_ctr_len;
+
+	hash_parms.alg = ctx->auth.alg;
+	hash_parms.mode = ctx->auth.mode;
+	hash_parms.type = HASH_TYPE_NONE;
+	hash_parms.key_buf = (u8 *)ctx->authkey;
+	hash_parms.key_len = ctx->authkeylen;
+	hash_parms.digestsize = digestsize;
+
+	if ((ctx->auth.alg == HASH_ALG_SHA224) &&
+	    (ctx->authkeylen < SHA224_DIGEST_SIZE))
+		hash_parms.key_len = SHA224_DIGEST_SIZE;
+
+	aead_parms.assoc_size = req->assoclen;
+	if (ctx->is_esp && !ctx->is_rfc4543) {
+		/*
+		 * 8-byte IV is included assoc data in request. SPU2
+		 * expects AAD to include just SPI and seqno. So
+		 * subtract off the IV len.
+		 */
+		aead_parms.assoc_size -= GCM_ESP_IV_SIZE;
+
+		if (rctx->is_encrypt) {
+			aead_parms.return_iv = true;
+			aead_parms.ret_iv_len = GCM_ESP_IV_SIZE;
+			aead_parms.ret_iv_off = GCM_ESP_SALT_SIZE;
+		}
+	} else {
+		aead_parms.ret_iv_len = 0;
+	}
+
+	/*
+	 * Count number of sg entries from the crypto API request that are to
+	 * be included in this mailbox message. For dst sg, don't count space
+	 * for digest. Digest gets caught in a separate buffer and copied back
+	 * to dst sg when processing response.
+	 */
+	rctx->src_nents = spu_sg_count(rctx->src_sg, rctx->src_skip, chunksize);
+	rctx->dst_nents = spu_sg_count(rctx->dst_sg, rctx->dst_skip, chunksize);
+	if (aead_parms.assoc_size)
+		assoc_nents = spu_sg_count(rctx->assoc, 0,
+					   aead_parms.assoc_size);
+
+	mssg = &rctx->mb_mssg;
+
+	rctx->total_sent = chunksize;
+	rctx->src_sent = chunksize;
+	if (spu->spu_assoc_resp_len(ctx->cipher.mode,
+				    aead_parms.assoc_size,
+				    aead_parms.ret_iv_len,
+				    rctx->is_encrypt))
+		rx_frag_num++;
+
+	aead_parms.iv_len = spu->spu_aead_ivlen(ctx->cipher.mode,
+						rctx->iv_ctr_len);
+
+	if (ctx->auth.alg == HASH_ALG_AES)
+		hash_parms.type = ctx->cipher_type;
+
+	/* General case AAD padding (CCM and RFC4543 special cases below) */
+	aead_parms.aad_pad_len = spu->spu_gcm_ccm_pad_len(ctx->cipher.mode,
+						 aead_parms.assoc_size);
+
+	/* General case data padding (CCM decrypt special case below) */
+	aead_parms.data_pad_len = spu->spu_gcm_ccm_pad_len(ctx->cipher.mode,
+							   chunksize);
+
+	if (ctx->cipher.mode == CIPHER_MODE_CCM) {
+		/*
+		 * for CCM, AAD len + 2 (rather than AAD len) needs to be
+		 * 128-bit aligned
+		 */
+		aead_parms.aad_pad_len = spu->spu_gcm_ccm_pad_len(
+					 ctx->cipher.mode,
+					 aead_parms.assoc_size + 2);
+
+		/*
+		 * And when decrypting CCM, need to pad without including
+		 * size of ICV which is tacked on to end of chunk
+		 */
+		if (!rctx->is_encrypt)
+			aead_parms.data_pad_len =
+				spu->spu_gcm_ccm_pad_len(ctx->cipher.mode,
+							chunksize - digestsize);
+
+		/* CCM also requires software to rewrite portions of IV: */
+		spu->spu_ccm_update_iv(digestsize, &cipher_parms, req->assoclen,
+				       chunksize, rctx->is_encrypt,
+				       ctx->is_esp);
+	}
+
+	if (ctx->is_rfc4543) {
+		/*
+		 * RFC4543: data is included in AAD, so don't pad after AAD
+		 * and pad data based on both AAD + data size
+		 */
+		aead_parms.aad_pad_len = 0;
+		if (!rctx->is_encrypt)
+			aead_parms.data_pad_len = spu->spu_gcm_ccm_pad_len(
+					ctx->cipher.mode,
+					aead_parms.assoc_size + chunksize -
+					digestsize);
+		else
+			aead_parms.data_pad_len = spu->spu_gcm_ccm_pad_len(
+					ctx->cipher.mode,
+					aead_parms.assoc_size + chunksize);
+
+		req_opts.is_rfc4543 = true;
+	}
+
+	if (spu_req_incl_icv(ctx->cipher.mode, rctx->is_encrypt)) {
+		incl_icv = true;
+		tx_frag_num++;
+		/* Copy ICV from end of src scatterlist to digest buf */
+		sg_copy_part_to_buf(req->src, rctx->msg_buf.digest, digestsize,
+				    req->assoclen + rctx->total_sent -
+				    digestsize);
+	}
+
+	atomic64_add(chunksize, &iproc_priv.bytes_out);
+
+	flow_log("%s()-sent chunksize:%u\n", __func__, chunksize);
+
+	/* Prepend SPU header with type 3 BCM header */
+	memcpy(rctx->msg_buf.bcm_spu_req_hdr, BCMHEADER, BCM_HDR_LEN);
+
+	spu_hdr_len = spu->spu_create_request(rctx->msg_buf.bcm_spu_req_hdr +
+					      BCM_HDR_LEN, &req_opts,
+					      &cipher_parms, &hash_parms,
+					      &aead_parms, chunksize);
+
+	/* Determine total length of padding. Put all padding in one buffer. */
+	db_size = spu_real_db_size(aead_parms.assoc_size, aead_parms.iv_len, 0,
+				   chunksize, aead_parms.aad_pad_len,
+				   aead_parms.data_pad_len, 0);
+
+	stat_pad_len = spu->spu_wordalign_padlen(db_size);
+
+	if (stat_pad_len)
+		rx_frag_num++;
+	pad_len = aead_parms.data_pad_len + stat_pad_len;
+	if (pad_len) {
+		tx_frag_num++;
+		spu->spu_request_pad(rctx->msg_buf.spu_req_pad,
+				     aead_parms.data_pad_len, 0,
+				     ctx->auth.alg, ctx->auth.mode,
+				     rctx->total_sent, stat_pad_len);
+	}
+
+	spu->spu_dump_msg_hdr(rctx->msg_buf.bcm_spu_req_hdr + BCM_HDR_LEN,
+			      spu_hdr_len);
+	dump_sg(rctx->assoc, 0, aead_parms.assoc_size);
+	packet_dump("    aead iv: ", rctx->msg_buf.iv_ctr, aead_parms.iv_len);
+	packet_log("BD:\n");
+	dump_sg(rctx->src_sg, rctx->src_skip, chunksize);
+	packet_dump("   pad: ", rctx->msg_buf.spu_req_pad, pad_len);
+
+	/*
+	 * Build mailbox message containing SPU request msg and rx buffers
+	 * to catch response message
+	 */
+	memset(mssg, 0, sizeof(*mssg));
+	mssg->type = BRCM_MESSAGE_SPU;
+	mssg->ctx = rctx;	/* Will be returned in response */
+
+	/* Create rx scatterlist to catch result */
+	rx_frag_num += rctx->dst_nents;
+	resp_len = chunksize;
+
+	/*
+	 * Always catch ICV in separate buffer. Have to for GCM/CCM because of
+	 * padding. Have to for SHA-224 and other truncated SHAs because SPU
+	 * sends entire digest back.
+	 */
+	rx_frag_num++;
+
+	if (((ctx->cipher.mode == CIPHER_MODE_GCM) ||
+	     (ctx->cipher.mode == CIPHER_MODE_CCM)) && !rctx->is_encrypt) {
+		/*
+		 * Input is ciphertxt plus ICV, but ICV not incl
+		 * in output.
+		 */
+		resp_len -= ctx->digestsize;
+		if (resp_len == 0)
+			/* no rx frags to catch output data */
+			rx_frag_num -= rctx->dst_nents;
+	}
+
+	err = spu_aead_rx_sg_create(mssg, req, rctx, rx_frag_num,
+				    aead_parms.assoc_size,
+				    aead_parms.ret_iv_len, resp_len, digestsize,
+				    stat_pad_len);
+	if (err)
+		return err;
+
+	/* Create tx scatterlist containing SPU request message */
+	tx_frag_num += rctx->src_nents;
+	tx_frag_num += assoc_nents;
+	if (aead_parms.aad_pad_len)
+		tx_frag_num++;
+	if (aead_parms.iv_len)
+		tx_frag_num++;
+	if (spu->spu_tx_status_len())
+		tx_frag_num++;
+	err = spu_aead_tx_sg_create(mssg, rctx, tx_frag_num, spu_hdr_len,
+				    rctx->assoc, aead_parms.assoc_size,
+				    assoc_nents, aead_parms.iv_len, chunksize,
+				    aead_parms.aad_pad_len, pad_len, incl_icv);
+	if (err)
+		return err;
+
+	err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx], mssg);
+	if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) {
+		while ((err == -ENOBUFS) && (retry_cnt < SPU_MB_RETRY_MAX)) {
+			/*
+			 * Mailbox queue is full. Since MAY_SLEEP is set, assume
+			 * not in atomic context and we can wait and try again.
+			 */
+			retry_cnt++;
+			usleep_range(MBOX_SLEEP_MIN, MBOX_SLEEP_MAX);
+			err = mbox_send_message(iproc_priv.mbox[rctx->chan_idx],
+						mssg);
+			atomic_inc(&iproc_priv.mb_no_spc);
+		}
+	}
+	if (err < 0) {
+		atomic_inc(&iproc_priv.mb_send_fail);
+		return err;
+	}
+
+	return -EINPROGRESS;
+}
+
+/**
+ * handle_aead_resp() - Process a SPU response message for an AEAD request.
+ * @rctx:  Crypto request context
+ */
+static void handle_aead_resp(struct iproc_reqctx_s *rctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_async_request *areq = rctx->parent;
+	struct aead_request *req = container_of(areq,
+						struct aead_request, base);
+	struct iproc_ctx_s *ctx = rctx->ctx;
+	u32 payload_len;
+	unsigned int icv_offset;
+	u32 result_len;
+
+	/* See how much data was returned */
+	payload_len = spu->spu_payload_length(rctx->msg_buf.spu_resp_hdr);
+	flow_log("payload_len %u\n", payload_len);
+
+	/* only count payload */
+	atomic64_add(payload_len, &iproc_priv.bytes_in);
+
+	if (req->assoclen)
+		packet_dump("  assoc_data ", rctx->msg_buf.a.resp_aad,
+			    req->assoclen);
+
+	/*
+	 * Copy the ICV back to the destination
+	 * buffer. In decrypt case, SPU gives us back the digest, but crypto
+	 * API doesn't expect ICV in dst buffer.
+	 */
+	result_len = req->cryptlen;
+	if (rctx->is_encrypt) {
+		icv_offset = req->assoclen + rctx->total_sent;
+		packet_dump("  ICV: ", rctx->msg_buf.digest, ctx->digestsize);
+		flow_log("copying ICV to dst sg at offset %u\n", icv_offset);
+		sg_copy_part_from_buf(req->dst, rctx->msg_buf.digest,
+				      ctx->digestsize, icv_offset);
+		result_len += ctx->digestsize;
+	}
+
+	packet_log("response data:  ");
+	dump_sg(req->dst, req->assoclen, result_len);
+
+	atomic_inc(&iproc_priv.op_counts[SPU_OP_AEAD]);
+	if (ctx->cipher.alg == CIPHER_ALG_AES) {
+		if (ctx->cipher.mode == CIPHER_MODE_CCM)
+			atomic_inc(&iproc_priv.aead_cnt[AES_CCM]);
+		else if (ctx->cipher.mode == CIPHER_MODE_GCM)
+			atomic_inc(&iproc_priv.aead_cnt[AES_GCM]);
+		else
+			atomic_inc(&iproc_priv.aead_cnt[AUTHENC]);
+	} else {
+		atomic_inc(&iproc_priv.aead_cnt[AUTHENC]);
+	}
+}
+
+/**
+ * spu_chunk_cleanup() - Do cleanup after processing one chunk of a request
+ * @rctx:  request context
+ *
+ * Mailbox scatterlists are allocated for each chunk. So free them after
+ * processing each chunk.
+ */
+static void spu_chunk_cleanup(struct iproc_reqctx_s *rctx)
+{
+	/* mailbox message used to tx request */
+	struct brcm_message *mssg = &rctx->mb_mssg;
+
+	kfree(mssg->spu.src);
+	kfree(mssg->spu.dst);
+	memset(mssg, 0, sizeof(struct brcm_message));
+}
+
+/**
+ * finish_req() - Used to invoke the complete callback from the requester when
+ * a request has been handled asynchronously.
+ * @rctx:  Request context
+ * @err:   Indicates whether the request was successful or not
+ *
+ * Ensures that cleanup has been done for request
+ */
+static void finish_req(struct iproc_reqctx_s *rctx, int err)
+{
+	struct crypto_async_request *areq = rctx->parent;
+
+	flow_log("%s() err:%d\n\n", __func__, err);
+
+	/* No harm done if already called */
+	spu_chunk_cleanup(rctx);
+
+	if (areq)
+		areq->complete(areq, err);
+}
+
+/**
+ * spu_rx_callback() - Callback from mailbox framework with a SPU response.
+ * @cl:		mailbox client structure for SPU driver
+ * @msg:	mailbox message containing SPU response
+ */
+static void spu_rx_callback(struct mbox_client *cl, void *msg)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct brcm_message *mssg = msg;
+	struct iproc_reqctx_s *rctx;
+	struct iproc_ctx_s *ctx;
+	struct crypto_async_request *areq;
+	int err = 0;
+
+	rctx = mssg->ctx;
+	if (unlikely(!rctx)) {
+		/* This is fatal */
+		pr_err("%s(): no request context", __func__);
+		err = -EFAULT;
+		goto cb_finish;
+	}
+	areq = rctx->parent;
+	ctx = rctx->ctx;
+
+	/* process the SPU status */
+	err = spu->spu_status_process(rctx->msg_buf.rx_stat);
+	if (err != 0) {
+		if (err == SPU_INVALID_ICV)
+			atomic_inc(&iproc_priv.bad_icv);
+		err = -EBADMSG;
+		goto cb_finish;
+	}
+
+	/* Process the SPU response message */
+	switch (rctx->ctx->alg->type) {
+	case CRYPTO_ALG_TYPE_ABLKCIPHER:
+		handle_ablkcipher_resp(rctx);
+		break;
+	case CRYPTO_ALG_TYPE_AHASH:
+		handle_ahash_resp(rctx);
+		break;
+	case CRYPTO_ALG_TYPE_AEAD:
+		handle_aead_resp(rctx);
+		break;
+	default:
+		err = -EINVAL;
+		goto cb_finish;
+	}
+
+	/*
+	 * If this response does not complete the request, then send the next
+	 * request chunk.
+	 */
+	if (rctx->total_sent < rctx->total_todo) {
+		/* Deallocate anything specific to previous chunk */
+		spu_chunk_cleanup(rctx);
+
+		switch (rctx->ctx->alg->type) {
+		case CRYPTO_ALG_TYPE_ABLKCIPHER:
+			err = handle_ablkcipher_req(rctx);
+			break;
+		case CRYPTO_ALG_TYPE_AHASH:
+			err = handle_ahash_req(rctx);
+			if (err == -EAGAIN)
+				/*
+				 * we saved data in hash carry, but tell crypto
+				 * API we successfully completed request.
+				 */
+				err = 0;
+			break;
+		case CRYPTO_ALG_TYPE_AEAD:
+			err = handle_aead_req(rctx);
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (err == -EINPROGRESS)
+			/* Successfully submitted request for next chunk */
+			return;
+	}
+
+cb_finish:
+	finish_req(rctx, err);
+}
+
+/* ==================== Kernel Cryptographic API ==================== */
+
+/**
+ * ablkcipher_enqueue() - Handle ablkcipher encrypt or decrypt request.
+ * @req:	Crypto API request
+ * @encrypt:	true if encrypting; false if decrypting
+ *
+ * Return: -EINPROGRESS if request accepted and result will be returned
+ *			asynchronously
+ *	   < 0 if an error
+ */
+static int ablkcipher_enqueue(struct ablkcipher_request *req, bool encrypt)
+{
+	struct iproc_reqctx_s *rctx = ablkcipher_request_ctx(req);
+	struct iproc_ctx_s *ctx =
+	    crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
+	int err;
+
+	flow_log("%s() enc:%u\n", __func__, encrypt);
+
+	rctx->gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+	rctx->parent = &req->base;
+	rctx->is_encrypt = encrypt;
+	rctx->bd_suppress = false;
+	rctx->total_todo = req->nbytes;
+	rctx->src_sent = 0;
+	rctx->total_sent = 0;
+	rctx->total_received = 0;
+	rctx->ctx = ctx;
+
+	/* Initialize current position in src and dst scatterlists */
+	rctx->src_sg = req->src;
+	rctx->src_nents = 0;
+	rctx->src_skip = 0;
+	rctx->dst_sg = req->dst;
+	rctx->dst_nents = 0;
+	rctx->dst_skip = 0;
+
+	if (ctx->cipher.mode == CIPHER_MODE_CBC ||
+	    ctx->cipher.mode == CIPHER_MODE_CTR ||
+	    ctx->cipher.mode == CIPHER_MODE_OFB ||
+	    ctx->cipher.mode == CIPHER_MODE_XTS ||
+	    ctx->cipher.mode == CIPHER_MODE_GCM ||
+	    ctx->cipher.mode == CIPHER_MODE_CCM) {
+		rctx->iv_ctr_len =
+		    crypto_ablkcipher_ivsize(crypto_ablkcipher_reqtfm(req));
+		memcpy(rctx->msg_buf.iv_ctr, req->info, rctx->iv_ctr_len);
+	} else {
+		rctx->iv_ctr_len = 0;
+	}
+
+	/* Choose a SPU to process this request */
+	rctx->chan_idx = select_channel();
+	err = handle_ablkcipher_req(rctx);
+	if (err != -EINPROGRESS)
+		/* synchronous result */
+		spu_chunk_cleanup(rctx);
+
+	return err;
+}
+
+static int des_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+		      unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ablkcipher_ctx(cipher);
+	u32 tmp[DES_EXPKEY_WORDS];
+
+	if (keylen == DES_KEY_SIZE) {
+		if (des_ekey(tmp, key) == 0) {
+			if (crypto_ablkcipher_get_flags(cipher) &
+			    CRYPTO_TFM_REQ_WEAK_KEY) {
+				u32 flags = CRYPTO_TFM_RES_WEAK_KEY;
+
+				crypto_ablkcipher_set_flags(cipher, flags);
+				return -EINVAL;
+			}
+		}
+
+		ctx->cipher_type = CIPHER_TYPE_DES;
+	} else {
+		crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int threedes_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+			   unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ablkcipher_ctx(cipher);
+
+	if (keylen == (DES_KEY_SIZE * 3)) {
+		const u32 *K = (const u32 *)key;
+		u32 flags = CRYPTO_TFM_RES_BAD_KEY_SCHED;
+
+		if (!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
+		    !((K[2] ^ K[4]) | (K[3] ^ K[5]))) {
+			crypto_ablkcipher_set_flags(cipher, flags);
+			return -EINVAL;
+		}
+
+		ctx->cipher_type = CIPHER_TYPE_3DES;
+	} else {
+		crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int aes_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+		      unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ablkcipher_ctx(cipher);
+
+	if (ctx->cipher.mode == CIPHER_MODE_XTS)
+		/* XTS includes two keys of equal length */
+		keylen = keylen / 2;
+
+	switch (keylen) {
+	case AES_KEYSIZE_128:
+		ctx->cipher_type = CIPHER_TYPE_AES128;
+		break;
+	case AES_KEYSIZE_192:
+		ctx->cipher_type = CIPHER_TYPE_AES192;
+		break;
+	case AES_KEYSIZE_256:
+		ctx->cipher_type = CIPHER_TYPE_AES256;
+		break;
+	default:
+		crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	WARN_ON((ctx->max_payload != SPU_MAX_PAYLOAD_INF) &&
+		((ctx->max_payload % AES_BLOCK_SIZE) != 0));
+	return 0;
+}
+
+static int rc4_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+		      unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ablkcipher_ctx(cipher);
+	int i;
+
+	ctx->enckeylen = ARC4_MAX_KEY_SIZE + ARC4_STATE_SIZE;
+
+	ctx->enckey[0] = 0x00;	/* 0x00 */
+	ctx->enckey[1] = 0x00;	/* i    */
+	ctx->enckey[2] = 0x00;	/* 0x00 */
+	ctx->enckey[3] = 0x00;	/* j    */
+	for (i = 0; i < ARC4_MAX_KEY_SIZE; i++)
+		ctx->enckey[i + ARC4_STATE_SIZE] = key[i % keylen];
+
+	ctx->cipher_type = CIPHER_TYPE_INIT;
+
+	return 0;
+}
+
+static int ablkcipher_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+			     unsigned int keylen)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct iproc_ctx_s *ctx = crypto_ablkcipher_ctx(cipher);
+	struct spu_cipher_parms cipher_parms;
+	u32 alloc_len = 0;
+	int err;
+
+	flow_log("ablkcipher_setkey() keylen: %d\n", keylen);
+	flow_dump("  key: ", key, keylen);
+
+	switch (ctx->cipher.alg) {
+	case CIPHER_ALG_DES:
+		err = des_setkey(cipher, key, keylen);
+		break;
+	case CIPHER_ALG_3DES:
+		err = threedes_setkey(cipher, key, keylen);
+		break;
+	case CIPHER_ALG_AES:
+		err = aes_setkey(cipher, key, keylen);
+		break;
+	case CIPHER_ALG_RC4:
+		err = rc4_setkey(cipher, key, keylen);
+		break;
+	default:
+		pr_err("%s() Error: unknown cipher alg\n", __func__);
+		err = -EINVAL;
+	}
+	if (err)
+		return err;
+
+	/* RC4 already populated ctx->enkey */
+	if (ctx->cipher.alg != CIPHER_ALG_RC4) {
+		memcpy(ctx->enckey, key, keylen);
+		ctx->enckeylen = keylen;
+	}
+	/* SPU needs XTS keys in the reverse order the crypto API presents */
+	if ((ctx->cipher.alg == CIPHER_ALG_AES) &&
+	    (ctx->cipher.mode == CIPHER_MODE_XTS)) {
+		unsigned int xts_keylen = keylen / 2;
+
+		memcpy(ctx->enckey, key + xts_keylen, xts_keylen);
+		memcpy(ctx->enckey + xts_keylen, key, xts_keylen);
+	}
+
+	if (spu->spu_type == SPU_TYPE_SPUM)
+		alloc_len = BCM_HDR_LEN + SPU_HEADER_ALLOC_LEN;
+	else if (spu->spu_type == SPU_TYPE_SPU2)
+		alloc_len = BCM_HDR_LEN + SPU2_HEADER_ALLOC_LEN;
+	memset(ctx->bcm_spu_req_hdr, 0, alloc_len);
+	cipher_parms.iv_buf = NULL;
+	cipher_parms.iv_len = crypto_ablkcipher_ivsize(cipher);
+	flow_log("%s: iv_len %u\n", __func__, cipher_parms.iv_len);
+
+	cipher_parms.alg = ctx->cipher.alg;
+	cipher_parms.mode = ctx->cipher.mode;
+	cipher_parms.type = ctx->cipher_type;
+	cipher_parms.key_buf = ctx->enckey;
+	cipher_parms.key_len = ctx->enckeylen;
+
+	/* Prepend SPU request message with BCM header */
+	memcpy(ctx->bcm_spu_req_hdr, BCMHEADER, BCM_HDR_LEN);
+	ctx->spu_req_hdr_len =
+	    spu->spu_cipher_req_init(ctx->bcm_spu_req_hdr + BCM_HDR_LEN,
+				     &cipher_parms);
+
+	ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen,
+							  ctx->enckeylen,
+							  false);
+
+	atomic_inc(&iproc_priv.setkey_cnt[SPU_OP_CIPHER]);
+
+	return 0;
+}
+
+static int ablkcipher_encrypt(struct ablkcipher_request *req)
+{
+	flow_log("ablkcipher_encrypt() nbytes:%u\n", req->nbytes);
+
+	return ablkcipher_enqueue(req, true);
+}
+
+static int ablkcipher_decrypt(struct ablkcipher_request *req)
+{
+	flow_log("ablkcipher_decrypt() nbytes:%u\n", req->nbytes);
+	return ablkcipher_enqueue(req, false);
+}
+
+static int ahash_enqueue(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	int err = 0;
+	const char *alg_name;
+
+	flow_log("ahash_enqueue() nbytes:%u\n", req->nbytes);
+
+	rctx->gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+	rctx->parent = &req->base;
+	rctx->ctx = ctx;
+	rctx->bd_suppress = true;
+	memset(&rctx->mb_mssg, 0, sizeof(struct brcm_message));
+
+	/* Initialize position in src scatterlist */
+	rctx->src_sg = req->src;
+	rctx->src_skip = 0;
+	rctx->src_nents = 0;
+	rctx->dst_sg = NULL;
+	rctx->dst_skip = 0;
+	rctx->dst_nents = 0;
+
+	/* SPU2 hardware does not compute hash of zero length data */
+	if ((rctx->is_final == 1) && (rctx->total_todo == 0) &&
+	    (iproc_priv.spu.spu_type == SPU_TYPE_SPU2)) {
+		alg_name = crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
+		flow_log("Doing %sfinal %s zero-len hash request in software\n",
+			 rctx->is_final ? "" : "non-", alg_name);
+		err = do_shash((unsigned char *)alg_name, req->result,
+			       NULL, 0, NULL, 0, ctx->authkey,
+			       ctx->authkeylen);
+		if (err < 0)
+			flow_log("Hash request failed with error %d\n", err);
+		return err;
+	}
+	/* Choose a SPU to process this request */
+	rctx->chan_idx = select_channel();
+
+	err = handle_ahash_req(rctx);
+	if (err != -EINPROGRESS)
+		/* synchronous result */
+		spu_chunk_cleanup(rctx);
+
+	if (err == -EAGAIN)
+		/*
+		 * we saved data in hash carry, but tell crypto API
+		 * we successfully completed request.
+		 */
+		err = 0;
+
+	return err;
+}
+
+static int __ahash_init(struct ahash_request *req)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+
+	flow_log("%s()\n", __func__);
+
+	/* Initialize the context */
+	rctx->hash_carry_len = 0;
+	rctx->is_final = 0;
+
+	rctx->total_todo = 0;
+	rctx->src_sent = 0;
+	rctx->total_sent = 0;
+	rctx->total_received = 0;
+
+	ctx->digestsize = crypto_ahash_digestsize(tfm);
+	/* If we add a hash whose digest is larger, catch it here. */
+	WARN_ON(ctx->digestsize > MAX_DIGEST_SIZE);
+
+	rctx->is_sw_hmac = false;
+
+	ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen, 0,
+							  true);
+
+	return 0;
+}
+
+/**
+ * spu_no_incr_hash() - Determine whether incremental hashing is supported.
+ * @ctx:  Crypto session context
+ *
+ * SPU-2 does not support incremental hashing (we'll have to revisit and
+ * condition based on chip revision or device tree entry if future versions do
+ * support incremental hash)
+ *
+ * SPU-M also doesn't support incremental hashing of AES-XCBC
+ *
+ * Return: true if incremental hashing is not supported
+ *         false otherwise
+ */
+bool spu_no_incr_hash(struct iproc_ctx_s *ctx)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+
+	if (spu->spu_type == SPU_TYPE_SPU2)
+		return true;
+
+	if ((ctx->auth.alg == HASH_ALG_AES) &&
+	    (ctx->auth.mode == HASH_MODE_XCBC))
+		return true;
+
+	/* Otherwise, incremental hashing is supported */
+	return false;
+}
+
+static int ahash_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	const char *alg_name;
+	struct crypto_shash *hash;
+	int ret;
+	gfp_t gfp;
+
+	if (spu_no_incr_hash(ctx)) {
+		/*
+		 * If we get an incremental hashing request and it's not
+		 * supported by the hardware, we need to handle it in software
+		 * by calling synchronous hash functions.
+		 */
+		alg_name = crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
+		hash = crypto_alloc_shash(alg_name, 0, 0);
+		if (IS_ERR(hash)) {
+			ret = PTR_ERR(hash);
+			goto err;
+		}
+
+		gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+		ctx->shash = kmalloc(sizeof(*ctx->shash) +
+				     crypto_shash_descsize(hash), gfp);
+		if (!ctx->shash) {
+			ret = -ENOMEM;
+			goto err_hash;
+		}
+		ctx->shash->tfm = hash;
+		ctx->shash->flags = 0;
+
+		/* Set the key using data we already have from setkey */
+		if (ctx->authkeylen > 0) {
+			ret = crypto_shash_setkey(hash, ctx->authkey,
+						  ctx->authkeylen);
+			if (ret)
+				goto err_shash;
+		}
+
+		/* Initialize hash w/ this key and other params */
+		ret = crypto_shash_init(ctx->shash);
+		if (ret)
+			goto err_shash;
+	} else {
+		/* Otherwise call the internal function which uses SPU hw */
+		ret = __ahash_init(req);
+	}
+
+	return ret;
+
+err_shash:
+	kfree(ctx->shash);
+err_hash:
+	crypto_free_shash(hash);
+err:
+	return ret;
+}
+
+static int __ahash_update(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+
+	flow_log("ahash_update() nbytes:%u\n", req->nbytes);
+
+	if (!req->nbytes)
+		return 0;
+	rctx->total_todo += req->nbytes;
+	rctx->src_sent = 0;
+
+	return ahash_enqueue(req);
+}
+
+static int ahash_update(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	u8 *tmpbuf;
+	int ret;
+	int nents;
+	gfp_t gfp;
+
+	if (spu_no_incr_hash(ctx)) {
+		/*
+		 * If we get an incremental hashing request and it's not
+		 * supported by the hardware, we need to handle it in software
+		 * by calling synchronous hash functions.
+		 */
+		if (req->src)
+			nents = sg_nents(req->src);
+		else
+			return -EINVAL;
+
+		/* Copy data from req scatterlist to tmp buffer */
+		gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+		tmpbuf = kmalloc(req->nbytes, gfp);
+		if (!tmpbuf)
+			return -ENOMEM;
+
+		if (sg_copy_to_buffer(req->src, nents, tmpbuf, req->nbytes) !=
+				req->nbytes) {
+			kfree(tmpbuf);
+			return -EINVAL;
+		}
+
+		/* Call synchronous update */
+		ret = crypto_shash_update(ctx->shash, tmpbuf, req->nbytes);
+		kfree(tmpbuf);
+	} else {
+		/* Otherwise call the internal function which uses SPU hw */
+		ret = __ahash_update(req);
+	}
+
+	return ret;
+}
+
+static int __ahash_final(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+
+	flow_log("ahash_final() nbytes:%u\n", req->nbytes);
+
+	rctx->is_final = 1;
+
+	return ahash_enqueue(req);
+}
+
+static int ahash_final(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	int ret;
+
+	if (spu_no_incr_hash(ctx)) {
+		/*
+		 * If we get an incremental hashing request and it's not
+		 * supported by the hardware, we need to handle it in software
+		 * by calling synchronous hash functions.
+		 */
+		ret = crypto_shash_final(ctx->shash, req->result);
+
+		/* Done with hash, can deallocate it now */
+		crypto_free_shash(ctx->shash->tfm);
+		kfree(ctx->shash);
+
+	} else {
+		/* Otherwise call the internal function which uses SPU hw */
+		ret = __ahash_final(req);
+	}
+
+	return ret;
+}
+
+static int __ahash_finup(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+
+	flow_log("ahash_finup() nbytes:%u\n", req->nbytes);
+
+	rctx->total_todo += req->nbytes;
+	rctx->src_sent = 0;
+	rctx->is_final = 1;
+
+	return ahash_enqueue(req);
+}
+
+static int ahash_finup(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	u8 *tmpbuf;
+	int ret;
+	int nents;
+	gfp_t gfp;
+
+	if (spu_no_incr_hash(ctx)) {
+		/*
+		 * If we get an incremental hashing request and it's not
+		 * supported by the hardware, we need to handle it in software
+		 * by calling synchronous hash functions.
+		 */
+		if (req->src) {
+			nents = sg_nents(req->src);
+		} else {
+			ret = -EINVAL;
+			goto ahash_finup_exit;
+		}
+
+		/* Copy data from req scatterlist to tmp buffer */
+		gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+		tmpbuf = kmalloc(req->nbytes, gfp);
+		if (!tmpbuf) {
+			ret = -ENOMEM;
+			goto ahash_finup_exit;
+		}
+
+		if (sg_copy_to_buffer(req->src, nents, tmpbuf, req->nbytes) !=
+				req->nbytes) {
+			ret = -EINVAL;
+			goto ahash_finup_free;
+		}
+
+		/* Call synchronous update */
+		ret = crypto_shash_finup(ctx->shash, tmpbuf, req->nbytes,
+					 req->result);
+	} else {
+		/* Otherwise call the internal function which uses SPU hw */
+		return __ahash_finup(req);
+	}
+ahash_finup_free:
+	kfree(tmpbuf);
+
+ahash_finup_exit:
+	/* Done with hash, can deallocate it now */
+	crypto_free_shash(ctx->shash->tfm);
+	kfree(ctx->shash);
+	return ret;
+}
+
+static int ahash_digest(struct ahash_request *req)
+{
+	int err = 0;
+
+	flow_log("ahash_digest() nbytes:%u\n", req->nbytes);
+
+	/* whole thing at once */
+	err = __ahash_init(req);
+	if (!err)
+		err = __ahash_finup(req);
+
+	return err;
+}
+
+static int ahash_setkey(struct crypto_ahash *ahash, const u8 *key,
+			unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(ahash);
+
+	flow_log("%s() ahash:%p key:%p keylen:%u\n",
+		 __func__, ahash, key, keylen);
+	flow_dump("  key: ", key, keylen);
+
+	if (ctx->auth.alg == HASH_ALG_AES) {
+		switch (keylen) {
+		case AES_KEYSIZE_128:
+			ctx->cipher_type = CIPHER_TYPE_AES128;
+			break;
+		case AES_KEYSIZE_192:
+			ctx->cipher_type = CIPHER_TYPE_AES192;
+			break;
+		case AES_KEYSIZE_256:
+			ctx->cipher_type = CIPHER_TYPE_AES256;
+			break;
+		default:
+			pr_err("%s() Error: Invalid key length\n", __func__);
+			return -EINVAL;
+		}
+	} else {
+		pr_err("%s() Error: unknown hash alg\n", __func__);
+		return -EINVAL;
+	}
+	memcpy(ctx->authkey, key, keylen);
+	ctx->authkeylen = keylen;
+
+	return 0;
+}
+
+static int ahash_export(struct ahash_request *req, void *out)
+{
+	const struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct spu_hash_export_s *spu_exp = (struct spu_hash_export_s *)out;
+
+	spu_exp->total_todo = rctx->total_todo;
+	spu_exp->total_sent = rctx->total_sent;
+	spu_exp->is_sw_hmac = rctx->is_sw_hmac;
+	memcpy(spu_exp->hash_carry, rctx->hash_carry, sizeof(rctx->hash_carry));
+	spu_exp->hash_carry_len = rctx->hash_carry_len;
+	memcpy(spu_exp->incr_hash, rctx->incr_hash, sizeof(rctx->incr_hash));
+
+	return 0;
+}
+
+static int ahash_import(struct ahash_request *req, const void *in)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct spu_hash_export_s *spu_exp = (struct spu_hash_export_s *)in;
+
+	rctx->total_todo = spu_exp->total_todo;
+	rctx->total_sent = spu_exp->total_sent;
+	rctx->is_sw_hmac = spu_exp->is_sw_hmac;
+	memcpy(rctx->hash_carry, spu_exp->hash_carry, sizeof(rctx->hash_carry));
+	rctx->hash_carry_len = spu_exp->hash_carry_len;
+	memcpy(rctx->incr_hash, spu_exp->incr_hash, sizeof(rctx->incr_hash));
+
+	return 0;
+}
+
+static int ahash_hmac_setkey(struct crypto_ahash *ahash, const u8 *key,
+			     unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(ahash);
+	unsigned int blocksize =
+		crypto_tfm_alg_blocksize(crypto_ahash_tfm(ahash));
+	unsigned int digestsize = crypto_ahash_digestsize(ahash);
+	unsigned int index;
+	int rc;
+
+	flow_log("%s() ahash:%p key:%p keylen:%u blksz:%u digestsz:%u\n",
+		 __func__, ahash, key, keylen, blocksize, digestsize);
+	flow_dump("  key: ", key, keylen);
+
+	if (keylen > blocksize) {
+		switch (ctx->auth.alg) {
+		case HASH_ALG_MD5:
+			rc = do_shash("md5", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA1:
+			rc = do_shash("sha1", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA224:
+			rc = do_shash("sha224", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA256:
+			rc = do_shash("sha256", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA384:
+			rc = do_shash("sha384", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA512:
+			rc = do_shash("sha512", ctx->authkey, key, keylen, NULL,
+				      0, NULL, 0);
+			break;
+		case HASH_ALG_SHA3_224:
+			rc = do_shash("sha3-224", ctx->authkey, key, keylen,
+				      NULL, 0, NULL, 0);
+			break;
+		case HASH_ALG_SHA3_256:
+			rc = do_shash("sha3-256", ctx->authkey, key, keylen,
+				      NULL, 0, NULL, 0);
+			break;
+		case HASH_ALG_SHA3_384:
+			rc = do_shash("sha3-384", ctx->authkey, key, keylen,
+				      NULL, 0, NULL, 0);
+			break;
+		case HASH_ALG_SHA3_512:
+			rc = do_shash("sha3-512", ctx->authkey, key, keylen,
+				      NULL, 0, NULL, 0);
+			break;
+		default:
+			pr_err("%s() Error: unknown hash alg\n", __func__);
+			return -EINVAL;
+		}
+		if (rc < 0) {
+			pr_err("%s() Error %d computing shash for %s\n",
+			       __func__, rc, hash_alg_name[ctx->auth.alg]);
+			return rc;
+		}
+		ctx->authkeylen = digestsize;
+
+		flow_log("  keylen > digestsize... hashed\n");
+		flow_dump("  newkey: ", ctx->authkey, ctx->authkeylen);
+	} else {
+		memcpy(ctx->authkey, key, keylen);
+		ctx->authkeylen = keylen;
+	}
+
+	/*
+	 * Full HMAC operation in SPUM is not verified,
+	 * So keeping the generation of IPAD, OPAD and
+	 * outer hashing in software.
+	 */
+	if (iproc_priv.spu.spu_type == SPU_TYPE_SPUM) {
+		memcpy(ctx->ipad, ctx->authkey, ctx->authkeylen);
+		memset(ctx->ipad + ctx->authkeylen, 0,
+		       blocksize - ctx->authkeylen);
+		ctx->authkeylen = 0;
+		memcpy(ctx->opad, ctx->ipad, blocksize);
+
+		for (index = 0; index < blocksize; index++) {
+			ctx->ipad[index] ^= 0x36;
+			ctx->opad[index] ^= 0x5c;
+		}
+
+		flow_dump("  ipad: ", ctx->ipad, blocksize);
+		flow_dump("  opad: ", ctx->opad, blocksize);
+	}
+	ctx->digestsize = digestsize;
+	atomic_inc(&iproc_priv.setkey_cnt[SPU_OP_HMAC]);
+
+	return 0;
+}
+
+static int ahash_hmac_init(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	unsigned int blocksize =
+			crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
+
+	flow_log("ahash_hmac_init()\n");
+
+	/* init the context as a hash */
+	ahash_init(req);
+
+	if (!spu_no_incr_hash(ctx)) {
+		/* SPU-M can do incr hashing but needs sw for outer HMAC */
+		rctx->is_sw_hmac = true;
+		ctx->auth.mode = HASH_MODE_HASH;
+		/* start with a prepended ipad */
+		memcpy(rctx->hash_carry, ctx->ipad, blocksize);
+		rctx->hash_carry_len = blocksize;
+		rctx->total_todo += blocksize;
+	}
+
+	return 0;
+}
+
+static int ahash_hmac_update(struct ahash_request *req)
+{
+	flow_log("ahash_hmac_update() nbytes:%u\n", req->nbytes);
+
+	if (!req->nbytes)
+		return 0;
+
+	return ahash_update(req);
+}
+
+static int ahash_hmac_final(struct ahash_request *req)
+{
+	flow_log("ahash_hmac_final() nbytes:%u\n", req->nbytes);
+
+	return ahash_final(req);
+}
+
+static int ahash_hmac_finup(struct ahash_request *req)
+{
+	flow_log("ahash_hmac_finupl() nbytes:%u\n", req->nbytes);
+
+	return ahash_finup(req);
+}
+
+static int ahash_hmac_digest(struct ahash_request *req)
+{
+	struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
+	unsigned int blocksize =
+			crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
+
+	flow_log("ahash_hmac_digest() nbytes:%u\n", req->nbytes);
+
+	/* Perform initialization and then call finup */
+	__ahash_init(req);
+
+	if (iproc_priv.spu.spu_type == SPU_TYPE_SPU2) {
+		/*
+		 * SPU2 supports full HMAC implementation in the
+		 * hardware, need not to generate IPAD, OPAD and
+		 * outer hash in software.
+		 * Only for hash key len > hash block size, SPU2
+		 * expects to perform hashing on the key, shorten
+		 * it to digest size and feed it as hash key.
+		 */
+		rctx->is_sw_hmac = false;
+		ctx->auth.mode = HASH_MODE_HMAC;
+	} else {
+		rctx->is_sw_hmac = true;
+		ctx->auth.mode = HASH_MODE_HASH;
+		/* start with a prepended ipad */
+		memcpy(rctx->hash_carry, ctx->ipad, blocksize);
+		rctx->hash_carry_len = blocksize;
+		rctx->total_todo += blocksize;
+	}
+
+	return __ahash_finup(req);
+}
+
+/* aead helpers */
+
+static int aead_need_fallback(struct aead_request *req)
+{
+	struct iproc_reqctx_s *rctx = aead_request_ctx(req);
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(aead);
+	u32 payload_len;
+
+	/*
+	 * SPU hardware cannot handle the AES-GCM/CCM case where plaintext
+	 * and AAD are both 0 bytes long. So use fallback in this case.
+	 */
+	if (((ctx->cipher.mode == CIPHER_MODE_GCM) ||
+	     (ctx->cipher.mode == CIPHER_MODE_CCM)) &&
+	    (req->assoclen == 0)) {
+		if ((rctx->is_encrypt && (req->cryptlen == 0)) ||
+		    (!rctx->is_encrypt && (req->cryptlen == ctx->digestsize))) {
+			flow_log("AES GCM/CCM needs fallback for 0 len req\n");
+			return 1;
+		}
+	}
+
+	/* SPU-M hardware only supports CCM digest size of 8, 12, or 16 bytes */
+	if ((ctx->cipher.mode == CIPHER_MODE_CCM) &&
+	    (spu->spu_type == SPU_TYPE_SPUM) &&
+	    (ctx->digestsize != 8) && (ctx->digestsize != 12) &&
+	    (ctx->digestsize != 16)) {
+		flow_log("%s() AES CCM needs fallbck for digest size %d\n",
+			 __func__, ctx->digestsize);
+		return 1;
+	}
+
+	/*
+	 * SPU-M on NSP has an issue where AES-CCM hash is not correct
+	 * when AAD size is 0
+	 */
+	if ((ctx->cipher.mode == CIPHER_MODE_CCM) &&
+	    (spu->spu_subtype == SPU_SUBTYPE_SPUM_NSP) &&
+	    (req->assoclen == 0)) {
+		flow_log("%s() AES_CCM needs fallback for 0 len AAD on NSP\n",
+			 __func__);
+		return 1;
+	}
+
+	payload_len = req->cryptlen;
+	if (spu->spu_type == SPU_TYPE_SPUM)
+		payload_len += req->assoclen;
+
+	flow_log("%s() payload len: %u\n", __func__, payload_len);
+
+	if (ctx->max_payload == SPU_MAX_PAYLOAD_INF)
+		return 0;
+	else
+		return payload_len > ctx->max_payload;
+}
+
+static void aead_complete(struct crypto_async_request *areq, int err)
+{
+	struct aead_request *req =
+	    container_of(areq, struct aead_request, base);
+	struct iproc_reqctx_s *rctx = aead_request_ctx(req);
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+
+	flow_log("%s() err:%d\n", __func__, err);
+
+	areq->tfm = crypto_aead_tfm(aead);
+
+	areq->complete = rctx->old_complete;
+	areq->data = rctx->old_data;
+
+	areq->complete(areq, err);
+}
+
+static int aead_do_fallback(struct aead_request *req, bool is_encrypt)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+	struct iproc_reqctx_s *rctx = aead_request_ctx(req);
+	struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm);
+	int err;
+	u32 req_flags;
+
+	flow_log("%s() enc:%u\n", __func__, is_encrypt);
+
+	if (ctx->fallback_cipher) {
+		/* Store the cipher tfm and then use the fallback tfm */
+		rctx->old_tfm = tfm;
+		aead_request_set_tfm(req, ctx->fallback_cipher);
+		/*
+		 * Save the callback and chain ourselves in, so we can restore
+		 * the tfm
+		 */
+		rctx->old_complete = req->base.complete;
+		rctx->old_data = req->base.data;
+		req_flags = aead_request_flags(req);
+		aead_request_set_callback(req, req_flags, aead_complete, req);
+		err = is_encrypt ? crypto_aead_encrypt(req) :
+		    crypto_aead_decrypt(req);
+
+		if (err == 0) {
+			/*
+			 * fallback was synchronous (did not return
+			 * -EINPROGRESS). So restore request state here.
+			 */
+			aead_request_set_callback(req, req_flags,
+						  rctx->old_complete, req);
+			req->base.data = rctx->old_data;
+			aead_request_set_tfm(req, aead);
+			flow_log("%s() fallback completed successfully\n\n",
+				 __func__);
+		}
+	} else {
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int aead_enqueue(struct aead_request *req, bool is_encrypt)
+{
+	struct iproc_reqctx_s *rctx = aead_request_ctx(req);
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(aead);
+	int err;
+
+	flow_log("%s() enc:%u\n", __func__, is_encrypt);
+
+	if (req->assoclen > MAX_ASSOC_SIZE) {
+		pr_err
+		    ("%s() Error: associated data too long. (%u > %u bytes)\n",
+		     __func__, req->assoclen, MAX_ASSOC_SIZE);
+		return -EINVAL;
+	}
+
+	rctx->gfp = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
+		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
+	rctx->parent = &req->base;
+	rctx->is_encrypt = is_encrypt;
+	rctx->bd_suppress = false;
+	rctx->total_todo = req->cryptlen;
+	rctx->src_sent = 0;
+	rctx->total_sent = 0;
+	rctx->total_received = 0;
+	rctx->is_sw_hmac = false;
+	rctx->ctx = ctx;
+	memset(&rctx->mb_mssg, 0, sizeof(struct brcm_message));
+
+	/* assoc data is at start of src sg */
+	rctx->assoc = req->src;
+
+	/*
+	 * Init current position in src scatterlist to be after assoc data.
+	 * src_skip set to buffer offset where data begins. (Assoc data could
+	 * end in the middle of a buffer.)
+	 */
+	if (spu_sg_at_offset(req->src, req->assoclen, &rctx->src_sg,
+			     &rctx->src_skip) < 0) {
+		pr_err("%s() Error: Unable to find start of src data\n",
+		       __func__);
+		return -EINVAL;
+	}
+
+	rctx->src_nents = 0;
+	rctx->dst_nents = 0;
+	if (req->dst == req->src) {
+		rctx->dst_sg = rctx->src_sg;
+		rctx->dst_skip = rctx->src_skip;
+	} else {
+		/*
+		 * Expect req->dst to have room for assoc data followed by
+		 * output data and ICV, if encrypt. So initialize dst_sg
+		 * to point beyond assoc len offset.
+		 */
+		if (spu_sg_at_offset(req->dst, req->assoclen, &rctx->dst_sg,
+				     &rctx->dst_skip) < 0) {
+			pr_err("%s() Error: Unable to find start of dst data\n",
+			       __func__);
+			return -EINVAL;
+		}
+	}
+
+	if (ctx->cipher.mode == CIPHER_MODE_CBC ||
+	    ctx->cipher.mode == CIPHER_MODE_CTR ||
+	    ctx->cipher.mode == CIPHER_MODE_OFB ||
+	    ctx->cipher.mode == CIPHER_MODE_XTS ||
+	    ctx->cipher.mode == CIPHER_MODE_GCM) {
+		rctx->iv_ctr_len =
+			ctx->salt_len +
+			crypto_aead_ivsize(crypto_aead_reqtfm(req));
+	} else if (ctx->cipher.mode == CIPHER_MODE_CCM) {
+		rctx->iv_ctr_len = CCM_AES_IV_SIZE;
+	} else {
+		rctx->iv_ctr_len = 0;
+	}
+
+	rctx->hash_carry_len = 0;
+
+	flow_log("  src sg: %p\n", req->src);
+	flow_log("  rctx->src_sg: %p, src_skip %u\n",
+		 rctx->src_sg, rctx->src_skip);
+	flow_log("  assoc:  %p, assoclen %u\n", rctx->assoc, req->assoclen);
+	flow_log("  dst sg: %p\n", req->dst);
+	flow_log("  rctx->dst_sg: %p, dst_skip %u\n",
+		 rctx->dst_sg, rctx->dst_skip);
+	flow_log("  iv_ctr_len:%u\n", rctx->iv_ctr_len);
+	flow_dump("  iv: ", req->iv, rctx->iv_ctr_len);
+	flow_log("  authkeylen:%u\n", ctx->authkeylen);
+	flow_log("  is_esp: %s\n", ctx->is_esp ? "yes" : "no");
+
+	if (ctx->max_payload == SPU_MAX_PAYLOAD_INF)
+		flow_log("  max_payload infinite");
+	else
+		flow_log("  max_payload: %u\n", ctx->max_payload);
+
+	if (unlikely(aead_need_fallback(req)))
+		return aead_do_fallback(req, is_encrypt);
+
+	/*
+	 * Do memory allocations for request after fallback check, because if we
+	 * do fallback, we won't call finish_req() to dealloc.
+	 */
+	if (rctx->iv_ctr_len) {
+		if (ctx->salt_len)
+			memcpy(rctx->msg_buf.iv_ctr + ctx->salt_offset,
+			       ctx->salt, ctx->salt_len);
+		memcpy(rctx->msg_buf.iv_ctr + ctx->salt_offset + ctx->salt_len,
+		       req->iv,
+		       rctx->iv_ctr_len - ctx->salt_len - ctx->salt_offset);
+	}
+
+	rctx->chan_idx = select_channel();
+	err = handle_aead_req(rctx);
+	if (err != -EINPROGRESS)
+		/* synchronous result */
+		spu_chunk_cleanup(rctx);
+
+	return err;
+}
+
+static int aead_authenc_setkey(struct crypto_aead *cipher,
+			       const u8 *key, unsigned int keylen)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+	struct crypto_tfm *tfm = crypto_aead_tfm(cipher);
+	struct rtattr *rta = (void *)key;
+	struct crypto_authenc_key_param *param;
+	const u8 *origkey = key;
+	const unsigned int origkeylen = keylen;
+
+	int ret = 0;
+
+	flow_log("%s() aead:%p key:%p keylen:%u\n", __func__, cipher, key,
+		 keylen);
+	flow_dump("  key: ", key, keylen);
+
+	if (!RTA_OK(rta, keylen))
+		goto badkey;
+	if (rta->rta_type != CRYPTO_AUTHENC_KEYA_PARAM)
+		goto badkey;
+	if (RTA_PAYLOAD(rta) < sizeof(*param))
+		goto badkey;
+
+	param = RTA_DATA(rta);
+	ctx->enckeylen = be32_to_cpu(param->enckeylen);
+
+	key += RTA_ALIGN(rta->rta_len);
+	keylen -= RTA_ALIGN(rta->rta_len);
+
+	if (keylen < ctx->enckeylen)
+		goto badkey;
+	if (ctx->enckeylen > MAX_KEY_SIZE)
+		goto badkey;
+
+	ctx->authkeylen = keylen - ctx->enckeylen;
+
+	if (ctx->authkeylen > MAX_KEY_SIZE)
+		goto badkey;
+
+	memcpy(ctx->enckey, key + ctx->authkeylen, ctx->enckeylen);
+	/* May end up padding auth key. So make sure it's zeroed. */
+	memset(ctx->authkey, 0, sizeof(ctx->authkey));
+	memcpy(ctx->authkey, key, ctx->authkeylen);
+
+	switch (ctx->alg->cipher_info.alg) {
+	case CIPHER_ALG_DES:
+		if (ctx->enckeylen == DES_KEY_SIZE) {
+			u32 tmp[DES_EXPKEY_WORDS];
+			u32 flags = CRYPTO_TFM_RES_WEAK_KEY;
+
+			if (des_ekey(tmp, key) == 0) {
+				if (crypto_aead_get_flags(cipher) &
+				    CRYPTO_TFM_REQ_WEAK_KEY) {
+					crypto_aead_set_flags(cipher, flags);
+					return -EINVAL;
+				}
+			}
+
+			ctx->cipher_type = CIPHER_TYPE_DES;
+		} else {
+			goto badkey;
+		}
+		break;
+	case CIPHER_ALG_3DES:
+		if (ctx->enckeylen == (DES_KEY_SIZE * 3)) {
+			const u32 *K = (const u32 *)key;
+			u32 flags = CRYPTO_TFM_RES_BAD_KEY_SCHED;
+
+			if (!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
+			    !((K[2] ^ K[4]) | (K[3] ^ K[5]))) {
+				crypto_aead_set_flags(cipher, flags);
+				return -EINVAL;
+			}
+
+			ctx->cipher_type = CIPHER_TYPE_3DES;
+		} else {
+			crypto_aead_set_flags(cipher,
+					      CRYPTO_TFM_RES_BAD_KEY_LEN);
+			return -EINVAL;
+		}
+		break;
+	case CIPHER_ALG_AES:
+		switch (ctx->enckeylen) {
+		case AES_KEYSIZE_128:
+			ctx->cipher_type = CIPHER_TYPE_AES128;
+			break;
+		case AES_KEYSIZE_192:
+			ctx->cipher_type = CIPHER_TYPE_AES192;
+			break;
+		case AES_KEYSIZE_256:
+			ctx->cipher_type = CIPHER_TYPE_AES256;
+			break;
+		default:
+			goto badkey;
+		}
+		break;
+	case CIPHER_ALG_RC4:
+		ctx->cipher_type = CIPHER_TYPE_INIT;
+		break;
+	default:
+		pr_err("%s() Error: Unknown cipher alg\n", __func__);
+		return -EINVAL;
+	}
+
+	flow_log("  enckeylen:%u authkeylen:%u\n", ctx->enckeylen,
+		 ctx->authkeylen);
+	flow_dump("  enc: ", ctx->enckey, ctx->enckeylen);
+	flow_dump("  auth: ", ctx->authkey, ctx->authkeylen);
+
+	/* setkey the fallback just in case we needto use it */
+	if (ctx->fallback_cipher) {
+		flow_log("  running fallback setkey()\n");
+
+		ctx->fallback_cipher->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+		ctx->fallback_cipher->base.crt_flags |=
+		    tfm->crt_flags & CRYPTO_TFM_REQ_MASK;
+		ret =
+		    crypto_aead_setkey(ctx->fallback_cipher, origkey,
+				       origkeylen);
+		if (ret) {
+			flow_log("  fallback setkey() returned:%d\n", ret);
+			tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+			tfm->crt_flags |=
+			    (ctx->fallback_cipher->base.crt_flags &
+			     CRYPTO_TFM_RES_MASK);
+		}
+	}
+
+	ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen,
+							  ctx->enckeylen,
+							  false);
+
+	atomic_inc(&iproc_priv.setkey_cnt[SPU_OP_AEAD]);
+
+	return ret;
+
+badkey:
+	ctx->enckeylen = 0;
+	ctx->authkeylen = 0;
+	ctx->digestsize = 0;
+
+	crypto_aead_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	return -EINVAL;
+}
+
+static int aead_gcm_ccm_setkey(struct crypto_aead *cipher,
+			       const u8 *key, unsigned int keylen)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+	struct crypto_tfm *tfm = crypto_aead_tfm(cipher);
+
+	int ret = 0;
+
+	flow_log("%s() keylen:%u\n", __func__, keylen);
+	flow_dump("  key: ", key, keylen);
+
+	if (!ctx->is_esp)
+		ctx->digestsize = keylen;
+
+	ctx->enckeylen = keylen;
+	ctx->authkeylen = 0;
+	memcpy(ctx->enckey, key, ctx->enckeylen);
+
+	switch (ctx->enckeylen) {
+	case AES_KEYSIZE_128:
+		ctx->cipher_type = CIPHER_TYPE_AES128;
+		break;
+	case AES_KEYSIZE_192:
+		ctx->cipher_type = CIPHER_TYPE_AES192;
+		break;
+	case AES_KEYSIZE_256:
+		ctx->cipher_type = CIPHER_TYPE_AES256;
+		break;
+	default:
+		goto badkey;
+	}
+
+	flow_log("  enckeylen:%u authkeylen:%u\n", ctx->enckeylen,
+		 ctx->authkeylen);
+	flow_dump("  enc: ", ctx->enckey, ctx->enckeylen);
+	flow_dump("  auth: ", ctx->authkey, ctx->authkeylen);
+
+	/* setkey the fallback just in case we need to use it */
+	if (ctx->fallback_cipher) {
+		flow_log("  running fallback setkey()\n");
+
+		ctx->fallback_cipher->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+		ctx->fallback_cipher->base.crt_flags |=
+		    tfm->crt_flags & CRYPTO_TFM_REQ_MASK;
+		ret = crypto_aead_setkey(ctx->fallback_cipher, key,
+					 keylen + ctx->salt_len);
+		if (ret) {
+			flow_log("  fallback setkey() returned:%d\n", ret);
+			tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+			tfm->crt_flags |=
+			    (ctx->fallback_cipher->base.crt_flags &
+			     CRYPTO_TFM_RES_MASK);
+		}
+	}
+
+	ctx->spu_resp_hdr_len = spu->spu_response_hdr_len(ctx->authkeylen,
+							  ctx->enckeylen,
+							  false);
+
+	atomic_inc(&iproc_priv.setkey_cnt[SPU_OP_AEAD]);
+
+	flow_log("  enckeylen:%u authkeylen:%u\n", ctx->enckeylen,
+		 ctx->authkeylen);
+
+	return ret;
+
+badkey:
+	ctx->enckeylen = 0;
+	ctx->authkeylen = 0;
+	ctx->digestsize = 0;
+
+	crypto_aead_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+	return -EINVAL;
+}
+
+/**
+ * aead_gcm_esp_setkey() - setkey() operation for ESP variant of GCM AES.
+ * @cipher: AEAD structure
+ * @key:    Key followed by 4 bytes of salt
+ * @keylen: Length of key plus salt, in bytes
+ *
+ * Extracts salt from key and stores it to be prepended to IV on each request.
+ * Digest is always 16 bytes
+ *
+ * Return: Value from generic gcm setkey.
+ */
+static int aead_gcm_esp_setkey(struct crypto_aead *cipher,
+			       const u8 *key, unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+
+	flow_log("%s\n", __func__);
+	ctx->salt_len = GCM_ESP_SALT_SIZE;
+	ctx->salt_offset = GCM_ESP_SALT_OFFSET;
+	memcpy(ctx->salt, key + keylen - GCM_ESP_SALT_SIZE, GCM_ESP_SALT_SIZE);
+	keylen -= GCM_ESP_SALT_SIZE;
+	ctx->digestsize = GCM_ESP_DIGESTSIZE;
+	ctx->is_esp = true;
+	flow_dump("salt: ", ctx->salt, GCM_ESP_SALT_SIZE);
+
+	return aead_gcm_ccm_setkey(cipher, key, keylen);
+}
+
+/**
+ * rfc4543_gcm_esp_setkey() - setkey operation for RFC4543 variant of GCM/GMAC.
+ * cipher: AEAD structure
+ * key:    Key followed by 4 bytes of salt
+ * keylen: Length of key plus salt, in bytes
+ *
+ * Extracts salt from key and stores it to be prepended to IV on each request.
+ * Digest is always 16 bytes
+ *
+ * Return: Value from generic gcm setkey.
+ */
+static int rfc4543_gcm_esp_setkey(struct crypto_aead *cipher,
+				  const u8 *key, unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+
+	flow_log("%s\n", __func__);
+	ctx->salt_len = GCM_ESP_SALT_SIZE;
+	ctx->salt_offset = GCM_ESP_SALT_OFFSET;
+	memcpy(ctx->salt, key + keylen - GCM_ESP_SALT_SIZE, GCM_ESP_SALT_SIZE);
+	keylen -= GCM_ESP_SALT_SIZE;
+	ctx->digestsize = GCM_ESP_DIGESTSIZE;
+	ctx->is_esp = true;
+	ctx->is_rfc4543 = true;
+	flow_dump("salt: ", ctx->salt, GCM_ESP_SALT_SIZE);
+
+	return aead_gcm_ccm_setkey(cipher, key, keylen);
+}
+
+/**
+ * aead_ccm_esp_setkey() - setkey() operation for ESP variant of CCM AES.
+ * @cipher: AEAD structure
+ * @key:    Key followed by 4 bytes of salt
+ * @keylen: Length of key plus salt, in bytes
+ *
+ * Extracts salt from key and stores it to be prepended to IV on each request.
+ * Digest is always 16 bytes
+ *
+ * Return: Value from generic ccm setkey.
+ */
+static int aead_ccm_esp_setkey(struct crypto_aead *cipher,
+			       const u8 *key, unsigned int keylen)
+{
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+
+	flow_log("%s\n", __func__);
+	ctx->salt_len = CCM_ESP_SALT_SIZE;
+	ctx->salt_offset = CCM_ESP_SALT_OFFSET;
+	memcpy(ctx->salt, key + keylen - CCM_ESP_SALT_SIZE, CCM_ESP_SALT_SIZE);
+	keylen -= CCM_ESP_SALT_SIZE;
+	ctx->is_esp = true;
+	flow_dump("salt: ", ctx->salt, CCM_ESP_SALT_SIZE);
+
+	return aead_gcm_ccm_setkey(cipher, key, keylen);
+}
+
+static int aead_setauthsize(struct crypto_aead *cipher, unsigned int authsize)
+{
+	struct iproc_ctx_s *ctx = crypto_aead_ctx(cipher);
+	int ret = 0;
+
+	flow_log("%s() authkeylen:%u authsize:%u\n",
+		 __func__, ctx->authkeylen, authsize);
+
+	ctx->digestsize = authsize;
+
+	/* setkey the fallback just in case we needto use it */
+	if (ctx->fallback_cipher) {
+		flow_log("  running fallback setauth()\n");
+
+		ret = crypto_aead_setauthsize(ctx->fallback_cipher, authsize);
+		if (ret)
+			flow_log("  fallback setauth() returned:%d\n", ret);
+	}
+
+	return ret;
+}
+
+static int aead_encrypt(struct aead_request *req)
+{
+	flow_log("%s() cryptlen:%u %08x\n", __func__, req->cryptlen,
+		 req->cryptlen);
+	dump_sg(req->src, 0, req->cryptlen + req->assoclen);
+	flow_log("  assoc_len:%u\n", req->assoclen);
+
+	return aead_enqueue(req, true);
+}
+
+static int aead_decrypt(struct aead_request *req)
+{
+	flow_log("%s() cryptlen:%u\n", __func__, req->cryptlen);
+	dump_sg(req->src, 0, req->cryptlen + req->assoclen);
+	flow_log("  assoc_len:%u\n", req->assoclen);
+
+	return aead_enqueue(req, false);
+}
+
+/* ==================== Supported Cipher Algorithms ==================== */
+
+static struct iproc_alg_s driver_algs[] = {
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "gcm(aes)",
+			.cra_driver_name = "gcm-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK
+		 },
+		 .setkey = aead_gcm_ccm_setkey,
+		 .ivsize = GCM_AES_IV_SIZE,
+		.maxauthsize = AES_BLOCK_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_GCM,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_GCM,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "ccm(aes)",
+			.cra_driver_name = "ccm-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK
+		 },
+		 .setkey = aead_gcm_ccm_setkey,
+		 .ivsize = CCM_AES_IV_SIZE,
+		.maxauthsize = AES_BLOCK_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CCM,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_CCM,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "rfc4106(gcm(aes))",
+			.cra_driver_name = "gcm-aes-esp-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK
+		 },
+		 .setkey = aead_gcm_esp_setkey,
+		 .ivsize = GCM_ESP_IV_SIZE,
+		 .maxauthsize = AES_BLOCK_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_GCM,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_GCM,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "rfc4309(ccm(aes))",
+			.cra_driver_name = "ccm-aes-esp-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK
+		 },
+		 .setkey = aead_ccm_esp_setkey,
+		 .ivsize = CCM_AES_IV_SIZE,
+		 .maxauthsize = AES_BLOCK_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CCM,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_CCM,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "rfc4543(gcm(aes))",
+			.cra_driver_name = "gmac-aes-esp-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK
+		 },
+		 .setkey = rfc4543_gcm_esp_setkey,
+		 .ivsize = GCM_ESP_IV_SIZE,
+		 .maxauthsize = AES_BLOCK_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_GCM,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_GCM,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(md5),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-md5-cbc-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		.ivsize = AES_BLOCK_SIZE,
+		.maxauthsize = MD5_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_MD5,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha1),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-sha1-cbc-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = AES_BLOCK_SIZE,
+		 .maxauthsize = SHA1_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA1,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha256),cbc(aes))",
+			.cra_driver_name = "authenc-hmac-sha256-cbc-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = AES_BLOCK_SIZE,
+		 .maxauthsize = SHA256_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA256,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(md5),cbc(des))",
+			.cra_driver_name = "authenc-hmac-md5-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = MD5_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_MD5,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha1),cbc(des))",
+			.cra_driver_name = "authenc-hmac-sha1-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = SHA1_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA1,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha224),cbc(des))",
+			.cra_driver_name = "authenc-hmac-sha224-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = SHA224_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA224,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha256),cbc(des))",
+			.cra_driver_name = "authenc-hmac-sha256-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = SHA256_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA256,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha384),cbc(des))",
+			.cra_driver_name = "authenc-hmac-sha384-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = SHA384_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA384,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha512),cbc(des))",
+			.cra_driver_name = "authenc-hmac-sha512-cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES_BLOCK_SIZE,
+		 .maxauthsize = SHA512_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA512,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(md5),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-md5-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = MD5_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_MD5,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha1),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha1-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = SHA1_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA1,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha224),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha224-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = SHA224_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA224,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha256),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha256-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = SHA256_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA256,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha384),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha384-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = SHA384_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA384,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AEAD,
+	 .alg.aead = {
+		 .base = {
+			.cra_name = "authenc(hmac(sha512),cbc(des3_ede))",
+			.cra_driver_name = "authenc-hmac-sha512-cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_flags = CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC
+		 },
+		 .setkey = aead_authenc_setkey,
+		 .ivsize = DES3_EDE_BLOCK_SIZE,
+		 .maxauthsize = SHA512_DIGEST_SIZE,
+	 },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA512,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 .auth_first = 0,
+	 },
+
+/* ABLKCIPHER algorithms. */
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ecb(arc4)",
+			.cra_driver_name = "ecb-arc4-iproc",
+			.cra_blocksize = ARC4_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = ARC4_MIN_KEY_SIZE,
+					   .max_keysize = ARC4_MAX_KEY_SIZE,
+					   .ivsize = 0,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_RC4,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ofb(des)",
+			.cra_driver_name = "ofb-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES_KEY_SIZE,
+					   .max_keysize = DES_KEY_SIZE,
+					   .ivsize = DES_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_OFB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "cbc(des)",
+			.cra_driver_name = "cbc-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES_KEY_SIZE,
+					   .max_keysize = DES_KEY_SIZE,
+					   .ivsize = DES_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ecb(des)",
+			.cra_driver_name = "ecb-des-iproc",
+			.cra_blocksize = DES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES_KEY_SIZE,
+					   .max_keysize = DES_KEY_SIZE,
+					   .ivsize = 0,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_DES,
+			 .mode = CIPHER_MODE_ECB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ofb(des3_ede)",
+			.cra_driver_name = "ofb-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES3_EDE_KEY_SIZE,
+					   .max_keysize = DES3_EDE_KEY_SIZE,
+					   .ivsize = DES3_EDE_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_OFB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "cbc(des3_ede)",
+			.cra_driver_name = "cbc-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES3_EDE_KEY_SIZE,
+					   .max_keysize = DES3_EDE_KEY_SIZE,
+					   .ivsize = DES3_EDE_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ecb(des3_ede)",
+			.cra_driver_name = "ecb-des3-iproc",
+			.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = DES3_EDE_KEY_SIZE,
+					   .max_keysize = DES3_EDE_KEY_SIZE,
+					   .ivsize = 0,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_3DES,
+			 .mode = CIPHER_MODE_ECB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ofb(aes)",
+			.cra_driver_name = "ofb-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = AES_MIN_KEY_SIZE,
+					   .max_keysize = AES_MAX_KEY_SIZE,
+					   .ivsize = AES_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_OFB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "cbc(aes)",
+			.cra_driver_name = "cbc-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = AES_MIN_KEY_SIZE,
+					   .max_keysize = AES_MAX_KEY_SIZE,
+					   .ivsize = AES_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CBC,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ecb(aes)",
+			.cra_driver_name = "ecb-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   .min_keysize = AES_MIN_KEY_SIZE,
+					   .max_keysize = AES_MAX_KEY_SIZE,
+					   .ivsize = 0,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_ECB,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "ctr(aes)",
+			.cra_driver_name = "ctr-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+					   /* .geniv = "chainiv", */
+					   .min_keysize = AES_MIN_KEY_SIZE,
+					   .max_keysize = AES_MAX_KEY_SIZE,
+					   .ivsize = AES_BLOCK_SIZE,
+					}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_CTR,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+{
+	 .type = CRYPTO_ALG_TYPE_ABLKCIPHER,
+	 .alg.crypto = {
+			.cra_name = "xts(aes)",
+			.cra_driver_name = "xts-aes-iproc",
+			.cra_blocksize = AES_BLOCK_SIZE,
+			.cra_ablkcipher = {
+				.min_keysize = 2 * AES_MIN_KEY_SIZE,
+				.max_keysize = 2 * AES_MAX_KEY_SIZE,
+				.ivsize = AES_BLOCK_SIZE,
+				}
+			},
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_AES,
+			 .mode = CIPHER_MODE_XTS,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_NONE,
+		       .mode = HASH_MODE_NONE,
+		       },
+	 },
+
+/* AHASH algorithms. */
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = MD5_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "md5",
+				    .cra_driver_name = "md5-iproc",
+				    .cra_blocksize = MD5_BLOCK_WORDS * 4,
+				    .cra_flags = CRYPTO_ALG_TYPE_AHASH |
+					     CRYPTO_ALG_ASYNC,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_MD5,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = MD5_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(md5)",
+				    .cra_driver_name = "hmac-md5-iproc",
+				    .cra_blocksize = MD5_BLOCK_WORDS * 4,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_MD5,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA1_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha1",
+				    .cra_driver_name = "sha1-iproc",
+				    .cra_blocksize = SHA1_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA1,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA1_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha1)",
+				    .cra_driver_name = "hmac-sha1-iproc",
+				    .cra_blocksize = SHA1_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA1,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+			.halg.digestsize = SHA224_DIGEST_SIZE,
+			.halg.base = {
+				    .cra_name = "sha224",
+				    .cra_driver_name = "sha224-iproc",
+				    .cra_blocksize = SHA224_BLOCK_SIZE,
+			}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA224,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA224_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha224)",
+				    .cra_driver_name = "hmac-sha224-iproc",
+				    .cra_blocksize = SHA224_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA224,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA256_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha256",
+				    .cra_driver_name = "sha256-iproc",
+				    .cra_blocksize = SHA256_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA256,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA256_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha256)",
+				    .cra_driver_name = "hmac-sha256-iproc",
+				    .cra_blocksize = SHA256_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA256,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	.type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA384_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha384",
+				    .cra_driver_name = "sha384-iproc",
+				    .cra_blocksize = SHA384_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA384,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA384_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha384)",
+				    .cra_driver_name = "hmac-sha384-iproc",
+				    .cra_blocksize = SHA384_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA384,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA512_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha512",
+				    .cra_driver_name = "sha512-iproc",
+				    .cra_blocksize = SHA512_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA512,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA512_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha512)",
+				    .cra_driver_name = "hmac-sha512-iproc",
+				    .cra_blocksize = SHA512_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA512,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_224_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha3-224",
+				    .cra_driver_name = "sha3-224-iproc",
+				    .cra_blocksize = SHA3_224_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_224,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_224_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha3-224)",
+				    .cra_driver_name = "hmac-sha3-224-iproc",
+				    .cra_blocksize = SHA3_224_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_224,
+		       .mode = HASH_MODE_HMAC
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_256_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha3-256",
+				    .cra_driver_name = "sha3-256-iproc",
+				    .cra_blocksize = SHA3_256_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_256,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_256_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha3-256)",
+				    .cra_driver_name = "hmac-sha3-256-iproc",
+				    .cra_blocksize = SHA3_256_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_256,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_384_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha3-384",
+				    .cra_driver_name = "sha3-384-iproc",
+				    .cra_blocksize = SHA3_224_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_384,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_384_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha3-384)",
+				    .cra_driver_name = "hmac-sha3-384-iproc",
+				    .cra_blocksize = SHA3_384_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_384,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_512_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "sha3-512",
+				    .cra_driver_name = "sha3-512-iproc",
+				    .cra_blocksize = SHA3_512_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_512,
+		       .mode = HASH_MODE_HASH,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = SHA3_512_DIGEST_SIZE,
+		      .halg.base = {
+				    .cra_name = "hmac(sha3-512)",
+				    .cra_driver_name = "hmac-sha3-512-iproc",
+				    .cra_blocksize = SHA3_512_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_SHA3_512,
+		       .mode = HASH_MODE_HMAC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = AES_BLOCK_SIZE,
+		      .halg.base = {
+				    .cra_name = "xcbc(aes)",
+				    .cra_driver_name = "xcbc-aes-iproc",
+				    .cra_blocksize = AES_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_XCBC,
+		       },
+	 },
+	{
+	 .type = CRYPTO_ALG_TYPE_AHASH,
+	 .alg.hash = {
+		      .halg.digestsize = AES_BLOCK_SIZE,
+		      .halg.base = {
+				    .cra_name = "cmac(aes)",
+				    .cra_driver_name = "cmac-aes-iproc",
+				    .cra_blocksize = AES_BLOCK_SIZE,
+				}
+		      },
+	 .cipher_info = {
+			 .alg = CIPHER_ALG_NONE,
+			 .mode = CIPHER_MODE_NONE,
+			 },
+	 .auth_info = {
+		       .alg = HASH_ALG_AES,
+		       .mode = HASH_MODE_CMAC,
+		       },
+	 },
+};
+
+static int generic_cra_init(struct crypto_tfm *tfm,
+			    struct iproc_alg_s *cipher_alg)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm);
+	unsigned int blocksize = crypto_tfm_alg_blocksize(tfm);
+
+	flow_log("%s()\n", __func__);
+
+	ctx->alg = cipher_alg;
+	ctx->cipher = cipher_alg->cipher_info;
+	ctx->auth = cipher_alg->auth_info;
+	ctx->auth_first = cipher_alg->auth_first;
+	ctx->max_payload = spu->spu_ctx_max_payload(ctx->cipher.alg,
+						    ctx->cipher.mode,
+						    blocksize);
+	ctx->fallback_cipher = NULL;
+
+	ctx->enckeylen = 0;
+	ctx->authkeylen = 0;
+
+	atomic_inc(&iproc_priv.stream_count);
+	atomic_inc(&iproc_priv.session_count);
+
+	return 0;
+}
+
+static int ablkcipher_cra_init(struct crypto_tfm *tfm)
+{
+	struct crypto_alg *alg = tfm->__crt_alg;
+	struct iproc_alg_s *cipher_alg;
+
+	flow_log("%s()\n", __func__);
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct iproc_reqctx_s);
+
+	cipher_alg = container_of(alg, struct iproc_alg_s, alg.crypto);
+	return generic_cra_init(tfm, cipher_alg);
+}
+
+static int ahash_cra_init(struct crypto_tfm *tfm)
+{
+	int err;
+	struct crypto_alg *alg = tfm->__crt_alg;
+	struct iproc_alg_s *cipher_alg;
+
+	cipher_alg = container_of(__crypto_ahash_alg(alg), struct iproc_alg_s,
+				  alg.hash);
+
+	err = generic_cra_init(tfm, cipher_alg);
+	flow_log("%s()\n", __func__);
+
+	/*
+	 * export state size has to be < 512 bytes. So don't include msg bufs
+	 * in state size.
+	 */
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				 sizeof(struct iproc_reqctx_s));
+
+	return err;
+}
+
+static int aead_cra_init(struct crypto_aead *aead)
+{
+	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+	struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm);
+	struct crypto_alg *alg = tfm->__crt_alg;
+	struct aead_alg *aalg = container_of(alg, struct aead_alg, base);
+	struct iproc_alg_s *cipher_alg = container_of(aalg, struct iproc_alg_s,
+						      alg.aead);
+
+	int err = generic_cra_init(tfm, cipher_alg);
+
+	flow_log("%s()\n", __func__);
+
+	crypto_aead_set_reqsize(aead, sizeof(struct iproc_reqctx_s));
+	ctx->is_esp = false;
+	ctx->salt_len = 0;
+	ctx->salt_offset = 0;
+
+	/* random first IV */
+	get_random_bytes(ctx->iv, MAX_IV_SIZE);
+	flow_dump("  iv: ", ctx->iv, MAX_IV_SIZE);
+
+	if (!err) {
+		if (alg->cra_flags & CRYPTO_ALG_NEED_FALLBACK) {
+			flow_log("%s() creating fallback cipher\n", __func__);
+
+			ctx->fallback_cipher =
+			    crypto_alloc_aead(alg->cra_name, 0,
+					      CRYPTO_ALG_ASYNC |
+					      CRYPTO_ALG_NEED_FALLBACK);
+			if (IS_ERR(ctx->fallback_cipher)) {
+				pr_err("%s() Error: failed to allocate fallback for %s\n",
+				       __func__, alg->cra_name);
+				return PTR_ERR(ctx->fallback_cipher);
+			}
+		}
+	}
+
+	return err;
+}
+
+static void generic_cra_exit(struct crypto_tfm *tfm)
+{
+	atomic_dec(&iproc_priv.session_count);
+}
+
+static void aead_cra_exit(struct crypto_aead *aead)
+{
+	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+	struct iproc_ctx_s *ctx = crypto_tfm_ctx(tfm);
+
+	generic_cra_exit(tfm);
+
+	if (ctx->fallback_cipher) {
+		crypto_free_aead(ctx->fallback_cipher);
+		ctx->fallback_cipher = NULL;
+	}
+}
+
+/**
+ * spu_functions_register() - Specify hardware-specific SPU functions based on
+ * SPU type read from device tree.
+ * @dev:	device structure
+ * @spu_type:	SPU hardware generation
+ * @spu_subtype: SPU hardware version
+ */
+static void spu_functions_register(struct device *dev,
+				   enum spu_spu_type spu_type,
+				   enum spu_spu_subtype spu_subtype)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+
+	if (spu_type == SPU_TYPE_SPUM) {
+		dev_dbg(dev, "Registering SPUM functions");
+		spu->spu_dump_msg_hdr = spum_dump_msg_hdr;
+		spu->spu_payload_length = spum_payload_length;
+		spu->spu_response_hdr_len = spum_response_hdr_len;
+		spu->spu_hash_pad_len = spum_hash_pad_len;
+		spu->spu_gcm_ccm_pad_len = spum_gcm_ccm_pad_len;
+		spu->spu_assoc_resp_len = spum_assoc_resp_len;
+		spu->spu_aead_ivlen = spum_aead_ivlen;
+		spu->spu_hash_type = spum_hash_type;
+		spu->spu_digest_size = spum_digest_size;
+		spu->spu_create_request = spum_create_request;
+		spu->spu_cipher_req_init = spum_cipher_req_init;
+		spu->spu_cipher_req_finish = spum_cipher_req_finish;
+		spu->spu_request_pad = spum_request_pad;
+		spu->spu_tx_status_len = spum_tx_status_len;
+		spu->spu_rx_status_len = spum_rx_status_len;
+		spu->spu_status_process = spum_status_process;
+		spu->spu_xts_tweak_in_payload = spum_xts_tweak_in_payload;
+		spu->spu_ccm_update_iv = spum_ccm_update_iv;
+		spu->spu_wordalign_padlen = spum_wordalign_padlen;
+		if (spu_subtype == SPU_SUBTYPE_SPUM_NS2)
+			spu->spu_ctx_max_payload = spum_ns2_ctx_max_payload;
+		else
+			spu->spu_ctx_max_payload = spum_nsp_ctx_max_payload;
+	} else {
+		dev_dbg(dev, "Registering SPU2 functions");
+		spu->spu_dump_msg_hdr = spu2_dump_msg_hdr;
+		spu->spu_ctx_max_payload = spu2_ctx_max_payload;
+		spu->spu_payload_length = spu2_payload_length;
+		spu->spu_response_hdr_len = spu2_response_hdr_len;
+		spu->spu_hash_pad_len = spu2_hash_pad_len;
+		spu->spu_gcm_ccm_pad_len = spu2_gcm_ccm_pad_len;
+		spu->spu_assoc_resp_len = spu2_assoc_resp_len;
+		spu->spu_aead_ivlen = spu2_aead_ivlen;
+		spu->spu_hash_type = spu2_hash_type;
+		spu->spu_digest_size = spu2_digest_size;
+		spu->spu_create_request = spu2_create_request;
+		spu->spu_cipher_req_init = spu2_cipher_req_init;
+		spu->spu_cipher_req_finish = spu2_cipher_req_finish;
+		spu->spu_request_pad = spu2_request_pad;
+		spu->spu_tx_status_len = spu2_tx_status_len;
+		spu->spu_rx_status_len = spu2_rx_status_len;
+		spu->spu_status_process = spu2_status_process;
+		spu->spu_xts_tweak_in_payload = spu2_xts_tweak_in_payload;
+		spu->spu_ccm_update_iv = spu2_ccm_update_iv;
+		spu->spu_wordalign_padlen = spu2_wordalign_padlen;
+	}
+}
+
+/**
+ * spu_mb_init() - Initialize mailbox client. Request ownership of a mailbox
+ * channel for the SPU being probed.
+ * @dev:  SPU driver device structure
+ *
+ * Return: 0 if successful
+ *	   < 0 otherwise
+ */
+static int spu_mb_init(struct device *dev)
+{
+	struct mbox_client *mcl = &iproc_priv.mcl[iproc_priv.spu.num_spu];
+	int err;
+
+	mcl->dev = dev;
+	mcl->tx_block = false;
+	mcl->tx_tout = 0;
+	mcl->knows_txdone = false;
+	mcl->rx_callback = spu_rx_callback;
+	mcl->tx_done = NULL;
+
+	iproc_priv.mbox[iproc_priv.spu.num_spu] =
+			mbox_request_channel(mcl, 0);
+	if (IS_ERR(iproc_priv.mbox[iproc_priv.spu.num_spu])) {
+		err = (int)PTR_ERR(iproc_priv.mbox[iproc_priv.spu.num_spu]);
+		dev_err(dev,
+			"Mbox channel %d request failed with err %d",
+			iproc_priv.spu.num_spu, err);
+		iproc_priv.mbox[iproc_priv.spu.num_spu] = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+static void spu_mb_release(struct platform_device *pdev)
+{
+	int i;
+
+	for (i = 0; i < iproc_priv.spu.num_spu; i++)
+		mbox_free_channel(iproc_priv.mbox[i]);
+}
+
+static void spu_counters_init(void)
+{
+	int i;
+	int j;
+
+	atomic_set(&iproc_priv.session_count, 0);
+	atomic_set(&iproc_priv.stream_count, 0);
+	atomic_set(&iproc_priv.next_chan, (int)iproc_priv.spu.num_spu);
+	atomic64_set(&iproc_priv.bytes_in, 0);
+	atomic64_set(&iproc_priv.bytes_out, 0);
+	for (i = 0; i < SPU_OP_NUM; i++) {
+		atomic_set(&iproc_priv.op_counts[i], 0);
+		atomic_set(&iproc_priv.setkey_cnt[i], 0);
+	}
+	for (i = 0; i < CIPHER_ALG_LAST; i++)
+		for (j = 0; j < CIPHER_MODE_LAST; j++)
+			atomic_set(&iproc_priv.cipher_cnt[i][j], 0);
+
+	for (i = 0; i < HASH_ALG_LAST; i++) {
+		atomic_set(&iproc_priv.hash_cnt[i], 0);
+		atomic_set(&iproc_priv.hmac_cnt[i], 0);
+	}
+	for (i = 0; i < AEAD_TYPE_LAST; i++)
+		atomic_set(&iproc_priv.aead_cnt[i], 0);
+
+	atomic_set(&iproc_priv.mb_no_spc, 0);
+	atomic_set(&iproc_priv.mb_send_fail, 0);
+	atomic_set(&iproc_priv.bad_icv, 0);
+}
+
+static int spu_register_ablkcipher(struct iproc_alg_s *driver_alg)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct crypto_alg *crypto = &driver_alg->alg.crypto;
+	int err;
+
+	/* SPU2 does not support RC4 */
+	if ((driver_alg->cipher_info.alg == CIPHER_ALG_RC4) &&
+	    (spu->spu_type == SPU_TYPE_SPU2))
+		return 0;
+
+	crypto->cra_module = THIS_MODULE;
+	crypto->cra_priority = cipher_pri;
+	crypto->cra_alignmask = 0;
+	crypto->cra_ctxsize = sizeof(struct iproc_ctx_s);
+	INIT_LIST_HEAD(&crypto->cra_list);
+
+	crypto->cra_init = ablkcipher_cra_init;
+	crypto->cra_exit = generic_cra_exit;
+	crypto->cra_type = &crypto_ablkcipher_type;
+	crypto->cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC |
+				CRYPTO_ALG_KERN_DRIVER_ONLY;
+
+	crypto->cra_ablkcipher.setkey = ablkcipher_setkey;
+	crypto->cra_ablkcipher.encrypt = ablkcipher_encrypt;
+	crypto->cra_ablkcipher.decrypt = ablkcipher_decrypt;
+
+	err = crypto_register_alg(crypto);
+	/* Mark alg as having been registered, if successful */
+	if (err == 0)
+		driver_alg->registered = true;
+	pr_debug("  registered ablkcipher %s\n", crypto->cra_driver_name);
+	return err;
+}
+
+static int spu_register_ahash(struct iproc_alg_s *driver_alg)
+{
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct ahash_alg *hash = &driver_alg->alg.hash;
+	int err;
+
+	/* AES-XCBC is the only AES hash type currently supported on SPU-M */
+	if ((driver_alg->auth_info.alg == HASH_ALG_AES) &&
+	    (driver_alg->auth_info.mode != HASH_MODE_XCBC) &&
+	    (spu->spu_type == SPU_TYPE_SPUM))
+		return 0;
+
+	/* SHA3 algorithm variants are not registered for SPU-M or SPU2. */
+	if ((driver_alg->auth_info.alg >= HASH_ALG_SHA3_224) &&
+	    (spu->spu_subtype != SPU_SUBTYPE_SPU2_V2))
+		return 0;
+
+	hash->halg.base.cra_module = THIS_MODULE;
+	hash->halg.base.cra_priority = hash_pri;
+	hash->halg.base.cra_alignmask = 0;
+	hash->halg.base.cra_ctxsize = sizeof(struct iproc_ctx_s);
+	hash->halg.base.cra_init = ahash_cra_init;
+	hash->halg.base.cra_exit = generic_cra_exit;
+	hash->halg.base.cra_type = &crypto_ahash_type;
+	hash->halg.base.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC;
+	hash->halg.statesize = sizeof(struct spu_hash_export_s);
+
+	if (driver_alg->auth_info.mode != HASH_MODE_HMAC) {
+		hash->setkey = ahash_setkey;
+		hash->init = ahash_init;
+		hash->update = ahash_update;
+		hash->final = ahash_final;
+		hash->finup = ahash_finup;
+		hash->digest = ahash_digest;
+	} else {
+		hash->setkey = ahash_hmac_setkey;
+		hash->init = ahash_hmac_init;
+		hash->update = ahash_hmac_update;
+		hash->final = ahash_hmac_final;
+		hash->finup = ahash_hmac_finup;
+		hash->digest = ahash_hmac_digest;
+	}
+	hash->export = ahash_export;
+	hash->import = ahash_import;
+
+	err = crypto_register_ahash(hash);
+	/* Mark alg as having been registered, if successful */
+	if (err == 0)
+		driver_alg->registered = true;
+	pr_debug("  registered ahash %s\n",
+		 hash->halg.base.cra_driver_name);
+	return err;
+}
+
+static int spu_register_aead(struct iproc_alg_s *driver_alg)
+{
+	struct aead_alg *aead = &driver_alg->alg.aead;
+	int err;
+
+	aead->base.cra_module = THIS_MODULE;
+	aead->base.cra_priority = aead_pri;
+	aead->base.cra_alignmask = 0;
+	aead->base.cra_ctxsize = sizeof(struct iproc_ctx_s);
+	INIT_LIST_HEAD(&aead->base.cra_list);
+
+	aead->base.cra_flags |= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC;
+	/* setkey set in alg initialization */
+	aead->setauthsize = aead_setauthsize;
+	aead->encrypt = aead_encrypt;
+	aead->decrypt = aead_decrypt;
+	aead->init = aead_cra_init;
+	aead->exit = aead_cra_exit;
+
+	err = crypto_register_aead(aead);
+	/* Mark alg as having been registered, if successful */
+	if (err == 0)
+		driver_alg->registered = true;
+	pr_debug("  registered aead %s\n", aead->base.cra_driver_name);
+	return err;
+}
+
+/* register crypto algorithms the device supports */
+static int spu_algs_register(struct device *dev)
+{
+	int i, j;
+	int err;
+
+	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
+		switch (driver_algs[i].type) {
+		case CRYPTO_ALG_TYPE_ABLKCIPHER:
+			err = spu_register_ablkcipher(&driver_algs[i]);
+			break;
+		case CRYPTO_ALG_TYPE_AHASH:
+			err = spu_register_ahash(&driver_algs[i]);
+			break;
+		case CRYPTO_ALG_TYPE_AEAD:
+			err = spu_register_aead(&driver_algs[i]);
+			break;
+		default:
+			dev_err(dev,
+				"iproc-crypto: unknown alg type: %d",
+				driver_algs[i].type);
+			err = -EINVAL;
+		}
+
+		if (err) {
+			dev_err(dev, "alg registration failed with error %d\n",
+				err);
+			goto err_algs;
+		}
+	}
+
+	return 0;
+
+err_algs:
+	for (j = 0; j < i; j++) {
+		/* Skip any algorithm not registered */
+		if (!driver_algs[j].registered)
+			continue;
+		switch (driver_algs[j].type) {
+		case CRYPTO_ALG_TYPE_ABLKCIPHER:
+			crypto_unregister_alg(&driver_algs[j].alg.crypto);
+			driver_algs[j].registered = false;
+			break;
+		case CRYPTO_ALG_TYPE_AHASH:
+			crypto_unregister_ahash(&driver_algs[j].alg.hash);
+			driver_algs[j].registered = false;
+			break;
+		case CRYPTO_ALG_TYPE_AEAD:
+			crypto_unregister_aead(&driver_algs[j].alg.aead);
+			driver_algs[j].registered = false;
+			break;
+		}
+	}
+	return err;
+}
+
+/* ==================== Kernel Platform API ==================== */
+
+static struct spu_type_subtype spum_ns2_types = {
+	SPU_TYPE_SPUM, SPU_SUBTYPE_SPUM_NS2
+};
+
+static struct spu_type_subtype spum_nsp_types = {
+	SPU_TYPE_SPUM, SPU_SUBTYPE_SPUM_NSP
+};
+
+static struct spu_type_subtype spu2_types = {
+	SPU_TYPE_SPU2, SPU_SUBTYPE_SPU2_V1
+};
+
+static struct spu_type_subtype spu2_v2_types = {
+	SPU_TYPE_SPU2, SPU_SUBTYPE_SPU2_V2
+};
+
+static const struct of_device_id bcm_spu_dt_ids[] = {
+	{
+		.compatible = "brcm,spum-crypto",
+		.data = &spum_ns2_types,
+	},
+	{
+		.compatible = "brcm,spum-nsp-crypto",
+		.data = &spum_nsp_types,
+	},
+	{
+		.compatible = "brcm,spu2-crypto",
+		.data = &spu2_types,
+	},
+	{
+		.compatible = "brcm,spu2-v2-crypto",
+		.data = &spu2_v2_types,
+	},
+	{ /* sentinel */ }
+};
+
+MODULE_DEVICE_TABLE(of, bcm_spu_dt_ids);
+
+static int spu_dt_read(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct spu_hw *spu = &iproc_priv.spu;
+	struct resource *spu_ctrl_regs;
+	const struct of_device_id *match;
+	const struct spu_type_subtype *matched_spu_type;
+	void __iomem *spu_reg_vbase[MAX_SPUS];
+	int err;
+
+	match = of_match_device(of_match_ptr(bcm_spu_dt_ids), dev);
+	matched_spu_type = match->data;
+
+	if (iproc_priv.spu.num_spu > 1) {
+		/* If this is 2nd or later SPU, make sure it's same type */
+		if ((spu->spu_type != matched_spu_type->type) ||
+		    (spu->spu_subtype != matched_spu_type->subtype)) {
+			err = -EINVAL;
+			dev_err(&pdev->dev, "Multiple SPU types not allowed");
+			return err;
+		}
+	} else {
+		/* Record type of first SPU */
+		spu->spu_type = matched_spu_type->type;
+		spu->spu_subtype = matched_spu_type->subtype;
+	}
+
+	/* Get and map SPU registers */
+	spu_ctrl_regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!spu_ctrl_regs) {
+		err = -EINVAL;
+		dev_err(&pdev->dev, "Invalid/missing registers for SPU\n");
+		return err;
+	}
+
+	spu_reg_vbase[iproc_priv.spu.num_spu] =
+				devm_ioremap_resource(dev, spu_ctrl_regs);
+	if (IS_ERR(spu_reg_vbase[iproc_priv.spu.num_spu])) {
+		err = PTR_ERR(spu_reg_vbase[iproc_priv.spu.num_spu]);
+		dev_err(&pdev->dev, "Failed to map registers: %d\n",
+			err);
+		spu_reg_vbase[iproc_priv.spu.num_spu] = NULL;
+		return err;
+	}
+
+	dev_dbg(dev, "SPU %d detected.", iproc_priv.spu.num_spu);
+
+	spu->reg_vbase[iproc_priv.spu.num_spu] = spu_reg_vbase;
+
+	return 0;
+}
+
+int bcm_spu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct spu_hw *spu = &iproc_priv.spu;
+	int err = 0;
+
+	iproc_priv.pdev[iproc_priv.spu.num_spu] = pdev;
+	platform_set_drvdata(iproc_priv.pdev[iproc_priv.spu.num_spu],
+			     &iproc_priv);
+
+	err = spu_dt_read(pdev);
+	if (err < 0)
+		goto failure;
+
+	err = spu_mb_init(&pdev->dev);
+	if (err < 0)
+		goto failure;
+
+	iproc_priv.spu.num_spu++;
+
+	/* If already initialized, we've just added another SPU and are done */
+	if (iproc_priv.inited)
+		return 0;
+
+	if (spu->spu_type == SPU_TYPE_SPUM)
+		iproc_priv.bcm_hdr_len = 8;
+	else if (spu->spu_type == SPU_TYPE_SPU2)
+		iproc_priv.bcm_hdr_len = 0;
+
+	spu_functions_register(&pdev->dev, spu->spu_type, spu->spu_subtype);
+
+	spu_counters_init();
+
+	spu_setup_debugfs();
+
+	err = spu_algs_register(dev);
+	if (err < 0)
+		goto fail_reg;
+
+	iproc_priv.inited = true;
+
+	return 0;
+
+fail_reg:
+	spu_free_debugfs();
+failure:
+	spu_mb_release(pdev);
+	dev_err(dev, "%s failed with error %d.\n", __func__, err);
+
+	return err;
+}
+
+int bcm_spu_remove(struct platform_device *pdev)
+{
+	int i;
+	struct device *dev = &pdev->dev;
+	char *cdn;
+
+	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
+		/*
+		 * Not all algorithms were registered, depending on whether
+		 * hardware is SPU or SPU2.  So here we make sure to skip
+		 * those algorithms that were not previously registered.
+		 */
+		if (!driver_algs[i].registered)
+			continue;
+
+		switch (driver_algs[i].type) {
+		case CRYPTO_ALG_TYPE_ABLKCIPHER:
+			crypto_unregister_alg(&driver_algs[i].alg.crypto);
+			dev_dbg(dev, "  unregistered cipher %s\n",
+				driver_algs[i].alg.crypto.cra_driver_name);
+			driver_algs[i].registered = false;
+			break;
+		case CRYPTO_ALG_TYPE_AHASH:
+			crypto_unregister_ahash(&driver_algs[i].alg.hash);
+			cdn = driver_algs[i].alg.hash.halg.base.cra_driver_name;
+			dev_dbg(dev, "  unregistered hash %s\n", cdn);
+			driver_algs[i].registered = false;
+			break;
+		case CRYPTO_ALG_TYPE_AEAD:
+			crypto_unregister_aead(&driver_algs[i].alg.aead);
+			dev_dbg(dev, "  unregistered aead %s\n",
+				driver_algs[i].alg.aead.base.cra_driver_name);
+			driver_algs[i].registered = false;
+			break;
+		}
+	}
+	spu_free_debugfs();
+	spu_mb_release(pdev);
+	return 0;
+}
+
+/* ===== Kernel Module API ===== */
+
+static struct platform_driver bcm_spu_pdriver = {
+	.driver = {
+		   .name = "brcm-spu-crypto",
+		   .of_match_table = of_match_ptr(bcm_spu_dt_ids),
+		   },
+	.probe = bcm_spu_probe,
+	.remove = bcm_spu_remove,
+};
+module_platform_driver(bcm_spu_pdriver);
+
+MODULE_AUTHOR("Rob Rice <rob.rice@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom symmetric crypto offload driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/crypto/bcm/cipher.h b/drivers/crypto/bcm/cipher.h
new file mode 100644
index 000000000000..51dca529ce8f
--- /dev/null
+++ b/drivers/crypto/bcm/cipher.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#ifndef _CIPHER_H
+#define _CIPHER_H
+
+#include <linux/atomic.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/mailbox_client.h>
+#include <crypto/aes.h>
+#include <crypto/internal/hash.h>
+#include <crypto/aead.h>
+#include <crypto/sha.h>
+#include <crypto/sha3.h>
+
+#include "spu.h"
+#include "spum.h"
+#include "spu2.h"
+
+/* Driver supports up to MAX_SPUS SPU blocks */
+#define MAX_SPUS 16
+
+#define ARC4_MIN_KEY_SIZE   1
+#define ARC4_MAX_KEY_SIZE   256
+#define ARC4_BLOCK_SIZE     1
+#define ARC4_STATE_SIZE     4
+
+#define CCM_AES_IV_SIZE    16
+#define GCM_AES_IV_SIZE    12
+#define GCM_ESP_IV_SIZE     8
+#define CCM_ESP_IV_SIZE     8
+#define RFC4543_ICV_SIZE   16
+
+#define MAX_KEY_SIZE	ARC4_MAX_KEY_SIZE
+#define MAX_IV_SIZE	AES_BLOCK_SIZE
+#define MAX_DIGEST_SIZE	SHA3_512_DIGEST_SIZE
+#define MAX_ASSOC_SIZE	512
+
+/* size of salt value for AES-GCM-ESP and AES-CCM-ESP */
+#define GCM_ESP_SALT_SIZE   4
+#define CCM_ESP_SALT_SIZE   3
+#define MAX_SALT_SIZE       GCM_ESP_SALT_SIZE
+#define GCM_ESP_SALT_OFFSET 0
+#define CCM_ESP_SALT_OFFSET 1
+
+#define GCM_ESP_DIGESTSIZE 16
+
+#define MAX_HASH_BLOCK_SIZE SHA512_BLOCK_SIZE
+
+/*
+ * Maximum number of bytes from a non-final hash request that can be deferred
+ * until more data is available. With new crypto API framework, this
+ * can be no more than one block of data.
+ */
+#define HASH_CARRY_MAX  MAX_HASH_BLOCK_SIZE
+
+/* Force at least 4-byte alignment of all SPU message fields */
+#define SPU_MSG_ALIGN  4
+
+/* Number of times to resend mailbox message if mb queue is full */
+#define SPU_MB_RETRY_MAX  1000
+
+/* op_counts[] indexes */
+enum op_type {
+	SPU_OP_CIPHER,
+	SPU_OP_HASH,
+	SPU_OP_HMAC,
+	SPU_OP_AEAD,
+	SPU_OP_NUM
+};
+
+enum spu_spu_type {
+	SPU_TYPE_SPUM,
+	SPU_TYPE_SPU2,
+};
+
+/*
+ * SPUM_NS2 and SPUM_NSP are the SPU-M block on Northstar 2 and Northstar Plus,
+ * respectively.
+ */
+enum spu_spu_subtype {
+	SPU_SUBTYPE_SPUM_NS2,
+	SPU_SUBTYPE_SPUM_NSP,
+	SPU_SUBTYPE_SPU2_V1,
+	SPU_SUBTYPE_SPU2_V2
+};
+
+struct spu_type_subtype {
+	enum spu_spu_type type;
+	enum spu_spu_subtype subtype;
+};
+
+struct cipher_op {
+	enum spu_cipher_alg alg;
+	enum spu_cipher_mode mode;
+};
+
+struct auth_op {
+	enum hash_alg alg;
+	enum hash_mode mode;
+};
+
+struct iproc_alg_s {
+	u32 type;
+	union {
+		struct crypto_alg crypto;
+		struct ahash_alg hash;
+		struct aead_alg aead;
+	} alg;
+	struct cipher_op cipher_info;
+	struct auth_op auth_info;
+	bool auth_first;
+	bool registered;
+};
+
+/*
+ * Buffers for a SPU request/reply message pair. All part of one structure to
+ * allow a single alloc per request.
+ */
+struct spu_msg_buf {
+	/* Request message fragments */
+
+	/*
+	 * SPU request message header. For SPU-M, holds MH, EMH, SCTX, BDESC,
+	 * and BD header. For SPU2, holds FMD, OMD.
+	 */
+	u8 bcm_spu_req_hdr[ALIGN(SPU2_HEADER_ALLOC_LEN, SPU_MSG_ALIGN)];
+
+	/* IV or counter. Size to include salt. Also used for XTS tweek. */
+	u8 iv_ctr[ALIGN(2 * AES_BLOCK_SIZE, SPU_MSG_ALIGN)];
+
+	/* Hash digest. request and response. */
+	u8 digest[ALIGN(MAX_DIGEST_SIZE, SPU_MSG_ALIGN)];
+
+	/* SPU request message padding */
+	u8 spu_req_pad[ALIGN(SPU_PAD_LEN_MAX, SPU_MSG_ALIGN)];
+
+	/* SPU-M request message STATUS field */
+	u8 tx_stat[ALIGN(SPU_TX_STATUS_LEN, SPU_MSG_ALIGN)];
+
+	/* Response message fragments */
+
+	/* SPU response message header */
+	u8 spu_resp_hdr[ALIGN(SPU2_HEADER_ALLOC_LEN, SPU_MSG_ALIGN)];
+
+	/* SPU response message STATUS field padding */
+	u8 rx_stat_pad[ALIGN(SPU_STAT_PAD_MAX, SPU_MSG_ALIGN)];
+
+	/* SPU response message STATUS field */
+	u8 rx_stat[ALIGN(SPU_RX_STATUS_LEN, SPU_MSG_ALIGN)];
+
+	union {
+		/* Buffers only used for ablkcipher */
+		struct {
+			/*
+			 * Field used for either SUPDT when RC4 is used
+			 * -OR- tweak value when XTS/AES is used
+			 */
+			u8 supdt_tweak[ALIGN(SPU_SUPDT_LEN, SPU_MSG_ALIGN)];
+		} c;
+
+		/* Buffers only used for aead */
+		struct {
+			/* SPU response pad for GCM data */
+			u8 gcmpad[ALIGN(AES_BLOCK_SIZE, SPU_MSG_ALIGN)];
+
+			/* SPU request msg padding for GCM AAD */
+			u8 req_aad_pad[ALIGN(SPU_PAD_LEN_MAX, SPU_MSG_ALIGN)];
+
+			/* SPU response data to be discarded */
+			u8 resp_aad[ALIGN(MAX_ASSOC_SIZE + MAX_IV_SIZE,
+					  SPU_MSG_ALIGN)];
+		} a;
+	};
+};
+
+struct iproc_ctx_s {
+	u8 enckey[MAX_KEY_SIZE + ARC4_STATE_SIZE];
+	unsigned int enckeylen;
+
+	u8 authkey[MAX_KEY_SIZE + ARC4_STATE_SIZE];
+	unsigned int authkeylen;
+
+	u8 salt[MAX_SALT_SIZE];
+	unsigned int salt_len;
+	unsigned int salt_offset;
+	u8 iv[MAX_IV_SIZE];
+
+	unsigned int digestsize;
+
+	struct iproc_alg_s *alg;
+	bool is_esp;
+
+	struct cipher_op cipher;
+	enum spu_cipher_type cipher_type;
+
+	struct auth_op auth;
+	bool auth_first;
+
+	/*
+	 * The maximum length in bytes of the payload in a SPU message for this
+	 * context. For SPU-M, the payload is the combination of AAD and data.
+	 * For SPU2, the payload is just data. A value of SPU_MAX_PAYLOAD_INF
+	 * indicates that there is no limit to the length of the SPU message
+	 * payload.
+	 */
+	unsigned int max_payload;
+
+	struct crypto_aead *fallback_cipher;
+
+	/* auth_type is determined during processing of request */
+
+	u8 ipad[MAX_HASH_BLOCK_SIZE];
+	u8 opad[MAX_HASH_BLOCK_SIZE];
+
+	/*
+	 * Buffer to hold SPU message header template. Template is created at
+	 * setkey time for ablkcipher requests, since most of the fields in the
+	 * header are known at that time. At request time, just fill in a few
+	 * missing pieces related to length of data in the request and IVs, etc.
+	 */
+	u8 bcm_spu_req_hdr[ALIGN(SPU2_HEADER_ALLOC_LEN, SPU_MSG_ALIGN)];
+
+	/* Length of SPU request header */
+	u16 spu_req_hdr_len;
+
+	/* Expected length of SPU response header */
+	u16 spu_resp_hdr_len;
+
+	/*
+	 * shash descriptor - needed to perform incremental hashing in
+	 * in software, when hw doesn't support it.
+	 */
+	struct shash_desc *shash;
+
+	bool is_rfc4543;	/* RFC 4543 style of GMAC */
+};
+
+/* state from iproc_reqctx_s necessary for hash state export/import */
+struct spu_hash_export_s {
+	unsigned int total_todo;
+	unsigned int total_sent;
+	u8 hash_carry[HASH_CARRY_MAX];
+	unsigned int hash_carry_len;
+	u8 incr_hash[MAX_DIGEST_SIZE];
+	bool is_sw_hmac;
+};
+
+struct iproc_reqctx_s {
+	/* general context */
+	struct crypto_async_request *parent;
+
+	/* only valid after enqueue() */
+	struct iproc_ctx_s *ctx;
+
+	u8 chan_idx;   /* Mailbox channel to be used to submit this request */
+
+	/* total todo, rx'd, and sent for this request */
+	unsigned int total_todo;
+	unsigned int total_received;	/* only valid for ablkcipher */
+	unsigned int total_sent;
+
+	/*
+	 * num bytes sent to hw from the src sg in this request. This can differ
+	 * from total_sent for incremental hashing. total_sent includes previous
+	 * init() and update() data. src_sent does not.
+	 */
+	unsigned int src_sent;
+
+	/*
+	 * For AEAD requests, start of associated data. This will typically
+	 * point to the beginning of the src scatterlist from the request,
+	 * since assoc data is at the beginning of the src scatterlist rather
+	 * than in its own sg.
+	 */
+	struct scatterlist *assoc;
+
+	/*
+	 * scatterlist entry and offset to start of data for next chunk. Crypto
+	 * API src scatterlist for AEAD starts with AAD, if present. For first
+	 * chunk, src_sg is sg entry at beginning of input data (after AAD).
+	 * src_skip begins at the offset in that sg entry where data begins.
+	 */
+	struct scatterlist *src_sg;
+	int src_nents;		/* Number of src entries with data */
+	u32 src_skip;		/* bytes of current sg entry already used */
+
+	/*
+	 * Same for destination. For AEAD, if there is AAD, output data must
+	 * be written at offset following AAD.
+	 */
+	struct scatterlist *dst_sg;
+	int dst_nents;		/* Number of dst entries with data */
+	u32 dst_skip;		/* bytes of current sg entry already written */
+
+	/* Mailbox message used to send this request to PDC driver */
+	struct brcm_message mb_mssg;
+
+	bool bd_suppress;	/* suppress BD field in SPU response? */
+
+	/* cipher context */
+	bool is_encrypt;
+
+	/*
+	 * CBC mode: IV.  CTR mode: counter.  Else empty. Used as a DMA
+	 * buffer for AEAD requests. So allocate as DMAable memory. If IV
+	 * concatenated with salt, includes the salt.
+	 */
+	u8 *iv_ctr;
+	/* Length of IV or counter, in bytes */
+	unsigned int iv_ctr_len;
+
+	/*
+	 * Hash requests can be of any size, whether initial, update, or final.
+	 * A non-final request must be submitted to the SPU as an integral
+	 * number of blocks. This may leave data at the end of the request
+	 * that is not a full block. Since the request is non-final, it cannot
+	 * be padded. So, we write the remainder to this hash_carry buffer and
+	 * hold it until the next request arrives. The carry data is then
+	 * submitted at the beginning of the data in the next SPU msg.
+	 * hash_carry_len is the number of bytes currently in hash_carry. These
+	 * fields are only used for ahash requests.
+	 */
+	u8 hash_carry[HASH_CARRY_MAX];
+	unsigned int hash_carry_len;
+	unsigned int is_final;	/* is this the final for the hash op? */
+
+	/*
+	 * Digest from incremental hash is saved here to include in next hash
+	 * operation. Cannot be stored in req->result for truncated hashes,
+	 * since result may be sized for final digest. Cannot be saved in
+	 * msg_buf because that gets deleted between incremental hash ops
+	 * and is not saved as part of export().
+	 */
+	u8 incr_hash[MAX_DIGEST_SIZE];
+
+	/* hmac context */
+	bool is_sw_hmac;
+
+	/* aead context */
+	struct crypto_tfm *old_tfm;
+	crypto_completion_t old_complete;
+	void *old_data;
+
+	gfp_t gfp;
+
+	/* Buffers used to build SPU request and response messages */
+	struct spu_msg_buf msg_buf;
+};
+
+/*
+ * Structure encapsulates a set of function pointers specific to the type of
+ * SPU hardware running. These functions handling creation and parsing of
+ * SPU request messages and SPU response messages. Includes hardware-specific
+ * values read from device tree.
+ */
+struct spu_hw {
+	void (*spu_dump_msg_hdr)(u8 *buf, unsigned int buf_len);
+	u32 (*spu_ctx_max_payload)(enum spu_cipher_alg cipher_alg,
+				   enum spu_cipher_mode cipher_mode,
+				   unsigned int blocksize);
+	u32 (*spu_payload_length)(u8 *spu_hdr);
+	u16 (*spu_response_hdr_len)(u16 auth_key_len, u16 enc_key_len,
+				    bool is_hash);
+	u16 (*spu_hash_pad_len)(enum hash_alg hash_alg,
+				enum hash_mode hash_mode, u32 chunksize,
+				u16 hash_block_size);
+	u32 (*spu_gcm_ccm_pad_len)(enum spu_cipher_mode cipher_mode,
+				   unsigned int data_size);
+	u32 (*spu_assoc_resp_len)(enum spu_cipher_mode cipher_mode,
+				  unsigned int assoc_len,
+				  unsigned int iv_len, bool is_encrypt);
+	u8 (*spu_aead_ivlen)(enum spu_cipher_mode cipher_mode,
+			     u16 iv_len);
+	enum hash_type (*spu_hash_type)(u32 src_sent);
+	u32 (*spu_digest_size)(u32 digest_size, enum hash_alg alg,
+			       enum hash_type);
+	u32 (*spu_create_request)(u8 *spu_hdr,
+				  struct spu_request_opts *req_opts,
+				  struct spu_cipher_parms *cipher_parms,
+				  struct spu_hash_parms *hash_parms,
+				  struct spu_aead_parms *aead_parms,
+				  unsigned int data_size);
+	u16 (*spu_cipher_req_init)(u8 *spu_hdr,
+				   struct spu_cipher_parms *cipher_parms);
+	void (*spu_cipher_req_finish)(u8 *spu_hdr,
+				      u16 spu_req_hdr_len,
+				      unsigned int is_inbound,
+				      struct spu_cipher_parms *cipher_parms,
+				      bool update_key,
+				      unsigned int data_size);
+	void (*spu_request_pad)(u8 *pad_start, u32 gcm_padding,
+				u32 hash_pad_len, enum hash_alg auth_alg,
+				enum hash_mode auth_mode,
+				unsigned int total_sent, u32 status_padding);
+	u8 (*spu_xts_tweak_in_payload)(void);
+	u8 (*spu_tx_status_len)(void);
+	u8 (*spu_rx_status_len)(void);
+	int (*spu_status_process)(u8 *statp);
+	void (*spu_ccm_update_iv)(unsigned int digestsize,
+				  struct spu_cipher_parms *cipher_parms,
+				  unsigned int assoclen, unsigned int chunksize,
+				  bool is_encrypt, bool is_esp);
+	u32 (*spu_wordalign_padlen)(u32 data_size);
+
+	/* The base virtual address of the SPU hw registers */
+	void __iomem *reg_vbase[MAX_SPUS];
+
+	/* Version of the SPU hardware */
+	enum spu_spu_type spu_type;
+
+	/* Sub-version of the SPU hardware */
+	enum spu_spu_subtype spu_subtype;
+
+	/* The number of SPUs on this platform */
+	u32 num_spu;
+};
+
+struct device_private {
+	struct platform_device *pdev[MAX_SPUS];
+
+	struct spu_hw spu;
+
+	atomic_t session_count;	/* number of streams active */
+	atomic_t stream_count;	/* monotonic counter for streamID's */
+
+	/* Length of BCM header. Set to 0 when hw does not expect BCM HEADER. */
+	u8 bcm_hdr_len;
+
+	/* The index of the channel to use for the next crypto request */
+	atomic_t next_chan;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_stats;
+
+	/* Number of request bytes processed and result bytes returned */
+	atomic64_t bytes_in;
+	atomic64_t bytes_out;
+
+	/* Number of operations of each type */
+	atomic_t op_counts[SPU_OP_NUM];
+
+	atomic_t cipher_cnt[CIPHER_ALG_LAST][CIPHER_MODE_LAST];
+	atomic_t hash_cnt[HASH_ALG_LAST];
+	atomic_t hmac_cnt[HASH_ALG_LAST];
+	atomic_t aead_cnt[AEAD_TYPE_LAST];
+
+	/* Number of calls to setkey() for each operation type */
+	atomic_t setkey_cnt[SPU_OP_NUM];
+
+	/* Number of times request was resubmitted because mb was full */
+	atomic_t mb_no_spc;
+
+	/* Number of mailbox send failures */
+	atomic_t mb_send_fail;
+
+	/* Number of ICV check failures for AEAD messages */
+	atomic_t bad_icv;
+
+	struct mbox_client mcl[MAX_SPUS];
+	/* Array of mailbox channel pointers, one for each channel */
+	struct mbox_chan *mbox[MAX_SPUS];
+
+	/* Driver initialized */
+	bool inited;
+};
+
+extern struct device_private iproc_priv;
+
+#endif
diff --git a/drivers/crypto/bcm/spu.c b/drivers/crypto/bcm/spu.c
new file mode 100644
index 000000000000..dbb5c03dde49
--- /dev/null
+++ b/drivers/crypto/bcm/spu.c
@@ -0,0 +1,1251 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include "util.h"
+#include "spu.h"
+#include "spum.h"
+#include "cipher.h"
+
+/* This array is based on the hash algo type supported in spu.h */
+char *tag_to_hash_idx[] = { "none", "md5", "sha1", "sha224", "sha256" };
+
+char *hash_alg_name[] = { "None", "md5", "sha1", "sha224", "sha256", "aes",
+	"sha384", "sha512", "sha3_224", "sha3_256", "sha3_384", "sha3_512" };
+
+char *aead_alg_name[] = { "ccm(aes)", "gcm(aes)", "authenc" };
+
+/* Assumes SPU-M messages are in big endian */
+void spum_dump_msg_hdr(u8 *buf, unsigned int buf_len)
+{
+	u8 *ptr = buf;
+	struct SPUHEADER *spuh = (struct SPUHEADER *)buf;
+	unsigned int hash_key_len = 0;
+	unsigned int hash_state_len = 0;
+	unsigned int cipher_key_len = 0;
+	unsigned int iv_len;
+	u32 pflags;
+	u32 cflags;
+	u32 ecf;
+	u32 cipher_alg;
+	u32 cipher_mode;
+	u32 cipher_type;
+	u32 hash_alg;
+	u32 hash_mode;
+	u32 hash_type;
+	u32 sctx_size;   /* SCTX length in words */
+	u32 sctx_pl_len; /* SCTX payload length in bytes */
+
+	packet_log("\n");
+	packet_log("SPU Message header %p len: %u\n", buf, buf_len);
+
+	/* ========== Decode MH ========== */
+	packet_log("  MH 0x%08x\n", be32_to_cpu(*((u32 *)ptr)));
+	if (spuh->mh.flags & MH_SCTX_PRES)
+		packet_log("    SCTX  present\n");
+	if (spuh->mh.flags & MH_BDESC_PRES)
+		packet_log("    BDESC present\n");
+	if (spuh->mh.flags & MH_MFM_PRES)
+		packet_log("    MFM   present\n");
+	if (spuh->mh.flags & MH_BD_PRES)
+		packet_log("    BD    present\n");
+	if (spuh->mh.flags & MH_HASH_PRES)
+		packet_log("    HASH  present\n");
+	if (spuh->mh.flags & MH_SUPDT_PRES)
+		packet_log("    SUPDT present\n");
+	packet_log("    Opcode 0x%02x\n", spuh->mh.op_code);
+
+	ptr += sizeof(spuh->mh) + sizeof(spuh->emh);  /* skip emh. unused */
+
+	/* ========== Decode SCTX ========== */
+	if (spuh->mh.flags & MH_SCTX_PRES) {
+		pflags = be32_to_cpu(spuh->sa.proto_flags);
+		packet_log("  SCTX[0] 0x%08x\n", pflags);
+		sctx_size = pflags & SCTX_SIZE;
+		packet_log("    Size %u words\n", sctx_size);
+
+		cflags = be32_to_cpu(spuh->sa.cipher_flags);
+		packet_log("  SCTX[1] 0x%08x\n", cflags);
+		packet_log("    Inbound:%lu (1:decrypt/vrfy 0:encrypt/auth)\n",
+			   (cflags & CIPHER_INBOUND) >> CIPHER_INBOUND_SHIFT);
+		packet_log("    Order:%lu (1:AuthFirst 0:EncFirst)\n",
+			   (cflags & CIPHER_ORDER) >> CIPHER_ORDER_SHIFT);
+		packet_log("    ICV_IS_512:%lx\n",
+			   (cflags & ICV_IS_512) >> ICV_IS_512_SHIFT);
+		cipher_alg = (cflags & CIPHER_ALG) >> CIPHER_ALG_SHIFT;
+		cipher_mode = (cflags & CIPHER_MODE) >> CIPHER_MODE_SHIFT;
+		cipher_type = (cflags & CIPHER_TYPE) >> CIPHER_TYPE_SHIFT;
+		packet_log("    Crypto Alg:%u Mode:%u Type:%u\n",
+			   cipher_alg, cipher_mode, cipher_type);
+		hash_alg = (cflags & HASH_ALG) >> HASH_ALG_SHIFT;
+		hash_mode = (cflags & HASH_MODE) >> HASH_MODE_SHIFT;
+		hash_type = (cflags & HASH_TYPE) >> HASH_TYPE_SHIFT;
+		packet_log("    Hash   Alg:%x Mode:%x Type:%x\n",
+			   hash_alg, hash_mode, hash_type);
+		packet_log("    UPDT_Offset:%u\n", cflags & UPDT_OFST);
+
+		ecf = be32_to_cpu(spuh->sa.ecf);
+		packet_log("  SCTX[2] 0x%08x\n", ecf);
+		packet_log("    WriteICV:%lu CheckICV:%lu ICV_SIZE:%u ",
+			   (ecf & INSERT_ICV) >> INSERT_ICV_SHIFT,
+			   (ecf & CHECK_ICV) >> CHECK_ICV_SHIFT,
+			   (ecf & ICV_SIZE) >> ICV_SIZE_SHIFT);
+		packet_log("BD_SUPPRESS:%lu\n",
+			   (ecf & BD_SUPPRESS) >> BD_SUPPRESS_SHIFT);
+		packet_log("    SCTX_IV:%lu ExplicitIV:%lu GenIV:%lu ",
+			   (ecf & SCTX_IV) >> SCTX_IV_SHIFT,
+			   (ecf & EXPLICIT_IV) >> EXPLICIT_IV_SHIFT,
+			   (ecf & GEN_IV) >> GEN_IV_SHIFT);
+		packet_log("IV_OV_OFST:%lu EXP_IV_SIZE:%u\n",
+			   (ecf & IV_OFFSET) >> IV_OFFSET_SHIFT,
+			   ecf & EXP_IV_SIZE);
+
+		ptr += sizeof(struct SCTX);
+
+		if (hash_alg && hash_mode) {
+			char *name = "NONE";
+
+			switch (hash_alg) {
+			case HASH_ALG_MD5:
+				hash_key_len = 16;
+				name = "MD5";
+				break;
+			case HASH_ALG_SHA1:
+				hash_key_len = 20;
+				name = "SHA1";
+				break;
+			case HASH_ALG_SHA224:
+				hash_key_len = 28;
+				name = "SHA224";
+				break;
+			case HASH_ALG_SHA256:
+				hash_key_len = 32;
+				name = "SHA256";
+				break;
+			case HASH_ALG_SHA384:
+				hash_key_len = 48;
+				name = "SHA384";
+				break;
+			case HASH_ALG_SHA512:
+				hash_key_len = 64;
+				name = "SHA512";
+				break;
+			case HASH_ALG_AES:
+				hash_key_len = 0;
+				name = "AES";
+				break;
+			case HASH_ALG_NONE:
+				break;
+			}
+
+			packet_log("    Auth Key Type:%s Length:%u Bytes\n",
+				   name, hash_key_len);
+			packet_dump("    KEY: ", ptr, hash_key_len);
+			ptr += hash_key_len;
+		} else if ((hash_alg == HASH_ALG_AES) &&
+			   (hash_mode == HASH_MODE_XCBC)) {
+			char *name = "NONE";
+
+			switch (cipher_type) {
+			case CIPHER_TYPE_AES128:
+				hash_key_len = 16;
+				name = "AES128-XCBC";
+				break;
+			case CIPHER_TYPE_AES192:
+				hash_key_len = 24;
+				name = "AES192-XCBC";
+				break;
+			case CIPHER_TYPE_AES256:
+				hash_key_len = 32;
+				name = "AES256-XCBC";
+				break;
+			}
+			packet_log("    Auth Key Type:%s Length:%u Bytes\n",
+				   name, hash_key_len);
+			packet_dump("    KEY: ", ptr, hash_key_len);
+			ptr += hash_key_len;
+		}
+
+		if (hash_alg && (hash_mode == HASH_MODE_NONE) &&
+		    (hash_type == HASH_TYPE_UPDT)) {
+			char *name = "NONE";
+
+			switch (hash_alg) {
+			case HASH_ALG_MD5:
+				hash_state_len = 16;
+				name = "MD5";
+				break;
+			case HASH_ALG_SHA1:
+				hash_state_len = 20;
+				name = "SHA1";
+				break;
+			case HASH_ALG_SHA224:
+				hash_state_len = 32;
+				name = "SHA224";
+				break;
+			case HASH_ALG_SHA256:
+				hash_state_len = 32;
+				name = "SHA256";
+				break;
+			case HASH_ALG_SHA384:
+				hash_state_len = 48;
+				name = "SHA384";
+				break;
+			case HASH_ALG_SHA512:
+				hash_state_len = 64;
+				name = "SHA512";
+				break;
+			case HASH_ALG_AES:
+				hash_state_len = 0;
+				name = "AES";
+				break;
+			case HASH_ALG_NONE:
+				break;
+			}
+
+			packet_log("    Auth State Type:%s Length:%u Bytes\n",
+				   name, hash_state_len);
+			packet_dump("    State: ", ptr, hash_state_len);
+			ptr += hash_state_len;
+		}
+
+		if (cipher_alg) {
+			char *name = "NONE";
+
+			switch (cipher_alg) {
+			case CIPHER_ALG_DES:
+				cipher_key_len = 8;
+				name = "DES";
+				break;
+			case CIPHER_ALG_3DES:
+				cipher_key_len = 24;
+				name = "3DES";
+				break;
+			case CIPHER_ALG_RC4:
+				cipher_key_len = 260;
+				name = "ARC4";
+				break;
+			case CIPHER_ALG_AES:
+				switch (cipher_type) {
+				case CIPHER_TYPE_AES128:
+					cipher_key_len = 16;
+					name = "AES128";
+					break;
+				case CIPHER_TYPE_AES192:
+					cipher_key_len = 24;
+					name = "AES192";
+					break;
+				case CIPHER_TYPE_AES256:
+					cipher_key_len = 32;
+					name = "AES256";
+					break;
+				}
+				break;
+			case CIPHER_ALG_NONE:
+				break;
+			}
+
+			packet_log("    Cipher Key Type:%s Length:%u Bytes\n",
+				   name, cipher_key_len);
+
+			/* XTS has two keys */
+			if (cipher_mode == CIPHER_MODE_XTS) {
+				packet_dump("    KEY2: ", ptr, cipher_key_len);
+				ptr += cipher_key_len;
+				packet_dump("    KEY1: ", ptr, cipher_key_len);
+				ptr += cipher_key_len;
+
+				cipher_key_len *= 2;
+			} else {
+				packet_dump("    KEY: ", ptr, cipher_key_len);
+				ptr += cipher_key_len;
+			}
+
+			if (ecf & SCTX_IV) {
+				sctx_pl_len = sctx_size * sizeof(u32) -
+					sizeof(struct SCTX);
+				iv_len = sctx_pl_len -
+					(hash_key_len + hash_state_len +
+					 cipher_key_len);
+				packet_log("    IV Length:%u Bytes\n", iv_len);
+				packet_dump("    IV: ", ptr, iv_len);
+				ptr += iv_len;
+			}
+		}
+	}
+
+	/* ========== Decode BDESC ========== */
+	if (spuh->mh.flags & MH_BDESC_PRES) {
+#ifdef DEBUG
+		struct BDESC_HEADER *bdesc = (struct BDESC_HEADER *)ptr;
+#endif
+		packet_log("  BDESC[0] 0x%08x\n", be32_to_cpu(*((u32 *)ptr)));
+		packet_log("    OffsetMAC:%u LengthMAC:%u\n",
+			   be16_to_cpu(bdesc->offset_mac),
+			   be16_to_cpu(bdesc->length_mac));
+		ptr += sizeof(u32);
+
+		packet_log("  BDESC[1] 0x%08x\n", be32_to_cpu(*((u32 *)ptr)));
+		packet_log("    OffsetCrypto:%u LengthCrypto:%u\n",
+			   be16_to_cpu(bdesc->offset_crypto),
+			   be16_to_cpu(bdesc->length_crypto));
+		ptr += sizeof(u32);
+
+		packet_log("  BDESC[2] 0x%08x\n", be32_to_cpu(*((u32 *)ptr)));
+		packet_log("    OffsetICV:%u OffsetIV:%u\n",
+			   be16_to_cpu(bdesc->offset_icv),
+			   be16_to_cpu(bdesc->offset_iv));
+		ptr += sizeof(u32);
+	}
+
+	/* ========== Decode BD ========== */
+	if (spuh->mh.flags & MH_BD_PRES) {
+#ifdef DEBUG
+		struct BD_HEADER *bd = (struct BD_HEADER *)ptr;
+#endif
+		packet_log("  BD[0] 0x%08x\n", be32_to_cpu(*((u32 *)ptr)));
+		packet_log("    Size:%ubytes PrevLength:%u\n",
+			   be16_to_cpu(bd->size), be16_to_cpu(bd->prev_length));
+		ptr += 4;
+	}
+
+	/* Double check sanity */
+	if (buf + buf_len != ptr) {
+		packet_log(" Packet parsed incorrectly. ");
+		packet_log("buf:%p buf_len:%u buf+buf_len:%p ptr:%p\n",
+			   buf, buf_len, buf + buf_len, ptr);
+	}
+
+	packet_log("\n");
+}
+
+/**
+ * spum_ns2_ctx_max_payload() - Determine the max length of the payload for a
+ * SPU message for a given cipher and hash alg context.
+ * @cipher_alg:		The cipher algorithm
+ * @cipher_mode:	The cipher mode
+ * @blocksize:		The size of a block of data for this algo
+ *
+ * The max payload must be a multiple of the blocksize so that if a request is
+ * too large to fit in a single SPU message, the request can be broken into
+ * max_payload sized chunks. Each chunk must be a multiple of blocksize.
+ *
+ * Return: Max payload length in bytes
+ */
+u32 spum_ns2_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			     enum spu_cipher_mode cipher_mode,
+			     unsigned int blocksize)
+{
+	u32 max_payload = SPUM_NS2_MAX_PAYLOAD;
+	u32 excess;
+
+	/* In XTS on SPU-M, we'll need to insert tweak before input data */
+	if (cipher_mode == CIPHER_MODE_XTS)
+		max_payload -= SPU_XTS_TWEAK_SIZE;
+
+	excess = max_payload % blocksize;
+
+	return max_payload - excess;
+}
+
+/**
+ * spum_nsp_ctx_max_payload() - Determine the max length of the payload for a
+ * SPU message for a given cipher and hash alg context.
+ * @cipher_alg:		The cipher algorithm
+ * @cipher_mode:	The cipher mode
+ * @blocksize:		The size of a block of data for this algo
+ *
+ * The max payload must be a multiple of the blocksize so that if a request is
+ * too large to fit in a single SPU message, the request can be broken into
+ * max_payload sized chunks. Each chunk must be a multiple of blocksize.
+ *
+ * Return: Max payload length in bytes
+ */
+u32 spum_nsp_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			     enum spu_cipher_mode cipher_mode,
+			     unsigned int blocksize)
+{
+	u32 max_payload = SPUM_NSP_MAX_PAYLOAD;
+	u32 excess;
+
+	/* In XTS on SPU-M, we'll need to insert tweak before input data */
+	if (cipher_mode == CIPHER_MODE_XTS)
+		max_payload -= SPU_XTS_TWEAK_SIZE;
+
+	excess = max_payload % blocksize;
+
+	return max_payload - excess;
+}
+
+/** spum_payload_length() - Given a SPU-M message header, extract the payload
+ * length.
+ * @spu_hdr:	Start of SPU header
+ *
+ * Assumes just MH, EMH, BD (no SCTX, BDESC. Works for response frames.
+ *
+ * Return: payload length in bytes
+ */
+u32 spum_payload_length(u8 *spu_hdr)
+{
+	struct BD_HEADER *bd;
+	u32 pl_len;
+
+	/* Find BD header.  skip MH, EMH */
+	bd = (struct BD_HEADER *)(spu_hdr + 8);
+	pl_len = be16_to_cpu(bd->size);
+
+	return pl_len;
+}
+
+/**
+ * spum_response_hdr_len() - Given the length of the hash key and encryption
+ * key, determine the expected length of a SPU response header.
+ * @auth_key_len:	authentication key length (bytes)
+ * @enc_key_len:	encryption key length (bytes)
+ * @is_hash:		true if response message is for a hash operation
+ *
+ * Return: length of SPU response header (bytes)
+ */
+u16 spum_response_hdr_len(u16 auth_key_len, u16 enc_key_len, bool is_hash)
+{
+	if (is_hash)
+		return SPU_HASH_RESP_HDR_LEN;
+	else
+		return SPU_RESP_HDR_LEN;
+}
+
+/**
+ * spum_hash_pad_len() - Calculate the length of hash padding required to extend
+ * data to a full block size.
+ * @hash_alg:   hash algorithm
+ * @hash_mode:       hash mode
+ * @chunksize:  length of data, in bytes
+ * @hash_block_size:  size of a block of data for hash algorithm
+ *
+ * Reserve space for 1 byte (0x80) start of pad and the total length as u64
+ *
+ * Return:  length of hash pad in bytes
+ */
+u16 spum_hash_pad_len(enum hash_alg hash_alg, enum hash_mode hash_mode,
+		      u32 chunksize, u16 hash_block_size)
+{
+	unsigned int length_len;
+	unsigned int used_space_last_block;
+	int hash_pad_len;
+
+	/* AES-XCBC hash requires just padding to next block boundary */
+	if ((hash_alg == HASH_ALG_AES) && (hash_mode == HASH_MODE_XCBC)) {
+		used_space_last_block = chunksize % hash_block_size;
+		hash_pad_len = hash_block_size - used_space_last_block;
+		if (hash_pad_len >= hash_block_size)
+			hash_pad_len -= hash_block_size;
+		return hash_pad_len;
+	}
+
+	used_space_last_block = chunksize % hash_block_size + 1;
+	if ((hash_alg == HASH_ALG_SHA384) || (hash_alg == HASH_ALG_SHA512))
+		length_len = 2 * sizeof(u64);
+	else
+		length_len = sizeof(u64);
+
+	used_space_last_block += length_len;
+	hash_pad_len = hash_block_size - used_space_last_block;
+	if (hash_pad_len < 0)
+		hash_pad_len += hash_block_size;
+
+	hash_pad_len += 1 + length_len;
+	return hash_pad_len;
+}
+
+/**
+ * spum_gcm_ccm_pad_len() - Determine the required length of GCM or CCM padding.
+ * @cipher_mode:	Algo type
+ * @data_size:		Length of plaintext (bytes)
+ *
+ * @Return: Length of padding, in bytes
+ */
+u32 spum_gcm_ccm_pad_len(enum spu_cipher_mode cipher_mode,
+			 unsigned int data_size)
+{
+	u32 pad_len = 0;
+	u32 m1 = SPU_GCM_CCM_ALIGN - 1;
+
+	if ((cipher_mode == CIPHER_MODE_GCM) ||
+	    (cipher_mode == CIPHER_MODE_CCM))
+		pad_len = ((data_size + m1) & ~m1) - data_size;
+
+	return pad_len;
+}
+
+/**
+ * spum_assoc_resp_len() - Determine the size of the receive buffer required to
+ * catch associated data.
+ * @cipher_mode:	cipher mode
+ * @assoc_len:		length of associated data (bytes)
+ * @iv_len:		length of IV (bytes)
+ * @is_encrypt:		true if encrypting. false if decrypting.
+ *
+ * Return: length of associated data in response message (bytes)
+ */
+u32 spum_assoc_resp_len(enum spu_cipher_mode cipher_mode,
+			unsigned int assoc_len, unsigned int iv_len,
+			bool is_encrypt)
+{
+	u32 buflen = 0;
+	u32 pad;
+
+	if (assoc_len)
+		buflen = assoc_len;
+
+	if (cipher_mode == CIPHER_MODE_GCM) {
+		/* AAD needs to be padded in responses too */
+		pad = spum_gcm_ccm_pad_len(cipher_mode, buflen);
+		buflen += pad;
+	}
+	if (cipher_mode == CIPHER_MODE_CCM) {
+		/*
+		 * AAD needs to be padded in responses too
+		 * for CCM, len + 2 needs to be 128-bit aligned.
+		 */
+		pad = spum_gcm_ccm_pad_len(cipher_mode, buflen + 2);
+		buflen += pad;
+	}
+
+	return buflen;
+}
+
+/**
+ * spu_aead_ivlen() - Calculate the length of the AEAD IV to be included
+ * in a SPU request after the AAD and before the payload.
+ * @cipher_mode:  cipher mode
+ * @iv_ctr_len:   initialization vector length in bytes
+ *
+ * In Linux ~4.2 and later, the assoc_data sg includes the IV. So no need
+ * to include the IV as a separate field in the SPU request msg.
+ *
+ * Return: Length of AEAD IV in bytes
+ */
+u8 spum_aead_ivlen(enum spu_cipher_mode cipher_mode, u16 iv_len)
+{
+	return 0;
+}
+
+/**
+ * spum_hash_type() - Determine the type of hash operation.
+ * @src_sent:  The number of bytes in the current request that have already
+ *             been sent to the SPU to be hashed.
+ *
+ * We do not use HASH_TYPE_FULL for requests that fit in a single SPU message.
+ * Using FULL causes failures (such as when the string to be hashed is empty).
+ * For similar reasons, we never use HASH_TYPE_FIN. Instead, submit messages
+ * as INIT or UPDT and do the hash padding in sw.
+ */
+enum hash_type spum_hash_type(u32 src_sent)
+{
+	return src_sent ? HASH_TYPE_UPDT : HASH_TYPE_INIT;
+}
+
+/**
+ * spum_digest_size() - Determine the size of a hash digest to expect the SPU to
+ * return.
+ * alg_digest_size: Number of bytes in the final digest for the given algo
+ * alg:             The hash algorithm
+ * htype:           Type of hash operation (init, update, full, etc)
+ *
+ * When doing incremental hashing for an algorithm with a truncated hash
+ * (e.g., SHA224), the SPU returns the full digest so that it can be fed back as
+ * a partial result for the next chunk.
+ */
+u32 spum_digest_size(u32 alg_digest_size, enum hash_alg alg,
+		     enum hash_type htype)
+{
+	u32 digestsize = alg_digest_size;
+
+	/* SPU returns complete digest when doing incremental hash and truncated
+	 * hash algo.
+	 */
+	if ((htype == HASH_TYPE_INIT) || (htype == HASH_TYPE_UPDT)) {
+		if (alg == HASH_ALG_SHA224)
+			digestsize = SHA256_DIGEST_SIZE;
+		else if (alg == HASH_ALG_SHA384)
+			digestsize = SHA512_DIGEST_SIZE;
+	}
+	return digestsize;
+}
+
+/**
+ * spum_create_request() - Build a SPU request message header, up to and
+ * including the BD header. Construct the message starting at spu_hdr. Caller
+ * should allocate this buffer in DMA-able memory at least SPU_HEADER_ALLOC_LEN
+ * bytes long.
+ * @spu_hdr: Start of buffer where SPU request header is to be written
+ * @req_opts: SPU request message options
+ * @cipher_parms: Parameters related to cipher algorithm
+ * @hash_parms:   Parameters related to hash algorithm
+ * @aead_parms:   Parameters related to AEAD operation
+ * @data_size:    Length of data to be encrypted or authenticated. If AEAD, does
+ *		  not include length of AAD.
+
+ * Return: the length of the SPU header in bytes. 0 if an error occurs.
+ */
+u32 spum_create_request(u8 *spu_hdr,
+			struct spu_request_opts *req_opts,
+			struct spu_cipher_parms *cipher_parms,
+			struct spu_hash_parms *hash_parms,
+			struct spu_aead_parms *aead_parms,
+			unsigned int data_size)
+{
+	struct SPUHEADER *spuh;
+	struct BDESC_HEADER *bdesc;
+	struct BD_HEADER *bd;
+
+	u8 *ptr;
+	u32 protocol_bits = 0;
+	u32 cipher_bits = 0;
+	u32 ecf_bits = 0;
+	u8 sctx_words = 0;
+	unsigned int buf_len = 0;
+
+	/* size of the cipher payload */
+	unsigned int cipher_len = hash_parms->prebuf_len + data_size +
+				hash_parms->pad_len;
+
+	/* offset of prebuf or data from end of BD header */
+	unsigned int cipher_offset = aead_parms->assoc_size +
+		aead_parms->iv_len + aead_parms->aad_pad_len;
+
+	/* total size of the DB data (without STAT word padding) */
+	unsigned int real_db_size = spu_real_db_size(aead_parms->assoc_size,
+						 aead_parms->iv_len,
+						 hash_parms->prebuf_len,
+						 data_size,
+						 aead_parms->aad_pad_len,
+						 aead_parms->data_pad_len,
+						 hash_parms->pad_len);
+
+	unsigned int auth_offset = 0;
+	unsigned int offset_iv = 0;
+
+	/* size/offset of the auth payload */
+	unsigned int auth_len;
+
+	auth_len = real_db_size;
+
+	if (req_opts->is_aead && req_opts->is_inbound)
+		cipher_len -= hash_parms->digestsize;
+
+	if (req_opts->is_aead && req_opts->is_inbound)
+		auth_len -= hash_parms->digestsize;
+
+	if ((hash_parms->alg == HASH_ALG_AES) &&
+	    (hash_parms->mode == HASH_MODE_XCBC)) {
+		auth_len -= hash_parms->pad_len;
+		cipher_len -= hash_parms->pad_len;
+	}
+
+	flow_log("%s()\n", __func__);
+	flow_log("  in:%u authFirst:%u\n",
+		 req_opts->is_inbound, req_opts->auth_first);
+	flow_log("  %s. cipher alg:%u mode:%u type %u\n",
+		 spu_alg_name(cipher_parms->alg, cipher_parms->mode),
+		 cipher_parms->alg, cipher_parms->mode, cipher_parms->type);
+	flow_log("    key: %d\n", cipher_parms->key_len);
+	flow_dump("    key: ", cipher_parms->key_buf, cipher_parms->key_len);
+	flow_log("    iv: %d\n", cipher_parms->iv_len);
+	flow_dump("    iv: ", cipher_parms->iv_buf, cipher_parms->iv_len);
+	flow_log("  auth alg:%u mode:%u type %u\n",
+		 hash_parms->alg, hash_parms->mode, hash_parms->type);
+	flow_log("  digestsize: %u\n", hash_parms->digestsize);
+	flow_log("  authkey: %d\n", hash_parms->key_len);
+	flow_dump("  authkey: ", hash_parms->key_buf, hash_parms->key_len);
+	flow_log("  assoc_size:%u\n", aead_parms->assoc_size);
+	flow_log("  prebuf_len:%u\n", hash_parms->prebuf_len);
+	flow_log("  data_size:%u\n", data_size);
+	flow_log("  hash_pad_len:%u\n", hash_parms->pad_len);
+	flow_log("  real_db_size:%u\n", real_db_size);
+	flow_log(" auth_offset:%u auth_len:%u cipher_offset:%u cipher_len:%u\n",
+		 auth_offset, auth_len, cipher_offset, cipher_len);
+	flow_log("  aead_iv: %u\n", aead_parms->iv_len);
+
+	/* starting out: zero the header (plus some) */
+	ptr = spu_hdr;
+	memset(ptr, 0, sizeof(struct SPUHEADER));
+
+	/* format master header word */
+	/* Do not set the next bit even though the datasheet says to */
+	spuh = (struct SPUHEADER *)ptr;
+	ptr += sizeof(struct SPUHEADER);
+	buf_len += sizeof(struct SPUHEADER);
+
+	spuh->mh.op_code = SPU_CRYPTO_OPERATION_GENERIC;
+	spuh->mh.flags |= (MH_SCTX_PRES | MH_BDESC_PRES | MH_BD_PRES);
+
+	/* Format sctx word 0 (protocol_bits) */
+	sctx_words = 3;		/* size in words */
+
+	/* Format sctx word 1 (cipher_bits) */
+	if (req_opts->is_inbound)
+		cipher_bits |= CIPHER_INBOUND;
+	if (req_opts->auth_first)
+		cipher_bits |= CIPHER_ORDER;
+
+	/* Set the crypto parameters in the cipher.flags */
+	cipher_bits |= cipher_parms->alg << CIPHER_ALG_SHIFT;
+	cipher_bits |= cipher_parms->mode << CIPHER_MODE_SHIFT;
+	cipher_bits |= cipher_parms->type << CIPHER_TYPE_SHIFT;
+
+	/* Set the auth parameters in the cipher.flags */
+	cipher_bits |= hash_parms->alg << HASH_ALG_SHIFT;
+	cipher_bits |= hash_parms->mode << HASH_MODE_SHIFT;
+	cipher_bits |= hash_parms->type << HASH_TYPE_SHIFT;
+
+	/*
+	 * Format sctx extensions if required, and update main fields if
+	 * required)
+	 */
+	if (hash_parms->alg) {
+		/* Write the authentication key material if present */
+		if (hash_parms->key_len) {
+			memcpy(ptr, hash_parms->key_buf, hash_parms->key_len);
+			ptr += hash_parms->key_len;
+			buf_len += hash_parms->key_len;
+			sctx_words += hash_parms->key_len / 4;
+		}
+
+		if ((cipher_parms->mode == CIPHER_MODE_GCM) ||
+		    (cipher_parms->mode == CIPHER_MODE_CCM))
+			/* unpadded length */
+			offset_iv = aead_parms->assoc_size;
+
+		/* if GCM/CCM we need to write ICV into the payload */
+		if (!req_opts->is_inbound) {
+			if ((cipher_parms->mode == CIPHER_MODE_GCM) ||
+			    (cipher_parms->mode == CIPHER_MODE_CCM))
+				ecf_bits |= 1 << INSERT_ICV_SHIFT;
+		} else {
+			ecf_bits |= CHECK_ICV;
+		}
+
+		/* Inform the SPU of the ICV size (in words) */
+		if (hash_parms->digestsize == 64)
+			cipher_bits |= ICV_IS_512;
+		else
+			ecf_bits |=
+			(hash_parms->digestsize / 4) << ICV_SIZE_SHIFT;
+	}
+
+	if (req_opts->bd_suppress)
+		ecf_bits |= BD_SUPPRESS;
+
+	/* copy the encryption keys in the SAD entry */
+	if (cipher_parms->alg) {
+		if (cipher_parms->key_len) {
+			memcpy(ptr, cipher_parms->key_buf,
+			       cipher_parms->key_len);
+			ptr += cipher_parms->key_len;
+			buf_len += cipher_parms->key_len;
+			sctx_words += cipher_parms->key_len / 4;
+		}
+
+		/*
+		 * if encrypting then set IV size, use SCTX IV unless no IV
+		 * given here
+		 */
+		if (cipher_parms->iv_buf && cipher_parms->iv_len) {
+			/* Use SCTX IV */
+			ecf_bits |= SCTX_IV;
+
+			/* cipher iv provided so put it in here */
+			memcpy(ptr, cipher_parms->iv_buf, cipher_parms->iv_len);
+
+			ptr += cipher_parms->iv_len;
+			buf_len += cipher_parms->iv_len;
+			sctx_words += cipher_parms->iv_len / 4;
+		}
+	}
+
+	/*
+	 * RFC4543 (GMAC/ESP) requires data to be sent as part of AAD
+	 * so we need to override the BDESC parameters.
+	 */
+	if (req_opts->is_rfc4543) {
+		if (req_opts->is_inbound)
+			data_size -= hash_parms->digestsize;
+		offset_iv = aead_parms->assoc_size + data_size;
+		cipher_len = 0;
+		cipher_offset = offset_iv;
+		auth_len = cipher_offset + aead_parms->data_pad_len;
+	}
+
+	/* write in the total sctx length now that we know it */
+	protocol_bits |= sctx_words;
+
+	/* Endian adjust the SCTX */
+	spuh->sa.proto_flags = cpu_to_be32(protocol_bits);
+	spuh->sa.cipher_flags = cpu_to_be32(cipher_bits);
+	spuh->sa.ecf = cpu_to_be32(ecf_bits);
+
+	/* === create the BDESC section === */
+	bdesc = (struct BDESC_HEADER *)ptr;
+
+	bdesc->offset_mac = cpu_to_be16(auth_offset);
+	bdesc->length_mac = cpu_to_be16(auth_len);
+	bdesc->offset_crypto = cpu_to_be16(cipher_offset);
+	bdesc->length_crypto = cpu_to_be16(cipher_len);
+
+	/*
+	 * CCM in SPU-M requires that ICV not be in same 32-bit word as data or
+	 * padding.  So account for padding as necessary.
+	 */
+	if (cipher_parms->mode == CIPHER_MODE_CCM)
+		auth_len += spum_wordalign_padlen(auth_len);
+
+	bdesc->offset_icv = cpu_to_be16(auth_len);
+	bdesc->offset_iv = cpu_to_be16(offset_iv);
+
+	ptr += sizeof(struct BDESC_HEADER);
+	buf_len += sizeof(struct BDESC_HEADER);
+
+	/* === no MFM section === */
+
+	/* === create the BD section === */
+
+	/* add the BD header */
+	bd = (struct BD_HEADER *)ptr;
+	bd->size = cpu_to_be16(real_db_size);
+	bd->prev_length = 0;
+
+	ptr += sizeof(struct BD_HEADER);
+	buf_len += sizeof(struct BD_HEADER);
+
+	packet_dump("  SPU request header: ", spu_hdr, buf_len);
+
+	return buf_len;
+}
+
+/**
+ * spum_cipher_req_init() - Build a SPU request message header, up to and
+ * including the BD header.
+ * @spu_hdr:      Start of SPU request header (MH)
+ * @cipher_parms: Parameters that describe the cipher request
+ *
+ * Construct the message starting at spu_hdr. Caller should allocate this buffer
+ * in DMA-able memory at least SPU_HEADER_ALLOC_LEN bytes long.
+ *
+ * Return: the length of the SPU header in bytes. 0 if an error occurs.
+ */
+u16 spum_cipher_req_init(u8 *spu_hdr, struct spu_cipher_parms *cipher_parms)
+{
+	struct SPUHEADER *spuh;
+	u32 protocol_bits = 0;
+	u32 cipher_bits = 0;
+	u32 ecf_bits = 0;
+	u8 sctx_words = 0;
+	u8 *ptr = spu_hdr;
+
+	flow_log("%s()\n", __func__);
+	flow_log("  cipher alg:%u mode:%u type %u\n", cipher_parms->alg,
+		 cipher_parms->mode, cipher_parms->type);
+	flow_log("  cipher_iv_len: %u\n", cipher_parms->iv_len);
+	flow_log("    key: %d\n", cipher_parms->key_len);
+	flow_dump("    key: ", cipher_parms->key_buf, cipher_parms->key_len);
+
+	/* starting out: zero the header (plus some) */
+	memset(spu_hdr, 0, sizeof(struct SPUHEADER));
+	ptr += sizeof(struct SPUHEADER);
+
+	/* format master header word */
+	/* Do not set the next bit even though the datasheet says to */
+	spuh = (struct SPUHEADER *)spu_hdr;
+
+	spuh->mh.op_code = SPU_CRYPTO_OPERATION_GENERIC;
+	spuh->mh.flags |= (MH_SCTX_PRES | MH_BDESC_PRES | MH_BD_PRES);
+
+	/* Format sctx word 0 (protocol_bits) */
+	sctx_words = 3;		/* size in words */
+
+	/* copy the encryption keys in the SAD entry */
+	if (cipher_parms->alg) {
+		if (cipher_parms->key_len) {
+			ptr += cipher_parms->key_len;
+			sctx_words += cipher_parms->key_len / 4;
+		}
+
+		/*
+		 * if encrypting then set IV size, use SCTX IV unless no IV
+		 * given here
+		 */
+		if (cipher_parms->iv_len) {
+			/* Use SCTX IV */
+			ecf_bits |= SCTX_IV;
+			ptr += cipher_parms->iv_len;
+			sctx_words += cipher_parms->iv_len / 4;
+		}
+	}
+
+	/* Set the crypto parameters in the cipher.flags */
+	cipher_bits |= cipher_parms->alg << CIPHER_ALG_SHIFT;
+	cipher_bits |= cipher_parms->mode << CIPHER_MODE_SHIFT;
+	cipher_bits |= cipher_parms->type << CIPHER_TYPE_SHIFT;
+
+	/* copy the encryption keys in the SAD entry */
+	if (cipher_parms->alg && cipher_parms->key_len)
+		memcpy(spuh + 1, cipher_parms->key_buf, cipher_parms->key_len);
+
+	/* write in the total sctx length now that we know it */
+	protocol_bits |= sctx_words;
+
+	/* Endian adjust the SCTX */
+	spuh->sa.proto_flags = cpu_to_be32(protocol_bits);
+
+	/* Endian adjust the SCTX */
+	spuh->sa.cipher_flags = cpu_to_be32(cipher_bits);
+	spuh->sa.ecf = cpu_to_be32(ecf_bits);
+
+	packet_dump("  SPU request header: ", spu_hdr,
+		    sizeof(struct SPUHEADER));
+
+	return sizeof(struct SPUHEADER) + cipher_parms->key_len +
+		cipher_parms->iv_len + sizeof(struct BDESC_HEADER) +
+		sizeof(struct BD_HEADER);
+}
+
+/**
+ * spum_cipher_req_finish() - Finish building a SPU request message header for a
+ * block cipher request. Assumes much of the header was already filled in at
+ * setkey() time in spu_cipher_req_init().
+ * @spu_hdr:         Start of the request message header (MH field)
+ * @spu_req_hdr_len: Length in bytes of the SPU request header
+ * @isInbound:       0 encrypt, 1 decrypt
+ * @cipher_parms:    Parameters describing cipher operation to be performed
+ * @update_key:      If true, rewrite the cipher key in SCTX
+ * @data_size:       Length of the data in the BD field
+ *
+ * Assumes much of the header was already filled in at setkey() time in
+ * spum_cipher_req_init().
+ * spum_cipher_req_init() fills in the encryption key. For RC4, when submitting
+ * a request for a non-first chunk, we use the 260-byte SUPDT field from the
+ * previous response as the key. update_key is true for this case. Unused in all
+ * other cases.
+ */
+void spum_cipher_req_finish(u8 *spu_hdr,
+			    u16 spu_req_hdr_len,
+			    unsigned int is_inbound,
+			    struct spu_cipher_parms *cipher_parms,
+			    bool update_key,
+			    unsigned int data_size)
+{
+	struct SPUHEADER *spuh;
+	struct BDESC_HEADER *bdesc;
+	struct BD_HEADER *bd;
+	u8 *bdesc_ptr = spu_hdr + spu_req_hdr_len -
+	    (sizeof(struct BD_HEADER) + sizeof(struct BDESC_HEADER));
+
+	u32 cipher_bits;
+
+	flow_log("%s()\n", __func__);
+	flow_log(" in: %u\n", is_inbound);
+	flow_log(" cipher alg: %u, cipher_type: %u\n", cipher_parms->alg,
+		 cipher_parms->type);
+	if (update_key) {
+		flow_log(" cipher key len: %u\n", cipher_parms->key_len);
+		flow_dump("  key: ", cipher_parms->key_buf,
+			  cipher_parms->key_len);
+	}
+
+	/*
+	 * In XTS mode, API puts "i" parameter (block tweak) in IV.  For
+	 * SPU-M, should be in start of the BD; tx_sg_create() copies it there.
+	 * IV in SPU msg for SPU-M should be 0, since that's the "j" parameter
+	 * (block ctr within larger data unit) - given we can send entire disk
+	 * block (<= 4KB) in 1 SPU msg, don't need to use this parameter.
+	 */
+	if (cipher_parms->mode == CIPHER_MODE_XTS)
+		memset(cipher_parms->iv_buf, 0, cipher_parms->iv_len);
+
+	flow_log(" iv len: %d\n", cipher_parms->iv_len);
+	flow_dump("    iv: ", cipher_parms->iv_buf, cipher_parms->iv_len);
+	flow_log(" data_size: %u\n", data_size);
+
+	/* format master header word */
+	/* Do not set the next bit even though the datasheet says to */
+	spuh = (struct SPUHEADER *)spu_hdr;
+
+	/* cipher_bits was initialized at setkey time */
+	cipher_bits = be32_to_cpu(spuh->sa.cipher_flags);
+
+	/* Format sctx word 1 (cipher_bits) */
+	if (is_inbound)
+		cipher_bits |= CIPHER_INBOUND;
+	else
+		cipher_bits &= ~CIPHER_INBOUND;
+
+	/* update encryption key for RC4 on non-first chunk */
+	if (update_key) {
+		spuh->sa.cipher_flags |=
+			cipher_parms->type << CIPHER_TYPE_SHIFT;
+		memcpy(spuh + 1, cipher_parms->key_buf, cipher_parms->key_len);
+	}
+
+	if (cipher_parms->alg && cipher_parms->iv_buf && cipher_parms->iv_len)
+		/* cipher iv provided so put it in here */
+		memcpy(bdesc_ptr - cipher_parms->iv_len, cipher_parms->iv_buf,
+		       cipher_parms->iv_len);
+
+	spuh->sa.cipher_flags = cpu_to_be32(cipher_bits);
+
+	/* === create the BDESC section === */
+	bdesc = (struct BDESC_HEADER *)bdesc_ptr;
+	bdesc->offset_mac = 0;
+	bdesc->length_mac = 0;
+	bdesc->offset_crypto = 0;
+
+	/* XTS mode, data_size needs to include tweak parameter */
+	if (cipher_parms->mode == CIPHER_MODE_XTS)
+		bdesc->length_crypto = cpu_to_be16(data_size +
+						  SPU_XTS_TWEAK_SIZE);
+	else
+		bdesc->length_crypto = cpu_to_be16(data_size);
+
+	bdesc->offset_icv = 0;
+	bdesc->offset_iv = 0;
+
+	/* === no MFM section === */
+
+	/* === create the BD section === */
+	/* add the BD header */
+	bd = (struct BD_HEADER *)(bdesc_ptr + sizeof(struct BDESC_HEADER));
+	bd->size = cpu_to_be16(data_size);
+
+	/* XTS mode, data_size needs to include tweak parameter */
+	if (cipher_parms->mode == CIPHER_MODE_XTS)
+		bd->size = cpu_to_be16(data_size + SPU_XTS_TWEAK_SIZE);
+	else
+		bd->size = cpu_to_be16(data_size);
+
+	bd->prev_length = 0;
+
+	packet_dump("  SPU request header: ", spu_hdr, spu_req_hdr_len);
+}
+
+/**
+ * spum_request_pad() - Create pad bytes at the end of the data.
+ * @pad_start:		Start of buffer where pad bytes are to be written
+ * @gcm_ccm_padding:	length of GCM/CCM padding, in bytes
+ * @hash_pad_len:	Number of bytes of padding extend data to full block
+ * @auth_alg:		authentication algorithm
+ * @auth_mode:		authentication mode
+ * @total_sent:		length inserted at end of hash pad
+ * @status_padding:	Number of bytes of padding to align STATUS word
+ *
+ * There may be three forms of pad:
+ *  1. GCM/CCM pad - for GCM/CCM mode ciphers, pad to 16-byte alignment
+ *  2. hash pad - pad to a block length, with 0x80 data terminator and
+ *                size at the end
+ *  3. STAT pad - to ensure the STAT field is 4-byte aligned
+ */
+void spum_request_pad(u8 *pad_start,
+		      u32 gcm_ccm_padding,
+		      u32 hash_pad_len,
+		      enum hash_alg auth_alg,
+		      enum hash_mode auth_mode,
+		      unsigned int total_sent, u32 status_padding)
+{
+	u8 *ptr = pad_start;
+
+	/* fix data alignent for GCM/CCM */
+	if (gcm_ccm_padding > 0) {
+		flow_log("  GCM: padding to 16 byte alignment: %u bytes\n",
+			 gcm_ccm_padding);
+		memset(ptr, 0, gcm_ccm_padding);
+		ptr += gcm_ccm_padding;
+	}
+
+	if (hash_pad_len > 0) {
+		/* clear the padding section */
+		memset(ptr, 0, hash_pad_len);
+
+		if ((auth_alg == HASH_ALG_AES) &&
+		    (auth_mode == HASH_MODE_XCBC)) {
+			/* AES/XCBC just requires padding to be 0s */
+			ptr += hash_pad_len;
+		} else {
+			/* terminate the data */
+			*ptr = 0x80;
+			ptr += (hash_pad_len - sizeof(u64));
+
+			/* add the size at the end as required per alg */
+			if (auth_alg == HASH_ALG_MD5)
+				*(u64 *)ptr = cpu_to_le64((u64)total_sent * 8);
+			else		/* SHA1, SHA2-224, SHA2-256 */
+				*(u64 *)ptr = cpu_to_be64((u64)total_sent * 8);
+			ptr += sizeof(u64);
+		}
+	}
+
+	/* pad to a 4byte alignment for STAT */
+	if (status_padding > 0) {
+		flow_log("  STAT: padding to 4 byte alignment: %u bytes\n",
+			 status_padding);
+
+		memset(ptr, 0, status_padding);
+		ptr += status_padding;
+	}
+}
+
+/**
+ * spum_xts_tweak_in_payload() - Indicate that SPUM DOES place the XTS tweak
+ * field in the packet payload (rather than using IV)
+ *
+ * Return: 1
+ */
+u8 spum_xts_tweak_in_payload(void)
+{
+	return 1;
+}
+
+/**
+ * spum_tx_status_len() - Return the length of the STATUS field in a SPU
+ * response message.
+ *
+ * Return: Length of STATUS field in bytes.
+ */
+u8 spum_tx_status_len(void)
+{
+	return SPU_TX_STATUS_LEN;
+}
+
+/**
+ * spum_rx_status_len() - Return the length of the STATUS field in a SPU
+ * response message.
+ *
+ * Return: Length of STATUS field in bytes.
+ */
+u8 spum_rx_status_len(void)
+{
+	return SPU_RX_STATUS_LEN;
+}
+
+/**
+ * spum_status_process() - Process the status from a SPU response message.
+ * @statp:  start of STATUS word
+ * Return:
+ *   0 - if status is good and response should be processed
+ *   !0 - status indicates an error and response is invalid
+ */
+int spum_status_process(u8 *statp)
+{
+	u32 status;
+
+	status = __be32_to_cpu(*(__be32 *)statp);
+	flow_log("SPU response STATUS %#08x\n", status);
+	if (status & SPU_STATUS_ERROR_FLAG) {
+		pr_err("%s() Warning: Error result from SPU: %#08x\n",
+		       __func__, status);
+		if (status & SPU_STATUS_INVALID_ICV)
+			return SPU_INVALID_ICV;
+		return -EBADMSG;
+	}
+	return 0;
+}
+
+/**
+ * spum_ccm_update_iv() - Update the IV as per the requirements for CCM mode.
+ *
+ * @digestsize:		Digest size of this request
+ * @cipher_parms:	(pointer to) cipher parmaeters, includes IV buf & IV len
+ * @assoclen:		Length of AAD data
+ * @chunksize:		length of input data to be sent in this req
+ * @is_encrypt:		true if this is an output/encrypt operation
+ * @is_esp:		true if this is an ESP / RFC4309 operation
+ *
+ */
+void spum_ccm_update_iv(unsigned int digestsize,
+			struct spu_cipher_parms *cipher_parms,
+			unsigned int assoclen,
+			unsigned int chunksize,
+			bool is_encrypt,
+			bool is_esp)
+{
+	u8 L;		/* L from CCM algorithm, length of plaintext data */
+	u8 mprime;	/* M' from CCM algo, (M - 2) / 2, where M=authsize */
+	u8 adata;
+
+	if (cipher_parms->iv_len != CCM_AES_IV_SIZE) {
+		pr_err("%s(): Invalid IV len %d for CCM mode, should be %d\n",
+		       __func__, cipher_parms->iv_len, CCM_AES_IV_SIZE);
+		return;
+	}
+
+	/*
+	 * IV needs to be formatted as follows:
+	 *
+	 * |          Byte 0               | Bytes 1 - N | Bytes (N+1) - 15 |
+	 * | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | Bits 7 - 0  |    Bits 7 - 0    |
+	 * | 0 |Ad?|(M - 2) / 2|   L - 1   |    Nonce    | Plaintext Length |
+	 *
+	 * Ad? = 1 if AAD present, 0 if not present
+	 * M = size of auth field, 8, 12, or 16 bytes (SPU-M) -or-
+	 *                         4, 6, 8, 10, 12, 14, 16 bytes (SPU2)
+	 * L = Size of Plaintext Length field; Nonce size = 15 - L
+	 *
+	 * It appears that the crypto API already expects the L-1 portion
+	 * to be set in the first byte of the IV, which implicitly determines
+	 * the nonce size, and also fills in the nonce.  But the other bits
+	 * in byte 0 as well as the plaintext length need to be filled in.
+	 *
+	 * In rfc4309/esp mode, L is not already in the supplied IV and
+	 * we need to fill it in, as well as move the IV data to be after
+	 * the salt
+	 */
+	if (is_esp) {
+		L = CCM_ESP_L_VALUE;	/* RFC4309 has fixed L */
+	} else {
+		/* L' = plaintext length - 1 so Plaintext length is L' + 1 */
+		L = ((cipher_parms->iv_buf[0] & CCM_B0_L_PRIME) >>
+		      CCM_B0_L_PRIME_SHIFT) + 1;
+	}
+
+	mprime = (digestsize - 2) >> 1;  /* M' = (M - 2) / 2 */
+	adata = (assoclen > 0);  /* adata = 1 if any associated data */
+
+	cipher_parms->iv_buf[0] = (adata << CCM_B0_ADATA_SHIFT) |
+				  (mprime << CCM_B0_M_PRIME_SHIFT) |
+				  ((L - 1) << CCM_B0_L_PRIME_SHIFT);
+
+	/* Nonce is already filled in by crypto API, and is 15 - L bytes */
+
+	/* Don't include digest in plaintext size when decrypting */
+	if (!is_encrypt)
+		chunksize -= digestsize;
+
+	/* Fill in length of plaintext, formatted to be L bytes long */
+	format_value_ccm(chunksize, &cipher_parms->iv_buf[15 - L + 1], L);
+}
+
+/**
+ * spum_wordalign_padlen() - Given the length of a data field, determine the
+ * padding required to align the data following this field on a 4-byte boundary.
+ * @data_size: length of data field in bytes
+ *
+ * Return: length of status field padding, in bytes
+ */
+u32 spum_wordalign_padlen(u32 data_size)
+{
+	return ((data_size + 3) & ~3) - data_size;
+}
diff --git a/drivers/crypto/bcm/spu.h b/drivers/crypto/bcm/spu.h
new file mode 100644
index 000000000000..aa6fc38db263
--- /dev/null
+++ b/drivers/crypto/bcm/spu.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+/*
+ * This file contains the definition of SPU messages. There are currently two
+ * SPU message formats: SPU-M and SPU2. The hardware uses different values to
+ * identify the same things in SPU-M vs SPU2. So this file defines values that
+ * are hardware independent. Software can use these values for any version of
+ * SPU hardware. These values are used in APIs in spu.c. Functions internal to
+ * spu.c and spu2.c convert these to hardware-specific values.
+ */
+
+#ifndef _SPU_H
+#define _SPU_H
+
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+#include <crypto/sha.h>
+
+enum spu_cipher_alg {
+	CIPHER_ALG_NONE = 0x0,
+	CIPHER_ALG_RC4 = 0x1,
+	CIPHER_ALG_DES = 0x2,
+	CIPHER_ALG_3DES = 0x3,
+	CIPHER_ALG_AES = 0x4,
+	CIPHER_ALG_LAST = 0x5
+};
+
+enum spu_cipher_mode {
+	CIPHER_MODE_NONE = 0x0,
+	CIPHER_MODE_ECB = 0x0,
+	CIPHER_MODE_CBC = 0x1,
+	CIPHER_MODE_OFB = 0x2,
+	CIPHER_MODE_CFB = 0x3,
+	CIPHER_MODE_CTR = 0x4,
+	CIPHER_MODE_CCM = 0x5,
+	CIPHER_MODE_GCM = 0x6,
+	CIPHER_MODE_XTS = 0x7,
+	CIPHER_MODE_LAST = 0x8
+};
+
+enum spu_cipher_type {
+	CIPHER_TYPE_NONE = 0x0,
+	CIPHER_TYPE_DES = 0x0,
+	CIPHER_TYPE_3DES = 0x0,
+	CIPHER_TYPE_INIT = 0x0,	/* used for ARC4 */
+	CIPHER_TYPE_AES128 = 0x0,
+	CIPHER_TYPE_AES192 = 0x1,
+	CIPHER_TYPE_UPDT = 0x1,	/* used for ARC4 */
+	CIPHER_TYPE_AES256 = 0x2,
+};
+
+enum hash_alg {
+	HASH_ALG_NONE = 0x0,
+	HASH_ALG_MD5 = 0x1,
+	HASH_ALG_SHA1 = 0x2,
+	HASH_ALG_SHA224 = 0x3,
+	HASH_ALG_SHA256 = 0x4,
+	HASH_ALG_AES = 0x5,
+	HASH_ALG_SHA384 = 0x6,
+	HASH_ALG_SHA512 = 0x7,
+	/* Keep SHA3 algorithms at the end always */
+	HASH_ALG_SHA3_224 = 0x8,
+	HASH_ALG_SHA3_256 = 0x9,
+	HASH_ALG_SHA3_384 = 0xa,
+	HASH_ALG_SHA3_512 = 0xb,
+	HASH_ALG_LAST
+};
+
+enum hash_mode {
+	HASH_MODE_NONE = 0x0,
+	HASH_MODE_HASH = 0x0,
+	HASH_MODE_XCBC = 0x0,
+	HASH_MODE_CMAC = 0x1,
+	HASH_MODE_CTXT = 0x1,
+	HASH_MODE_HMAC = 0x2,
+	HASH_MODE_RABIN = 0x4,
+	HASH_MODE_FHMAC = 0x6,
+	HASH_MODE_CCM = 0x5,
+	HASH_MODE_GCM = 0x6,
+};
+
+enum hash_type {
+	HASH_TYPE_NONE = 0x0,
+	HASH_TYPE_FULL = 0x0,
+	HASH_TYPE_INIT = 0x1,
+	HASH_TYPE_UPDT = 0x2,
+	HASH_TYPE_FIN = 0x3,
+	HASH_TYPE_AES128 = 0x0,
+	HASH_TYPE_AES192 = 0x1,
+	HASH_TYPE_AES256 = 0x2
+};
+
+enum aead_type {
+	AES_CCM,
+	AES_GCM,
+	AUTHENC,
+	AEAD_TYPE_LAST
+};
+
+extern char *hash_alg_name[HASH_ALG_LAST];
+extern char *aead_alg_name[AEAD_TYPE_LAST];
+
+struct spu_request_opts {
+	bool is_inbound;
+	bool auth_first;
+	bool is_aead;
+	bool is_esp;
+	bool bd_suppress;
+	bool is_rfc4543;
+};
+
+struct spu_cipher_parms {
+	enum spu_cipher_alg  alg;
+	enum spu_cipher_mode mode;
+	enum spu_cipher_type type;
+	u8                  *key_buf;
+	u16                  key_len;
+	/* iv_buf and iv_len include salt, if applicable */
+	u8                  *iv_buf;
+	u16                  iv_len;
+};
+
+struct spu_hash_parms {
+	enum hash_alg  alg;
+	enum hash_mode mode;
+	enum hash_type type;
+	u8             digestsize;
+	u8            *key_buf;
+	u16            key_len;
+	u16            prebuf_len;
+	/* length of hash pad. signed, needs to handle roll-overs */
+	int            pad_len;
+};
+
+struct spu_aead_parms {
+	u32 assoc_size;
+	u16 iv_len;      /* length of IV field between assoc data and data */
+	u8  aad_pad_len; /* For AES GCM/CCM, length of padding after AAD */
+	u8  data_pad_len;/* For AES GCM/CCM, length of padding after data */
+	bool return_iv;  /* True if SPU should return an IV */
+	u32 ret_iv_len;  /* Length in bytes of returned IV */
+	u32 ret_iv_off;  /* Offset into full IV if partial IV returned */
+};
+
+/************** SPU sizes ***************/
+
+#define SPU_RX_STATUS_LEN  4
+
+/* Max length of padding for 4-byte alignment of STATUS field */
+#define SPU_STAT_PAD_MAX  4
+
+/* Max length of pad fragment. 4 is for 4-byte alignment of STATUS field */
+#define SPU_PAD_LEN_MAX (SPU_GCM_CCM_ALIGN + MAX_HASH_BLOCK_SIZE + \
+			 SPU_STAT_PAD_MAX)
+
+/* GCM and CCM require 16-byte alignment */
+#define SPU_GCM_CCM_ALIGN 16
+
+/* Length up SUPDT field in SPU response message for RC4 */
+#define SPU_SUPDT_LEN 260
+
+/* SPU status error codes. These used as common error codes across all
+ * SPU variants.
+ */
+#define SPU_INVALID_ICV  1
+
+/* Indicates no limit to the length of the payload in a SPU message */
+#define SPU_MAX_PAYLOAD_INF  0xFFFFFFFF
+
+/* Size of XTS tweak ("i" parameter), in bytes */
+#define SPU_XTS_TWEAK_SIZE 16
+
+/* CCM B_0 field definitions, common for SPU-M and SPU2 */
+#define CCM_B0_ADATA		0x40
+#define CCM_B0_ADATA_SHIFT	   6
+#define CCM_B0_M_PRIME		0x38
+#define CCM_B0_M_PRIME_SHIFT	   3
+#define CCM_B0_L_PRIME		0x07
+#define CCM_B0_L_PRIME_SHIFT	   0
+#define CCM_ESP_L_VALUE		   4
+
+/**
+ * spu_req_incl_icv() - Return true if SPU request message should include the
+ * ICV as a separate buffer.
+ * @cipher_mode:  the cipher mode being requested
+ * @is_encrypt:   true if encrypting. false if decrypting.
+ *
+ * Return:  true if ICV to be included as separate buffer
+ */
+static __always_inline  bool spu_req_incl_icv(enum spu_cipher_mode cipher_mode,
+					      bool is_encrypt)
+{
+	if ((cipher_mode == CIPHER_MODE_GCM) && !is_encrypt)
+		return true;
+	if ((cipher_mode == CIPHER_MODE_CCM) && !is_encrypt)
+		return true;
+
+	return false;
+}
+
+static __always_inline u32 spu_real_db_size(u32 assoc_size,
+					    u32 aead_iv_buf_len,
+					    u32 prebuf_len,
+					    u32 data_size,
+					    u32 aad_pad_len,
+					    u32 gcm_pad_len,
+					    u32 hash_pad_len)
+{
+	return assoc_size + aead_iv_buf_len + prebuf_len + data_size +
+	    aad_pad_len + gcm_pad_len + hash_pad_len;
+}
+
+/************** SPU Functions Prototypes **************/
+
+void spum_dump_msg_hdr(u8 *buf, unsigned int buf_len);
+
+u32 spum_ns2_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			     enum spu_cipher_mode cipher_mode,
+			     unsigned int blocksize);
+u32 spum_nsp_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			     enum spu_cipher_mode cipher_mode,
+			     unsigned int blocksize);
+u32 spum_payload_length(u8 *spu_hdr);
+u16 spum_response_hdr_len(u16 auth_key_len, u16 enc_key_len, bool is_hash);
+u16 spum_hash_pad_len(enum hash_alg hash_alg, enum hash_mode hash_mode,
+		      u32 chunksize, u16 hash_block_size);
+u32 spum_gcm_ccm_pad_len(enum spu_cipher_mode cipher_mode,
+			 unsigned int data_size);
+u32 spum_assoc_resp_len(enum spu_cipher_mode cipher_mode,
+			unsigned int assoc_len, unsigned int iv_len,
+			bool is_encrypt);
+u8 spum_aead_ivlen(enum spu_cipher_mode cipher_mode, u16 iv_len);
+bool spu_req_incl_icv(enum spu_cipher_mode cipher_mode, bool is_encrypt);
+enum hash_type spum_hash_type(u32 src_sent);
+u32 spum_digest_size(u32 alg_digest_size, enum hash_alg alg,
+		     enum hash_type htype);
+
+u32 spum_create_request(u8 *spu_hdr,
+			struct spu_request_opts *req_opts,
+			struct spu_cipher_parms *cipher_parms,
+			struct spu_hash_parms *hash_parms,
+			struct spu_aead_parms *aead_parms,
+			unsigned int data_size);
+
+u16 spum_cipher_req_init(u8 *spu_hdr, struct spu_cipher_parms *cipher_parms);
+
+void spum_cipher_req_finish(u8 *spu_hdr,
+			    u16 spu_req_hdr_len,
+			    unsigned int is_inbound,
+			    struct spu_cipher_parms *cipher_parms,
+			    bool update_key,
+			    unsigned int data_size);
+
+void spum_request_pad(u8 *pad_start,
+		      u32 gcm_padding,
+		      u32 hash_pad_len,
+		      enum hash_alg auth_alg,
+		      enum hash_mode auth_mode,
+		      unsigned int total_sent, u32 status_padding);
+
+u8 spum_xts_tweak_in_payload(void);
+u8 spum_tx_status_len(void);
+u8 spum_rx_status_len(void);
+int spum_status_process(u8 *statp);
+
+void spum_ccm_update_iv(unsigned int digestsize,
+			struct spu_cipher_parms *cipher_parms,
+			unsigned int assoclen,
+			unsigned int chunksize,
+			bool is_encrypt,
+			bool is_esp);
+u32 spum_wordalign_padlen(u32 data_size);
+#endif
diff --git a/drivers/crypto/bcm/spu2.c b/drivers/crypto/bcm/spu2.c
new file mode 100644
index 000000000000..ef04c9748317
--- /dev/null
+++ b/drivers/crypto/bcm/spu2.c
@@ -0,0 +1,1401 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+/*
+ * This file works with the SPU2 version of the SPU. SPU2 has different message
+ * formats than the previous version of the SPU. All SPU message format
+ * differences should be hidden in the spux.c,h files.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include "util.h"
+#include "spu.h"
+#include "spu2.h"
+
+#define SPU2_TX_STATUS_LEN  0	/* SPU2 has no STATUS in input packet */
+
+/*
+ * Controlled by pkt_stat_cnt field in CRYPTO_SS_SPU0_CORE_SPU2_CONTROL0
+ * register. Defaults to 2.
+ */
+#define SPU2_RX_STATUS_LEN  2
+
+enum spu2_proto_sel {
+	SPU2_PROTO_RESV = 0,
+	SPU2_MACSEC_SECTAG8_ECB = 1,
+	SPU2_MACSEC_SECTAG8_SCB = 2,
+	SPU2_MACSEC_SECTAG16 = 3,
+	SPU2_MACSEC_SECTAG16_8_XPN = 4,
+	SPU2_IPSEC = 5,
+	SPU2_IPSEC_ESN = 6,
+	SPU2_TLS_CIPHER = 7,
+	SPU2_TLS_AEAD = 8,
+	SPU2_DTLS_CIPHER = 9,
+	SPU2_DTLS_AEAD = 10
+};
+
+char *spu2_cipher_type_names[] = { "None", "AES128", "AES192", "AES256",
+	"DES", "3DES"
+};
+
+char *spu2_cipher_mode_names[] = { "ECB", "CBC", "CTR", "CFB", "OFB", "XTS",
+	"CCM", "GCM"
+};
+
+char *spu2_hash_type_names[] = { "None", "AES128", "AES192", "AES256",
+	"Reserved", "Reserved", "MD5", "SHA1", "SHA224", "SHA256", "SHA384",
+	"SHA512", "SHA512/224", "SHA512/256", "SHA3-224", "SHA3-256",
+	"SHA3-384", "SHA3-512"
+};
+
+char *spu2_hash_mode_names[] = { "CMAC", "CBC-MAC", "XCBC-MAC", "HMAC",
+	"Rabin", "CCM", "GCM", "Reserved"
+};
+
+static char *spu2_ciph_type_name(enum spu2_cipher_type cipher_type)
+{
+	if (cipher_type >= SPU2_CIPHER_TYPE_LAST)
+		return "Reserved";
+	return spu2_cipher_type_names[cipher_type];
+}
+
+static char *spu2_ciph_mode_name(enum spu2_cipher_mode cipher_mode)
+{
+	if (cipher_mode >= SPU2_CIPHER_MODE_LAST)
+		return "Reserved";
+	return spu2_cipher_mode_names[cipher_mode];
+}
+
+static char *spu2_hash_type_name(enum spu2_hash_type hash_type)
+{
+	if (hash_type >= SPU2_HASH_TYPE_LAST)
+		return "Reserved";
+	return spu2_hash_type_names[hash_type];
+}
+
+static char *spu2_hash_mode_name(enum spu2_hash_mode hash_mode)
+{
+	if (hash_mode >= SPU2_HASH_MODE_LAST)
+		return "Reserved";
+	return spu2_hash_mode_names[hash_mode];
+}
+
+/*
+ * Convert from a software cipher mode value to the corresponding value
+ * for SPU2.
+ */
+static int spu2_cipher_mode_xlate(enum spu_cipher_mode cipher_mode,
+				  enum spu2_cipher_mode *spu2_mode)
+{
+	switch (cipher_mode) {
+	case CIPHER_MODE_ECB:
+		*spu2_mode = SPU2_CIPHER_MODE_ECB;
+		break;
+	case CIPHER_MODE_CBC:
+		*spu2_mode = SPU2_CIPHER_MODE_CBC;
+		break;
+	case CIPHER_MODE_OFB:
+		*spu2_mode = SPU2_CIPHER_MODE_OFB;
+		break;
+	case CIPHER_MODE_CFB:
+		*spu2_mode = SPU2_CIPHER_MODE_CFB;
+		break;
+	case CIPHER_MODE_CTR:
+		*spu2_mode = SPU2_CIPHER_MODE_CTR;
+		break;
+	case CIPHER_MODE_CCM:
+		*spu2_mode = SPU2_CIPHER_MODE_CCM;
+		break;
+	case CIPHER_MODE_GCM:
+		*spu2_mode = SPU2_CIPHER_MODE_GCM;
+		break;
+	case CIPHER_MODE_XTS:
+		*spu2_mode = SPU2_CIPHER_MODE_XTS;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * spu2_cipher_xlate() - Convert a cipher {alg/mode/type} triple to a SPU2
+ * cipher type and mode.
+ * @cipher_alg:  [in]  cipher algorithm value from software enumeration
+ * @cipher_mode: [in]  cipher mode value from software enumeration
+ * @cipher_type: [in]  cipher type value from software enumeration
+ * @spu2_type:   [out] cipher type value used by spu2 hardware
+ * @spu2_mode:   [out] cipher mode value used by spu2 hardware
+ *
+ * Return:  0 if successful
+ */
+static int spu2_cipher_xlate(enum spu_cipher_alg cipher_alg,
+			     enum spu_cipher_mode cipher_mode,
+			     enum spu_cipher_type cipher_type,
+			     enum spu2_cipher_type *spu2_type,
+			     enum spu2_cipher_mode *spu2_mode)
+{
+	int err;
+
+	err = spu2_cipher_mode_xlate(cipher_mode, spu2_mode);
+	if (err) {
+		flow_log("Invalid cipher mode %d\n", cipher_mode);
+		return err;
+	}
+
+	switch (cipher_alg) {
+	case CIPHER_ALG_NONE:
+		*spu2_type = SPU2_CIPHER_TYPE_NONE;
+		break;
+	case CIPHER_ALG_RC4:
+		/* SPU2 does not support RC4 */
+		err = -EINVAL;
+		*spu2_type = SPU2_CIPHER_TYPE_NONE;
+		break;
+	case CIPHER_ALG_DES:
+		*spu2_type = SPU2_CIPHER_TYPE_DES;
+		break;
+	case CIPHER_ALG_3DES:
+		*spu2_type = SPU2_CIPHER_TYPE_3DES;
+		break;
+	case CIPHER_ALG_AES:
+		switch (cipher_type) {
+		case CIPHER_TYPE_AES128:
+			*spu2_type = SPU2_CIPHER_TYPE_AES128;
+			break;
+		case CIPHER_TYPE_AES192:
+			*spu2_type = SPU2_CIPHER_TYPE_AES192;
+			break;
+		case CIPHER_TYPE_AES256:
+			*spu2_type = SPU2_CIPHER_TYPE_AES256;
+			break;
+		default:
+			err = -EINVAL;
+		}
+		break;
+	case CIPHER_ALG_LAST:
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	if (err)
+		flow_log("Invalid cipher alg %d or type %d\n",
+			 cipher_alg, cipher_type);
+	return err;
+}
+
+/*
+ * Convert from a software hash mode value to the corresponding value
+ * for SPU2. Note that HASH_MODE_NONE and HASH_MODE_XCBC have the same value.
+ */
+static int spu2_hash_mode_xlate(enum hash_mode hash_mode,
+				enum spu2_hash_mode *spu2_mode)
+{
+	switch (hash_mode) {
+	case HASH_MODE_XCBC:
+		*spu2_mode = SPU2_HASH_MODE_XCBC_MAC;
+		break;
+	case HASH_MODE_CMAC:
+		*spu2_mode = SPU2_HASH_MODE_CMAC;
+		break;
+	case HASH_MODE_HMAC:
+		*spu2_mode = SPU2_HASH_MODE_HMAC;
+		break;
+	case HASH_MODE_CCM:
+		*spu2_mode = SPU2_HASH_MODE_CCM;
+		break;
+	case HASH_MODE_GCM:
+		*spu2_mode = SPU2_HASH_MODE_GCM;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * spu2_hash_xlate() - Convert a hash {alg/mode/type} triple to a SPU2 hash type
+ * and mode.
+ * @hash_alg:  [in] hash algorithm value from software enumeration
+ * @hash_mode: [in] hash mode value from software enumeration
+ * @hash_type: [in] hash type value from software enumeration
+ * @ciph_type: [in] cipher type value from software enumeration
+ * @spu2_type: [out] hash type value used by SPU2 hardware
+ * @spu2_mode: [out] hash mode value used by SPU2 hardware
+ *
+ * Return:  0 if successful
+ */
+static int
+spu2_hash_xlate(enum hash_alg hash_alg, enum hash_mode hash_mode,
+		enum hash_type hash_type, enum spu_cipher_type ciph_type,
+		enum spu2_hash_type *spu2_type, enum spu2_hash_mode *spu2_mode)
+{
+	int err;
+
+	err = spu2_hash_mode_xlate(hash_mode, spu2_mode);
+	if (err) {
+		flow_log("Invalid hash mode %d\n", hash_mode);
+		return err;
+	}
+
+	switch (hash_alg) {
+	case HASH_ALG_NONE:
+		*spu2_type = SPU2_HASH_TYPE_NONE;
+		break;
+	case HASH_ALG_MD5:
+		*spu2_type = SPU2_HASH_TYPE_MD5;
+		break;
+	case HASH_ALG_SHA1:
+		*spu2_type = SPU2_HASH_TYPE_SHA1;
+		break;
+	case HASH_ALG_SHA224:
+		*spu2_type = SPU2_HASH_TYPE_SHA224;
+		break;
+	case HASH_ALG_SHA256:
+		*spu2_type = SPU2_HASH_TYPE_SHA256;
+		break;
+	case HASH_ALG_SHA384:
+		*spu2_type = SPU2_HASH_TYPE_SHA384;
+		break;
+	case HASH_ALG_SHA512:
+		*spu2_type = SPU2_HASH_TYPE_SHA512;
+		break;
+	case HASH_ALG_AES:
+		switch (ciph_type) {
+		case CIPHER_TYPE_AES128:
+			*spu2_type = SPU2_HASH_TYPE_AES128;
+			break;
+		case CIPHER_TYPE_AES192:
+			*spu2_type = SPU2_HASH_TYPE_AES192;
+			break;
+		case CIPHER_TYPE_AES256:
+			*spu2_type = SPU2_HASH_TYPE_AES256;
+			break;
+		default:
+			err = -EINVAL;
+		}
+		break;
+	case HASH_ALG_SHA3_224:
+		*spu2_type = SPU2_HASH_TYPE_SHA3_224;
+		break;
+	case HASH_ALG_SHA3_256:
+		*spu2_type = SPU2_HASH_TYPE_SHA3_256;
+		break;
+	case HASH_ALG_SHA3_384:
+		*spu2_type = SPU2_HASH_TYPE_SHA3_384;
+		break;
+	case HASH_ALG_SHA3_512:
+		*spu2_type = SPU2_HASH_TYPE_SHA3_512;
+	case HASH_ALG_LAST:
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	if (err)
+		flow_log("Invalid hash alg %d or type %d\n",
+			 hash_alg, hash_type);
+	return err;
+}
+
+/* Dump FMD ctrl0. The ctrl0 input is in host byte order */
+static void spu2_dump_fmd_ctrl0(u64 ctrl0)
+{
+	enum spu2_cipher_type ciph_type;
+	enum spu2_cipher_mode ciph_mode;
+	enum spu2_hash_type hash_type;
+	enum spu2_hash_mode hash_mode;
+	char *ciph_name;
+	char *ciph_mode_name;
+	char *hash_name;
+	char *hash_mode_name;
+	u8 cfb;
+	u8 proto;
+
+	packet_log(" FMD CTRL0 %#16llx\n", ctrl0);
+	if (ctrl0 & SPU2_CIPH_ENCRYPT_EN)
+		packet_log("  encrypt\n");
+	else
+		packet_log("  decrypt\n");
+
+	ciph_type = (ctrl0 & SPU2_CIPH_TYPE) >> SPU2_CIPH_TYPE_SHIFT;
+	ciph_name = spu2_ciph_type_name(ciph_type);
+	packet_log("  Cipher type: %s\n", ciph_name);
+
+	if (ciph_type != SPU2_CIPHER_TYPE_NONE) {
+		ciph_mode = (ctrl0 & SPU2_CIPH_MODE) >> SPU2_CIPH_MODE_SHIFT;
+		ciph_mode_name = spu2_ciph_mode_name(ciph_mode);
+		packet_log("  Cipher mode: %s\n", ciph_mode_name);
+	}
+
+	cfb = (ctrl0 & SPU2_CFB_MASK) >> SPU2_CFB_MASK_SHIFT;
+	packet_log("  CFB %#x\n", cfb);
+
+	proto = (ctrl0 & SPU2_PROTO_SEL) >> SPU2_PROTO_SEL_SHIFT;
+	packet_log("  protocol %#x\n", proto);
+
+	if (ctrl0 & SPU2_HASH_FIRST)
+		packet_log("  hash first\n");
+	else
+		packet_log("  cipher first\n");
+
+	if (ctrl0 & SPU2_CHK_TAG)
+		packet_log("  check tag\n");
+
+	hash_type = (ctrl0 & SPU2_HASH_TYPE) >> SPU2_HASH_TYPE_SHIFT;
+	hash_name = spu2_hash_type_name(hash_type);
+	packet_log("  Hash type: %s\n", hash_name);
+
+	if (hash_type != SPU2_HASH_TYPE_NONE) {
+		hash_mode = (ctrl0 & SPU2_HASH_MODE) >> SPU2_HASH_MODE_SHIFT;
+		hash_mode_name = spu2_hash_mode_name(hash_mode);
+		packet_log("  Hash mode: %s\n", hash_mode_name);
+	}
+
+	if (ctrl0 & SPU2_CIPH_PAD_EN) {
+		packet_log("  Cipher pad: %#2llx\n",
+			   (ctrl0 & SPU2_CIPH_PAD) >> SPU2_CIPH_PAD_SHIFT);
+	}
+}
+
+/* Dump FMD ctrl1. The ctrl1 input is in host byte order */
+static void spu2_dump_fmd_ctrl1(u64 ctrl1)
+{
+	u8 hash_key_len;
+	u8 ciph_key_len;
+	u8 ret_iv_len;
+	u8 iv_offset;
+	u8 iv_len;
+	u8 hash_tag_len;
+	u8 ret_md;
+
+	packet_log(" FMD CTRL1 %#16llx\n", ctrl1);
+	if (ctrl1 & SPU2_TAG_LOC)
+		packet_log("  Tag after payload\n");
+
+	packet_log("  Msg includes ");
+	if (ctrl1 & SPU2_HAS_FR_DATA)
+		packet_log("FD ");
+	if (ctrl1 & SPU2_HAS_AAD1)
+		packet_log("AAD1 ");
+	if (ctrl1 & SPU2_HAS_NAAD)
+		packet_log("NAAD ");
+	if (ctrl1 & SPU2_HAS_AAD2)
+		packet_log("AAD2 ");
+	if (ctrl1 & SPU2_HAS_ESN)
+		packet_log("ESN ");
+	packet_log("\n");
+
+	hash_key_len = (ctrl1 & SPU2_HASH_KEY_LEN) >> SPU2_HASH_KEY_LEN_SHIFT;
+	packet_log("  Hash key len %u\n", hash_key_len);
+
+	ciph_key_len = (ctrl1 & SPU2_CIPH_KEY_LEN) >> SPU2_CIPH_KEY_LEN_SHIFT;
+	packet_log("  Cipher key len %u\n", ciph_key_len);
+
+	if (ctrl1 & SPU2_GENIV)
+		packet_log("  Generate IV\n");
+
+	if (ctrl1 & SPU2_HASH_IV)
+		packet_log("  IV included in hash\n");
+
+	if (ctrl1 & SPU2_RET_IV)
+		packet_log("  Return IV in output before payload\n");
+
+	ret_iv_len = (ctrl1 & SPU2_RET_IV_LEN) >> SPU2_RET_IV_LEN_SHIFT;
+	packet_log("  Length of returned IV %u bytes\n",
+		   ret_iv_len ? ret_iv_len : 16);
+
+	iv_offset = (ctrl1 & SPU2_IV_OFFSET) >> SPU2_IV_OFFSET_SHIFT;
+	packet_log("  IV offset %u\n", iv_offset);
+
+	iv_len = (ctrl1 & SPU2_IV_LEN) >> SPU2_IV_LEN_SHIFT;
+	packet_log("  Input IV len %u bytes\n", iv_len);
+
+	hash_tag_len = (ctrl1 & SPU2_HASH_TAG_LEN) >> SPU2_HASH_TAG_LEN_SHIFT;
+	packet_log("  Hash tag length %u bytes\n", hash_tag_len);
+
+	packet_log("  Return ");
+	ret_md = (ctrl1 & SPU2_RETURN_MD) >> SPU2_RETURN_MD_SHIFT;
+	if (ret_md)
+		packet_log("FMD ");
+	if (ret_md == SPU2_RET_FMD_OMD)
+		packet_log("OMD ");
+	else if (ret_md == SPU2_RET_FMD_OMD_IV)
+		packet_log("OMD IV ");
+	if (ctrl1 & SPU2_RETURN_FD)
+		packet_log("FD ");
+	if (ctrl1 & SPU2_RETURN_AAD1)
+		packet_log("AAD1 ");
+	if (ctrl1 & SPU2_RETURN_NAAD)
+		packet_log("NAAD ");
+	if (ctrl1 & SPU2_RETURN_AAD2)
+		packet_log("AAD2 ");
+	if (ctrl1 & SPU2_RETURN_PAY)
+		packet_log("Payload");
+	packet_log("\n");
+}
+
+/* Dump FMD ctrl2. The ctrl2 input is in host byte order */
+static void spu2_dump_fmd_ctrl2(u64 ctrl2)
+{
+	packet_log(" FMD CTRL2 %#16llx\n", ctrl2);
+
+	packet_log("  AAD1 offset %llu length %llu bytes\n",
+		   ctrl2 & SPU2_AAD1_OFFSET,
+		   (ctrl2 & SPU2_AAD1_LEN) >> SPU2_AAD1_LEN_SHIFT);
+	packet_log("  AAD2 offset %llu\n",
+		   (ctrl2 & SPU2_AAD2_OFFSET) >> SPU2_AAD2_OFFSET_SHIFT);
+	packet_log("  Payload offset %llu\n",
+		   (ctrl2 & SPU2_PL_OFFSET) >> SPU2_PL_OFFSET_SHIFT);
+}
+
+/* Dump FMD ctrl3. The ctrl3 input is in host byte order */
+static void spu2_dump_fmd_ctrl3(u64 ctrl3)
+{
+	packet_log(" FMD CTRL3 %#16llx\n", ctrl3);
+
+	packet_log("  Payload length %llu bytes\n", ctrl3 & SPU2_PL_LEN);
+	packet_log("  TLS length %llu bytes\n",
+		   (ctrl3 & SPU2_TLS_LEN) >> SPU2_TLS_LEN_SHIFT);
+}
+
+static void spu2_dump_fmd(struct SPU2_FMD *fmd)
+{
+	spu2_dump_fmd_ctrl0(le64_to_cpu(fmd->ctrl0));
+	spu2_dump_fmd_ctrl1(le64_to_cpu(fmd->ctrl1));
+	spu2_dump_fmd_ctrl2(le64_to_cpu(fmd->ctrl2));
+	spu2_dump_fmd_ctrl3(le64_to_cpu(fmd->ctrl3));
+}
+
+static void spu2_dump_omd(u8 *omd, u16 hash_key_len, u16 ciph_key_len,
+			  u16 hash_iv_len, u16 ciph_iv_len)
+{
+	u8 *ptr = omd;
+
+	packet_log(" OMD:\n");
+
+	if (hash_key_len) {
+		packet_log("  Hash Key Length %u bytes\n", hash_key_len);
+		packet_dump("  KEY: ", ptr, hash_key_len);
+		ptr += hash_key_len;
+	}
+
+	if (ciph_key_len) {
+		packet_log("  Cipher Key Length %u bytes\n", ciph_key_len);
+		packet_dump("  KEY: ", ptr, ciph_key_len);
+		ptr += ciph_key_len;
+	}
+
+	if (hash_iv_len) {
+		packet_log("  Hash IV Length %u bytes\n", hash_iv_len);
+		packet_dump("  hash IV: ", ptr, hash_iv_len);
+		ptr += ciph_key_len;
+	}
+
+	if (ciph_iv_len) {
+		packet_log("  Cipher IV Length %u bytes\n", ciph_iv_len);
+		packet_dump("  cipher IV: ", ptr, ciph_iv_len);
+	}
+}
+
+/* Dump a SPU2 header for debug */
+void spu2_dump_msg_hdr(u8 *buf, unsigned int buf_len)
+{
+	struct SPU2_FMD *fmd = (struct SPU2_FMD *)buf;
+	u8 *omd;
+	u64 ctrl1;
+	u16 hash_key_len;
+	u16 ciph_key_len;
+	u16 hash_iv_len;
+	u16 ciph_iv_len;
+	u16 omd_len;
+
+	packet_log("\n");
+	packet_log("SPU2 message header %p len: %u\n", buf, buf_len);
+
+	spu2_dump_fmd(fmd);
+	omd = (u8 *)(fmd + 1);
+
+	ctrl1 = le64_to_cpu(fmd->ctrl1);
+	hash_key_len = (ctrl1 & SPU2_HASH_KEY_LEN) >> SPU2_HASH_KEY_LEN_SHIFT;
+	ciph_key_len = (ctrl1 & SPU2_CIPH_KEY_LEN) >> SPU2_CIPH_KEY_LEN_SHIFT;
+	hash_iv_len = 0;
+	ciph_iv_len = (ctrl1 & SPU2_IV_LEN) >> SPU2_IV_LEN_SHIFT;
+	spu2_dump_omd(omd, hash_key_len, ciph_key_len, hash_iv_len,
+		      ciph_iv_len);
+
+	/* Double check sanity */
+	omd_len = hash_key_len + ciph_key_len + hash_iv_len + ciph_iv_len;
+	if (FMD_SIZE + omd_len != buf_len) {
+		packet_log
+		    (" Packet parsed incorrectly. buf_len %u, sum of MD %zu\n",
+		     buf_len, FMD_SIZE + omd_len);
+	}
+	packet_log("\n");
+}
+
+/**
+ * spu2_fmd_init() - At setkey time, initialize the fixed meta data for
+ * subsequent ablkcipher requests for this context.
+ * @spu2_cipher_type:  Cipher algorithm
+ * @spu2_mode:         Cipher mode
+ * @cipher_key_len:    Length of cipher key, in bytes
+ * @cipher_iv_len:     Length of cipher initialization vector, in bytes
+ *
+ * Return:  0 (success)
+ */
+static int spu2_fmd_init(struct SPU2_FMD *fmd,
+			 enum spu2_cipher_type spu2_type,
+			 enum spu2_cipher_mode spu2_mode,
+			 u32 cipher_key_len, u32 cipher_iv_len)
+{
+	u64 ctrl0;
+	u64 ctrl1;
+	u64 ctrl2;
+	u64 ctrl3;
+	u32 aad1_offset;
+	u32 aad2_offset;
+	u16 aad1_len = 0;
+	u64 payload_offset;
+
+	ctrl0 = (spu2_type << SPU2_CIPH_TYPE_SHIFT) |
+	    (spu2_mode << SPU2_CIPH_MODE_SHIFT);
+
+	ctrl1 = (cipher_key_len << SPU2_CIPH_KEY_LEN_SHIFT) |
+	    ((u64)cipher_iv_len << SPU2_IV_LEN_SHIFT) |
+	    ((u64)SPU2_RET_FMD_ONLY << SPU2_RETURN_MD_SHIFT) | SPU2_RETURN_PAY;
+
+	/*
+	 * AAD1 offset is from start of FD. FD length is always 0 for this
+	 * driver. So AAD1_offset is always 0.
+	 */
+	aad1_offset = 0;
+	aad2_offset = aad1_offset;
+	payload_offset = 0;
+	ctrl2 = aad1_offset |
+	    (aad1_len << SPU2_AAD1_LEN_SHIFT) |
+	    (aad2_offset << SPU2_AAD2_OFFSET_SHIFT) |
+	    (payload_offset << SPU2_PL_OFFSET_SHIFT);
+
+	ctrl3 = 0;
+
+	fmd->ctrl0 = cpu_to_le64(ctrl0);
+	fmd->ctrl1 = cpu_to_le64(ctrl1);
+	fmd->ctrl2 = cpu_to_le64(ctrl2);
+	fmd->ctrl3 = cpu_to_le64(ctrl3);
+
+	return 0;
+}
+
+/**
+ * spu2_fmd_ctrl0_write() - Write ctrl0 field in fixed metadata (FMD) field of
+ * SPU request packet.
+ * @fmd:            Start of FMD field to be written
+ * @is_inbound:     true if decrypting. false if encrypting.
+ * @authFirst:      true if alg authenticates before encrypting
+ * @protocol:       protocol selector
+ * @cipher_type:    cipher algorithm
+ * @cipher_mode:    cipher mode
+ * @auth_type:      authentication type
+ * @auth_mode:      authentication mode
+ */
+static void spu2_fmd_ctrl0_write(struct SPU2_FMD *fmd,
+				 bool is_inbound, bool auth_first,
+				 enum spu2_proto_sel protocol,
+				 enum spu2_cipher_type cipher_type,
+				 enum spu2_cipher_mode cipher_mode,
+				 enum spu2_hash_type auth_type,
+				 enum spu2_hash_mode auth_mode)
+{
+	u64 ctrl0 = 0;
+
+	if ((cipher_type != SPU2_CIPHER_TYPE_NONE) && !is_inbound)
+		ctrl0 |= SPU2_CIPH_ENCRYPT_EN;
+
+	ctrl0 |= ((u64)cipher_type << SPU2_CIPH_TYPE_SHIFT) |
+	    ((u64)cipher_mode << SPU2_CIPH_MODE_SHIFT);
+
+	if (protocol)
+		ctrl0 |= (u64)protocol << SPU2_PROTO_SEL_SHIFT;
+
+	if (auth_first)
+		ctrl0 |= SPU2_HASH_FIRST;
+
+	if (is_inbound && (auth_type != SPU2_HASH_TYPE_NONE))
+		ctrl0 |= SPU2_CHK_TAG;
+
+	ctrl0 |= (((u64)auth_type << SPU2_HASH_TYPE_SHIFT) |
+		  ((u64)auth_mode << SPU2_HASH_MODE_SHIFT));
+
+	fmd->ctrl0 = cpu_to_le64(ctrl0);
+}
+
+/**
+ * spu2_fmd_ctrl1_write() - Write ctrl1 field in fixed metadata (FMD) field of
+ * SPU request packet.
+ * @fmd:            Start of FMD field to be written
+ * @assoc_size:     Length of additional associated data, in bytes
+ * @auth_key_len:   Length of authentication key, in bytes
+ * @cipher_key_len: Length of cipher key, in bytes
+ * @gen_iv:         If true, hw generates IV and returns in response
+ * @hash_iv:        IV participates in hash. Used for IPSEC and TLS.
+ * @return_iv:      Return IV in output packet before payload
+ * @ret_iv_len:     Length of IV returned from SPU, in bytes
+ * @ret_iv_offset:  Offset into full IV of start of returned IV
+ * @cipher_iv_len:  Length of input cipher IV, in bytes
+ * @digest_size:    Length of digest (aka, hash tag or ICV), in bytes
+ * @return_payload: Return payload in SPU response
+ * @return_md : return metadata in SPU response
+ *
+ * Packet can have AAD2 w/o AAD1. For algorithms currently supported,
+ * associated data goes in AAD2.
+ */
+static void spu2_fmd_ctrl1_write(struct SPU2_FMD *fmd, bool is_inbound,
+				 u64 assoc_size,
+				 u64 auth_key_len, u64 cipher_key_len,
+				 bool gen_iv, bool hash_iv, bool return_iv,
+				 u64 ret_iv_len, u64 ret_iv_offset,
+				 u64 cipher_iv_len, u64 digest_size,
+				 bool return_payload, bool return_md)
+{
+	u64 ctrl1 = 0;
+
+	if (is_inbound && digest_size)
+		ctrl1 |= SPU2_TAG_LOC;
+
+	if (assoc_size) {
+		ctrl1 |= SPU2_HAS_AAD2;
+		ctrl1 |= SPU2_RETURN_AAD2;  /* need aad2 for gcm aes esp */
+	}
+
+	if (auth_key_len)
+		ctrl1 |= ((auth_key_len << SPU2_HASH_KEY_LEN_SHIFT) &
+			  SPU2_HASH_KEY_LEN);
+
+	if (cipher_key_len)
+		ctrl1 |= ((cipher_key_len << SPU2_CIPH_KEY_LEN_SHIFT) &
+			  SPU2_CIPH_KEY_LEN);
+
+	if (gen_iv)
+		ctrl1 |= SPU2_GENIV;
+
+	if (hash_iv)
+		ctrl1 |= SPU2_HASH_IV;
+
+	if (return_iv) {
+		ctrl1 |= SPU2_RET_IV;
+		ctrl1 |= ret_iv_len << SPU2_RET_IV_LEN_SHIFT;
+		ctrl1 |= ret_iv_offset << SPU2_IV_OFFSET_SHIFT;
+	}
+
+	ctrl1 |= ((cipher_iv_len << SPU2_IV_LEN_SHIFT) & SPU2_IV_LEN);
+
+	if (digest_size)
+		ctrl1 |= ((digest_size << SPU2_HASH_TAG_LEN_SHIFT) &
+			  SPU2_HASH_TAG_LEN);
+
+	/* Let's ask for the output pkt to include FMD, but don't need to
+	 * get keys and IVs back in OMD.
+	 */
+	if (return_md)
+		ctrl1 |= ((u64)SPU2_RET_FMD_ONLY << SPU2_RETURN_MD_SHIFT);
+	else
+		ctrl1 |= ((u64)SPU2_RET_NO_MD << SPU2_RETURN_MD_SHIFT);
+
+	/* Crypto API does not get assoc data back. So no need for AAD2. */
+
+	if (return_payload)
+		ctrl1 |= SPU2_RETURN_PAY;
+
+	fmd->ctrl1 = cpu_to_le64(ctrl1);
+}
+
+/**
+ * spu2_fmd_ctrl2_write() - Set the ctrl2 field in the fixed metadata field of
+ * SPU2 header.
+ * @fmd:            Start of FMD field to be written
+ * @cipher_offset:  Number of bytes from Start of Packet (end of FD field) where
+ *                  data to be encrypted or decrypted begins
+ * @auth_key_len:   Length of authentication key, in bytes
+ * @auth_iv_len:    Length of authentication initialization vector, in bytes
+ * @cipher_key_len: Length of cipher key, in bytes
+ * @cipher_iv_len:  Length of cipher IV, in bytes
+ */
+static void spu2_fmd_ctrl2_write(struct SPU2_FMD *fmd, u64 cipher_offset,
+				 u64 auth_key_len, u64 auth_iv_len,
+				 u64 cipher_key_len, u64 cipher_iv_len)
+{
+	u64 ctrl2;
+	u64 aad1_offset;
+	u64 aad2_offset;
+	u16 aad1_len = 0;
+	u64 payload_offset;
+
+	/* AAD1 offset is from start of FD. FD length always 0. */
+	aad1_offset = 0;
+
+	aad2_offset = aad1_offset;
+	payload_offset = cipher_offset;
+	ctrl2 = aad1_offset |
+	    (aad1_len << SPU2_AAD1_LEN_SHIFT) |
+	    (aad2_offset << SPU2_AAD2_OFFSET_SHIFT) |
+	    (payload_offset << SPU2_PL_OFFSET_SHIFT);
+
+	fmd->ctrl2 = cpu_to_le64(ctrl2);
+}
+
+/**
+ * spu2_fmd_ctrl3_write() - Set the ctrl3 field in FMD
+ * @fmd:          Fixed meta data. First field in SPU2 msg header.
+ * @payload_len:  Length of payload, in bytes
+ */
+static void spu2_fmd_ctrl3_write(struct SPU2_FMD *fmd, u64 payload_len)
+{
+	u64 ctrl3;
+
+	ctrl3 = payload_len & SPU2_PL_LEN;
+
+	fmd->ctrl3 = cpu_to_le64(ctrl3);
+}
+
+/**
+ * spu2_ctx_max_payload() - Determine the maximum length of the payload for a
+ * SPU message for a given cipher and hash alg context.
+ * @cipher_alg:		The cipher algorithm
+ * @cipher_mode:	The cipher mode
+ * @blocksize:		The size of a block of data for this algo
+ *
+ * For SPU2, the hardware generally ignores the PayloadLen field in ctrl3 of
+ * FMD and just keeps computing until it receives a DMA descriptor with the EOF
+ * flag set. So we consider the max payload to be infinite. AES CCM is an
+ * exception.
+ *
+ * Return: Max payload length in bytes
+ */
+u32 spu2_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			 enum spu_cipher_mode cipher_mode,
+			 unsigned int blocksize)
+{
+	if ((cipher_alg == CIPHER_ALG_AES) &&
+	    (cipher_mode == CIPHER_MODE_CCM)) {
+		u32 excess = SPU2_MAX_PAYLOAD % blocksize;
+
+		return SPU2_MAX_PAYLOAD - excess;
+	} else {
+		return SPU_MAX_PAYLOAD_INF;
+	}
+}
+
+/**
+ * spu_payload_length() -  Given a SPU2 message header, extract the payload
+ * length.
+ * @spu_hdr:  Start of SPU message header (FMD)
+ *
+ * Return: payload length, in bytes
+ */
+u32 spu2_payload_length(u8 *spu_hdr)
+{
+	struct SPU2_FMD *fmd = (struct SPU2_FMD *)spu_hdr;
+	u32 pl_len;
+	u64 ctrl3;
+
+	ctrl3 = le64_to_cpu(fmd->ctrl3);
+	pl_len = ctrl3 & SPU2_PL_LEN;
+
+	return pl_len;
+}
+
+/**
+ * spu_response_hdr_len() - Determine the expected length of a SPU response
+ * header.
+ * @auth_key_len:  Length of authentication key, in bytes
+ * @enc_key_len:   Length of encryption key, in bytes
+ *
+ * For SPU2, includes just FMD. OMD is never requested.
+ *
+ * Return: Length of FMD, in bytes
+ */
+u16 spu2_response_hdr_len(u16 auth_key_len, u16 enc_key_len, bool is_hash)
+{
+	return FMD_SIZE;
+}
+
+/**
+ * spu_hash_pad_len() - Calculate the length of hash padding required to extend
+ * data to a full block size.
+ * @hash_alg:        hash algorithm
+ * @hash_mode:       hash mode
+ * @chunksize:       length of data, in bytes
+ * @hash_block_size: size of a hash block, in bytes
+ *
+ * SPU2 hardware does all hash padding
+ *
+ * Return:  length of hash pad in bytes
+ */
+u16 spu2_hash_pad_len(enum hash_alg hash_alg, enum hash_mode hash_mode,
+		      u32 chunksize, u16 hash_block_size)
+{
+	return 0;
+}
+
+/**
+ * spu2_gcm_ccm_padlen() -  Determine the length of GCM/CCM padding for either
+ * the AAD field or the data.
+ *
+ * Return:  0. Unlike SPU-M, SPU2 hardware does any GCM/CCM padding required.
+ */
+u32 spu2_gcm_ccm_pad_len(enum spu_cipher_mode cipher_mode,
+			 unsigned int data_size)
+{
+	return 0;
+}
+
+/**
+ * spu_assoc_resp_len() - Determine the size of the AAD2 buffer needed to catch
+ * associated data in a SPU2 output packet.
+ * @cipher_mode:   cipher mode
+ * @assoc_len:     length of additional associated data, in bytes
+ * @iv_len:        length of initialization vector, in bytes
+ * @is_encrypt:    true if encrypting. false if decrypt.
+ *
+ * Return: Length of buffer to catch associated data in response
+ */
+u32 spu2_assoc_resp_len(enum spu_cipher_mode cipher_mode,
+			unsigned int assoc_len, unsigned int iv_len,
+			bool is_encrypt)
+{
+	u32 resp_len = assoc_len;
+
+	if (is_encrypt)
+		/* gcm aes esp has to write 8-byte IV in response */
+		resp_len += iv_len;
+	return resp_len;
+}
+
+/*
+ * spu_aead_ivlen() - Calculate the length of the AEAD IV to be included
+ * in a SPU request after the AAD and before the payload.
+ * @cipher_mode:  cipher mode
+ * @iv_ctr_len:   initialization vector length in bytes
+ *
+ * For SPU2, AEAD IV is included in OMD and does not need to be repeated
+ * prior to the payload.
+ *
+ * Return: Length of AEAD IV in bytes
+ */
+u8 spu2_aead_ivlen(enum spu_cipher_mode cipher_mode, u16 iv_len)
+{
+	return 0;
+}
+
+/**
+ * spu2_hash_type() - Determine the type of hash operation.
+ * @src_sent:  The number of bytes in the current request that have already
+ *             been sent to the SPU to be hashed.
+ *
+ * SPU2 always does a FULL hash operation
+ */
+enum hash_type spu2_hash_type(u32 src_sent)
+{
+	return HASH_TYPE_FULL;
+}
+
+/**
+ * spu2_digest_size() - Determine the size of a hash digest to expect the SPU to
+ * return.
+ * alg_digest_size: Number of bytes in the final digest for the given algo
+ * alg:             The hash algorithm
+ * htype:           Type of hash operation (init, update, full, etc)
+ *
+ */
+u32 spu2_digest_size(u32 alg_digest_size, enum hash_alg alg,
+		     enum hash_type htype)
+{
+	return alg_digest_size;
+}
+
+/**
+ * spu_create_request() - Build a SPU2 request message header, includint FMD and
+ * OMD.
+ * @spu_hdr: Start of buffer where SPU request header is to be written
+ * @req_opts: SPU request message options
+ * @cipher_parms: Parameters related to cipher algorithm
+ * @hash_parms:   Parameters related to hash algorithm
+ * @aead_parms:   Parameters related to AEAD operation
+ * @data_size:    Length of data to be encrypted or authenticated. If AEAD, does
+ *		  not include length of AAD.
+ *
+ * Construct the message starting at spu_hdr. Caller should allocate this buffer
+ * in DMA-able memory at least SPU_HEADER_ALLOC_LEN bytes long.
+ *
+ * Return: the length of the SPU header in bytes. 0 if an error occurs.
+ */
+u32 spu2_create_request(u8 *spu_hdr,
+			struct spu_request_opts *req_opts,
+			struct spu_cipher_parms *cipher_parms,
+			struct spu_hash_parms *hash_parms,
+			struct spu_aead_parms *aead_parms,
+			unsigned int data_size)
+{
+	struct SPU2_FMD *fmd;
+	u8 *ptr;
+	unsigned int buf_len;
+	int err;
+	enum spu2_cipher_type spu2_ciph_type = SPU2_CIPHER_TYPE_NONE;
+	enum spu2_cipher_mode spu2_ciph_mode;
+	enum spu2_hash_type spu2_auth_type = SPU2_HASH_TYPE_NONE;
+	enum spu2_hash_mode spu2_auth_mode;
+	bool return_md = true;
+	enum spu2_proto_sel proto = SPU2_PROTO_RESV;
+
+	/* size of the payload */
+	unsigned int payload_len =
+	    hash_parms->prebuf_len + data_size + hash_parms->pad_len -
+	    ((req_opts->is_aead && req_opts->is_inbound) ?
+	     hash_parms->digestsize : 0);
+
+	/* offset of prebuf or data from start of AAD2 */
+	unsigned int cipher_offset = aead_parms->assoc_size +
+			aead_parms->aad_pad_len + aead_parms->iv_len;
+
+#ifdef DEBUG
+	/* total size of the data following OMD (without STAT word padding) */
+	unsigned int real_db_size = spu_real_db_size(aead_parms->assoc_size,
+						 aead_parms->iv_len,
+						 hash_parms->prebuf_len,
+						 data_size,
+						 aead_parms->aad_pad_len,
+						 aead_parms->data_pad_len,
+						 hash_parms->pad_len);
+#endif
+	unsigned int assoc_size = aead_parms->assoc_size;
+
+	if (req_opts->is_aead &&
+	    (cipher_parms->alg == CIPHER_ALG_AES) &&
+	    (cipher_parms->mode == CIPHER_MODE_GCM))
+		/*
+		 * On SPU 2, aes gcm cipher first on encrypt, auth first on
+		 * decrypt
+		 */
+		req_opts->auth_first = req_opts->is_inbound;
+
+	/* and do opposite for ccm (auth 1st on encrypt) */
+	if (req_opts->is_aead &&
+	    (cipher_parms->alg == CIPHER_ALG_AES) &&
+	    (cipher_parms->mode == CIPHER_MODE_CCM))
+		req_opts->auth_first = !req_opts->is_inbound;
+
+	flow_log("%s()\n", __func__);
+	flow_log("  in:%u authFirst:%u\n",
+		 req_opts->is_inbound, req_opts->auth_first);
+	flow_log("  cipher alg:%u mode:%u type %u\n", cipher_parms->alg,
+		 cipher_parms->mode, cipher_parms->type);
+	flow_log("  is_esp: %s\n", req_opts->is_esp ? "yes" : "no");
+	flow_log("    key: %d\n", cipher_parms->key_len);
+	flow_dump("    key: ", cipher_parms->key_buf, cipher_parms->key_len);
+	flow_log("    iv: %d\n", cipher_parms->iv_len);
+	flow_dump("    iv: ", cipher_parms->iv_buf, cipher_parms->iv_len);
+	flow_log("  auth alg:%u mode:%u type %u\n",
+		 hash_parms->alg, hash_parms->mode, hash_parms->type);
+	flow_log("  digestsize: %u\n", hash_parms->digestsize);
+	flow_log("  authkey: %d\n", hash_parms->key_len);
+	flow_dump("  authkey: ", hash_parms->key_buf, hash_parms->key_len);
+	flow_log("  assoc_size:%u\n", assoc_size);
+	flow_log("  prebuf_len:%u\n", hash_parms->prebuf_len);
+	flow_log("  data_size:%u\n", data_size);
+	flow_log("  hash_pad_len:%u\n", hash_parms->pad_len);
+	flow_log("  real_db_size:%u\n", real_db_size);
+	flow_log("  cipher_offset:%u payload_len:%u\n",
+		 cipher_offset, payload_len);
+	flow_log("  aead_iv: %u\n", aead_parms->iv_len);
+
+	/* Convert to spu2 values for cipher alg, hash alg */
+	err = spu2_cipher_xlate(cipher_parms->alg, cipher_parms->mode,
+				cipher_parms->type,
+				&spu2_ciph_type, &spu2_ciph_mode);
+
+	/* If we are doing GCM hashing only - either via rfc4543 transform
+	 * or because we happen to do GCM with AAD only and no payload - we
+	 * need to configure hardware to use hash key rather than cipher key
+	 * and put data into payload.  This is because unlike SPU-M, running
+	 * GCM cipher with 0 size payload is not permitted.
+	 */
+	if ((req_opts->is_rfc4543) ||
+	    ((spu2_ciph_mode == SPU2_CIPHER_MODE_GCM) &&
+	    (payload_len == 0))) {
+		/* Use hashing (only) and set up hash key */
+		spu2_ciph_type = SPU2_CIPHER_TYPE_NONE;
+		hash_parms->key_len = cipher_parms->key_len;
+		memcpy(hash_parms->key_buf, cipher_parms->key_buf,
+		       cipher_parms->key_len);
+		cipher_parms->key_len = 0;
+
+		if (req_opts->is_rfc4543)
+			payload_len += assoc_size;
+		else
+			payload_len = assoc_size;
+		cipher_offset = 0;
+		assoc_size = 0;
+	}
+
+	if (err)
+		return 0;
+
+	flow_log("spu2 cipher type %s, cipher mode %s\n",
+		 spu2_ciph_type_name(spu2_ciph_type),
+		 spu2_ciph_mode_name(spu2_ciph_mode));
+
+	err = spu2_hash_xlate(hash_parms->alg, hash_parms->mode,
+			      hash_parms->type,
+			      cipher_parms->type,
+			      &spu2_auth_type, &spu2_auth_mode);
+	if (err)
+		return 0;
+
+	flow_log("spu2 hash type %s, hash mode %s\n",
+		 spu2_hash_type_name(spu2_auth_type),
+		 spu2_hash_mode_name(spu2_auth_mode));
+
+	fmd = (struct SPU2_FMD *)spu_hdr;
+
+	spu2_fmd_ctrl0_write(fmd, req_opts->is_inbound, req_opts->auth_first,
+			     proto, spu2_ciph_type, spu2_ciph_mode,
+			     spu2_auth_type, spu2_auth_mode);
+
+	spu2_fmd_ctrl1_write(fmd, req_opts->is_inbound, assoc_size,
+			     hash_parms->key_len, cipher_parms->key_len,
+			     false, false,
+			     aead_parms->return_iv, aead_parms->ret_iv_len,
+			     aead_parms->ret_iv_off,
+			     cipher_parms->iv_len, hash_parms->digestsize,
+			     !req_opts->bd_suppress, return_md);
+
+	spu2_fmd_ctrl2_write(fmd, cipher_offset, hash_parms->key_len, 0,
+			     cipher_parms->key_len, cipher_parms->iv_len);
+
+	spu2_fmd_ctrl3_write(fmd, payload_len);
+
+	ptr = (u8 *)(fmd + 1);
+	buf_len = sizeof(struct SPU2_FMD);
+
+	/* Write OMD */
+	if (hash_parms->key_len) {
+		memcpy(ptr, hash_parms->key_buf, hash_parms->key_len);
+		ptr += hash_parms->key_len;
+		buf_len += hash_parms->key_len;
+	}
+	if (cipher_parms->key_len) {
+		memcpy(ptr, cipher_parms->key_buf, cipher_parms->key_len);
+		ptr += cipher_parms->key_len;
+		buf_len += cipher_parms->key_len;
+	}
+	if (cipher_parms->iv_len) {
+		memcpy(ptr, cipher_parms->iv_buf, cipher_parms->iv_len);
+		ptr += cipher_parms->iv_len;
+		buf_len += cipher_parms->iv_len;
+	}
+
+	packet_dump("  SPU request header: ", spu_hdr, buf_len);
+
+	return buf_len;
+}
+
+/**
+ * spu_cipher_req_init() - Build an ablkcipher SPU2 request message header,
+ * including FMD and OMD.
+ * @spu_hdr:       Location of start of SPU request (FMD field)
+ * @cipher_parms:  Parameters describing cipher request
+ *
+ * Called at setkey time to initialize a msg header that can be reused for all
+ * subsequent ablkcipher requests. Construct the message starting at spu_hdr.
+ * Caller should allocate this buffer in DMA-able memory at least
+ * SPU_HEADER_ALLOC_LEN bytes long.
+ *
+ * Return: the total length of the SPU header (FMD and OMD) in bytes. 0 if an
+ * error occurs.
+ */
+u16 spu2_cipher_req_init(u8 *spu_hdr, struct spu_cipher_parms *cipher_parms)
+{
+	struct SPU2_FMD *fmd;
+	u8 *omd;
+	enum spu2_cipher_type spu2_type = SPU2_CIPHER_TYPE_NONE;
+	enum spu2_cipher_mode spu2_mode;
+	int err;
+
+	flow_log("%s()\n", __func__);
+	flow_log("  cipher alg:%u mode:%u type %u\n", cipher_parms->alg,
+		 cipher_parms->mode, cipher_parms->type);
+	flow_log("  cipher_iv_len: %u\n", cipher_parms->iv_len);
+	flow_log("    key: %d\n", cipher_parms->key_len);
+	flow_dump("    key: ", cipher_parms->key_buf, cipher_parms->key_len);
+
+	/* Convert to spu2 values */
+	err = spu2_cipher_xlate(cipher_parms->alg, cipher_parms->mode,
+				cipher_parms->type, &spu2_type, &spu2_mode);
+	if (err)
+		return 0;
+
+	flow_log("spu2 cipher type %s, cipher mode %s\n",
+		 spu2_ciph_type_name(spu2_type),
+		 spu2_ciph_mode_name(spu2_mode));
+
+	/* Construct the FMD header */
+	fmd = (struct SPU2_FMD *)spu_hdr;
+	err = spu2_fmd_init(fmd, spu2_type, spu2_mode, cipher_parms->key_len,
+			    cipher_parms->iv_len);
+	if (err)
+		return 0;
+
+	/* Write cipher key to OMD */
+	omd = (u8 *)(fmd + 1);
+	if (cipher_parms->key_buf && cipher_parms->key_len)
+		memcpy(omd, cipher_parms->key_buf, cipher_parms->key_len);
+
+	packet_dump("  SPU request header: ", spu_hdr,
+		    FMD_SIZE + cipher_parms->key_len + cipher_parms->iv_len);
+
+	return FMD_SIZE + cipher_parms->key_len + cipher_parms->iv_len;
+}
+
+/**
+ * spu_cipher_req_finish() - Finish building a SPU request message header for a
+ * block cipher request.
+ * @spu_hdr:         Start of the request message header (MH field)
+ * @spu_req_hdr_len: Length in bytes of the SPU request header
+ * @isInbound:       0 encrypt, 1 decrypt
+ * @cipher_parms:    Parameters describing cipher operation to be performed
+ * @update_key:      If true, rewrite the cipher key in SCTX
+ * @data_size:       Length of the data in the BD field
+ *
+ * Assumes much of the header was already filled in at setkey() time in
+ * spu_cipher_req_init().
+ * spu_cipher_req_init() fills in the encryption key. For RC4, when submitting a
+ * request for a non-first chunk, we use the 260-byte SUPDT field from the
+ * previous response as the key. update_key is true for this case. Unused in all
+ * other cases.
+ */
+void spu2_cipher_req_finish(u8 *spu_hdr,
+			    u16 spu_req_hdr_len,
+			    unsigned int is_inbound,
+			    struct spu_cipher_parms *cipher_parms,
+			    bool update_key,
+			    unsigned int data_size)
+{
+	struct SPU2_FMD *fmd;
+	u8 *omd;		/* start of optional metadata */
+	u64 ctrl0;
+	u64 ctrl3;
+
+	flow_log("%s()\n", __func__);
+	flow_log(" in: %u\n", is_inbound);
+	flow_log(" cipher alg: %u, cipher_type: %u\n", cipher_parms->alg,
+		 cipher_parms->type);
+	if (update_key) {
+		flow_log(" cipher key len: %u\n", cipher_parms->key_len);
+		flow_dump("  key: ", cipher_parms->key_buf,
+			  cipher_parms->key_len);
+	}
+	flow_log(" iv len: %d\n", cipher_parms->iv_len);
+	flow_dump("    iv: ", cipher_parms->iv_buf, cipher_parms->iv_len);
+	flow_log(" data_size: %u\n", data_size);
+
+	fmd = (struct SPU2_FMD *)spu_hdr;
+	omd = (u8 *)(fmd + 1);
+
+	/*
+	 * FMD ctrl0 was initialized at setkey time. update it to indicate
+	 * whether we are encrypting or decrypting.
+	 */
+	ctrl0 = le64_to_cpu(fmd->ctrl0);
+	if (is_inbound)
+		ctrl0 &= ~SPU2_CIPH_ENCRYPT_EN;	/* decrypt */
+	else
+		ctrl0 |= SPU2_CIPH_ENCRYPT_EN;	/* encrypt */
+	fmd->ctrl0 = cpu_to_le64(ctrl0);
+
+	if (cipher_parms->alg && cipher_parms->iv_buf && cipher_parms->iv_len) {
+		/* cipher iv provided so put it in here */
+		memcpy(omd + cipher_parms->key_len, cipher_parms->iv_buf,
+		       cipher_parms->iv_len);
+	}
+
+	ctrl3 = le64_to_cpu(fmd->ctrl3);
+	data_size &= SPU2_PL_LEN;
+	ctrl3 |= data_size;
+	fmd->ctrl3 = cpu_to_le64(ctrl3);
+
+	packet_dump("  SPU request header: ", spu_hdr, spu_req_hdr_len);
+}
+
+/**
+ * spu_request_pad() - Create pad bytes at the end of the data.
+ * @pad_start:      Start of buffer where pad bytes are to be written
+ * @gcm_padding:    Length of GCM padding, in bytes
+ * @hash_pad_len:   Number of bytes of padding extend data to full block
+ * @auth_alg:       Authentication algorithm
+ * @auth_mode:      Authentication mode
+ * @total_sent:     Length inserted at end of hash pad
+ * @status_padding: Number of bytes of padding to align STATUS word
+ *
+ * There may be three forms of pad:
+ *  1. GCM pad - for GCM mode ciphers, pad to 16-byte alignment
+ *  2. hash pad - pad to a block length, with 0x80 data terminator and
+ *                size at the end
+ *  3. STAT pad - to ensure the STAT field is 4-byte aligned
+ */
+void spu2_request_pad(u8 *pad_start, u32 gcm_padding, u32 hash_pad_len,
+		      enum hash_alg auth_alg, enum hash_mode auth_mode,
+		      unsigned int total_sent, u32 status_padding)
+{
+	u8 *ptr = pad_start;
+
+	/* fix data alignent for GCM */
+	if (gcm_padding > 0) {
+		flow_log("  GCM: padding to 16 byte alignment: %u bytes\n",
+			 gcm_padding);
+		memset(ptr, 0, gcm_padding);
+		ptr += gcm_padding;
+	}
+
+	if (hash_pad_len > 0) {
+		/* clear the padding section */
+		memset(ptr, 0, hash_pad_len);
+
+		/* terminate the data */
+		*ptr = 0x80;
+		ptr += (hash_pad_len - sizeof(u64));
+
+		/* add the size at the end as required per alg */
+		if (auth_alg == HASH_ALG_MD5)
+			*(u64 *)ptr = cpu_to_le64((u64)total_sent * 8);
+		else		/* SHA1, SHA2-224, SHA2-256 */
+			*(u64 *)ptr = cpu_to_be64((u64)total_sent * 8);
+		ptr += sizeof(u64);
+	}
+
+	/* pad to a 4byte alignment for STAT */
+	if (status_padding > 0) {
+		flow_log("  STAT: padding to 4 byte alignment: %u bytes\n",
+			 status_padding);
+
+		memset(ptr, 0, status_padding);
+		ptr += status_padding;
+	}
+}
+
+/**
+ * spu2_xts_tweak_in_payload() - Indicate that SPU2 does NOT place the XTS
+ * tweak field in the packet payload (it uses IV instead)
+ *
+ * Return: 0
+ */
+u8 spu2_xts_tweak_in_payload(void)
+{
+	return 0;
+}
+
+/**
+ * spu2_tx_status_len() - Return the length of the STATUS field in a SPU
+ * response message.
+ *
+ * Return: Length of STATUS field in bytes.
+ */
+u8 spu2_tx_status_len(void)
+{
+	return SPU2_TX_STATUS_LEN;
+}
+
+/**
+ * spu2_rx_status_len() - Return the length of the STATUS field in a SPU
+ * response message.
+ *
+ * Return: Length of STATUS field in bytes.
+ */
+u8 spu2_rx_status_len(void)
+{
+	return SPU2_RX_STATUS_LEN;
+}
+
+/**
+ * spu_status_process() - Process the status from a SPU response message.
+ * @statp:  start of STATUS word
+ *
+ * Return:  0 - if status is good and response should be processed
+ *         !0 - status indicates an error and response is invalid
+ */
+int spu2_status_process(u8 *statp)
+{
+	/* SPU2 status is 2 bytes by default - SPU_RX_STATUS_LEN */
+	u16 status = le16_to_cpu(*(__le16 *)statp);
+
+	if (status == 0)
+		return 0;
+
+	flow_log("rx status is %#x\n", status);
+	if (status == SPU2_INVALID_ICV)
+		return SPU_INVALID_ICV;
+
+	return -EBADMSG;
+}
+
+/**
+ * spu2_ccm_update_iv() - Update the IV as per the requirements for CCM mode.
+ *
+ * @digestsize:		Digest size of this request
+ * @cipher_parms:	(pointer to) cipher parmaeters, includes IV buf & IV len
+ * @assoclen:		Length of AAD data
+ * @chunksize:		length of input data to be sent in this req
+ * @is_encrypt:		true if this is an output/encrypt operation
+ * @is_esp:		true if this is an ESP / RFC4309 operation
+ *
+ */
+void spu2_ccm_update_iv(unsigned int digestsize,
+			struct spu_cipher_parms *cipher_parms,
+			unsigned int assoclen, unsigned int chunksize,
+			bool is_encrypt, bool is_esp)
+{
+	int L;  /* size of length field, in bytes */
+
+	/*
+	 * In RFC4309 mode, L is fixed at 4 bytes; otherwise, IV from
+	 * testmgr contains (L-1) in bottom 3 bits of first byte,
+	 * per RFC 3610.
+	 */
+	if (is_esp)
+		L = CCM_ESP_L_VALUE;
+	else
+		L = ((cipher_parms->iv_buf[0] & CCM_B0_L_PRIME) >>
+		      CCM_B0_L_PRIME_SHIFT) + 1;
+
+	/* SPU2 doesn't want these length bytes nor the first byte... */
+	cipher_parms->iv_len -= (1 + L);
+	memmove(cipher_parms->iv_buf, &cipher_parms->iv_buf[1],
+		cipher_parms->iv_len);
+}
+
+/**
+ * spu2_wordalign_padlen() - SPU2 does not require padding.
+ * @data_size: length of data field in bytes
+ *
+ * Return: length of status field padding, in bytes (always 0 on SPU2)
+ */
+u32 spu2_wordalign_padlen(u32 data_size)
+{
+	return 0;
+}
diff --git a/drivers/crypto/bcm/spu2.h b/drivers/crypto/bcm/spu2.h
new file mode 100644
index 000000000000..ab1f59934828
--- /dev/null
+++ b/drivers/crypto/bcm/spu2.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+/*
+ * This file contains SPU message definitions specific to SPU2.
+ */
+
+#ifndef _SPU2_H
+#define _SPU2_H
+
+enum spu2_cipher_type {
+	SPU2_CIPHER_TYPE_NONE = 0x0,
+	SPU2_CIPHER_TYPE_AES128 = 0x1,
+	SPU2_CIPHER_TYPE_AES192 = 0x2,
+	SPU2_CIPHER_TYPE_AES256 = 0x3,
+	SPU2_CIPHER_TYPE_DES = 0x4,
+	SPU2_CIPHER_TYPE_3DES = 0x5,
+	SPU2_CIPHER_TYPE_LAST
+};
+
+enum spu2_cipher_mode {
+	SPU2_CIPHER_MODE_ECB = 0x0,
+	SPU2_CIPHER_MODE_CBC = 0x1,
+	SPU2_CIPHER_MODE_CTR = 0x2,
+	SPU2_CIPHER_MODE_CFB = 0x3,
+	SPU2_CIPHER_MODE_OFB = 0x4,
+	SPU2_CIPHER_MODE_XTS = 0x5,
+	SPU2_CIPHER_MODE_CCM = 0x6,
+	SPU2_CIPHER_MODE_GCM = 0x7,
+	SPU2_CIPHER_MODE_LAST
+};
+
+enum spu2_hash_type {
+	SPU2_HASH_TYPE_NONE = 0x0,
+	SPU2_HASH_TYPE_AES128 = 0x1,
+	SPU2_HASH_TYPE_AES192 = 0x2,
+	SPU2_HASH_TYPE_AES256 = 0x3,
+	SPU2_HASH_TYPE_MD5 = 0x6,
+	SPU2_HASH_TYPE_SHA1 = 0x7,
+	SPU2_HASH_TYPE_SHA224 = 0x8,
+	SPU2_HASH_TYPE_SHA256 = 0x9,
+	SPU2_HASH_TYPE_SHA384 = 0xa,
+	SPU2_HASH_TYPE_SHA512 = 0xb,
+	SPU2_HASH_TYPE_SHA512_224 = 0xc,
+	SPU2_HASH_TYPE_SHA512_256 = 0xd,
+	SPU2_HASH_TYPE_SHA3_224 = 0xe,
+	SPU2_HASH_TYPE_SHA3_256 = 0xf,
+	SPU2_HASH_TYPE_SHA3_384 = 0x10,
+	SPU2_HASH_TYPE_SHA3_512 = 0x11,
+	SPU2_HASH_TYPE_LAST
+};
+
+enum spu2_hash_mode {
+	SPU2_HASH_MODE_CMAC = 0x0,
+	SPU2_HASH_MODE_CBC_MAC = 0x1,
+	SPU2_HASH_MODE_XCBC_MAC = 0x2,
+	SPU2_HASH_MODE_HMAC = 0x3,
+	SPU2_HASH_MODE_RABIN = 0x4,
+	SPU2_HASH_MODE_CCM = 0x5,
+	SPU2_HASH_MODE_GCM = 0x6,
+	SPU2_HASH_MODE_RESERVED = 0x7,
+	SPU2_HASH_MODE_LAST
+};
+
+enum spu2_ret_md_opts {
+	SPU2_RET_NO_MD = 0,	/* return no metadata */
+	SPU2_RET_FMD_OMD = 1,	/* return both FMD and OMD */
+	SPU2_RET_FMD_ONLY = 2,	/* return only FMD */
+	SPU2_RET_FMD_OMD_IV = 3,	/* return FMD and OMD with just IVs */
+};
+
+/* Fixed Metadata format */
+struct SPU2_FMD {
+	u64 ctrl0;
+	u64 ctrl1;
+	u64 ctrl2;
+	u64 ctrl3;
+};
+
+#define FMD_SIZE  sizeof(struct SPU2_FMD)
+
+/* Fixed part of request message header length in bytes. Just FMD. */
+#define SPU2_REQ_FIXED_LEN FMD_SIZE
+#define SPU2_HEADER_ALLOC_LEN (SPU_REQ_FIXED_LEN + \
+				2 * MAX_KEY_SIZE + 2 * MAX_IV_SIZE)
+
+/* FMD ctrl0 field masks */
+#define SPU2_CIPH_ENCRYPT_EN            0x1 /* 0: decrypt, 1: encrypt */
+#define SPU2_CIPH_TYPE                 0xF0 /* one of spu2_cipher_type */
+#define SPU2_CIPH_TYPE_SHIFT              4
+#define SPU2_CIPH_MODE                0xF00 /* one of spu2_cipher_mode */
+#define SPU2_CIPH_MODE_SHIFT              8
+#define SPU2_CFB_MASK                0x7000 /* cipher feedback mask */
+#define SPU2_CFB_MASK_SHIFT              12
+#define SPU2_PROTO_SEL             0xF00000 /* MACsec, IPsec, TLS... */
+#define SPU2_PROTO_SEL_SHIFT             20
+#define SPU2_HASH_FIRST           0x1000000 /* 1: hash input is input pkt
+					     * data
+					     */
+#define SPU2_CHK_TAG              0x2000000 /* 1: check digest provided */
+#define SPU2_HASH_TYPE          0x1F0000000 /* one of spu2_hash_type */
+#define SPU2_HASH_TYPE_SHIFT             28
+#define SPU2_HASH_MODE         0xF000000000 /* one of spu2_hash_mode */
+#define SPU2_HASH_MODE_SHIFT             36
+#define SPU2_CIPH_PAD_EN     0x100000000000 /* 1: Add pad to end of payload for
+					     *    enc
+					     */
+#define SPU2_CIPH_PAD      0xFF000000000000 /* cipher pad value */
+#define SPU2_CIPH_PAD_SHIFT              48
+
+/* FMD ctrl1 field masks */
+#define SPU2_TAG_LOC                    0x1 /* 1: end of payload, 0: undef */
+#define SPU2_HAS_FR_DATA                0x2 /* 1: msg has frame data */
+#define SPU2_HAS_AAD1                   0x4 /* 1: msg has AAD1 field */
+#define SPU2_HAS_NAAD                   0x8 /* 1: msg has NAAD field */
+#define SPU2_HAS_AAD2                  0x10 /* 1: msg has AAD2 field */
+#define SPU2_HAS_ESN                   0x20 /* 1: msg has ESN field */
+#define SPU2_HASH_KEY_LEN            0xFF00 /* len of hash key in bytes.
+					     * HMAC only.
+					     */
+#define SPU2_HASH_KEY_LEN_SHIFT           8
+#define SPU2_CIPH_KEY_LEN         0xFF00000 /* len of cipher key in bytes */
+#define SPU2_CIPH_KEY_LEN_SHIFT          20
+#define SPU2_GENIV               0x10000000 /* 1: hw generates IV */
+#define SPU2_HASH_IV             0x20000000 /* 1: IV incl in hash */
+#define SPU2_RET_IV              0x40000000 /* 1: return IV in output msg
+					     *    b4 payload
+					     */
+#define SPU2_RET_IV_LEN         0xF00000000 /* length in bytes of IV returned.
+					     * 0 = 16 bytes
+					     */
+#define SPU2_RET_IV_LEN_SHIFT            32
+#define SPU2_IV_OFFSET         0xF000000000 /* gen IV offset */
+#define SPU2_IV_OFFSET_SHIFT             36
+#define SPU2_IV_LEN          0x1F0000000000 /* length of input IV in bytes */
+#define SPU2_IV_LEN_SHIFT                40
+#define SPU2_HASH_TAG_LEN  0x7F000000000000 /* hash tag length in bytes */
+#define SPU2_HASH_TAG_LEN_SHIFT          48
+#define SPU2_RETURN_MD    0x300000000000000 /* return metadata */
+#define SPU2_RETURN_MD_SHIFT             56
+#define SPU2_RETURN_FD    0x400000000000000
+#define SPU2_RETURN_AAD1  0x800000000000000
+#define SPU2_RETURN_NAAD 0x1000000000000000
+#define SPU2_RETURN_AAD2 0x2000000000000000
+#define SPU2_RETURN_PAY  0x4000000000000000 /* return payload */
+
+/* FMD ctrl2 field masks */
+#define SPU2_AAD1_OFFSET              0xFFF /* byte offset of AAD1 field */
+#define SPU2_AAD1_LEN               0xFF000 /* length of AAD1 in bytes */
+#define SPU2_AAD1_LEN_SHIFT              12
+#define SPU2_AAD2_OFFSET         0xFFF00000 /* byte offset of AAD2 field */
+#define SPU2_AAD2_OFFSET_SHIFT           20
+#define SPU2_PL_OFFSET   0xFFFFFFFF00000000 /* payload offset from AAD2 */
+#define SPU2_PL_OFFSET_SHIFT             32
+
+/* FMD ctrl3 field masks */
+#define SPU2_PL_LEN              0xFFFFFFFF /* payload length in bytes */
+#define SPU2_TLS_LEN         0xFFFF00000000 /* TLS encrypt: cipher len
+					     * TLS decrypt: compressed len
+					     */
+#define SPU2_TLS_LEN_SHIFT               32
+
+/*
+ * Max value that can be represented in the Payload Length field of the
+ * ctrl3 word of FMD.
+ */
+#define SPU2_MAX_PAYLOAD  SPU2_PL_LEN
+
+/* Error values returned in STATUS field of response messages */
+#define SPU2_INVALID_ICV  1
+
+void spu2_dump_msg_hdr(u8 *buf, unsigned int buf_len);
+u32 spu2_ctx_max_payload(enum spu_cipher_alg cipher_alg,
+			 enum spu_cipher_mode cipher_mode,
+			 unsigned int blocksize);
+u32 spu2_payload_length(u8 *spu_hdr);
+u16 spu2_response_hdr_len(u16 auth_key_len, u16 enc_key_len, bool is_hash);
+u16 spu2_hash_pad_len(enum hash_alg hash_alg, enum hash_mode hash_mode,
+		      u32 chunksize, u16 hash_block_size);
+u32 spu2_gcm_ccm_pad_len(enum spu_cipher_mode cipher_mode,
+			 unsigned int data_size);
+u32 spu2_assoc_resp_len(enum spu_cipher_mode cipher_mode,
+			unsigned int assoc_len, unsigned int iv_len,
+			bool is_encrypt);
+u8 spu2_aead_ivlen(enum spu_cipher_mode cipher_mode,
+		   u16 iv_len);
+enum hash_type spu2_hash_type(u32 src_sent);
+u32 spu2_digest_size(u32 alg_digest_size, enum hash_alg alg,
+		     enum hash_type htype);
+u32 spu2_create_request(u8 *spu_hdr,
+			struct spu_request_opts *req_opts,
+			struct spu_cipher_parms *cipher_parms,
+			struct spu_hash_parms *hash_parms,
+			struct spu_aead_parms *aead_parms,
+			unsigned int data_size);
+u16 spu2_cipher_req_init(u8 *spu_hdr, struct spu_cipher_parms *cipher_parms);
+void spu2_cipher_req_finish(u8 *spu_hdr,
+			    u16 spu_req_hdr_len,
+			    unsigned int is_inbound,
+			    struct spu_cipher_parms *cipher_parms,
+			    bool update_key,
+			    unsigned int data_size);
+void spu2_request_pad(u8 *pad_start, u32 gcm_padding, u32 hash_pad_len,
+		      enum hash_alg auth_alg, enum hash_mode auth_mode,
+		      unsigned int total_sent, u32 status_padding);
+u8 spu2_xts_tweak_in_payload(void);
+u8 spu2_tx_status_len(void);
+u8 spu2_rx_status_len(void);
+int spu2_status_process(u8 *statp);
+void spu2_ccm_update_iv(unsigned int digestsize,
+			struct spu_cipher_parms *cipher_parms,
+			unsigned int assoclen, unsigned int chunksize,
+			bool is_encrypt, bool is_esp);
+u32 spu2_wordalign_padlen(u32 data_size);
+#endif
diff --git a/drivers/crypto/bcm/spum.h b/drivers/crypto/bcm/spum.h
new file mode 100644
index 000000000000..d0a5b5828638
--- /dev/null
+++ b/drivers/crypto/bcm/spum.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+/*
+ * This file contains SPU message definitions specific to SPU-M.
+ */
+
+#ifndef _SPUM_H_
+#define _SPUM_H_
+
+#define SPU_CRYPTO_OPERATION_GENERIC	0x1
+
+/* Length of STATUS field in tx and rx packets */
+#define SPU_TX_STATUS_LEN  4
+
+/* SPU-M error codes */
+#define SPU_STATUS_MASK                 0x0000FF00
+#define SPU_STATUS_SUCCESS              0x00000000
+#define SPU_STATUS_INVALID_ICV          0x00000100
+
+#define SPU_STATUS_ERROR_FLAG           0x00020000
+
+/* Request message. MH + EMH + BDESC + BD header */
+#define SPU_REQ_FIXED_LEN 24
+
+/*
+ * Max length of a SPU message header. Used to allocate a buffer where
+ * the SPU message header is constructed. Can be used for either a SPU-M
+ * header or a SPU2 header.
+ * For SPU-M, sum of the following:
+ *    MH - 4 bytes
+ *    EMH - 4
+ *    SCTX - 3 +
+ *      max auth key len - 64
+ *      max cipher key len - 264 (RC4)
+ *      max IV len - 16
+ *    BDESC - 12
+ *    BD header - 4
+ * Total:  371
+ *
+ * For SPU2, FMD_SIZE (32) plus lengths of hash and cipher keys,
+ * hash and cipher IVs. If SPU2 does not support RC4, then
+ */
+#define SPU_HEADER_ALLOC_LEN  (SPU_REQ_FIXED_LEN + MAX_KEY_SIZE + \
+				MAX_KEY_SIZE + MAX_IV_SIZE)
+
+/*
+ * Response message header length. Normally MH, EMH, BD header, but when
+ * BD_SUPPRESS is used for hash requests, there is no BD header.
+ */
+#define SPU_RESP_HDR_LEN 12
+#define SPU_HASH_RESP_HDR_LEN 8
+
+/*
+ * Max value that can be represented in the Payload Length field of the BD
+ * header. This is a 16-bit field.
+ */
+#define SPUM_NS2_MAX_PAYLOAD  (BIT(16) - 1)
+
+/*
+ * NSP SPU is limited to ~9KB because of FA2 FIFO size limitations;
+ * Set MAX_PAYLOAD to 8k to allow for addition of header, digest, etc.
+ * and stay within limitation.
+ */
+
+#define SPUM_NSP_MAX_PAYLOAD	8192
+
+/* Buffer Descriptor Header [BDESC]. SPU in big-endian mode. */
+struct BDESC_HEADER {
+	u16 offset_mac;		/* word 0 [31-16] */
+	u16 length_mac;		/* word 0 [15-0]  */
+	u16 offset_crypto;	/* word 1 [31-16] */
+	u16 length_crypto;	/* word 1 [15-0]  */
+	u16 offset_icv;		/* word 2 [31-16] */
+	u16 offset_iv;		/* word 2 [15-0]  */
+};
+
+/* Buffer Data Header [BD]. SPU in big-endian mode. */
+struct BD_HEADER {
+	u16 size;
+	u16 prev_length;
+};
+
+/* Command Context Header. SPU-M in big endian mode. */
+struct MHEADER {
+	u8 flags;	/* [31:24] */
+	u8 op_code;	/* [23:16] */
+	u16 reserved;	/* [15:0] */
+};
+
+/* MH header flags bits */
+#define MH_SUPDT_PRES   BIT(0)
+#define MH_HASH_PRES    BIT(2)
+#define MH_BD_PRES      BIT(3)
+#define MH_MFM_PRES     BIT(4)
+#define MH_BDESC_PRES   BIT(5)
+#define MH_SCTX_PRES	BIT(7)
+
+/* SCTX word 0 bit offsets and fields masks */
+#define SCTX_SIZE               0x000000FF
+
+/* SCTX word 1 bit shifts and field masks */
+#define  UPDT_OFST              0x000000FF   /* offset of SCTX updateable fld */
+#define  HASH_TYPE              0x00000300   /* hash alg operation type */
+#define  HASH_TYPE_SHIFT                 8
+#define  HASH_MODE              0x00001C00   /* one of spu2_hash_mode */
+#define  HASH_MODE_SHIFT                10
+#define  HASH_ALG               0x0000E000   /* hash algorithm */
+#define  HASH_ALG_SHIFT                 13
+#define  CIPHER_TYPE            0x00030000   /* encryption operation type */
+#define  CIPHER_TYPE_SHIFT              16
+#define  CIPHER_MODE            0x001C0000   /* encryption mode */
+#define  CIPHER_MODE_SHIFT              18
+#define  CIPHER_ALG             0x00E00000   /* encryption algo */
+#define  CIPHER_ALG_SHIFT               21
+#define  ICV_IS_512                BIT(27)
+#define  ICV_IS_512_SHIFT		27
+#define  CIPHER_ORDER               BIT(30)
+#define  CIPHER_ORDER_SHIFT             30
+#define  CIPHER_INBOUND             BIT(31)
+#define  CIPHER_INBOUND_SHIFT           31
+
+/* SCTX word 2 bit shifts and field masks */
+#define  EXP_IV_SIZE                   0x7
+#define  IV_OFFSET                   BIT(3)
+#define  IV_OFFSET_SHIFT                 3
+#define  GEN_IV                      BIT(5)
+#define  GEN_IV_SHIFT                    5
+#define  EXPLICIT_IV                 BIT(6)
+#define  EXPLICIT_IV_SHIFT               6
+#define  SCTX_IV                     BIT(7)
+#define  SCTX_IV_SHIFT                   7
+#define  ICV_SIZE                   0x0F00
+#define  ICV_SIZE_SHIFT                  8
+#define  CHECK_ICV                  BIT(12)
+#define  CHECK_ICV_SHIFT                12
+#define  INSERT_ICV                 BIT(13)
+#define  INSERT_ICV_SHIFT               13
+#define  BD_SUPPRESS                BIT(19)
+#define  BD_SUPPRESS_SHIFT              19
+
+/* Generic Mode Security Context Structure [SCTX] */
+struct SCTX {
+/* word 0: protocol flags */
+	u32 proto_flags;
+
+/* word 1: cipher flags */
+	u32 cipher_flags;
+
+/* word 2: Extended cipher flags */
+	u32 ecf;
+
+};
+
+struct SPUHEADER {
+	struct MHEADER mh;
+	u32 emh;
+	struct SCTX sa;
+};
+
+#endif /* _SPUM_H_ */
diff --git a/drivers/crypto/bcm/util.c b/drivers/crypto/bcm/util.c
new file mode 100644
index 000000000000..0502f460dacd
--- /dev/null
+++ b/drivers/crypto/bcm/util.c
@@ -0,0 +1,581 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#include <linux/debugfs.h>
+
+#include "cipher.h"
+#include "util.h"
+
+/* offset of SPU_OFIFO_CTRL register */
+#define SPU_OFIFO_CTRL      0x40
+#define SPU_FIFO_WATERMARK  0x1FF
+
+/**
+ * spu_sg_at_offset() - Find the scatterlist entry at a given distance from the
+ * start of a scatterlist.
+ * @sg:         [in]  Start of a scatterlist
+ * @skip:       [in]  Distance from the start of the scatterlist, in bytes
+ * @sge:        [out] Scatterlist entry at skip bytes from start
+ * @sge_offset: [out] Number of bytes from start of sge buffer to get to
+ *                    requested distance.
+ *
+ * Return: 0 if entry found at requested distance
+ *         < 0 otherwise
+ */
+int spu_sg_at_offset(struct scatterlist *sg, unsigned int skip,
+		     struct scatterlist **sge, unsigned int *sge_offset)
+{
+	/* byte index from start of sg to the end of the previous entry */
+	unsigned int index = 0;
+	/* byte index from start of sg to the end of the current entry */
+	unsigned int next_index;
+
+	next_index = sg->length;
+	while (next_index <= skip) {
+		sg = sg_next(sg);
+		index = next_index;
+		if (!sg)
+			return -EINVAL;
+		next_index += sg->length;
+	}
+
+	*sge_offset = skip - index;
+	*sge = sg;
+	return 0;
+}
+
+/* Copy len bytes of sg data, starting at offset skip, to a dest buffer */
+void sg_copy_part_to_buf(struct scatterlist *src, u8 *dest,
+			 unsigned int len, unsigned int skip)
+{
+	size_t copied;
+	unsigned int nents = sg_nents(src);
+
+	copied = sg_pcopy_to_buffer(src, nents, dest, len, skip);
+	if (copied != len) {
+		flow_log("%s copied %u bytes of %u requested. ",
+			 __func__, (u32)copied, len);
+		flow_log("sg with %u entries and skip %u\n", nents, skip);
+	}
+}
+
+/*
+ * Copy data into a scatterlist starting at a specified offset in the
+ * scatterlist. Specifically, copy len bytes of data in the buffer src
+ * into the scatterlist dest, starting skip bytes into the scatterlist.
+ */
+void sg_copy_part_from_buf(struct scatterlist *dest, u8 *src,
+			   unsigned int len, unsigned int skip)
+{
+	size_t copied;
+	unsigned int nents = sg_nents(dest);
+
+	copied = sg_pcopy_from_buffer(dest, nents, src, len, skip);
+	if (copied != len) {
+		flow_log("%s copied %u bytes of %u requested. ",
+			 __func__, (u32)copied, len);
+		flow_log("sg with %u entries and skip %u\n", nents, skip);
+	}
+}
+
+/**
+ * spu_sg_count() - Determine number of elements in scatterlist to provide a
+ * specified number of bytes.
+ * @sg_list:  scatterlist to examine
+ * @skip:     index of starting point
+ * @nbytes:   consider elements of scatterlist until reaching this number of
+ *	      bytes
+ *
+ * Return: the number of sg entries contributing to nbytes of data
+ */
+int spu_sg_count(struct scatterlist *sg_list, unsigned int skip, int nbytes)
+{
+	struct scatterlist *sg;
+	int sg_nents = 0;
+	unsigned int offset;
+
+	if (!sg_list)
+		return 0;
+
+	if (spu_sg_at_offset(sg_list, skip, &sg, &offset) < 0)
+		return 0;
+
+	while (sg && (nbytes > 0)) {
+		sg_nents++;
+		nbytes -= (sg->length - offset);
+		offset = 0;
+		sg = sg_next(sg);
+	}
+	return sg_nents;
+}
+
+/**
+ * spu_msg_sg_add() - Copy scatterlist entries from one sg to another, up to a
+ * given length.
+ * @to_sg:       scatterlist to copy to
+ * @from_sg:     scatterlist to copy from
+ * @from_skip:   number of bytes to skip in from_sg. Non-zero when previous
+ *		 request included part of the buffer in entry in from_sg.
+ *		 Assumes from_skip < from_sg->length.
+ * @from_nents   number of entries in from_sg
+ * @length       number of bytes to copy. may reach this limit before exhausting
+ *		 from_sg.
+ *
+ * Copies the entries themselves, not the data in the entries. Assumes to_sg has
+ * enough entries. Does not limit the size of an individual buffer in to_sg.
+ *
+ * to_sg, from_sg, skip are all updated to end of copy
+ *
+ * Return: Number of bytes copied
+ */
+u32 spu_msg_sg_add(struct scatterlist **to_sg,
+		   struct scatterlist **from_sg, u32 *from_skip,
+		   u8 from_nents, u32 length)
+{
+	struct scatterlist *sg;	/* an entry in from_sg */
+	struct scatterlist *to = *to_sg;
+	struct scatterlist *from = *from_sg;
+	u32 skip = *from_skip;
+	u32 offset;
+	int i;
+	u32 entry_len = 0;
+	u32 frag_len = 0;	/* length of entry added to to_sg */
+	u32 copied = 0;		/* number of bytes copied so far */
+
+	if (length == 0)
+		return 0;
+
+	for_each_sg(from, sg, from_nents, i) {
+		/* number of bytes in this from entry not yet used */
+		entry_len = sg->length - skip;
+		frag_len = min(entry_len, length - copied);
+		offset = sg->offset + skip;
+		if (frag_len)
+			sg_set_page(to++, sg_page(sg), frag_len, offset);
+		copied += frag_len;
+		if (copied == entry_len) {
+			/* used up all of from entry */
+			skip = 0;	/* start at beginning of next entry */
+		}
+		if (copied == length)
+			break;
+	}
+	*to_sg = to;
+	*from_sg = sg;
+	if (frag_len < entry_len)
+		*from_skip = skip + frag_len;
+	else
+		*from_skip = 0;
+
+	return copied;
+}
+
+void add_to_ctr(u8 *ctr_pos, unsigned int increment)
+{
+	__be64 *high_be = (__be64 *)ctr_pos;
+	__be64 *low_be = high_be + 1;
+	u64 orig_low = __be64_to_cpu(*low_be);
+	u64 new_low = orig_low + (u64)increment;
+
+	*low_be = __cpu_to_be64(new_low);
+	if (new_low < orig_low)
+		/* there was a carry from the low 8 bytes */
+		*high_be = __cpu_to_be64(__be64_to_cpu(*high_be) + 1);
+}
+
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+/* do a synchronous decrypt operation */
+int do_decrypt(char *alg_name,
+	       void *key_ptr, unsigned int key_len,
+	       void *iv_ptr, void *src_ptr, void *dst_ptr,
+	       unsigned int block_len)
+{
+	struct scatterlist sg_in[1], sg_out[1];
+	struct crypto_blkcipher *tfm =
+	    crypto_alloc_blkcipher(alg_name, 0, CRYPTO_ALG_ASYNC);
+	struct blkcipher_desc desc = {.tfm = tfm, .flags = 0 };
+	int ret = 0;
+	void *iv;
+	int ivsize;
+
+	flow_log("%s() name:%s block_len:%u\n", __func__, alg_name, block_len);
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key_ptr, key_len);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src_ptr, block_len);
+
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst_ptr, block_len);
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+	memcpy(iv, iv_ptr, ivsize);
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, block_len);
+	crypto_free_blkcipher(tfm);
+
+	if (ret < 0)
+		pr_err("aes_decrypt failed %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * do_shash() - Do a synchronous hash operation in software
+ * @name:       The name of the hash algorithm
+ * @result:     Buffer where digest is to be written
+ * @data1:      First part of data to hash. May be NULL.
+ * @data1_len:  Length of data1, in bytes
+ * @data2:      Second part of data to hash. May be NULL.
+ * @data2_len:  Length of data2, in bytes
+ * @key:	Key (if keyed hash)
+ * @key_len:	Length of key, in bytes (or 0 if non-keyed hash)
+ *
+ * Note that the crypto API will not select this driver's own transform because
+ * this driver only registers asynchronous algos.
+ *
+ * Return: 0 if hash successfully stored in result
+ *         < 0 otherwise
+ */
+int do_shash(unsigned char *name, unsigned char *result,
+	     const u8 *data1, unsigned int data1_len,
+	     const u8 *data2, unsigned int data2_len,
+	     const u8 *key, unsigned int key_len)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *hash;
+	struct sdesc *sdesc;
+
+	hash = crypto_alloc_shash(name, 0, 0);
+	if (IS_ERR(hash)) {
+		rc = PTR_ERR(hash);
+		pr_err("%s: Crypto %s allocation error %d", __func__, name, rc);
+		return rc;
+	}
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(hash);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc) {
+		rc = -ENOMEM;
+		pr_err("%s: Memory allocation failure", __func__);
+		goto do_shash_err;
+	}
+	sdesc->shash.tfm = hash;
+	sdesc->shash.flags = 0x0;
+
+	if (key_len > 0) {
+		rc = crypto_shash_setkey(hash, key, key_len);
+		if (rc) {
+			pr_err("%s: Could not setkey %s shash", __func__, name);
+			goto do_shash_err;
+		}
+	}
+
+	rc = crypto_shash_init(&sdesc->shash);
+	if (rc) {
+		pr_err("%s: Could not init %s shash", __func__, name);
+		goto do_shash_err;
+	}
+	rc = crypto_shash_update(&sdesc->shash, data1, data1_len);
+	if (rc) {
+		pr_err("%s: Could not update1", __func__);
+		goto do_shash_err;
+	}
+	if (data2 && data2_len) {
+		rc = crypto_shash_update(&sdesc->shash, data2, data2_len);
+		if (rc) {
+			pr_err("%s: Could not update2", __func__);
+			goto do_shash_err;
+		}
+	}
+	rc = crypto_shash_final(&sdesc->shash, result);
+	if (rc)
+		pr_err("%s: Could not genereate %s hash", __func__, name);
+
+do_shash_err:
+	crypto_free_shash(hash);
+	kfree(sdesc);
+
+	return rc;
+}
+
+/* Dump len bytes of a scatterlist starting at skip bytes into the sg */
+void __dump_sg(struct scatterlist *sg, unsigned int skip, unsigned int len)
+{
+	u8 dbuf[16];
+	unsigned int idx = skip;
+	unsigned int num_out = 0;	/* number of bytes dumped so far */
+	unsigned int count;
+
+	if (packet_debug_logging) {
+		while (num_out < len) {
+			count = (len - num_out > 16) ? 16 : len - num_out;
+			sg_copy_part_to_buf(sg, dbuf, count, idx);
+			num_out += count;
+			print_hex_dump(KERN_ALERT, "  sg: ", DUMP_PREFIX_NONE,
+				       4, 1, dbuf, count, false);
+			idx += 16;
+		}
+	}
+	if (debug_logging_sleep)
+		msleep(debug_logging_sleep);
+}
+
+/* Returns the name for a given cipher alg/mode */
+char *spu_alg_name(enum spu_cipher_alg alg, enum spu_cipher_mode mode)
+{
+	switch (alg) {
+	case CIPHER_ALG_RC4:
+		return "rc4";
+	case CIPHER_ALG_AES:
+		switch (mode) {
+		case CIPHER_MODE_CBC:
+			return "cbc(aes)";
+		case CIPHER_MODE_ECB:
+			return "ecb(aes)";
+		case CIPHER_MODE_OFB:
+			return "ofb(aes)";
+		case CIPHER_MODE_CFB:
+			return "cfb(aes)";
+		case CIPHER_MODE_CTR:
+			return "ctr(aes)";
+		case CIPHER_MODE_XTS:
+			return "xts(aes)";
+		case CIPHER_MODE_GCM:
+			return "gcm(aes)";
+		default:
+			return "aes";
+		}
+		break;
+	case CIPHER_ALG_DES:
+		switch (mode) {
+		case CIPHER_MODE_CBC:
+			return "cbc(des)";
+		case CIPHER_MODE_ECB:
+			return "ecb(des)";
+		case CIPHER_MODE_CTR:
+			return "ctr(des)";
+		default:
+			return "des";
+		}
+		break;
+	case CIPHER_ALG_3DES:
+		switch (mode) {
+		case CIPHER_MODE_CBC:
+			return "cbc(des3_ede)";
+		case CIPHER_MODE_ECB:
+			return "ecb(des3_ede)";
+		case CIPHER_MODE_CTR:
+			return "ctr(des3_ede)";
+		default:
+			return "3des";
+		}
+		break;
+	default:
+		return "other";
+	}
+}
+
+static ssize_t spu_debugfs_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct device_private *ipriv;
+	char *buf;
+	ssize_t ret, out_offset, out_count;
+	int i;
+	u32 fifo_len;
+	u32 spu_ofifo_ctrl;
+	u32 alg;
+	u32 mode;
+	u32 op_cnt;
+
+	out_count = 2048;
+
+	buf = kmalloc(out_count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ipriv = filp->private_data;
+	out_offset = 0;
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Number of SPUs.........%u\n",
+			       ipriv->spu.num_spu);
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Current sessions.......%u\n",
+			       atomic_read(&ipriv->session_count));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Session count..........%u\n",
+			       atomic_read(&ipriv->stream_count));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Cipher setkey..........%u\n",
+			       atomic_read(&ipriv->setkey_cnt[SPU_OP_CIPHER]));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Cipher Ops.............%u\n",
+			       atomic_read(&ipriv->op_counts[SPU_OP_CIPHER]));
+	for (alg = 0; alg < CIPHER_ALG_LAST; alg++) {
+		for (mode = 0; mode < CIPHER_MODE_LAST; mode++) {
+			op_cnt = atomic_read(&ipriv->cipher_cnt[alg][mode]);
+			if (op_cnt) {
+				out_offset += snprintf(buf + out_offset,
+						       out_count - out_offset,
+			       "  %-13s%11u\n",
+			       spu_alg_name(alg, mode), op_cnt);
+			}
+		}
+	}
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Hash Ops...............%u\n",
+			       atomic_read(&ipriv->op_counts[SPU_OP_HASH]));
+	for (alg = 0; alg < HASH_ALG_LAST; alg++) {
+		op_cnt = atomic_read(&ipriv->hash_cnt[alg]);
+		if (op_cnt) {
+			out_offset += snprintf(buf + out_offset,
+					       out_count - out_offset,
+		       "  %-13s%11u\n",
+		       hash_alg_name[alg], op_cnt);
+		}
+	}
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "HMAC setkey............%u\n",
+			       atomic_read(&ipriv->setkey_cnt[SPU_OP_HMAC]));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "HMAC Ops...............%u\n",
+			       atomic_read(&ipriv->op_counts[SPU_OP_HMAC]));
+	for (alg = 0; alg < HASH_ALG_LAST; alg++) {
+		op_cnt = atomic_read(&ipriv->hmac_cnt[alg]);
+		if (op_cnt) {
+			out_offset += snprintf(buf + out_offset,
+					       out_count - out_offset,
+		       "  %-13s%11u\n",
+		       hash_alg_name[alg], op_cnt);
+		}
+	}
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "AEAD setkey............%u\n",
+			       atomic_read(&ipriv->setkey_cnt[SPU_OP_AEAD]));
+
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "AEAD Ops...............%u\n",
+			       atomic_read(&ipriv->op_counts[SPU_OP_AEAD]));
+	for (alg = 0; alg < AEAD_TYPE_LAST; alg++) {
+		op_cnt = atomic_read(&ipriv->aead_cnt[alg]);
+		if (op_cnt) {
+			out_offset += snprintf(buf + out_offset,
+					       out_count - out_offset,
+		       "  %-13s%11u\n",
+		       aead_alg_name[alg], op_cnt);
+		}
+	}
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Bytes of req data......%llu\n",
+			       (u64)atomic64_read(&ipriv->bytes_out));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Bytes of resp data.....%llu\n",
+			       (u64)atomic64_read(&ipriv->bytes_in));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Mailbox full...........%u\n",
+			       atomic_read(&ipriv->mb_no_spc));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Mailbox send failures..%u\n",
+			       atomic_read(&ipriv->mb_send_fail));
+	out_offset += snprintf(buf + out_offset, out_count - out_offset,
+			       "Check ICV errors.......%u\n",
+			       atomic_read(&ipriv->bad_icv));
+	if (ipriv->spu.spu_type == SPU_TYPE_SPUM)
+		for (i = 0; i < ipriv->spu.num_spu; i++) {
+			spu_ofifo_ctrl = ioread32(ipriv->spu.reg_vbase[i] +
+						  SPU_OFIFO_CTRL);
+			fifo_len = spu_ofifo_ctrl & SPU_FIFO_WATERMARK;
+			out_offset += snprintf(buf + out_offset,
+					       out_count - out_offset,
+				       "SPU %d output FIFO high water.....%u\n",
+				       i, fifo_len);
+		}
+
+	if (out_offset > out_count)
+		out_offset = out_count;
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+	kfree(buf);
+	return ret;
+}
+
+static const struct file_operations spu_debugfs_stats = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = spu_debugfs_read,
+};
+
+/*
+ * Create the debug FS directories. If the top-level directory has not yet
+ * been created, create it now. Create a stats file in this directory for
+ * a SPU.
+ */
+void spu_setup_debugfs(void)
+{
+	if (!debugfs_initialized())
+		return;
+
+	if (!iproc_priv.debugfs_dir)
+		iproc_priv.debugfs_dir = debugfs_create_dir(KBUILD_MODNAME,
+							    NULL);
+
+	if (!iproc_priv.debugfs_stats)
+		/* Create file with permissions S_IRUSR */
+		debugfs_create_file("stats", 0400, iproc_priv.debugfs_dir,
+				    &iproc_priv, &spu_debugfs_stats);
+}
+
+void spu_free_debugfs(void)
+{
+	debugfs_remove_recursive(iproc_priv.debugfs_dir);
+	iproc_priv.debugfs_dir = NULL;
+}
+
+/**
+ * format_value_ccm() - Format a value into a buffer, using a specified number
+ *			of bytes (i.e. maybe writing value X into a 4 byte
+ *			buffer, or maybe into a 12 byte buffer), as per the
+ *			SPU CCM spec.
+ *
+ * @val:		value to write (up to max of unsigned int)
+ * @buf:		(pointer to) buffer to write the value
+ * @len:		number of bytes to use (0 to 255)
+ *
+ */
+void format_value_ccm(unsigned int val, u8 *buf, u8 len)
+{
+	int i;
+
+	/* First clear full output buffer */
+	memset(buf, 0, len);
+
+	/* Then, starting from right side, fill in with data */
+	for (i = 0; i < len; i++) {
+		buf[len - i - 1] = (val >> (8 * i)) & 0xff;
+		if (i >= 3)
+			break;  /* Only handle up to 32 bits of 'val' */
+	}
+}
diff --git a/drivers/crypto/bcm/util.h b/drivers/crypto/bcm/util.h
new file mode 100644
index 000000000000..712e029795f8
--- /dev/null
+++ b/drivers/crypto/bcm/util.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#ifndef _UTIL_H
+#define _UTIL_H
+
+#include <linux/kernel.h>
+#include <linux/delay.h>
+
+#include "spu.h"
+
+extern int flow_debug_logging;
+extern int packet_debug_logging;
+extern int debug_logging_sleep;
+
+#ifdef DEBUG
+#define flow_log(...)	                \
+	do {	                              \
+		if (flow_debug_logging) {	        \
+			printk(__VA_ARGS__);	          \
+			if (debug_logging_sleep)	      \
+				msleep(debug_logging_sleep);	\
+		}	                                \
+	} while (0)
+#define flow_dump(msg, var, var_len)	   \
+	do {	                                 \
+		if (flow_debug_logging) {	           \
+			print_hex_dump(KERN_ALERT, msg, DUMP_PREFIX_NONE,  \
+					16, 1, var, var_len, false); \
+				if (debug_logging_sleep)	       \
+					msleep(debug_logging_sleep);   \
+		}                                    \
+	} while (0)
+
+#define packet_log(...)               \
+	do {                                \
+		if (packet_debug_logging) {       \
+			printk(__VA_ARGS__);            \
+			if (debug_logging_sleep)        \
+				msleep(debug_logging_sleep);  \
+		}                                 \
+	} while (0)
+#define packet_dump(msg, var, var_len)   \
+	do {                                   \
+		if (packet_debug_logging) {          \
+			print_hex_dump(KERN_ALERT, msg, DUMP_PREFIX_NONE,  \
+					16, 1, var, var_len, false); \
+			if (debug_logging_sleep)           \
+				msleep(debug_logging_sleep);     \
+		}                                    \
+	} while (0)
+
+void __dump_sg(struct scatterlist *sg, unsigned int skip, unsigned int len);
+
+#define dump_sg(sg, skip, len)     __dump_sg(sg, skip, len)
+
+#else /* !DEBUG_ON */
+
+#define flow_log(...) do {} while (0)
+#define flow_dump(msg, var, var_len) do {} while (0)
+#define packet_log(...) do {} while (0)
+#define packet_dump(msg, var, var_len) do {} while (0)
+
+#define dump_sg(sg, skip, len) do {} while (0)
+
+#endif /* DEBUG_ON */
+
+int spu_sg_at_offset(struct scatterlist *sg, unsigned int skip,
+		     struct scatterlist **sge, unsigned int *sge_offset);
+
+/* Copy sg data, from skip, length len, to dest */
+void sg_copy_part_to_buf(struct scatterlist *src, u8 *dest,
+			 unsigned int len, unsigned int skip);
+/* Copy src into scatterlist from offset, length len */
+void sg_copy_part_from_buf(struct scatterlist *dest, u8 *src,
+			   unsigned int len, unsigned int skip);
+
+int spu_sg_count(struct scatterlist *sg_list, unsigned int skip, int nbytes);
+u32 spu_msg_sg_add(struct scatterlist **to_sg,
+		   struct scatterlist **from_sg, u32 *skip,
+		   u8 from_nents, u32 tot_len);
+
+void add_to_ctr(u8 *ctr_pos, unsigned int increment);
+
+/* do a synchronous decrypt operation */
+int do_decrypt(char *alg_name,
+	       void *key_ptr, unsigned int key_len,
+	       void *iv_ptr, void *src_ptr, void *dst_ptr,
+	       unsigned int block_len);
+
+/* produce a message digest from data of length n bytes */
+int do_shash(unsigned char *name, unsigned char *result,
+	     const u8 *data1, unsigned int data1_len,
+	     const u8 *data2, unsigned int data2_len,
+	     const u8 *key, unsigned int key_len);
+
+char *spu_alg_name(enum spu_cipher_alg alg, enum spu_cipher_mode mode);
+
+void spu_setup_debugfs(void);
+void spu_free_debugfs(void);
+void format_value_ccm(unsigned int val, u8 *buf, u8 len);
+
+#endif
diff --git a/drivers/crypto/bfin_crc.c b/drivers/crypto/bfin_crc.c
index 10db7df366c8..a118b9bed669 100644
--- a/drivers/crypto/bfin_crc.c
+++ b/drivers/crypto/bfin_crc.c
@@ -203,7 +203,7 @@ static void bfin_crypto_crc_config_dma(struct bfin_crypto_crc *crc)
 			crc->sg_cpu[i].x_count = 1;
 			crc->sg_cpu[i].x_modify = CHKSUM_DIGEST_SIZE;
 			dev_dbg(crc->dev, "%d: crc_dma: start_addr:0x%lx, "
-				"cfg:0x%lx, x_count:0x%lx, x_modify:0x%lx\n",
+				"cfg:0x%x, x_count:0x%x, x_modify:0x%x\n",
 				i, crc->sg_cpu[i].start_addr,
 				crc->sg_cpu[i].cfg, crc->sg_cpu[i].x_count,
 				crc->sg_cpu[i].x_modify);
@@ -233,7 +233,7 @@ static void bfin_crypto_crc_config_dma(struct bfin_crypto_crc *crc)
 		crc->sg_cpu[i].x_count = dma_count;
 		crc->sg_cpu[i].x_modify = dma_mod;
 		dev_dbg(crc->dev, "%d: crc_dma: start_addr:0x%lx, "
-			"cfg:0x%lx, x_count:0x%lx, x_modify:0x%lx\n",
+			"cfg:0x%x, x_count:0x%x, x_modify:0x%x\n",
 			i, crc->sg_cpu[i].start_addr,
 			crc->sg_cpu[i].cfg, crc->sg_cpu[i].x_count,
 			crc->sg_cpu[i].x_modify);
@@ -257,7 +257,7 @@ static void bfin_crypto_crc_config_dma(struct bfin_crypto_crc *crc)
 		crc->sg_cpu[i].x_count = 1;
 		crc->sg_cpu[i].x_modify = CHKSUM_DIGEST_SIZE;
 		dev_dbg(crc->dev, "%d: crc_dma: start_addr:0x%lx, "
-			"cfg:0x%lx, x_count:0x%lx, x_modify:0x%lx\n",
+			"cfg:0x%x, x_count:0x%x, x_modify:0x%x\n",
 			i, crc->sg_cpu[i].start_addr,
 			crc->sg_cpu[i].cfg, crc->sg_cpu[i].x_count,
 			crc->sg_cpu[i].x_modify);
diff --git a/drivers/crypto/bfin_crc.h b/drivers/crypto/bfin_crc.h
index 75cef4dc85a1..786ef746d109 100644
--- a/drivers/crypto/bfin_crc.h
+++ b/drivers/crypto/bfin_crc.h
@@ -55,7 +55,6 @@ struct crc_info {
 
 #include <linux/types.h>
 #include <linux/spinlock.h>
-#include <linux/miscdevice.h>
 
 struct crc_register {
 	u32 control;
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 662fe94cb2f8..9bc80eb06934 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -134,15 +134,15 @@ struct caam_aead_alg {
  * per-session context
  */
 struct caam_ctx {
-	struct device *jrdev;
 	u32 sh_desc_enc[DESC_MAX_USED_LEN];
 	u32 sh_desc_dec[DESC_MAX_USED_LEN];
 	u32 sh_desc_givenc[DESC_MAX_USED_LEN];
+	u8 key[CAAM_MAX_KEY_SIZE];
 	dma_addr_t sh_desc_enc_dma;
 	dma_addr_t sh_desc_dec_dma;
 	dma_addr_t sh_desc_givenc_dma;
-	u8 key[CAAM_MAX_KEY_SIZE];
 	dma_addr_t key_dma;
+	struct device *jrdev;
 	struct alginfo adata;
 	struct alginfo cdata;
 	unsigned int authsize;
@@ -171,13 +171,8 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	/* aead_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_aead_null_encap(desc, &ctx->adata, ctx->authsize);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/*
 	 * Job Descriptor and Shared Descriptors
@@ -194,13 +189,8 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	/* aead_decrypt shared descriptor */
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_aead_null_decap(desc, &ctx->adata, ctx->authsize);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -278,13 +268,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_aead_encap(desc, &ctx->cdata, &ctx->adata, ctx->authsize,
 			       is_rfc3686, nonce, ctx1_iv_off);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 skip_enc:
 	/*
@@ -315,13 +300,8 @@ skip_enc:
 	cnstr_shdsc_aead_decap(desc, &ctx->cdata, &ctx->adata, ivsize,
 			       ctx->authsize, alg->caam.geniv, is_rfc3686,
 			       nonce, ctx1_iv_off);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	if (!alg->caam.geniv)
 		goto skip_givenc;
@@ -354,13 +334,8 @@ skip_enc:
 	cnstr_shdsc_aead_givencap(desc, &ctx->cdata, &ctx->adata, ivsize,
 				  ctx->authsize, is_rfc3686, nonce,
 				  ctx1_iv_off);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 skip_givenc:
 	return 0;
@@ -403,13 +378,8 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_gcm_encap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/*
 	 * Job Descriptor and Shared Descriptors
@@ -425,13 +395,8 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_gcm_decap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -472,13 +437,8 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_rfc4106_encap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/*
 	 * Job Descriptor and Shared Descriptors
@@ -494,13 +454,8 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_rfc4106_decap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -542,13 +497,8 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_rfc4543_encap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/*
 	 * Job Descriptor and Shared Descriptors
@@ -564,13 +514,8 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_rfc4543_decap(desc, &ctx->cdata, ctx->authsize);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -614,28 +559,15 @@ static int aead_setkey(struct crypto_aead *aead,
 
 	/* postpend encryption key to auth split key */
 	memcpy(ctx->key + ctx->adata.keylen_pad, keys.enckey, keys.enckeylen);
-
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->adata.keylen_pad +
-				      keys.enckeylen, DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->key_dma, ctx->adata.keylen_pad +
+				   keys.enckeylen, DMA_TO_DEVICE);
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, ctx->key,
 		       ctx->adata.keylen_pad + keys.enckeylen, 1);
 #endif
-
 	ctx->cdata.keylen = keys.enckeylen;
-
-	ret = aead_set_sh_desc(aead);
-	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->adata.keylen_pad +
-				 keys.enckeylen, DMA_TO_DEVICE);
-	}
-
-	return ret;
+	return aead_set_sh_desc(aead);
 badkey:
 	crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
 	return -EINVAL;
@@ -646,7 +578,6 @@ static int gcm_setkey(struct crypto_aead *aead,
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	int ret = 0;
 
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "key in @"__stringify(__LINE__)": ",
@@ -654,21 +585,10 @@ static int gcm_setkey(struct crypto_aead *aead,
 #endif
 
 	memcpy(ctx->key, key, keylen);
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, keylen,
-				      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->key_dma, keylen, DMA_TO_DEVICE);
 	ctx->cdata.keylen = keylen;
 
-	ret = gcm_set_sh_desc(aead);
-	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
-				 DMA_TO_DEVICE);
-	}
-
-	return ret;
+	return gcm_set_sh_desc(aead);
 }
 
 static int rfc4106_setkey(struct crypto_aead *aead,
@@ -676,7 +596,6 @@ static int rfc4106_setkey(struct crypto_aead *aead,
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	int ret = 0;
 
 	if (keylen < 4)
 		return -EINVAL;
@@ -693,21 +612,9 @@ static int rfc4106_setkey(struct crypto_aead *aead,
 	 * in the nonce. Update the AES key length.
 	 */
 	ctx->cdata.keylen = keylen - 4;
-
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->cdata.keylen,
-				      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
-
-	ret = rfc4106_set_sh_desc(aead);
-	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
-				 DMA_TO_DEVICE);
-	}
-
-	return ret;
+	dma_sync_single_for_device(jrdev, ctx->key_dma, ctx->cdata.keylen,
+				   DMA_TO_DEVICE);
+	return rfc4106_set_sh_desc(aead);
 }
 
 static int rfc4543_setkey(struct crypto_aead *aead,
@@ -715,7 +622,6 @@ static int rfc4543_setkey(struct crypto_aead *aead,
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	int ret = 0;
 
 	if (keylen < 4)
 		return -EINVAL;
@@ -732,21 +638,9 @@ static int rfc4543_setkey(struct crypto_aead *aead,
 	 * in the nonce. Update the AES key length.
 	 */
 	ctx->cdata.keylen = keylen - 4;
-
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->cdata.keylen,
-				      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
-
-	ret = rfc4543_set_sh_desc(aead);
-	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
-				 DMA_TO_DEVICE);
-	}
-
-	return ret;
+	dma_sync_single_for_device(jrdev, ctx->key_dma, ctx->cdata.keylen,
+				   DMA_TO_DEVICE);
+	return rfc4543_set_sh_desc(aead);
 }
 
 static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
@@ -787,12 +681,7 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 		keylen -= CTR_RFC3686_NONCE_SIZE;
 	}
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, keylen,
-				      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->key_dma, keylen, DMA_TO_DEVICE);
 	ctx->cdata.keylen = keylen;
 	ctx->cdata.key_virt = ctx->key;
 	ctx->cdata.key_inline = true;
@@ -801,37 +690,22 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_ablkcipher_encap(desc, &ctx->cdata, ivsize, is_rfc3686,
 				     ctx1_iv_off);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/* ablkcipher_decrypt shared descriptor */
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_ablkcipher_decap(desc, &ctx->cdata, ivsize, is_rfc3686,
 				     ctx1_iv_off);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc,
-					      desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/* ablkcipher_givencrypt shared descriptor */
 	desc = ctx->sh_desc_givenc;
 	cnstr_shdsc_ablkcipher_givencap(desc, &ctx->cdata, ivsize, is_rfc3686,
 					ctx1_iv_off);
-	ctx->sh_desc_givenc_dma = dma_map_single(jrdev, desc,
-						 desc_bytes(desc),
-						 DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_givenc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_givenc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
@@ -851,11 +725,7 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	}
 
 	memcpy(ctx->key, key, keylen);
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, keylen, DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->key_dma, keylen, DMA_TO_DEVICE);
 	ctx->cdata.keylen = keylen;
 	ctx->cdata.key_virt = ctx->key;
 	ctx->cdata.key_inline = true;
@@ -863,32 +733,22 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	/* xts_ablkcipher_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
 	cnstr_shdsc_xts_ablkcipher_encap(desc, &ctx->cdata);
-	ctx->sh_desc_enc_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_enc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_enc_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	/* xts_ablkcipher_decrypt shared descriptor */
 	desc = ctx->sh_desc_dec;
 	cnstr_shdsc_xts_ablkcipher_decap(desc, &ctx->cdata);
-	ctx->sh_desc_dec_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dec_dma)) {
-		dma_unmap_single(jrdev, ctx->sh_desc_enc_dma,
-				 desc_bytes(ctx->sh_desc_enc), DMA_TO_DEVICE);
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_dec_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 
 	return 0;
 }
 
 /*
  * aead_edesc - s/w-extended aead descriptor
- * @src_nents: number of segments in input scatterlist
- * @dst_nents: number of segments in output scatterlist
+ * @src_nents: number of segments in input s/w scatterlist
+ * @dst_nents: number of segments in output s/w scatterlist
  * @sec4_sg_bytes: length of dma mapped sec4_sg space
  * @sec4_sg_dma: bus physical mapped address of h/w link table
  * @sec4_sg: pointer to h/w link table
@@ -905,8 +765,8 @@ struct aead_edesc {
 
 /*
  * ablkcipher_edesc - s/w-extended ablkcipher descriptor
- * @src_nents: number of segments in input scatterlist
- * @dst_nents: number of segments in output scatterlist
+ * @src_nents: number of segments in input s/w scatterlist
+ * @dst_nents: number of segments in output s/w scatterlist
  * @iv_dma: dma address of iv for checking continuity and link table
  * @sec4_sg_bytes: length of dma mapped sec4_sg space
  * @sec4_sg_dma: bus physical mapped address of h/w link table
@@ -930,10 +790,11 @@ static void caam_unmap(struct device *dev, struct scatterlist *src,
 		       int sec4_sg_bytes)
 {
 	if (dst != src) {
-		dma_unmap_sg(dev, src, src_nents ? : 1, DMA_TO_DEVICE);
-		dma_unmap_sg(dev, dst, dst_nents ? : 1, DMA_FROM_DEVICE);
+		if (src_nents)
+			dma_unmap_sg(dev, src, src_nents, DMA_TO_DEVICE);
+		dma_unmap_sg(dev, dst, dst_nents, DMA_FROM_DEVICE);
 	} else {
-		dma_unmap_sg(dev, src, src_nents ? : 1, DMA_BIDIRECTIONAL);
+		dma_unmap_sg(dev, src, src_nents, DMA_BIDIRECTIONAL);
 	}
 
 	if (iv_dma)
@@ -1102,7 +963,7 @@ static void init_aead_job(struct aead_request *req,
 	init_job_desc_shared(desc, ptr, len, HDR_SHARE_DEFER | HDR_REVERSE);
 
 	if (all_contig) {
-		src_dma = sg_dma_address(req->src);
+		src_dma = edesc->src_nents ? sg_dma_address(req->src) : 0;
 		in_options = 0;
 	} else {
 		src_dma = edesc->sec4_sg_dma;
@@ -1117,7 +978,7 @@ static void init_aead_job(struct aead_request *req,
 	out_options = in_options;
 
 	if (unlikely(req->src != req->dst)) {
-		if (!edesc->dst_nents) {
+		if (edesc->dst_nents == 1) {
 			dst_dma = sg_dma_address(req->dst);
 		} else {
 			dst_dma = edesc->sec4_sg_dma +
@@ -1227,10 +1088,11 @@ static void init_ablkcipher_job(u32 *sh_desc, dma_addr_t ptr,
 	print_hex_dump(KERN_ERR, "presciv@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, req->info,
 		       ivsize, 1);
-	printk(KERN_ERR "asked=%d, nbytes%d\n", (int)edesc->src_nents ? 100 : req->nbytes, req->nbytes);
+	pr_err("asked=%d, nbytes%d\n",
+	       (int)edesc->src_nents > 1 ? 100 : req->nbytes, req->nbytes);
 	dbg_dump_sg(KERN_ERR, "src    @"__stringify(__LINE__)": ",
 		    DUMP_PREFIX_ADDRESS, 16, 4, req->src,
-		    edesc->src_nents ? 100 : req->nbytes, 1);
+		    edesc->src_nents > 1 ? 100 : req->nbytes, 1);
 #endif
 
 	len = desc_len(sh_desc);
@@ -1247,7 +1109,7 @@ static void init_ablkcipher_job(u32 *sh_desc, dma_addr_t ptr,
 	append_seq_in_ptr(desc, src_dma, req->nbytes + ivsize, in_options);
 
 	if (likely(req->src == req->dst)) {
-		if (!edesc->src_nents && iv_contig) {
+		if (edesc->src_nents == 1 && iv_contig) {
 			dst_dma = sg_dma_address(req->src);
 		} else {
 			dst_dma = edesc->sec4_sg_dma +
@@ -1255,7 +1117,7 @@ static void init_ablkcipher_job(u32 *sh_desc, dma_addr_t ptr,
 			out_options = LDST_SGF;
 		}
 	} else {
-		if (!edesc->dst_nents) {
+		if (edesc->dst_nents == 1) {
 			dst_dma = sg_dma_address(req->dst);
 		} else {
 			dst_dma = edesc->sec4_sg_dma +
@@ -1287,13 +1149,13 @@ static void init_ablkcipher_giv_job(u32 *sh_desc, dma_addr_t ptr,
 		       ivsize, 1);
 	dbg_dump_sg(KERN_ERR, "src    @" __stringify(__LINE__) ": ",
 		    DUMP_PREFIX_ADDRESS, 16, 4, req->src,
-		    edesc->src_nents ? 100 : req->nbytes, 1);
+		    edesc->src_nents > 1 ? 100 : req->nbytes, 1);
 #endif
 
 	len = desc_len(sh_desc);
 	init_job_desc_shared(desc, ptr, len, HDR_SHARE_DEFER | HDR_REVERSE);
 
-	if (!edesc->src_nents) {
+	if (edesc->src_nents == 1) {
 		src_dma = sg_dma_address(req->src);
 		in_options = 0;
 	} else {
@@ -1326,83 +1188,98 @@ static struct aead_edesc *aead_edesc_alloc(struct aead_request *req,
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	int src_nents, dst_nents = 0;
+	int src_nents, mapped_src_nents, dst_nents = 0, mapped_dst_nents = 0;
 	struct aead_edesc *edesc;
-	int sgc;
-	bool all_contig = true;
-	int sec4_sg_index, sec4_sg_len = 0, sec4_sg_bytes;
+	int sec4_sg_index, sec4_sg_len, sec4_sg_bytes;
 	unsigned int authsize = ctx->authsize;
 
 	if (unlikely(req->dst != req->src)) {
-		src_nents = sg_count(req->src, req->assoclen + req->cryptlen);
-		dst_nents = sg_count(req->dst,
-				     req->assoclen + req->cryptlen +
-					(encrypt ? authsize : (-authsize)));
+		src_nents = sg_nents_for_len(req->src, req->assoclen +
+					     req->cryptlen);
+		if (unlikely(src_nents < 0)) {
+			dev_err(jrdev, "Insufficient bytes (%d) in src S/G\n",
+				req->assoclen + req->cryptlen);
+			return ERR_PTR(src_nents);
+		}
+
+		dst_nents = sg_nents_for_len(req->dst, req->assoclen +
+					     req->cryptlen +
+						(encrypt ? authsize :
+							   (-authsize)));
+		if (unlikely(dst_nents < 0)) {
+			dev_err(jrdev, "Insufficient bytes (%d) in dst S/G\n",
+				req->assoclen + req->cryptlen +
+				(encrypt ? authsize : (-authsize)));
+			return ERR_PTR(dst_nents);
+		}
 	} else {
-		src_nents = sg_count(req->src,
-				     req->assoclen + req->cryptlen +
-					(encrypt ? authsize : 0));
+		src_nents = sg_nents_for_len(req->src, req->assoclen +
+					     req->cryptlen +
+					     (encrypt ? authsize : 0));
+		if (unlikely(src_nents < 0)) {
+			dev_err(jrdev, "Insufficient bytes (%d) in src S/G\n",
+				req->assoclen + req->cryptlen +
+				(encrypt ? authsize : 0));
+			return ERR_PTR(src_nents);
+		}
 	}
 
-	/* Check if data are contiguous. */
-	all_contig = !src_nents;
-	if (!all_contig)
-		sec4_sg_len = src_nents;
+	if (likely(req->src == req->dst)) {
+		mapped_src_nents = dma_map_sg(jrdev, req->src, src_nents,
+					      DMA_BIDIRECTIONAL);
+		if (unlikely(!mapped_src_nents)) {
+			dev_err(jrdev, "unable to map source\n");
+			return ERR_PTR(-ENOMEM);
+		}
+	} else {
+		/* Cover also the case of null (zero length) input data */
+		if (src_nents) {
+			mapped_src_nents = dma_map_sg(jrdev, req->src,
+						      src_nents, DMA_TO_DEVICE);
+			if (unlikely(!mapped_src_nents)) {
+				dev_err(jrdev, "unable to map source\n");
+				return ERR_PTR(-ENOMEM);
+			}
+		} else {
+			mapped_src_nents = 0;
+		}
 
-	sec4_sg_len += dst_nents;
+		mapped_dst_nents = dma_map_sg(jrdev, req->dst, dst_nents,
+					      DMA_FROM_DEVICE);
+		if (unlikely(!mapped_dst_nents)) {
+			dev_err(jrdev, "unable to map destination\n");
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
 
+	sec4_sg_len = mapped_src_nents > 1 ? mapped_src_nents : 0;
+	sec4_sg_len += mapped_dst_nents > 1 ? mapped_dst_nents : 0;
 	sec4_sg_bytes = sec4_sg_len * sizeof(struct sec4_sg_entry);
 
 	/* allocate space for base edesc and hw desc commands, link tables */
 	edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes,
 			GFP_DMA | flags);
 	if (!edesc) {
-		dev_err(jrdev, "could not allocate extended descriptor\n");
+		caam_unmap(jrdev, req->src, req->dst, src_nents, dst_nents, 0,
+			   0, 0, 0);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (likely(req->src == req->dst)) {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_BIDIRECTIONAL);
-		if (unlikely(!sgc)) {
-			dev_err(jrdev, "unable to map source\n");
-			kfree(edesc);
-			return ERR_PTR(-ENOMEM);
-		}
-	} else {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_TO_DEVICE);
-		if (unlikely(!sgc)) {
-			dev_err(jrdev, "unable to map source\n");
-			kfree(edesc);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
-				 DMA_FROM_DEVICE);
-		if (unlikely(!sgc)) {
-			dev_err(jrdev, "unable to map destination\n");
-			dma_unmap_sg(jrdev, req->src, src_nents ? : 1,
-				     DMA_TO_DEVICE);
-			kfree(edesc);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-
 	edesc->src_nents = src_nents;
 	edesc->dst_nents = dst_nents;
 	edesc->sec4_sg = (void *)edesc + sizeof(struct aead_edesc) +
 			 desc_bytes;
-	*all_contig_ptr = all_contig;
+	*all_contig_ptr = !(mapped_src_nents > 1);
 
 	sec4_sg_index = 0;
-	if (!all_contig) {
-		sg_to_sec4_sg_last(req->src, src_nents,
-			      edesc->sec4_sg + sec4_sg_index, 0);
-		sec4_sg_index += src_nents;
+	if (mapped_src_nents > 1) {
+		sg_to_sec4_sg_last(req->src, mapped_src_nents,
+				   edesc->sec4_sg + sec4_sg_index, 0);
+		sec4_sg_index += mapped_src_nents;
 	}
-	if (dst_nents) {
-		sg_to_sec4_sg_last(req->dst, dst_nents,
+	if (mapped_dst_nents > 1) {
+		sg_to_sec4_sg_last(req->dst, mapped_dst_nents,
 				   edesc->sec4_sg + sec4_sg_index, 0);
 	}
 
@@ -1600,40 +1477,49 @@ static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 					  CRYPTO_TFM_REQ_MAY_SLEEP)) ?
 		       GFP_KERNEL : GFP_ATOMIC;
-	int src_nents, dst_nents = 0, sec4_sg_bytes;
+	int src_nents, mapped_src_nents, dst_nents = 0, mapped_dst_nents = 0;
 	struct ablkcipher_edesc *edesc;
 	dma_addr_t iv_dma = 0;
-	bool iv_contig = false;
-	int sgc;
+	bool in_contig;
 	int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
-	int sec4_sg_index;
+	int dst_sg_idx, sec4_sg_ents, sec4_sg_bytes;
 
-	src_nents = sg_count(req->src, req->nbytes);
+	src_nents = sg_nents_for_len(req->src, req->nbytes);
+	if (unlikely(src_nents < 0)) {
+		dev_err(jrdev, "Insufficient bytes (%d) in src S/G\n",
+			req->nbytes);
+		return ERR_PTR(src_nents);
+	}
 
-	if (req->dst != req->src)
-		dst_nents = sg_count(req->dst, req->nbytes);
+	if (req->dst != req->src) {
+		dst_nents = sg_nents_for_len(req->dst, req->nbytes);
+		if (unlikely(dst_nents < 0)) {
+			dev_err(jrdev, "Insufficient bytes (%d) in dst S/G\n",
+				req->nbytes);
+			return ERR_PTR(dst_nents);
+		}
+	}
 
 	if (likely(req->src == req->dst)) {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_BIDIRECTIONAL);
-		if (unlikely(!sgc)) {
+		mapped_src_nents = dma_map_sg(jrdev, req->src, src_nents,
+					      DMA_BIDIRECTIONAL);
+		if (unlikely(!mapped_src_nents)) {
 			dev_err(jrdev, "unable to map source\n");
 			return ERR_PTR(-ENOMEM);
 		}
 	} else {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_TO_DEVICE);
-		if (unlikely(!sgc)) {
+		mapped_src_nents = dma_map_sg(jrdev, req->src, src_nents,
+					      DMA_TO_DEVICE);
+		if (unlikely(!mapped_src_nents)) {
 			dev_err(jrdev, "unable to map source\n");
 			return ERR_PTR(-ENOMEM);
 		}
 
-		sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
-				 DMA_FROM_DEVICE);
-		if (unlikely(!sgc)) {
+		mapped_dst_nents = dma_map_sg(jrdev, req->dst, dst_nents,
+					      DMA_FROM_DEVICE);
+		if (unlikely(!mapped_dst_nents)) {
 			dev_err(jrdev, "unable to map destination\n");
-			dma_unmap_sg(jrdev, req->src, src_nents ? : 1,
-				     DMA_TO_DEVICE);
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
@@ -1646,16 +1532,17 @@ static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request
 		return ERR_PTR(-ENOMEM);
 	}
 
-	/*
-	 * Check if iv can be contiguous with source and destination.
-	 * If so, include it. If not, create scatterlist.
-	 */
-	if (!src_nents && iv_dma + ivsize == sg_dma_address(req->src))
-		iv_contig = true;
-	else
-		src_nents = src_nents ? : 1;
-	sec4_sg_bytes = ((iv_contig ? 0 : 1) + src_nents + dst_nents) *
-			sizeof(struct sec4_sg_entry);
+	if (mapped_src_nents == 1 &&
+	    iv_dma + ivsize == sg_dma_address(req->src)) {
+		in_contig = true;
+		sec4_sg_ents = 0;
+	} else {
+		in_contig = false;
+		sec4_sg_ents = 1 + mapped_src_nents;
+	}
+	dst_sg_idx = sec4_sg_ents;
+	sec4_sg_ents += mapped_dst_nents > 1 ? mapped_dst_nents : 0;
+	sec4_sg_bytes = sec4_sg_ents * sizeof(struct sec4_sg_entry);
 
 	/* allocate space for base edesc and hw desc commands, link tables */
 	edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes,
@@ -1673,17 +1560,15 @@ static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request
 	edesc->sec4_sg = (void *)edesc + sizeof(struct ablkcipher_edesc) +
 			 desc_bytes;
 
-	sec4_sg_index = 0;
-	if (!iv_contig) {
+	if (!in_contig) {
 		dma_to_sec4_sg_one(edesc->sec4_sg, iv_dma, ivsize, 0);
-		sg_to_sec4_sg_last(req->src, src_nents,
+		sg_to_sec4_sg_last(req->src, mapped_src_nents,
 				   edesc->sec4_sg + 1, 0);
-		sec4_sg_index += 1 + src_nents;
 	}
 
-	if (dst_nents) {
-		sg_to_sec4_sg_last(req->dst, dst_nents,
-			edesc->sec4_sg + sec4_sg_index, 0);
+	if (mapped_dst_nents > 1) {
+		sg_to_sec4_sg_last(req->dst, mapped_dst_nents,
+				   edesc->sec4_sg + dst_sg_idx, 0);
 	}
 
 	edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
@@ -1704,7 +1589,7 @@ static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request
 		       sec4_sg_bytes, 1);
 #endif
 
-	*iv_contig_out = iv_contig;
+	*iv_contig_out = in_contig;
 	return edesc;
 }
 
@@ -1798,40 +1683,50 @@ static struct ablkcipher_edesc *ablkcipher_giv_edesc_alloc(
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 					  CRYPTO_TFM_REQ_MAY_SLEEP)) ?
 		       GFP_KERNEL : GFP_ATOMIC;
-	int src_nents, dst_nents = 0, sec4_sg_bytes;
+	int src_nents, mapped_src_nents, dst_nents, mapped_dst_nents;
 	struct ablkcipher_edesc *edesc;
 	dma_addr_t iv_dma = 0;
-	bool iv_contig = false;
-	int sgc;
+	bool out_contig;
 	int ivsize = crypto_ablkcipher_ivsize(ablkcipher);
-	int sec4_sg_index;
+	int dst_sg_idx, sec4_sg_ents, sec4_sg_bytes;
 
-	src_nents = sg_count(req->src, req->nbytes);
-
-	if (unlikely(req->dst != req->src))
-		dst_nents = sg_count(req->dst, req->nbytes);
+	src_nents = sg_nents_for_len(req->src, req->nbytes);
+	if (unlikely(src_nents < 0)) {
+		dev_err(jrdev, "Insufficient bytes (%d) in src S/G\n",
+			req->nbytes);
+		return ERR_PTR(src_nents);
+	}
 
 	if (likely(req->src == req->dst)) {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_BIDIRECTIONAL);
-		if (unlikely(!sgc)) {
-			dev_err(jrdev, "unable to map source\n");
-			return ERR_PTR(-ENOMEM);
-		}
-	} else {
-		sgc = dma_map_sg(jrdev, req->src, src_nents ? : 1,
-				 DMA_TO_DEVICE);
-		if (unlikely(!sgc)) {
+		mapped_src_nents = dma_map_sg(jrdev, req->src, src_nents,
+					      DMA_BIDIRECTIONAL);
+		if (unlikely(!mapped_src_nents)) {
 			dev_err(jrdev, "unable to map source\n");
 			return ERR_PTR(-ENOMEM);
 		}
 
-		sgc = dma_map_sg(jrdev, req->dst, dst_nents ? : 1,
-				 DMA_FROM_DEVICE);
-		if (unlikely(!sgc)) {
+		dst_nents = src_nents;
+		mapped_dst_nents = src_nents;
+	} else {
+		mapped_src_nents = dma_map_sg(jrdev, req->src, src_nents,
+					      DMA_TO_DEVICE);
+		if (unlikely(!mapped_src_nents)) {
+			dev_err(jrdev, "unable to map source\n");
+			return ERR_PTR(-ENOMEM);
+		}
+
+		dst_nents = sg_nents_for_len(req->dst, req->nbytes);
+		if (unlikely(dst_nents < 0)) {
+			dev_err(jrdev, "Insufficient bytes (%d) in dst S/G\n",
+				req->nbytes);
+			return ERR_PTR(dst_nents);
+		}
+
+		mapped_dst_nents = dma_map_sg(jrdev, req->dst, dst_nents,
+					      DMA_FROM_DEVICE);
+		if (unlikely(!mapped_dst_nents)) {
 			dev_err(jrdev, "unable to map destination\n");
-			dma_unmap_sg(jrdev, req->src, src_nents ? : 1,
-				     DMA_TO_DEVICE);
+			dma_unmap_sg(jrdev, req->src, src_nents, DMA_TO_DEVICE);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
@@ -1848,14 +1743,18 @@ static struct ablkcipher_edesc *ablkcipher_giv_edesc_alloc(
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (!dst_nents && iv_dma + ivsize == sg_dma_address(req->dst))
-		iv_contig = true;
-	else
-		dst_nents = dst_nents ? : 1;
-	sec4_sg_bytes = ((iv_contig ? 0 : 1) + src_nents + dst_nents) *
-			sizeof(struct sec4_sg_entry);
+	sec4_sg_ents = mapped_src_nents > 1 ? mapped_src_nents : 0;
+	dst_sg_idx = sec4_sg_ents;
+	if (mapped_dst_nents == 1 &&
+	    iv_dma + ivsize == sg_dma_address(req->dst)) {
+		out_contig = true;
+	} else {
+		out_contig = false;
+		sec4_sg_ents += 1 + mapped_dst_nents;
+	}
 
 	/* allocate space for base edesc and hw desc commands, link tables */
+	sec4_sg_bytes = sec4_sg_ents * sizeof(struct sec4_sg_entry);
 	edesc = kzalloc(sizeof(*edesc) + desc_bytes + sec4_sg_bytes,
 			GFP_DMA | flags);
 	if (!edesc) {
@@ -1871,18 +1770,15 @@ static struct ablkcipher_edesc *ablkcipher_giv_edesc_alloc(
 	edesc->sec4_sg = (void *)edesc + sizeof(struct ablkcipher_edesc) +
 			 desc_bytes;
 
-	sec4_sg_index = 0;
-	if (src_nents) {
-		sg_to_sec4_sg_last(req->src, src_nents, edesc->sec4_sg, 0);
-		sec4_sg_index += src_nents;
-	}
+	if (mapped_src_nents > 1)
+		sg_to_sec4_sg_last(req->src, mapped_src_nents, edesc->sec4_sg,
+				   0);
 
-	if (!iv_contig) {
-		dma_to_sec4_sg_one(edesc->sec4_sg + sec4_sg_index,
+	if (!out_contig) {
+		dma_to_sec4_sg_one(edesc->sec4_sg + dst_sg_idx,
 				   iv_dma, ivsize, 0);
-		sec4_sg_index += 1;
-		sg_to_sec4_sg_last(req->dst, dst_nents,
-				   edesc->sec4_sg + sec4_sg_index, 0);
+		sg_to_sec4_sg_last(req->dst, mapped_dst_nents,
+				   edesc->sec4_sg + dst_sg_idx + 1, 0);
 	}
 
 	edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
@@ -1903,7 +1799,7 @@ static struct ablkcipher_edesc *ablkcipher_giv_edesc_alloc(
 		       sec4_sg_bytes, 1);
 #endif
 
-	*iv_contig_out = iv_contig;
+	*iv_contig_out = out_contig;
 	return edesc;
 }
 
@@ -1914,7 +1810,7 @@ static int ablkcipher_givencrypt(struct skcipher_givcrypt_request *creq)
 	struct crypto_ablkcipher *ablkcipher = crypto_ablkcipher_reqtfm(req);
 	struct caam_ctx *ctx = crypto_ablkcipher_ctx(ablkcipher);
 	struct device *jrdev = ctx->jrdev;
-	bool iv_contig;
+	bool iv_contig = false;
 	u32 *desc;
 	int ret = 0;
 
@@ -3355,12 +3251,31 @@ struct caam_crypto_alg {
 
 static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam)
 {
+	dma_addr_t dma_addr;
+
 	ctx->jrdev = caam_jr_alloc();
 	if (IS_ERR(ctx->jrdev)) {
 		pr_err("Job Ring Device allocation for transform failed\n");
 		return PTR_ERR(ctx->jrdev);
 	}
 
+	dma_addr = dma_map_single_attrs(ctx->jrdev, ctx->sh_desc_enc,
+					offsetof(struct caam_ctx,
+						 sh_desc_enc_dma),
+					DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(ctx->jrdev, dma_addr)) {
+		dev_err(ctx->jrdev, "unable to map key, shared descriptors\n");
+		caam_jr_free(ctx->jrdev);
+		return -ENOMEM;
+	}
+
+	ctx->sh_desc_enc_dma = dma_addr;
+	ctx->sh_desc_dec_dma = dma_addr + offsetof(struct caam_ctx,
+						   sh_desc_dec);
+	ctx->sh_desc_givenc_dma = dma_addr + offsetof(struct caam_ctx,
+						      sh_desc_givenc);
+	ctx->key_dma = dma_addr + offsetof(struct caam_ctx, key);
+
 	/* copy descriptor header template value */
 	ctx->cdata.algtype = OP_TYPE_CLASS1_ALG | caam->class1_alg_type;
 	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam->class2_alg_type;
@@ -3390,25 +3305,9 @@ static int caam_aead_init(struct crypto_aead *tfm)
 
 static void caam_exit_common(struct caam_ctx *ctx)
 {
-	if (ctx->sh_desc_enc_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_enc_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_enc_dma,
-				 desc_bytes(ctx->sh_desc_enc), DMA_TO_DEVICE);
-	if (ctx->sh_desc_dec_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_dec_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_dec_dma,
-				 desc_bytes(ctx->sh_desc_dec), DMA_TO_DEVICE);
-	if (ctx->sh_desc_givenc_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_givenc_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_givenc_dma,
-				 desc_bytes(ctx->sh_desc_givenc),
-				 DMA_TO_DEVICE);
-	if (ctx->key_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->key_dma))
-		dma_unmap_single(ctx->jrdev, ctx->key_dma,
-				 ctx->cdata.keylen + ctx->adata.keylen_pad,
-				 DMA_TO_DEVICE);
-
+	dma_unmap_single_attrs(ctx->jrdev, ctx->sh_desc_enc_dma,
+			       offsetof(struct caam_ctx, sh_desc_enc_dma),
+			       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 	caam_jr_free(ctx->jrdev);
 }
 
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index e58639ea53b1..da4f94eab3da 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -109,7 +109,6 @@ struct caam_hash_ctx {
 	dma_addr_t sh_desc_digest_dma;
 	struct device *jrdev;
 	u8 key[CAAM_MAX_HASH_KEY_SIZE];
-	dma_addr_t key_dma;
 	int ctx_len;
 	struct alginfo adata;
 };
@@ -138,6 +137,31 @@ struct caam_export_state {
 	int (*finup)(struct ahash_request *req);
 };
 
+static inline void switch_buf(struct caam_hash_state *state)
+{
+	state->current_buf ^= 1;
+}
+
+static inline u8 *current_buf(struct caam_hash_state *state)
+{
+	return state->current_buf ? state->buf_1 : state->buf_0;
+}
+
+static inline u8 *alt_buf(struct caam_hash_state *state)
+{
+	return state->current_buf ? state->buf_0 : state->buf_1;
+}
+
+static inline int *current_buflen(struct caam_hash_state *state)
+{
+	return state->current_buf ? &state->buflen_1 : &state->buflen_0;
+}
+
+static inline int *alt_buflen(struct caam_hash_state *state)
+{
+	return state->current_buf ? &state->buflen_0 : &state->buflen_1;
+}
+
 /* Common job descriptor seq in/out ptr routines */
 
 /* Map state->caam_ctx, and append seq_out_ptr command that points to it */
@@ -149,6 +173,7 @@ static inline int map_seq_out_ptr_ctx(u32 *desc, struct device *jrdev,
 					ctx_len, DMA_FROM_DEVICE);
 	if (dma_mapping_error(jrdev, state->ctx_dma)) {
 		dev_err(jrdev, "unable to map ctx\n");
+		state->ctx_dma = 0;
 		return -ENOMEM;
 	}
 
@@ -169,36 +194,27 @@ static inline dma_addr_t map_seq_out_ptr_result(u32 *desc, struct device *jrdev,
 	return dst_dma;
 }
 
-/* Map current buffer in state and put it in link table */
-static inline dma_addr_t buf_map_to_sec4_sg(struct device *jrdev,
-					    struct sec4_sg_entry *sec4_sg,
-					    u8 *buf, int buflen)
+/* Map current buffer in state (if length > 0) and put it in link table */
+static inline int buf_map_to_sec4_sg(struct device *jrdev,
+				     struct sec4_sg_entry *sec4_sg,
+				     struct caam_hash_state *state)
 {
-	dma_addr_t buf_dma;
+	int buflen = *current_buflen(state);
 
-	buf_dma = dma_map_single(jrdev, buf, buflen, DMA_TO_DEVICE);
-	dma_to_sec4_sg_one(sec4_sg, buf_dma, buflen, 0);
+	if (!buflen)
+		return 0;
 
-	return buf_dma;
-}
+	state->buf_dma = dma_map_single(jrdev, current_buf(state), buflen,
+					DMA_TO_DEVICE);
+	if (dma_mapping_error(jrdev, state->buf_dma)) {
+		dev_err(jrdev, "unable to map buf\n");
+		state->buf_dma = 0;
+		return -ENOMEM;
+	}
 
-/*
- * Only put buffer in link table if it contains data, which is possible,
- * since a buffer has previously been used, and needs to be unmapped,
- */
-static inline dma_addr_t
-try_buf_map_to_sec4_sg(struct device *jrdev, struct sec4_sg_entry *sec4_sg,
-		       u8 *buf, dma_addr_t buf_dma, int buflen,
-		       int last_buflen)
-{
-	if (buf_dma && !dma_mapping_error(jrdev, buf_dma))
-		dma_unmap_single(jrdev, buf_dma, last_buflen, DMA_TO_DEVICE);
-	if (buflen)
-		buf_dma = buf_map_to_sec4_sg(jrdev, sec4_sg, buf, buflen);
-	else
-		buf_dma = 0;
+	dma_to_sec4_sg_one(sec4_sg, state->buf_dma, buflen, 0);
 
-	return buf_dma;
+	return 0;
 }
 
 /* Map state->caam_ctx, and add it to link table */
@@ -209,6 +225,7 @@ static inline int ctx_map_to_sec4_sg(u32 *desc, struct device *jrdev,
 	state->ctx_dma = dma_map_single(jrdev, state->caam_ctx, ctx_len, flag);
 	if (dma_mapping_error(jrdev, state->ctx_dma)) {
 		dev_err(jrdev, "unable to map ctx\n");
+		state->ctx_dma = 0;
 		return -ENOMEM;
 	}
 
@@ -277,12 +294,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_update shared descriptor */
 	desc = ctx->sh_desc_update;
 	ahash_gen_sh_desc(desc, OP_ALG_AS_UPDATE, ctx->ctx_len, ctx, true);
-	ctx->sh_desc_update_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
-						 DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_update_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_update_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR,
 		       "ahash update shdesc@"__stringify(__LINE__)": ",
@@ -292,13 +305,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_update_first shared descriptor */
 	desc = ctx->sh_desc_update_first;
 	ahash_gen_sh_desc(desc, OP_ALG_AS_INIT, ctx->ctx_len, ctx, false);
-	ctx->sh_desc_update_first_dma = dma_map_single(jrdev, desc,
-						       desc_bytes(desc),
-						       DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_update_first_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_update_first_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR,
 		       "ahash update first shdesc@"__stringify(__LINE__)": ",
@@ -308,12 +316,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_final shared descriptor */
 	desc = ctx->sh_desc_fin;
 	ahash_gen_sh_desc(desc, OP_ALG_AS_FINALIZE, digestsize, ctx, true);
-	ctx->sh_desc_fin_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
-					      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_fin_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_fin_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ahash final shdesc@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, desc,
@@ -323,13 +327,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_digest shared descriptor */
 	desc = ctx->sh_desc_digest;
 	ahash_gen_sh_desc(desc, OP_ALG_AS_INITFINAL, digestsize, ctx, false);
-	ctx->sh_desc_digest_dma = dma_map_single(jrdev, desc,
-						 desc_bytes(desc),
-						 DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_digest_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
+	dma_sync_single_for_device(jrdev, ctx->sh_desc_digest_dma,
+				   desc_bytes(desc), DMA_TO_DEVICE);
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR,
 		       "ahash digest shdesc@"__stringify(__LINE__)": ",
@@ -420,7 +419,6 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 			const u8 *key, unsigned int keylen)
 {
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
-	struct device *jrdev = ctx->jrdev;
 	int blocksize = crypto_tfm_alg_blocksize(&ahash->base);
 	int digestsize = crypto_ahash_digestsize(ahash);
 	int ret;
@@ -448,28 +446,14 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 	if (ret)
 		goto bad_free_key;
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->adata.keylen_pad,
-				      DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->key_dma)) {
-		dev_err(jrdev, "unable to map key i/o memory\n");
-		ret = -ENOMEM;
-		goto error_free_key;
-	}
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, ctx->key,
 		       ctx->adata.keylen_pad, 1);
 #endif
 
-	ret = ahash_set_sh_desc(ahash);
-	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->adata.keylen_pad,
-				 DMA_TO_DEVICE);
-	}
-
- error_free_key:
 	kfree(hashed_key);
-	return ret;
+	return ahash_set_sh_desc(ahash);
  bad_free_key:
 	kfree(hashed_key);
 	crypto_ahash_set_flags(ahash, CRYPTO_TFM_RES_BAD_KEY_LEN);
@@ -498,6 +482,8 @@ static inline void ahash_unmap(struct device *dev,
 			struct ahash_edesc *edesc,
 			struct ahash_request *req, int dst_len)
 {
+	struct caam_hash_state *state = ahash_request_ctx(req);
+
 	if (edesc->src_nents)
 		dma_unmap_sg(dev, req->src, edesc->src_nents, DMA_TO_DEVICE);
 	if (edesc->dst_dma)
@@ -506,6 +492,12 @@ static inline void ahash_unmap(struct device *dev,
 	if (edesc->sec4_sg_bytes)
 		dma_unmap_single(dev, edesc->sec4_sg_dma,
 				 edesc->sec4_sg_bytes, DMA_TO_DEVICE);
+
+	if (state->buf_dma) {
+		dma_unmap_single(dev, state->buf_dma, *current_buflen(state),
+				 DMA_TO_DEVICE);
+		state->buf_dma = 0;
+	}
 }
 
 static inline void ahash_unmap_ctx(struct device *dev,
@@ -516,8 +508,10 @@ static inline void ahash_unmap_ctx(struct device *dev,
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
 	struct caam_hash_state *state = ahash_request_ctx(req);
 
-	if (state->ctx_dma)
+	if (state->ctx_dma) {
 		dma_unmap_single(dev, state->ctx_dma, ctx->ctx_len, flag);
+		state->ctx_dma = 0;
+	}
 	ahash_unmap(dev, edesc, req, dst_len);
 }
 
@@ -562,8 +556,8 @@ static void ahash_done_bi(struct device *jrdev, u32 *desc, u32 err,
 	struct ahash_edesc *edesc;
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
-#ifdef DEBUG
 	struct caam_hash_state *state = ahash_request_ctx(req);
+#ifdef DEBUG
 	int digestsize = crypto_ahash_digestsize(ahash);
 
 	dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
@@ -574,6 +568,7 @@ static void ahash_done_bi(struct device *jrdev, u32 *desc, u32 err,
 		caam_jr_strstatus(jrdev, err);
 
 	ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len, DMA_BIDIRECTIONAL);
+	switch_buf(state);
 	kfree(edesc);
 
 #ifdef DEBUG
@@ -630,8 +625,8 @@ static void ahash_done_ctx_dst(struct device *jrdev, u32 *desc, u32 err,
 	struct ahash_edesc *edesc;
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
-#ifdef DEBUG
 	struct caam_hash_state *state = ahash_request_ctx(req);
+#ifdef DEBUG
 	int digestsize = crypto_ahash_digestsize(ahash);
 
 	dev_err(jrdev, "%s %d: err 0x%x\n", __func__, __LINE__, err);
@@ -642,6 +637,7 @@ static void ahash_done_ctx_dst(struct device *jrdev, u32 *desc, u32 err,
 		caam_jr_strstatus(jrdev, err);
 
 	ahash_unmap_ctx(jrdev, edesc, req, ctx->ctx_len, DMA_FROM_DEVICE);
+	switch_buf(state);
 	kfree(edesc);
 
 #ifdef DEBUG
@@ -725,11 +721,10 @@ static int ahash_update_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int *buflen = state->current_buf ? &state->buflen_1 : &state->buflen_0;
-	u8 *next_buf = state->current_buf ? state->buf_0 : state->buf_1;
-	int *next_buflen = state->current_buf ? &state->buflen_0 :
-			   &state->buflen_1, last_buflen;
+	u8 *buf = current_buf(state);
+	int *buflen = current_buflen(state);
+	u8 *next_buf = alt_buf(state);
+	int *next_buflen = alt_buflen(state), last_buflen;
 	int in_len = *buflen + req->nbytes, to_hash;
 	u32 *desc;
 	int src_nents, mapped_nents, sec4_sg_bytes, sec4_sg_src_index;
@@ -783,10 +778,9 @@ static int ahash_update_ctx(struct ahash_request *req)
 		if (ret)
 			goto unmap_ctx;
 
-		state->buf_dma = try_buf_map_to_sec4_sg(jrdev,
-							edesc->sec4_sg + 1,
-							buf, state->buf_dma,
-							*buflen, last_buflen);
+		ret = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1, state);
+		if (ret)
+			goto unmap_ctx;
 
 		if (mapped_nents) {
 			sg_to_sec4_sg_last(req->src, mapped_nents,
@@ -801,8 +795,6 @@ static int ahash_update_ctx(struct ahash_request *req)
 				cpu_to_caam32(SEC4_SG_LEN_FIN);
 		}
 
-		state->current_buf = !state->current_buf;
-
 		desc = edesc->hw_desc;
 
 		edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
@@ -859,10 +851,7 @@ static int ahash_final_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
-	int last_buflen = state->current_buf ? state->buflen_0 :
-			  state->buflen_1;
+	int buflen = *current_buflen(state);
 	u32 *desc;
 	int sec4_sg_bytes, sec4_sg_src_index;
 	int digestsize = crypto_ahash_digestsize(ahash);
@@ -889,9 +878,10 @@ static int ahash_final_ctx(struct ahash_request *req)
 	if (ret)
 		goto unmap_ctx;
 
-	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1,
-						buf, state->buf_dma, buflen,
-						last_buflen);
+	ret = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1, state);
+	if (ret)
+		goto unmap_ctx;
+
 	(edesc->sec4_sg + sec4_sg_src_index - 1)->len |=
 		cpu_to_caam32(SEC4_SG_LEN_FIN);
 
@@ -938,10 +928,7 @@ static int ahash_finup_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
-	int last_buflen = state->current_buf ? state->buflen_0 :
-			  state->buflen_1;
+	int buflen = *current_buflen(state);
 	u32 *desc;
 	int sec4_sg_src_index;
 	int src_nents, mapped_nents;
@@ -986,9 +973,9 @@ static int ahash_finup_ctx(struct ahash_request *req)
 	if (ret)
 		goto unmap_ctx;
 
-	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1,
-						buf, state->buf_dma, buflen,
-						last_buflen);
+	ret = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg + 1, state);
+	if (ret)
+		goto unmap_ctx;
 
 	ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents,
 				  sec4_sg_src_index, ctx->ctx_len + buflen,
@@ -1024,6 +1011,7 @@ static int ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *ahash = crypto_ahash_reqtfm(req);
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
+	struct caam_hash_state *state = ahash_request_ctx(req);
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
@@ -1033,6 +1021,8 @@ static int ahash_digest(struct ahash_request *req)
 	struct ahash_edesc *edesc;
 	int ret;
 
+	state->buf_dma = 0;
+
 	src_nents = sg_nents_for_len(req->src, req->nbytes);
 	if (src_nents < 0) {
 		dev_err(jrdev, "Invalid number of src SG.\n");
@@ -1105,8 +1095,8 @@ static int ahash_final_no_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
+	u8 *buf = current_buf(state);
+	int buflen = *current_buflen(state);
 	u32 *desc;
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct ahash_edesc *edesc;
@@ -1166,11 +1156,10 @@ static int ahash_update_no_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int *buflen = state->current_buf ? &state->buflen_1 : &state->buflen_0;
-	u8 *next_buf = state->current_buf ? state->buf_0 : state->buf_1;
-	int *next_buflen = state->current_buf ? &state->buflen_0 :
-			   &state->buflen_1;
+	u8 *buf = current_buf(state);
+	int *buflen = current_buflen(state);
+	u8 *next_buf = alt_buf(state);
+	int *next_buflen = alt_buflen(state);
 	int in_len = *buflen + req->nbytes, to_hash;
 	int sec4_sg_bytes, src_nents, mapped_nents;
 	struct ahash_edesc *edesc;
@@ -1219,8 +1208,10 @@ static int ahash_update_no_ctx(struct ahash_request *req)
 		edesc->sec4_sg_bytes = sec4_sg_bytes;
 		edesc->dst_dma = 0;
 
-		state->buf_dma = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg,
-						    buf, *buflen);
+		ret = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg, state);
+		if (ret)
+			goto unmap_ctx;
+
 		sg_to_sec4_sg_last(req->src, mapped_nents,
 				   edesc->sec4_sg + 1, 0);
 
@@ -1230,8 +1221,6 @@ static int ahash_update_no_ctx(struct ahash_request *req)
 						 *next_buflen, 0);
 		}
 
-		state->current_buf = !state->current_buf;
-
 		desc = edesc->hw_desc;
 
 		edesc->sec4_sg_dma = dma_map_single(jrdev, edesc->sec4_sg,
@@ -1293,10 +1282,7 @@ static int ahash_finup_no_ctx(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int buflen = state->current_buf ? state->buflen_1 : state->buflen_0;
-	int last_buflen = state->current_buf ? state->buflen_0 :
-			  state->buflen_1;
+	int buflen = *current_buflen(state);
 	u32 *desc;
 	int sec4_sg_bytes, sec4_sg_src_index, src_nents, mapped_nents;
 	int digestsize = crypto_ahash_digestsize(ahash);
@@ -1338,9 +1324,9 @@ static int ahash_finup_no_ctx(struct ahash_request *req)
 	edesc->src_nents = src_nents;
 	edesc->sec4_sg_bytes = sec4_sg_bytes;
 
-	state->buf_dma = try_buf_map_to_sec4_sg(jrdev, edesc->sec4_sg, buf,
-						state->buf_dma, buflen,
-						last_buflen);
+	ret = buf_map_to_sec4_sg(jrdev, edesc->sec4_sg, state);
+	if (ret)
+		goto unmap;
 
 	ret = ahash_edesc_add_src(ctx, edesc, req, mapped_nents, 1, buflen,
 				  req->nbytes);
@@ -1386,9 +1372,8 @@ static int ahash_update_first(struct ahash_request *req)
 	struct device *jrdev = ctx->jrdev;
 	gfp_t flags = (req->base.flags & (CRYPTO_TFM_REQ_MAY_BACKLOG |
 		       CRYPTO_TFM_REQ_MAY_SLEEP)) ? GFP_KERNEL : GFP_ATOMIC;
-	u8 *next_buf = state->current_buf ? state->buf_1 : state->buf_0;
-	int *next_buflen = state->current_buf ?
-		&state->buflen_1 : &state->buflen_0;
+	u8 *next_buf = alt_buf(state);
+	int *next_buflen = alt_buflen(state);
 	int to_hash;
 	u32 *desc;
 	int src_nents, mapped_nents;
@@ -1470,6 +1455,7 @@ static int ahash_update_first(struct ahash_request *req)
 		state->final = ahash_final_no_ctx;
 		scatterwalk_map_and_copy(next_buf, req->src, 0,
 					 req->nbytes, 0);
+		switch_buf(state);
 	}
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "next buf@"__stringify(__LINE__)": ",
@@ -1497,6 +1483,7 @@ static int ahash_init(struct ahash_request *req)
 	state->finup = ahash_finup_first;
 	state->final = ahash_final_no_ctx;
 
+	state->ctx_dma = 0;
 	state->current_buf = 0;
 	state->buf_dma = 0;
 	state->buflen_0 = 0;
@@ -1732,6 +1719,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
 					 HASH_MSG_LEN + SHA256_DIGEST_SIZE,
 					 HASH_MSG_LEN + 64,
 					 HASH_MSG_LEN + SHA512_DIGEST_SIZE };
+	dma_addr_t dma_addr;
 
 	/*
 	 * Get a Job ring from Job Ring driver to ensure in-order
@@ -1742,6 +1730,26 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
 		pr_err("Job Ring Device allocation for transform failed\n");
 		return PTR_ERR(ctx->jrdev);
 	}
+
+	dma_addr = dma_map_single_attrs(ctx->jrdev, ctx->sh_desc_update,
+					offsetof(struct caam_hash_ctx,
+						 sh_desc_update_dma),
+					DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+	if (dma_mapping_error(ctx->jrdev, dma_addr)) {
+		dev_err(ctx->jrdev, "unable to map shared descriptors\n");
+		caam_jr_free(ctx->jrdev);
+		return -ENOMEM;
+	}
+
+	ctx->sh_desc_update_dma = dma_addr;
+	ctx->sh_desc_update_first_dma = dma_addr +
+					offsetof(struct caam_hash_ctx,
+						 sh_desc_update_first);
+	ctx->sh_desc_fin_dma = dma_addr + offsetof(struct caam_hash_ctx,
+						   sh_desc_fin);
+	ctx->sh_desc_digest_dma = dma_addr + offsetof(struct caam_hash_ctx,
+						      sh_desc_digest);
+
 	/* copy descriptor header template value */
 	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam_hash->alg_type;
 
@@ -1758,26 +1766,10 @@ static void caam_hash_cra_exit(struct crypto_tfm *tfm)
 {
 	struct caam_hash_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	if (ctx->sh_desc_update_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_update_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_update_dma,
-				 desc_bytes(ctx->sh_desc_update),
-				 DMA_TO_DEVICE);
-	if (ctx->sh_desc_update_first_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_update_first_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_update_first_dma,
-				 desc_bytes(ctx->sh_desc_update_first),
-				 DMA_TO_DEVICE);
-	if (ctx->sh_desc_fin_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_fin_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_fin_dma,
-				 desc_bytes(ctx->sh_desc_fin), DMA_TO_DEVICE);
-	if (ctx->sh_desc_digest_dma &&
-	    !dma_mapping_error(ctx->jrdev, ctx->sh_desc_digest_dma))
-		dma_unmap_single(ctx->jrdev, ctx->sh_desc_digest_dma,
-				 desc_bytes(ctx->sh_desc_digest),
-				 DMA_TO_DEVICE);
-
+	dma_unmap_single_attrs(ctx->jrdev, ctx->sh_desc_update_dma,
+			       offsetof(struct caam_hash_ctx,
+					sh_desc_update_dma),
+			       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 	caam_jr_free(ctx->jrdev);
 }
 
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 755109841cfd..579f8263c479 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -13,7 +13,6 @@
 #include "intern.h"
 #include "jr.h"
 #include "desc_constr.h"
-#include "error.h"
 #include "ctrl.h"
 
 bool caam_little_end;
@@ -309,10 +308,8 @@ static int caam_remove(struct platform_device *pdev)
 	ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl;
 
 	/* Remove platform devices for JobRs */
-	for (ring = 0; ring < ctrlpriv->total_jobrs; ring++) {
-		if (ctrlpriv->jrpdev[ring])
-			of_device_unregister(ctrlpriv->jrpdev[ring]);
-	}
+	for (ring = 0; ring < ctrlpriv->total_jobrs; ring++)
+		of_device_unregister(ctrlpriv->jrpdev[ring]);
 
 	/* De-initialize RNG state handles initialized by this driver. */
 	if (ctrlpriv->rng4_sh_init)
@@ -424,7 +421,7 @@ DEFINE_SIMPLE_ATTRIBUTE(caam_fops_u64_ro, caam_debugfs_u64_get, NULL, "%llu\n");
 /* Probe routine for CAAM top (controller) level */
 static int caam_probe(struct platform_device *pdev)
 {
-	int ret, ring, rspec, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN;
+	int ret, ring, ridx, rspec, gen_sk, ent_delay = RTSDCTL_ENT_DLY_MIN;
 	u64 caam_id;
 	struct device *dev;
 	struct device_node *nprop, *np;
@@ -587,13 +584,18 @@ static int caam_probe(struct platform_device *pdev)
 			      JRSTART_JR1_START | JRSTART_JR2_START |
 			      JRSTART_JR3_START);
 
-	if (sizeof(dma_addr_t) == sizeof(u64))
+	if (sizeof(dma_addr_t) == sizeof(u64)) {
 		if (of_device_is_compatible(nprop, "fsl,sec-v5.0"))
-			dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
+			ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
 		else
-			dma_set_mask_and_coherent(dev, DMA_BIT_MASK(36));
-	else
-		dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+			ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(36));
+	} else {
+		ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+	}
+	if (ret) {
+		dev_err(dev, "dma_set_mask_and_coherent failed (%d)\n", ret);
+		goto iounmap_ctrl;
+	}
 
 	/*
 	 * Detect and enable JobRs
@@ -614,6 +616,7 @@ static int caam_probe(struct platform_device *pdev)
 	}
 
 	ring = 0;
+	ridx = 0;
 	ctrlpriv->total_jobrs = 0;
 	for_each_available_child_of_node(nprop, np)
 		if (of_device_is_compatible(np, "fsl,sec-v4.0-job-ring") ||
@@ -621,17 +624,19 @@ static int caam_probe(struct platform_device *pdev)
 			ctrlpriv->jrpdev[ring] =
 				of_platform_device_create(np, NULL, dev);
 			if (!ctrlpriv->jrpdev[ring]) {
-				pr_warn("JR%d Platform device creation error\n",
-					ring);
+				pr_warn("JR physical index %d: Platform device creation error\n",
+					ridx);
+				ridx++;
 				continue;
 			}
 			ctrlpriv->jr[ring] = (struct caam_job_ring __iomem __force *)
 					     ((__force uint8_t *)ctrl +
-					     (ring + JR_BLOCK_NUMBER) *
+					     (ridx + JR_BLOCK_NUMBER) *
 					      BLOCK_OFFSET
 					     );
 			ctrlpriv->total_jobrs++;
 			ring++;
+			ridx++;
 	}
 
 	/* Check to see if QI present. If so, enable */
diff --git a/drivers/crypto/caam/error.c b/drivers/crypto/caam/error.c
index 79a0cc70717f..6f44ccb55c63 100644
--- a/drivers/crypto/caam/error.c
+++ b/drivers/crypto/caam/error.c
@@ -6,9 +6,7 @@
 
 #include "compat.h"
 #include "regs.h"
-#include "intern.h"
 #include "desc.h"
-#include "jr.h"
 #include "error.h"
 
 static const struct {
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index c8604dfadbf5..27631000b9f8 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -498,13 +498,22 @@ static int caam_jr_probe(struct platform_device *pdev)
 
 	jrpriv->rregs = (struct caam_job_ring __iomem __force *)ctrl;
 
-	if (sizeof(dma_addr_t) == sizeof(u64))
+	if (sizeof(dma_addr_t) == sizeof(u64)) {
 		if (of_device_is_compatible(nprop, "fsl,sec-v5.0-job-ring"))
-			dma_set_mask_and_coherent(jrdev, DMA_BIT_MASK(40));
+			error = dma_set_mask_and_coherent(jrdev,
+							  DMA_BIT_MASK(40));
 		else
-			dma_set_mask_and_coherent(jrdev, DMA_BIT_MASK(36));
-	else
-		dma_set_mask_and_coherent(jrdev, DMA_BIT_MASK(32));
+			error = dma_set_mask_and_coherent(jrdev,
+							  DMA_BIT_MASK(36));
+	} else {
+		error = dma_set_mask_and_coherent(jrdev, DMA_BIT_MASK(32));
+	}
+	if (error) {
+		dev_err(jrdev, "dma_set_mask_and_coherent failed (%d)\n",
+			error);
+		iounmap(ctrl);
+		return error;
+	}
 
 	/* Identify the interrupt */
 	jrpriv->irq = irq_of_parse_and_map(nprop, 0);
diff --git a/drivers/crypto/caam/sg_sw_sec4.h b/drivers/crypto/caam/sg_sw_sec4.h
index 6afa20c4a013..c6adad09c972 100644
--- a/drivers/crypto/caam/sg_sw_sec4.h
+++ b/drivers/crypto/caam/sg_sw_sec4.h
@@ -73,14 +73,3 @@ static inline struct sec4_sg_entry *sg_to_sec4_sg_len(
 	} while (total);
 	return sec4_sg_ptr - 1;
 }
-
-/* derive number of elements in scatterlist, but return 0 for 1 */
-static inline int sg_count(struct scatterlist *sg_list, int nbytes)
-{
-	int sg_nents = sg_nents_for_len(sg_list, nbytes);
-
-	if (likely(sg_nents == 1))
-		return 0;
-
-	return sg_nents;
-}
diff --git a/drivers/crypto/cavium/cpt/Kconfig b/drivers/crypto/cavium/cpt/Kconfig
new file mode 100644
index 000000000000..cbd51b1aa046
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/Kconfig
@@ -0,0 +1,17 @@
+#
+# Cavium crypto device configuration
+#
+
+config CRYPTO_DEV_CPT
+	tristate
+
+config CAVIUM_CPT
+	tristate "Cavium Cryptographic Accelerator driver"
+	depends on ARCH_THUNDER || COMPILE_TEST
+	depends on PCI_MSI && 64BIT
+	select CRYPTO_DEV_CPT
+	help
+	  Support for Cavium CPT block found in octeon-tx series of
+	  processors.
+
+	  To compile this as a module, choose M here.
diff --git a/drivers/crypto/cavium/cpt/Makefile b/drivers/crypto/cavium/cpt/Makefile
new file mode 100644
index 000000000000..dbf055e14622
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_CAVIUM_CPT) += cptpf.o cptvf.o
+cptpf-objs := cptpf_main.o cptpf_mbox.o
+cptvf-objs := cptvf_main.o cptvf_reqmanager.o cptvf_mbox.o cptvf_algs.o
diff --git a/drivers/crypto/cavium/cpt/cpt_common.h b/drivers/crypto/cavium/cpt/cpt_common.h
new file mode 100644
index 000000000000..225078d03773
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cpt_common.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __CPT_COMMON_H
+#define __CPT_COMMON_H
+
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+
+#include "cpt_hw_types.h"
+
+/* Device ID */
+#define CPT_81XX_PCI_PF_DEVICE_ID 0xa040
+#define CPT_81XX_PCI_VF_DEVICE_ID 0xa041
+
+/* flags to indicate the features supported */
+#define CPT_FLAG_SRIOV_ENABLED BIT(1)
+#define CPT_FLAG_VF_DRIVER BIT(2)
+#define CPT_FLAG_DEVICE_READY BIT(3)
+
+#define cpt_sriov_enabled(cpt) ((cpt)->flags & CPT_FLAG_SRIOV_ENABLED)
+#define cpt_vf_driver(cpt) ((cpt)->flags & CPT_FLAG_VF_DRIVER)
+#define cpt_device_ready(cpt) ((cpt)->flags & CPT_FLAG_DEVICE_READY)
+
+#define CPT_MBOX_MSG_TYPE_ACK 1
+#define CPT_MBOX_MSG_TYPE_NACK 2
+#define CPT_MBOX_MSG_TIMEOUT 2000
+#define VF_STATE_DOWN 0
+#define VF_STATE_UP 1
+
+/*
+ * CPT Registers map for 81xx
+ */
+
+/* PF registers */
+#define CPTX_PF_CONSTANTS(a) (0x0ll + ((u64)(a) << 36))
+#define CPTX_PF_RESET(a) (0x100ll + ((u64)(a) << 36))
+#define CPTX_PF_DIAG(a) (0x120ll + ((u64)(a) << 36))
+#define CPTX_PF_BIST_STATUS(a) (0x160ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_CTL(a) (0x200ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_FLIP(a) (0x210ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_INT(a) (0x220ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_INT_W1S(a) (0x230ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_ENA_W1S(a)	(0x240ll + ((u64)(a) << 36))
+#define CPTX_PF_ECC0_ENA_W1C(a)	(0x250ll + ((u64)(a) << 36))
+#define CPTX_PF_MBOX_INTX(a, b)	\
+	(0x400ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_MBOX_INT_W1SX(a, b) \
+	(0x420ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_MBOX_ENA_W1CX(a, b) \
+	(0x440ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_MBOX_ENA_W1SX(a, b) \
+	(0x460ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_EXEC_INT(a) (0x500ll + 0x1000000000ll * ((a) & 0x1))
+#define CPTX_PF_EXEC_INT_W1S(a)	(0x520ll + ((u64)(a) << 36))
+#define CPTX_PF_EXEC_ENA_W1C(a)	(0x540ll + ((u64)(a) << 36))
+#define CPTX_PF_EXEC_ENA_W1S(a)	(0x560ll + ((u64)(a) << 36))
+#define CPTX_PF_GX_EN(a, b) \
+	(0x600ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_EXEC_INFO(a) (0x700ll + ((u64)(a) << 36))
+#define CPTX_PF_EXEC_BUSY(a) (0x800ll + ((u64)(a) << 36))
+#define CPTX_PF_EXEC_INFO0(a) (0x900ll + ((u64)(a) << 36))
+#define CPTX_PF_EXEC_INFO1(a) (0x910ll + ((u64)(a) << 36))
+#define CPTX_PF_INST_REQ_PC(a) (0x10000ll + ((u64)(a) << 36))
+#define CPTX_PF_INST_LATENCY_PC(a) \
+	(0x10020ll + ((u64)(a) << 36))
+#define CPTX_PF_RD_REQ_PC(a) (0x10040ll + ((u64)(a) << 36))
+#define CPTX_PF_RD_LATENCY_PC(a) (0x10060ll + ((u64)(a) << 36))
+#define CPTX_PF_RD_UC_PC(a) (0x10080ll + ((u64)(a) << 36))
+#define CPTX_PF_ACTIVE_CYCLES_PC(a) (0x10100ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_CTL(a) (0x4000000ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_STATUS(a) (0x4000008ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_CLK(a) (0x4000010ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_DBG_CTL(a) (0x4000018ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_DBG_DATA(a)	(0x4000020ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_BIST_STATUS(a) (0x4000028ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_REQ_TIMER(a) (0x4000030ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_MEM_CTL(a) (0x4000038ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_PERF_CTL(a)	(0x4001000ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_DBG_CNTX(a, b) \
+	(0x4001100ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_EXE_PERF_EVENT_CNT(a) (0x4001180ll + ((u64)(a) << 36))
+#define CPTX_PF_EXE_EPCI_INBX_CNT(a, b) \
+	(0x4001200ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_EXE_EPCI_OUTBX_CNT(a, b) \
+	(0x4001240ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_ENGX_UCODE_BASE(a, b) \
+	(0x4002000ll + ((u64)(a) << 36) + ((b) << 3))
+#define CPTX_PF_QX_CTL(a, b) \
+	(0x8000000ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_PF_QX_GMCTL(a, b) \
+	(0x8000020ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_PF_QX_CTL2(a, b) \
+	(0x8000100ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_PF_VFX_MBOXX(a, b, c) \
+	(0x8001000ll + ((u64)(a) << 36) + ((b) << 20) + ((c) << 8))
+
+/* VF registers */
+#define CPTX_VQX_CTL(a, b) (0x100ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_SADDR(a, b) (0x200ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_WAIT(a, b) (0x400ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_INPROG(a, b) (0x410ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE(a, b) (0x420ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_ACK(a, b) (0x440ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_INT_W1S(a, b) (0x460ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_INT_W1C(a, b) (0x468ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_ENA_W1S(a, b) (0x470ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DONE_ENA_W1C(a, b) (0x478ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_MISC_INT(a, b)	(0x500ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_MISC_INT_W1S(a, b) (0x508ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_MISC_ENA_W1S(a, b) (0x510ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_MISC_ENA_W1C(a, b) (0x518ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VQX_DOORBELL(a, b) (0x600ll + ((u64)(a) << 36) + ((b) << 20))
+#define CPTX_VFX_PF_MBOXX(a, b, c) \
+	(0x1000ll + ((u64)(a) << 36) + ((b) << 20) + ((c) << 3))
+
+enum vftype {
+	AE_TYPES = 1,
+	SE_TYPES = 2,
+	BAD_CPT_TYPES,
+};
+
+/* Max CPT devices supported */
+enum cpt_mbox_opcode {
+	CPT_MSG_VF_UP = 1,
+	CPT_MSG_VF_DOWN,
+	CPT_MSG_READY,
+	CPT_MSG_QLEN,
+	CPT_MSG_QBIND_GRP,
+	CPT_MSG_VQ_PRIORITY,
+};
+
+/* CPT mailbox structure */
+struct cpt_mbox {
+	u64 msg; /* Message type MBOX[0] */
+	u64 data;/* Data         MBOX[1] */
+};
+
+/* Register read/write APIs */
+static inline void cpt_write_csr64(u8 __iomem *hw_addr, u64 offset,
+				   u64 val)
+{
+	writeq(val, hw_addr + offset);
+}
+
+static inline u64 cpt_read_csr64(u8 __iomem *hw_addr, u64 offset)
+{
+	return readq(hw_addr + offset);
+}
+#endif /* __CPT_COMMON_H */
diff --git a/drivers/crypto/cavium/cpt/cpt_hw_types.h b/drivers/crypto/cavium/cpt/cpt_hw_types.h
new file mode 100644
index 000000000000..279669494196
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cpt_hw_types.h
@@ -0,0 +1,658 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __CPT_HW_TYPES_H
+#define __CPT_HW_TYPES_H
+
+#include "cpt_common.h"
+
+/**
+ * Enumeration cpt_comp_e
+ *
+ * CPT Completion Enumeration
+ * Enumerates the values of CPT_RES_S[COMPCODE].
+ */
+enum cpt_comp_e {
+	CPT_COMP_E_NOTDONE = 0x00,
+	CPT_COMP_E_GOOD = 0x01,
+	CPT_COMP_E_FAULT = 0x02,
+	CPT_COMP_E_SWERR = 0x03,
+	CPT_COMP_E_LAST_ENTRY = 0xFF
+};
+
+/**
+ * Structure cpt_inst_s
+ *
+ * CPT Instruction Structure
+ * This structure specifies the instruction layout. Instructions are
+ * stored in memory as little-endian unless CPT()_PF_Q()_CTL[INST_BE] is set.
+ * cpt_inst_s_s
+ * Word 0
+ * doneint:1 Done interrupt.
+ *	0 = No interrupts related to this instruction.
+ *	1 = When the instruction completes, CPT()_VQ()_DONE[DONE] will be
+ *	incremented,and based on the rules described there an interrupt may
+ *	occur.
+ * Word 1
+ * res_addr [127: 64] Result IOVA.
+ *	If nonzero, specifies where to write CPT_RES_S.
+ *	If zero, no result structure will be written.
+ *	Address must be 16-byte aligned.
+ *	Bits <63:49> are ignored by hardware; software should use a
+ *	sign-extended bit <48> for forward compatibility.
+ * Word 2
+ *  grp:10 [171:162] If [WQ_PTR] is nonzero, the SSO guest-group to use when
+ *	CPT submits work SSO.
+ *	For the SSO to not discard the add-work request, FPA_PF_MAP() must map
+ *	[GRP] and CPT()_PF_Q()_GMCTL[GMID] as valid.
+ *  tt:2 [161:160] If [WQ_PTR] is nonzero, the SSO tag type to use when CPT
+ *	submits work to SSO
+ *  tag:32 [159:128] If [WQ_PTR] is nonzero, the SSO tag to use when CPT
+ *	submits work to SSO.
+ * Word 3
+ *  wq_ptr [255:192] If [WQ_PTR] is nonzero, it is a pointer to a
+ *	work-queue entry that CPT submits work to SSO after all context,
+ *	output data, and result write operations are visible to other
+ *	CNXXXX units and the cores. Bits <2:0> must be zero.
+ *	Bits <63:49> are ignored by hardware; software should
+ *	use a sign-extended bit <48> for forward compatibility.
+ *	Internal:
+ *	Bits <63:49>, <2:0> are ignored by hardware, treated as always 0x0.
+ * Word 4
+ *  ei0; [319:256] Engine instruction word 0. Passed to the AE/SE.
+ * Word 5
+ *  ei1; [383:320] Engine instruction word 1. Passed to the AE/SE.
+ * Word 6
+ *  ei2; [447:384] Engine instruction word 1. Passed to the AE/SE.
+ * Word 7
+ *  ei3; [511:448] Engine instruction word 1. Passed to the AE/SE.
+ *
+ */
+union cpt_inst_s {
+	u64 u[8];
+	struct cpt_inst_s_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_17_63:47;
+		u64 doneint:1;
+		u64 reserved_0_1:16;
+#else /* Word 0 - Little Endian */
+		u64 reserved_0_15:16;
+		u64 doneint:1;
+		u64 reserved_17_63:47;
+#endif /* Word 0 - End */
+		u64 res_addr;
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 2 - Big Endian */
+		u64 reserved_172_19:20;
+		u64 grp:10;
+		u64 tt:2;
+		u64 tag:32;
+#else /* Word 2 - Little Endian */
+		u64 tag:32;
+		u64 tt:2;
+		u64 grp:10;
+		u64 reserved_172_191:20;
+#endif /* Word 2 - End */
+		u64 wq_ptr;
+		u64 ei0;
+		u64 ei1;
+		u64 ei2;
+		u64 ei3;
+	} s;
+};
+
+/**
+ * Structure cpt_res_s
+ *
+ * CPT Result Structure
+ * The CPT coprocessor writes the result structure after it completes a
+ * CPT_INST_S instruction. The result structure is exactly 16 bytes, and
+ * each instruction completion produces exactly one result structure.
+ *
+ * This structure is stored in memory as little-endian unless
+ * CPT()_PF_Q()_CTL[INST_BE] is set.
+ * cpt_res_s_s
+ * Word 0
+ *  doneint:1 [16:16] Done interrupt. This bit is copied from the
+ *	corresponding instruction's CPT_INST_S[DONEINT].
+ *  compcode:8 [7:0] Indicates completion/error status of the CPT coprocessor
+ *	for the	associated instruction, as enumerated by CPT_COMP_E.
+ *	Core software may write the memory location containing [COMPCODE] to
+ *	0x0 before ringing the doorbell, and then poll for completion by
+ *	checking for a nonzero value.
+ *	Once the core observes a nonzero [COMPCODE] value in this case,the CPT
+ *	coprocessor will have also completed L2/DRAM write operations.
+ * Word 1
+ *  reserved
+ *
+ */
+union cpt_res_s {
+	u64 u[2];
+	struct cpt_res_s_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_17_63:47;
+		u64 doneint:1;
+		u64 reserved_8_15:8;
+		u64 compcode:8;
+#else /* Word 0 - Little Endian */
+		u64 compcode:8;
+		u64 reserved_8_15:8;
+		u64 doneint:1;
+		u64 reserved_17_63:47;
+#endif /* Word 0 - End */
+		u64 reserved_64_127;
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_pf_bist_status
+ *
+ * CPT PF Control Bist Status Register
+ * This register has the BIST status of memories. Each bit is the BIST result
+ * of an individual memory (per bit, 0 = pass and 1 = fail).
+ * cptx_pf_bist_status_s
+ * Word0
+ *  bstatus [29:0](RO/H) BIST status. One bit per memory, enumerated by
+ *	CPT_RAMS_E.
+ */
+union cptx_pf_bist_status {
+	u64 u;
+	struct cptx_pf_bist_status_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_30_63:34;
+		u64 bstatus:30;
+#else /* Word 0 - Little Endian */
+		u64 bstatus:30;
+		u64 reserved_30_63:34;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_pf_constants
+ *
+ * CPT PF Constants Register
+ * This register contains implementation-related parameters of CPT in CNXXXX.
+ * cptx_pf_constants_s
+ * Word 0
+ *  reserved_40_63:24 [63:40] Reserved.
+ *  epcis:8 [39:32](RO) Number of EPCI busses.
+ *  grps:8 [31:24](RO) Number of engine groups implemented.
+ *  ae:8 [23:16](RO/H) Number of AEs. In CNXXXX, for CPT0 returns 0x0,
+ *	for CPT1 returns 0x18, or less if there are fuse-disables.
+ *  se:8 [15:8](RO/H) Number of SEs. In CNXXXX, for CPT0 returns 0x30,
+ *	or less if there are fuse-disables, for CPT1 returns 0x0.
+ *  vq:8 [7:0](RO) Number of VQs.
+ */
+union cptx_pf_constants {
+	u64 u;
+	struct cptx_pf_constants_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_40_63:24;
+		u64 epcis:8;
+		u64 grps:8;
+		u64 ae:8;
+		u64 se:8;
+		u64 vq:8;
+#else /* Word 0 - Little Endian */
+		u64 vq:8;
+		u64 se:8;
+		u64 ae:8;
+		u64 grps:8;
+		u64 epcis:8;
+		u64 reserved_40_63:24;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_pf_exe_bist_status
+ *
+ * CPT PF Engine Bist Status Register
+ * This register has the BIST status of each engine.  Each bit is the
+ * BIST result of an individual engine (per bit, 0 = pass and 1 = fail).
+ * cptx_pf_exe_bist_status_s
+ * Word0
+ *  reserved_48_63:16 [63:48] reserved
+ *  bstatus:48 [47:0](RO/H) BIST status. One bit per engine.
+ *
+ */
+union cptx_pf_exe_bist_status {
+	u64 u;
+	struct cptx_pf_exe_bist_status_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_48_63:16;
+		u64 bstatus:48;
+#else /* Word 0 - Little Endian */
+		u64 bstatus:48;
+		u64 reserved_48_63:16;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_pf_q#_ctl
+ *
+ * CPT Queue Control Register
+ * This register configures queues. This register should be changed only
+ * when quiescent (see CPT()_VQ()_INPROG[INFLIGHT]).
+ * cptx_pf_qx_ctl_s
+ * Word0
+ *  reserved_60_63:4 [63:60] reserved.
+ *  aura:12; [59:48](R/W) Guest-aura for returning this queue's
+ *	instruction-chunk buffers to FPA. Only used when [INST_FREE] is set.
+ *	For the FPA to not discard the request, FPA_PF_MAP() must map
+ *	[AURA] and CPT()_PF_Q()_GMCTL[GMID] as valid.
+ *  reserved_45_47:3 [47:45] reserved.
+ *  size:13 [44:32](R/W) Command-buffer size, in number of 64-bit words per
+ *	command buffer segment. Must be 8*n + 1, where n is the number of
+ *	instructions per buffer segment.
+ *  reserved_11_31:21 [31:11] Reserved.
+ *  cont_err:1 [10:10](R/W) Continue on error.
+ *	0 = When CPT()_VQ()_MISC_INT[NWRP], CPT()_VQ()_MISC_INT[IRDE] or
+ *	CPT()_VQ()_MISC_INT[DOVF] are set by hardware or software via
+ *	CPT()_VQ()_MISC_INT_W1S, then CPT()_VQ()_CTL[ENA] is cleared.  Due to
+ *	pipelining, additional instructions may have been processed between the
+ *	instruction causing the error and the next instruction in the disabled
+ *	queue (the instruction at CPT()_VQ()_SADDR).
+ *	1 = Ignore errors and continue processing instructions.
+ *	For diagnostic use only.
+ *  inst_free:1 [9:9](R/W) Instruction FPA free. When set, when CPT reaches the
+ *	end of an instruction chunk, that chunk will be freed to the FPA.
+ *  inst_be:1 [8:8](R/W) Instruction big-endian control. When set, instructions,
+ *	instruction next chunk pointers, and result structures are stored in
+ *	big-endian format in memory.
+ *  iqb_ldwb:1 [7:7](R/W) Instruction load don't write back.
+ *	0 = The hardware issues NCB transient load (LDT) towards the cache,
+ *	which if the line hits and is is dirty will cause the line to be
+ *	written back before being replaced.
+ *	1 = The hardware issues NCB LDWB read-and-invalidate command towards
+ *	the cache when fetching the last word of instructions; as a result the
+ *	line will not be written back when replaced.  This improves
+ *	performance, but software must not read the instructions after they are
+ *	posted to the hardware.	Reads that do not consume the last word of a
+ *	cache line always use LDI.
+ *  reserved_4_6:3 [6:4] Reserved.
+ *  grp:3; [3:1](R/W) Engine group.
+ *  pri:1; [0:0](R/W) Queue priority.
+ *	1 = This queue has higher priority. Round-robin between higher
+ *	priority queues.
+ *	0 = This queue has lower priority. Round-robin between lower
+ *	priority queues.
+ */
+union cptx_pf_qx_ctl {
+	u64 u;
+	struct cptx_pf_qx_ctl_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_60_63:4;
+		u64 aura:12;
+		u64 reserved_45_47:3;
+		u64 size:13;
+		u64 reserved_11_31:21;
+		u64 cont_err:1;
+		u64 inst_free:1;
+		u64 inst_be:1;
+		u64 iqb_ldwb:1;
+		u64 reserved_4_6:3;
+		u64 grp:3;
+		u64 pri:1;
+#else /* Word 0 - Little Endian */
+		u64 pri:1;
+		u64 grp:3;
+		u64 reserved_4_6:3;
+		u64 iqb_ldwb:1;
+		u64 inst_be:1;
+		u64 inst_free:1;
+		u64 cont_err:1;
+		u64 reserved_11_31:21;
+		u64 size:13;
+		u64 reserved_45_47:3;
+		u64 aura:12;
+		u64 reserved_60_63:4;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_saddr
+ *
+ * CPT Queue Starting Buffer Address Registers
+ * These registers set the instruction buffer starting address.
+ * cptx_vqx_saddr_s
+ * Word0
+ *  reserved_49_63:15 [63:49] Reserved.
+ *  ptr:43 [48:6](R/W/H) Instruction buffer IOVA <48:6> (64-byte aligned).
+ *	When written, it is the initial buffer starting address; when read,
+ *	it is the next read pointer to be requested from L2C. The PTR field
+ *	is overwritten with the next pointer each time that the command buffer
+ *	segment is exhausted. New commands will then be read from the newly
+ *	specified command buffer pointer.
+ *  reserved_0_5:6 [5:0] Reserved.
+ *
+ */
+union cptx_vqx_saddr {
+	u64 u;
+	struct cptx_vqx_saddr_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_49_63:15;
+		u64 ptr:43;
+		u64 reserved_0_5:6;
+#else /* Word 0 - Little Endian */
+		u64 reserved_0_5:6;
+		u64 ptr:43;
+		u64 reserved_49_63:15;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_misc_ena_w1s
+ *
+ * CPT Queue Misc Interrupt Enable Set Register
+ * This register sets interrupt enable bits.
+ * cptx_vqx_misc_ena_w1s_s
+ * Word0
+ * reserved_5_63:59 [63:5] Reserved.
+ * swerr:1 [4:4](R/W1S/H) Reads or sets enable for
+ *	CPT(0..1)_VQ(0..63)_MISC_INT[SWERR].
+ * nwrp:1 [3:3](R/W1S/H) Reads or sets enable for
+ *	CPT(0..1)_VQ(0..63)_MISC_INT[NWRP].
+ * irde:1 [2:2](R/W1S/H) Reads or sets enable for
+ *	CPT(0..1)_VQ(0..63)_MISC_INT[IRDE].
+ * dovf:1 [1:1](R/W1S/H) Reads or sets enable for
+ *	CPT(0..1)_VQ(0..63)_MISC_INT[DOVF].
+ * mbox:1 [0:0](R/W1S/H) Reads or sets enable for
+ *	CPT(0..1)_VQ(0..63)_MISC_INT[MBOX].
+ *
+ */
+union cptx_vqx_misc_ena_w1s {
+	u64 u;
+	struct cptx_vqx_misc_ena_w1s_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_5_63:59;
+		u64 swerr:1;
+		u64 nwrp:1;
+		u64 irde:1;
+		u64 dovf:1;
+		u64 mbox:1;
+#else /* Word 0 - Little Endian */
+		u64 mbox:1;
+		u64 dovf:1;
+		u64 irde:1;
+		u64 nwrp:1;
+		u64 swerr:1;
+		u64 reserved_5_63:59;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_doorbell
+ *
+ * CPT Queue Doorbell Registers
+ * Doorbells for the CPT instruction queues.
+ * cptx_vqx_doorbell_s
+ * Word0
+ *  reserved_20_63:44 [63:20] Reserved.
+ *  dbell_cnt:20 [19:0](R/W/H) Number of instruction queue 64-bit words to add
+ *	to the CPT instruction doorbell count. Readback value is the the
+ *	current number of pending doorbell requests. If counter overflows
+ *	CPT()_VQ()_MISC_INT[DBELL_DOVF] is set. To reset the count back to
+ *	zero, write one to clear CPT()_VQ()_MISC_INT_ENA_W1C[DBELL_DOVF],
+ *	then write a value of 2^20 minus the read [DBELL_CNT], then write one
+ *	to CPT()_VQ()_MISC_INT_W1C[DBELL_DOVF] and
+ *	CPT()_VQ()_MISC_INT_ENA_W1S[DBELL_DOVF]. Must be a multiple of 8.
+ *	All CPT instructions are 8 words and require a doorbell count of
+ *	multiple of 8.
+ */
+union cptx_vqx_doorbell {
+	u64 u;
+	struct cptx_vqx_doorbell_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_20_63:44;
+		u64 dbell_cnt:20;
+#else /* Word 0 - Little Endian */
+		u64 dbell_cnt:20;
+		u64 reserved_20_63:44;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_inprog
+ *
+ * CPT Queue In Progress Count Registers
+ * These registers contain the per-queue instruction in flight registers.
+ * cptx_vqx_inprog_s
+ * Word0
+ *  reserved_8_63:56 [63:8] Reserved.
+ *  inflight:8 [7:0](RO/H) Inflight count. Counts the number of instructions
+ *	for the VF for which CPT is fetching, executing or responding to
+ *	instructions. However this does not include any interrupts that are
+ *	awaiting software handling (CPT()_VQ()_DONE[DONE] != 0x0).
+ *	A queue may not be reconfigured until:
+ *	1. CPT()_VQ()_CTL[ENA] is cleared by software.
+ *	2. [INFLIGHT] is polled until equals to zero.
+ */
+union cptx_vqx_inprog {
+	u64 u;
+	struct cptx_vqx_inprog_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_8_63:56;
+		u64 inflight:8;
+#else /* Word 0 - Little Endian */
+		u64 inflight:8;
+		u64 reserved_8_63:56;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_misc_int
+ *
+ * CPT Queue Misc Interrupt Register
+ * These registers contain the per-queue miscellaneous interrupts.
+ * cptx_vqx_misc_int_s
+ * Word 0
+ *  reserved_5_63:59 [63:5] Reserved.
+ *  swerr:1 [4:4](R/W1C/H) Software error from engines.
+ *  nwrp:1  [3:3](R/W1C/H) NCB result write response error.
+ *  irde:1  [2:2](R/W1C/H) Instruction NCB read response error.
+ *  dovf:1 [1:1](R/W1C/H) Doorbell overflow.
+ *  mbox:1 [0:0](R/W1C/H) PF to VF mailbox interrupt. Set when
+ *	CPT()_VF()_PF_MBOX(0) is written.
+ *
+ */
+union cptx_vqx_misc_int {
+	u64 u;
+	struct cptx_vqx_misc_int_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_5_63:59;
+		u64 swerr:1;
+		u64 nwrp:1;
+		u64 irde:1;
+		u64 dovf:1;
+		u64 mbox:1;
+#else /* Word 0 - Little Endian */
+		u64 mbox:1;
+		u64 dovf:1;
+		u64 irde:1;
+		u64 nwrp:1;
+		u64 swerr:1;
+		u64 reserved_5_63:59;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_done_ack
+ *
+ * CPT Queue Done Count Ack Registers
+ * This register is written by software to acknowledge interrupts.
+ * cptx_vqx_done_ack_s
+ * Word0
+ *  reserved_20_63:44 [63:20] Reserved.
+ *  done_ack:20 [19:0](R/W/H) Number of decrements to CPT()_VQ()_DONE[DONE].
+ *	Reads CPT()_VQ()_DONE[DONE]. Written by software to acknowledge
+ *	interrupts. If CPT()_VQ()_DONE[DONE] is still nonzero the interrupt
+ *	will be re-sent if the conditions described in CPT()_VQ()_DONE[DONE]
+ *	are satisfied.
+ *
+ */
+union cptx_vqx_done_ack {
+	u64 u;
+	struct cptx_vqx_done_ack_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_20_63:44;
+		u64 done_ack:20;
+#else /* Word 0 - Little Endian */
+		u64 done_ack:20;
+		u64 reserved_20_63:44;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_done
+ *
+ * CPT Queue Done Count Registers
+ * These registers contain the per-queue instruction done count.
+ * cptx_vqx_done_s
+ * Word0
+ *  reserved_20_63:44 [63:20] Reserved.
+ *  done:20 [19:0](R/W/H) Done count. When CPT_INST_S[DONEINT] set and that
+ *	instruction completes, CPT()_VQ()_DONE[DONE] is incremented when the
+ *	instruction finishes. Write to this field are for diagnostic use only;
+ *	instead software writes CPT()_VQ()_DONE_ACK with the number of
+ *	decrements for this field.
+ *	Interrupts are sent as follows:
+ *	* When CPT()_VQ()_DONE[DONE] = 0, then no results are pending, the
+ *	interrupt coalescing timer is held to zero, and an interrupt is not
+ *	sent.
+ *	* When CPT()_VQ()_DONE[DONE] != 0, then the interrupt coalescing timer
+ *	counts. If the counter is >= CPT()_VQ()_DONE_WAIT[TIME_WAIT]*1024, or
+ *	CPT()_VQ()_DONE[DONE] >= CPT()_VQ()_DONE_WAIT[NUM_WAIT], i.e. enough
+ *	time has passed or enough results have arrived, then the interrupt is
+ *	sent.
+ *	* When CPT()_VQ()_DONE_ACK is written (or CPT()_VQ()_DONE is written
+ *	but this is not typical), the interrupt coalescing timer restarts.
+ *	Note after decrementing this interrupt equation is recomputed,
+ *	for example if CPT()_VQ()_DONE[DONE] >= CPT()_VQ()_DONE_WAIT[NUM_WAIT]
+ *	and because the timer is zero, the interrupt will be resent immediately.
+ *	(This covers the race case between software acknowledging an interrupt
+ *	and a result returning.)
+ *	* When CPT()_VQ()_DONE_ENA_W1S[DONE] = 0, interrupts are not sent,
+ *	but the counting described above still occurs.
+ *	Since CPT instructions complete out-of-order, if software is using
+ *	completion interrupts the suggested scheme is to request a DONEINT on
+ *	each request, and when an interrupt arrives perform a "greedy" scan for
+ *	completions; even if a later command is acknowledged first this will
+ *	not result in missing a completion.
+ *	Software is responsible for making sure [DONE] does not overflow;
+ *	for example by insuring there are not more than 2^20-1 instructions in
+ *	flight that may request interrupts.
+ *
+ */
+union cptx_vqx_done {
+	u64 u;
+	struct cptx_vqx_done_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_20_63:44;
+		u64 done:20;
+#else /* Word 0 - Little Endian */
+		u64 done:20;
+		u64 reserved_20_63:44;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_done_wait
+ *
+ * CPT Queue Done Interrupt Coalescing Wait Registers
+ * Specifies the per queue interrupt coalescing settings.
+ * cptx_vqx_done_wait_s
+ * Word0
+ *  reserved_48_63:16 [63:48] Reserved.
+ *  time_wait:16; [47:32](R/W) Time hold-off. When CPT()_VQ()_DONE[DONE] = 0
+ *	or CPT()_VQ()_DONE_ACK is written a timer is cleared. When the timer
+ *	reaches [TIME_WAIT]*1024 then interrupt coalescing ends.
+ *	see CPT()_VQ()_DONE[DONE]. If 0x0, time coalescing is disabled.
+ *  reserved_20_31:12 [31:20] Reserved.
+ *  num_wait:20 [19:0](R/W) Number of messages hold-off.
+ *	When CPT()_VQ()_DONE[DONE] >= [NUM_WAIT] then interrupt coalescing ends
+ *	see CPT()_VQ()_DONE[DONE]. If 0x0, same behavior as 0x1.
+ *
+ */
+union cptx_vqx_done_wait {
+	u64 u;
+	struct cptx_vqx_done_wait_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_48_63:16;
+		u64 time_wait:16;
+		u64 reserved_20_31:12;
+		u64 num_wait:20;
+#else /* Word 0 - Little Endian */
+		u64 num_wait:20;
+		u64 reserved_20_31:12;
+		u64 time_wait:16;
+		u64 reserved_48_63:16;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_done_ena_w1s
+ *
+ * CPT Queue Done Interrupt Enable Set Registers
+ * Write 1 to these registers will enable the DONEINT interrupt for the queue.
+ * cptx_vqx_done_ena_w1s_s
+ * Word0
+ *  reserved_1_63:63 [63:1] Reserved.
+ *  done:1 [0:0](R/W1S/H) Write 1 will enable DONEINT for this queue.
+ *	Write 0 has no effect. Read will return the enable bit.
+ */
+union cptx_vqx_done_ena_w1s {
+	u64 u;
+	struct cptx_vqx_done_ena_w1s_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_1_63:63;
+		u64 done:1;
+#else /* Word 0 - Little Endian */
+		u64 done:1;
+		u64 reserved_1_63:63;
+#endif /* Word 0 - End */
+	} s;
+};
+
+/**
+ * Register (NCB) cpt#_vq#_ctl
+ *
+ * CPT VF Queue Control Registers
+ * This register configures queues. This register should be changed (other than
+ * clearing [ENA]) only when quiescent (see CPT()_VQ()_INPROG[INFLIGHT]).
+ * cptx_vqx_ctl_s
+ * Word0
+ *  reserved_1_63:63 [63:1] Reserved.
+ *  ena:1 [0:0](R/W/H) Enables the logical instruction queue.
+ *	See also CPT()_PF_Q()_CTL[CONT_ERR] and	CPT()_VQ()_INPROG[INFLIGHT].
+ *	1 = Queue is enabled.
+ *	0 = Queue is disabled.
+ */
+union cptx_vqx_ctl {
+	u64 u;
+	struct cptx_vqx_ctl_s {
+#if defined(__BIG_ENDIAN_BITFIELD) /* Word 0 - Big Endian */
+		u64 reserved_1_63:63;
+		u64 ena:1;
+#else /* Word 0 - Little Endian */
+		u64 ena:1;
+		u64 reserved_1_63:63;
+#endif /* Word 0 - End */
+	} s;
+};
+#endif /*__CPT_HW_TYPES_H*/
diff --git a/drivers/crypto/cavium/cpt/cptpf.h b/drivers/crypto/cavium/cpt/cptpf.h
new file mode 100644
index 000000000000..c0556c5f63c9
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptpf.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __CPTPF_H
+#define __CPTPF_H
+
+#include "cpt_common.h"
+
+#define CSR_DELAY 30
+#define CPT_MAX_CORE_GROUPS 8
+#define CPT_MAX_SE_CORES 10
+#define CPT_MAX_AE_CORES 6
+#define CPT_MAX_TOTAL_CORES (CPT_MAX_SE_CORES + CPT_MAX_AE_CORES)
+#define CPT_MAX_VF_NUM 16
+#define	CPT_PF_MSIX_VECTORS 3
+#define CPT_PF_INT_VEC_E_MBOXX(a) (0x02 + (a))
+#define CPT_UCODE_VERSION_SZ 32
+struct cpt_device;
+
+struct microcode {
+	u8 is_mc_valid;
+	u8 is_ae;
+	u8 group;
+	u8 num_cores;
+	u32 code_size;
+	u64 core_mask;
+	u8 version[CPT_UCODE_VERSION_SZ];
+	/* Base info */
+	dma_addr_t phys_base;
+	void *code;
+};
+
+struct cpt_vf_info {
+	u8 state;
+	u8 priority;
+	u8 id;
+	u32 qlen;
+};
+
+/**
+ * cpt device structure
+ */
+struct cpt_device {
+	u16 flags;	/* Flags to hold device status bits */
+	u8 num_vf_en; /* Number of VFs enabled (0...CPT_MAX_VF_NUM) */
+	struct cpt_vf_info vfinfo[CPT_MAX_VF_NUM]; /* Per VF info */
+
+	void __iomem *reg_base; /* Register start address */
+	struct pci_dev *pdev; /* pci device handle */
+
+	struct microcode mcode[CPT_MAX_CORE_GROUPS];
+	u8 next_mc_idx; /* next microcode index */
+	u8 next_group;
+	u8 max_se_cores;
+	u8 max_ae_cores;
+};
+
+void cpt_mbox_intr_handler(struct cpt_device *cpt, int mbx);
+#endif /* __CPTPF_H */
diff --git a/drivers/crypto/cavium/cpt/cptpf_main.c b/drivers/crypto/cavium/cpt/cptpf_main.c
new file mode 100644
index 000000000000..4119c40e7c4b
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptpf_main.c
@@ -0,0 +1,670 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/printk.h>
+#include <linux/version.h>
+
+#include "cptpf.h"
+
+#define DRV_NAME	"thunder-cpt"
+#define DRV_VERSION	"1.0"
+
+static u32 num_vfs = 4; /* Default 4 VF enabled */
+module_param(num_vfs, uint, 0444);
+MODULE_PARM_DESC(num_vfs, "Number of VFs to enable(1-16)");
+
+/*
+ * Disable cores specified by coremask
+ */
+static void cpt_disable_cores(struct cpt_device *cpt, u64 coremask,
+			      u8 type, u8 grp)
+{
+	u64 pf_exe_ctl;
+	u32 timeout = 100;
+	u64 grpmask = 0;
+	struct device *dev = &cpt->pdev->dev;
+
+	if (type == AE_TYPES)
+		coremask = (coremask << cpt->max_se_cores);
+
+	/* Disengage the cores from groups */
+	grpmask = cpt_read_csr64(cpt->reg_base, CPTX_PF_GX_EN(0, grp));
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_GX_EN(0, grp),
+			(grpmask & ~coremask));
+	udelay(CSR_DELAY);
+	grp = cpt_read_csr64(cpt->reg_base, CPTX_PF_EXEC_BUSY(0));
+	while (grp & coremask) {
+		dev_err(dev, "Cores still busy %llx", coremask);
+		grp = cpt_read_csr64(cpt->reg_base,
+				     CPTX_PF_EXEC_BUSY(0));
+		if (timeout--)
+			break;
+
+		udelay(CSR_DELAY);
+	}
+
+	/* Disable the cores */
+	pf_exe_ctl = cpt_read_csr64(cpt->reg_base, CPTX_PF_EXE_CTL(0));
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_EXE_CTL(0),
+			(pf_exe_ctl & ~coremask));
+	udelay(CSR_DELAY);
+}
+
+/*
+ * Enable cores specified by coremask
+ */
+static void cpt_enable_cores(struct cpt_device *cpt, u64 coremask,
+			     u8 type)
+{
+	u64 pf_exe_ctl;
+
+	if (type == AE_TYPES)
+		coremask = (coremask << cpt->max_se_cores);
+
+	pf_exe_ctl = cpt_read_csr64(cpt->reg_base, CPTX_PF_EXE_CTL(0));
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_EXE_CTL(0),
+			(pf_exe_ctl | coremask));
+	udelay(CSR_DELAY);
+}
+
+static void cpt_configure_group(struct cpt_device *cpt, u8 grp,
+				u64 coremask, u8 type)
+{
+	u64 pf_gx_en = 0;
+
+	if (type == AE_TYPES)
+		coremask = (coremask << cpt->max_se_cores);
+
+	pf_gx_en = cpt_read_csr64(cpt->reg_base, CPTX_PF_GX_EN(0, grp));
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_GX_EN(0, grp),
+			(pf_gx_en | coremask));
+	udelay(CSR_DELAY);
+}
+
+static void cpt_disable_mbox_interrupts(struct cpt_device *cpt)
+{
+	/* Clear mbox(0) interupts for all vfs */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_MBOX_ENA_W1CX(0, 0), ~0ull);
+}
+
+static void cpt_disable_ecc_interrupts(struct cpt_device *cpt)
+{
+	/* Clear ecc(0) interupts for all vfs */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_ECC0_ENA_W1C(0), ~0ull);
+}
+
+static void cpt_disable_exec_interrupts(struct cpt_device *cpt)
+{
+	/* Clear exec interupts for all vfs */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_EXEC_ENA_W1C(0), ~0ull);
+}
+
+static void cpt_disable_all_interrupts(struct cpt_device *cpt)
+{
+	cpt_disable_mbox_interrupts(cpt);
+	cpt_disable_ecc_interrupts(cpt);
+	cpt_disable_exec_interrupts(cpt);
+}
+
+static void cpt_enable_mbox_interrupts(struct cpt_device *cpt)
+{
+	/* Set mbox(0) interupts for all vfs */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_MBOX_ENA_W1SX(0, 0), ~0ull);
+}
+
+static int cpt_load_microcode(struct cpt_device *cpt, struct microcode *mcode)
+{
+	int ret = 0, core = 0, shift = 0;
+	u32 total_cores = 0;
+	struct device *dev = &cpt->pdev->dev;
+
+	if (!mcode || !mcode->code) {
+		dev_err(dev, "Either the mcode is null or data is NULL\n");
+		return -EINVAL;
+	}
+
+	if (mcode->code_size == 0) {
+		dev_err(dev, "microcode size is 0\n");
+		return -EINVAL;
+	}
+
+	/* Assumes 0-9 are SE cores for UCODE_BASE registers and
+	 * AE core bases follow
+	 */
+	if (mcode->is_ae) {
+		core = CPT_MAX_SE_CORES; /* start couting from 10 */
+		total_cores = CPT_MAX_TOTAL_CORES; /* upto 15 */
+	} else {
+		core = 0; /* start couting from 0 */
+		total_cores = CPT_MAX_SE_CORES; /* upto 9 */
+	}
+
+	/* Point to microcode for each core of the group */
+	for (; core < total_cores ; core++, shift++) {
+		if (mcode->core_mask & (1 << shift)) {
+			cpt_write_csr64(cpt->reg_base,
+					CPTX_PF_ENGX_UCODE_BASE(0, core),
+					(u64)mcode->phys_base);
+		}
+	}
+	return ret;
+}
+
+static int do_cpt_init(struct cpt_device *cpt, struct microcode *mcode)
+{
+	int ret = 0;
+	struct device *dev = &cpt->pdev->dev;
+
+	/* Make device not ready */
+	cpt->flags &= ~CPT_FLAG_DEVICE_READY;
+	/* Disable All PF interrupts */
+	cpt_disable_all_interrupts(cpt);
+	/* Calculate mcode group and coremasks */
+	if (mcode->is_ae) {
+		if (mcode->num_cores > cpt->max_ae_cores) {
+			dev_err(dev, "Requested for more cores than available AE cores\n");
+			ret = -EINVAL;
+			goto cpt_init_fail;
+		}
+
+		if (cpt->next_group >= CPT_MAX_CORE_GROUPS) {
+			dev_err(dev, "Can't load, all eight microcode groups in use");
+			return -ENFILE;
+		}
+
+		mcode->group = cpt->next_group;
+		/* Convert requested cores to mask */
+		mcode->core_mask = GENMASK(mcode->num_cores, 0);
+		cpt_disable_cores(cpt, mcode->core_mask, AE_TYPES,
+				  mcode->group);
+		/* Load microcode for AE engines */
+		ret = cpt_load_microcode(cpt, mcode);
+		if (ret) {
+			dev_err(dev, "Microcode load Failed for %s\n",
+				mcode->version);
+			goto cpt_init_fail;
+		}
+		cpt->next_group++;
+		/* Configure group mask for the mcode */
+		cpt_configure_group(cpt, mcode->group, mcode->core_mask,
+				    AE_TYPES);
+		/* Enable AE cores for the group mask */
+		cpt_enable_cores(cpt, mcode->core_mask, AE_TYPES);
+	} else {
+		if (mcode->num_cores > cpt->max_se_cores) {
+			dev_err(dev, "Requested for more cores than available SE cores\n");
+			ret = -EINVAL;
+			goto cpt_init_fail;
+		}
+		if (cpt->next_group >= CPT_MAX_CORE_GROUPS) {
+			dev_err(dev, "Can't load, all eight microcode groups in use");
+			return -ENFILE;
+		}
+
+		mcode->group = cpt->next_group;
+		/* Covert requested cores to mask */
+		mcode->core_mask = GENMASK(mcode->num_cores, 0);
+		cpt_disable_cores(cpt, mcode->core_mask, SE_TYPES,
+				  mcode->group);
+		/* Load microcode for SE engines */
+		ret = cpt_load_microcode(cpt, mcode);
+		if (ret) {
+			dev_err(dev, "Microcode load Failed for %s\n",
+				mcode->version);
+			goto cpt_init_fail;
+		}
+		cpt->next_group++;
+		/* Configure group mask for the mcode */
+		cpt_configure_group(cpt, mcode->group, mcode->core_mask,
+				    SE_TYPES);
+		/* Enable SE cores for the group mask */
+		cpt_enable_cores(cpt, mcode->core_mask, SE_TYPES);
+	}
+
+	/* Enabled PF mailbox interrupts */
+	cpt_enable_mbox_interrupts(cpt);
+	cpt->flags |= CPT_FLAG_DEVICE_READY;
+
+	return ret;
+
+cpt_init_fail:
+	/* Enabled PF mailbox interrupts */
+	cpt_enable_mbox_interrupts(cpt);
+
+	return ret;
+}
+
+struct ucode_header {
+	u8 version[CPT_UCODE_VERSION_SZ];
+	u32 code_length;
+	u32 data_length;
+	u64 sram_address;
+};
+
+static int cpt_ucode_load_fw(struct cpt_device *cpt, const u8 *fw, bool is_ae)
+{
+	const struct firmware *fw_entry;
+	struct device *dev = &cpt->pdev->dev;
+	struct ucode_header *ucode;
+	struct microcode *mcode;
+	int j, ret = 0;
+
+	ret = request_firmware(&fw_entry, fw, dev);
+	if (ret)
+		return ret;
+
+	ucode = (struct ucode_header *)fw_entry->data;
+	mcode = &cpt->mcode[cpt->next_mc_idx];
+	memcpy(mcode->version, (u8 *)fw_entry->data, CPT_UCODE_VERSION_SZ);
+	mcode->code_size = ntohl(ucode->code_length) * 2;
+	if (!mcode->code_size)
+		return -EINVAL;
+
+	mcode->is_ae = is_ae;
+	mcode->core_mask = 0ULL;
+	mcode->num_cores = is_ae ? 6 : 10;
+
+	/*  Allocate DMAable space */
+	mcode->code = dma_zalloc_coherent(&cpt->pdev->dev, mcode->code_size,
+					  &mcode->phys_base, GFP_KERNEL);
+	if (!mcode->code) {
+		dev_err(dev, "Unable to allocate space for microcode");
+		return -ENOMEM;
+	}
+
+	memcpy((void *)mcode->code, (void *)(fw_entry->data + sizeof(*ucode)),
+	       mcode->code_size);
+
+	/* Byte swap 64-bit */
+	for (j = 0; j < (mcode->code_size / 8); j++)
+		((u64 *)mcode->code)[j] = cpu_to_be64(((u64 *)mcode->code)[j]);
+	/*  MC needs 16-bit swap */
+	for (j = 0; j < (mcode->code_size / 2); j++)
+		((u16 *)mcode->code)[j] = cpu_to_be16(((u16 *)mcode->code)[j]);
+
+	dev_dbg(dev, "mcode->code_size = %u\n", mcode->code_size);
+	dev_dbg(dev, "mcode->is_ae = %u\n", mcode->is_ae);
+	dev_dbg(dev, "mcode->num_cores = %u\n", mcode->num_cores);
+	dev_dbg(dev, "mcode->code = %llx\n", (u64)mcode->code);
+	dev_dbg(dev, "mcode->phys_base = %llx\n", mcode->phys_base);
+
+	ret = do_cpt_init(cpt, mcode);
+	if (ret) {
+		dev_err(dev, "do_cpt_init failed with ret: %d\n", ret);
+		return ret;
+	}
+
+	dev_info(dev, "Microcode Loaded %s\n", mcode->version);
+	mcode->is_mc_valid = 1;
+	cpt->next_mc_idx++;
+	release_firmware(fw_entry);
+
+	return ret;
+}
+
+static int cpt_ucode_load(struct cpt_device *cpt)
+{
+	int ret = 0;
+	struct device *dev = &cpt->pdev->dev;
+
+	ret = cpt_ucode_load_fw(cpt, "cpt8x-mc-ae.out", true);
+	if (ret) {
+		dev_err(dev, "ae:cpt_ucode_load failed with ret: %d\n", ret);
+		return ret;
+	}
+	ret = cpt_ucode_load_fw(cpt, "cpt8x-mc-se.out", false);
+	if (ret) {
+		dev_err(dev, "se:cpt_ucode_load failed with ret: %d\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static irqreturn_t cpt_mbx0_intr_handler(int irq, void *cpt_irq)
+{
+	struct cpt_device *cpt = (struct cpt_device *)cpt_irq;
+
+	cpt_mbox_intr_handler(cpt, 0);
+
+	return IRQ_HANDLED;
+}
+
+static void cpt_reset(struct cpt_device *cpt)
+{
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_RESET(0), 1);
+}
+
+static void cpt_find_max_enabled_cores(struct cpt_device *cpt)
+{
+	union cptx_pf_constants pf_cnsts = {0};
+
+	pf_cnsts.u = cpt_read_csr64(cpt->reg_base, CPTX_PF_CONSTANTS(0));
+	cpt->max_se_cores = pf_cnsts.s.se;
+	cpt->max_ae_cores = pf_cnsts.s.ae;
+}
+
+static u32 cpt_check_bist_status(struct cpt_device *cpt)
+{
+	union cptx_pf_bist_status bist_sts = {0};
+
+	bist_sts.u = cpt_read_csr64(cpt->reg_base,
+				    CPTX_PF_BIST_STATUS(0));
+
+	return bist_sts.u;
+}
+
+static u64 cpt_check_exe_bist_status(struct cpt_device *cpt)
+{
+	union cptx_pf_exe_bist_status bist_sts = {0};
+
+	bist_sts.u = cpt_read_csr64(cpt->reg_base,
+				    CPTX_PF_EXE_BIST_STATUS(0));
+
+	return bist_sts.u;
+}
+
+static void cpt_disable_all_cores(struct cpt_device *cpt)
+{
+	u32 grp, timeout = 100;
+	struct device *dev = &cpt->pdev->dev;
+
+	/* Disengage the cores from groups */
+	for (grp = 0; grp < CPT_MAX_CORE_GROUPS; grp++) {
+		cpt_write_csr64(cpt->reg_base, CPTX_PF_GX_EN(0, grp), 0);
+		udelay(CSR_DELAY);
+	}
+
+	grp = cpt_read_csr64(cpt->reg_base, CPTX_PF_EXEC_BUSY(0));
+	while (grp) {
+		dev_err(dev, "Cores still busy");
+		grp = cpt_read_csr64(cpt->reg_base,
+				     CPTX_PF_EXEC_BUSY(0));
+		if (timeout--)
+			break;
+
+		udelay(CSR_DELAY);
+	}
+	/* Disable the cores */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_EXE_CTL(0), 0);
+}
+
+/**
+ * Ensure all cores are disengaged from all groups by
+ * calling cpt_disable_all_cores() before calling this
+ * function.
+ */
+static void cpt_unload_microcode(struct cpt_device *cpt)
+{
+	u32 grp = 0, core;
+
+	/* Free microcode bases and reset group masks */
+	for (grp = 0; grp < CPT_MAX_CORE_GROUPS; grp++) {
+		struct microcode *mcode = &cpt->mcode[grp];
+
+		if (cpt->mcode[grp].code)
+			dma_free_coherent(&cpt->pdev->dev, mcode->code_size,
+					  mcode->code, mcode->phys_base);
+		mcode->code = NULL;
+	}
+	/* Clear UCODE_BASE registers for all engines */
+	for (core = 0; core < CPT_MAX_TOTAL_CORES; core++)
+		cpt_write_csr64(cpt->reg_base,
+				CPTX_PF_ENGX_UCODE_BASE(0, core), 0ull);
+}
+
+static int cpt_device_init(struct cpt_device *cpt)
+{
+	u64 bist;
+	struct device *dev = &cpt->pdev->dev;
+
+	/* Reset the PF when probed first */
+	cpt_reset(cpt);
+	mdelay(100);
+
+	/*Check BIST status*/
+	bist = (u64)cpt_check_bist_status(cpt);
+	if (bist) {
+		dev_err(dev, "RAM BIST failed with code 0x%llx", bist);
+		return -ENODEV;
+	}
+
+	bist = cpt_check_exe_bist_status(cpt);
+	if (bist) {
+		dev_err(dev, "Engine BIST failed with code 0x%llx", bist);
+		return -ENODEV;
+	}
+
+	/*Get CLK frequency*/
+	/*Get max enabled cores */
+	cpt_find_max_enabled_cores(cpt);
+	/*Disable all cores*/
+	cpt_disable_all_cores(cpt);
+	/*Reset device parameters*/
+	cpt->next_mc_idx   = 0;
+	cpt->next_group = 0;
+	/* PF is ready */
+	cpt->flags |= CPT_FLAG_DEVICE_READY;
+
+	return 0;
+}
+
+static int cpt_register_interrupts(struct cpt_device *cpt)
+{
+	int ret;
+	struct device *dev = &cpt->pdev->dev;
+
+	/* Enable MSI-X */
+	ret = pci_alloc_irq_vectors(cpt->pdev, CPT_PF_MSIX_VECTORS,
+			CPT_PF_MSIX_VECTORS, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(&cpt->pdev->dev, "Request for #%d msix vectors failed\n",
+			CPT_PF_MSIX_VECTORS);
+		return ret;
+	}
+
+	/* Register mailbox interrupt handlers */
+	ret = request_irq(pci_irq_vector(cpt->pdev, CPT_PF_INT_VEC_E_MBOXX(0)),
+			  cpt_mbx0_intr_handler, 0, "CPT Mbox0", cpt);
+	if (ret)
+		goto fail;
+
+	/* Enable mailbox interrupt */
+	cpt_enable_mbox_interrupts(cpt);
+	return 0;
+
+fail:
+	dev_err(dev, "Request irq failed\n");
+	pci_disable_msix(cpt->pdev);
+	return ret;
+}
+
+static void cpt_unregister_interrupts(struct cpt_device *cpt)
+{
+	free_irq(pci_irq_vector(cpt->pdev, CPT_PF_INT_VEC_E_MBOXX(0)), cpt);
+	pci_disable_msix(cpt->pdev);
+}
+
+static int cpt_sriov_init(struct cpt_device *cpt, int num_vfs)
+{
+	int pos = 0;
+	int err;
+	u16 total_vf_cnt;
+	struct pci_dev *pdev = cpt->pdev;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+	if (!pos) {
+		dev_err(&pdev->dev, "SRIOV capability is not found in PCIe config space\n");
+		return -ENODEV;
+	}
+
+	cpt->num_vf_en = num_vfs; /* User requested VFs */
+	pci_read_config_word(pdev, (pos + PCI_SRIOV_TOTAL_VF), &total_vf_cnt);
+	if (total_vf_cnt < cpt->num_vf_en)
+		cpt->num_vf_en = total_vf_cnt;
+
+	if (!total_vf_cnt)
+		return 0;
+
+	/*Enabled the available VFs */
+	err = pci_enable_sriov(pdev, cpt->num_vf_en);
+	if (err) {
+		dev_err(&pdev->dev, "SRIOV enable failed, num VF is %d\n",
+			cpt->num_vf_en);
+		cpt->num_vf_en = 0;
+		return err;
+	}
+
+	/* TODO: Optionally enable static VQ priorities feature */
+
+	dev_info(&pdev->dev, "SRIOV enabled, number of VF available %d\n",
+		 cpt->num_vf_en);
+
+	cpt->flags |= CPT_FLAG_SRIOV_ENABLED;
+
+	return 0;
+}
+
+static int cpt_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct device *dev = &pdev->dev;
+	struct cpt_device *cpt;
+	int err;
+
+	if (num_vfs > 16 || num_vfs < 4) {
+		dev_warn(dev, "Invalid vf count %d, Resetting it to 4(default)\n",
+			 num_vfs);
+		num_vfs = 4;
+	}
+
+	cpt = devm_kzalloc(dev, sizeof(*cpt), GFP_KERNEL);
+	if (!cpt)
+		return -ENOMEM;
+
+	pci_set_drvdata(pdev, cpt);
+	cpt->pdev = pdev;
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev, "Failed to enable PCI device\n");
+		pci_set_drvdata(pdev, NULL);
+		return err;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(dev, "PCI request regions failed 0x%x\n", err);
+		goto cpt_err_disable_device;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(48));
+	if (err) {
+		dev_err(dev, "Unable to get usable DMA configuration\n");
+		goto cpt_err_release_regions;
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(48));
+	if (err) {
+		dev_err(dev, "Unable to get 48-bit DMA for consistent allocations\n");
+		goto cpt_err_release_regions;
+	}
+
+	/* MAP PF's configuration registers */
+	cpt->reg_base = pcim_iomap(pdev, 0, 0);
+	if (!cpt->reg_base) {
+		dev_err(dev, "Cannot map config register space, aborting\n");
+		err = -ENOMEM;
+		goto cpt_err_release_regions;
+	}
+
+	/* CPT device HW initialization */
+	cpt_device_init(cpt);
+
+	/* Register interrupts */
+	err = cpt_register_interrupts(cpt);
+	if (err)
+		goto cpt_err_release_regions;
+
+	err = cpt_ucode_load(cpt);
+	if (err)
+		goto cpt_err_unregister_interrupts;
+
+	/* Configure SRIOV */
+	err = cpt_sriov_init(cpt, num_vfs);
+	if (err)
+		goto cpt_err_unregister_interrupts;
+
+	return 0;
+
+cpt_err_unregister_interrupts:
+	cpt_unregister_interrupts(cpt);
+cpt_err_release_regions:
+	pci_release_regions(pdev);
+cpt_err_disable_device:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void cpt_remove(struct pci_dev *pdev)
+{
+	struct cpt_device *cpt = pci_get_drvdata(pdev);
+
+	/* Disengage SE and AE cores from all groups*/
+	cpt_disable_all_cores(cpt);
+	/* Unload microcodes */
+	cpt_unload_microcode(cpt);
+	cpt_unregister_interrupts(cpt);
+	pci_disable_sriov(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static void cpt_shutdown(struct pci_dev *pdev)
+{
+	struct cpt_device *cpt = pci_get_drvdata(pdev);
+
+	if (!cpt)
+		return;
+
+	dev_info(&pdev->dev, "Shutdown device %x:%x.\n",
+		 (u32)pdev->vendor, (u32)pdev->device);
+
+	cpt_unregister_interrupts(cpt);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+/* Supported devices */
+static const struct pci_device_id cpt_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, CPT_81XX_PCI_PF_DEVICE_ID) },
+	{ 0, }  /* end of table */
+};
+
+static struct pci_driver cpt_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = cpt_id_table,
+	.probe = cpt_probe,
+	.remove = cpt_remove,
+	.shutdown = cpt_shutdown,
+};
+
+module_pci_driver(cpt_pci_driver);
+
+MODULE_AUTHOR("George Cherian <george.cherian@cavium.com>");
+MODULE_DESCRIPTION("Cavium Thunder CPT Physical Function Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cpt_id_table);
diff --git a/drivers/crypto/cavium/cpt/cptpf_mbox.c b/drivers/crypto/cavium/cpt/cptpf_mbox.c
new file mode 100644
index 000000000000..20f2c6ee46a5
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptpf_mbox.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include "cptpf.h"
+
+static void cpt_send_msg_to_vf(struct cpt_device *cpt, int vf,
+			       struct cpt_mbox *mbx)
+{
+	/* Writing mbox(0) causes interrupt */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_VFX_MBOXX(0, vf, 1),
+			mbx->data);
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_VFX_MBOXX(0, vf, 0), mbx->msg);
+}
+
+/* ACKs VF's mailbox message
+ * @vf: VF to which ACK to be sent
+ */
+static void cpt_mbox_send_ack(struct cpt_device *cpt, int vf,
+			      struct cpt_mbox *mbx)
+{
+	mbx->data = 0ull;
+	mbx->msg = CPT_MBOX_MSG_TYPE_ACK;
+	cpt_send_msg_to_vf(cpt, vf, mbx);
+}
+
+static void cpt_clear_mbox_intr(struct cpt_device *cpt, u32 vf)
+{
+	/* W1C for the VF */
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_MBOX_INTX(0, 0), (1 << vf));
+}
+
+/*
+ *  Configure QLEN/Chunk sizes for VF
+ */
+static void cpt_cfg_qlen_for_vf(struct cpt_device *cpt, int vf, u32 size)
+{
+	union cptx_pf_qx_ctl pf_qx_ctl;
+
+	pf_qx_ctl.u = cpt_read_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, vf));
+	pf_qx_ctl.s.size = size;
+	pf_qx_ctl.s.cont_err = true;
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, vf), pf_qx_ctl.u);
+}
+
+/*
+ * Configure VQ priority
+ */
+static void cpt_cfg_vq_priority(struct cpt_device *cpt, int vf, u32 pri)
+{
+	union cptx_pf_qx_ctl pf_qx_ctl;
+
+	pf_qx_ctl.u = cpt_read_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, vf));
+	pf_qx_ctl.s.pri = pri;
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, vf), pf_qx_ctl.u);
+}
+
+static int cpt_bind_vq_to_grp(struct cpt_device *cpt, u8 q, u8 grp)
+{
+	struct microcode *mcode = cpt->mcode;
+	union cptx_pf_qx_ctl pf_qx_ctl;
+	struct device *dev = &cpt->pdev->dev;
+
+	if (q >= CPT_MAX_VF_NUM) {
+		dev_err(dev, "Queues are more than cores in the group");
+		return -EINVAL;
+	}
+	if (grp >= CPT_MAX_CORE_GROUPS) {
+		dev_err(dev, "Request group is more than possible groups");
+		return -EINVAL;
+	}
+	if (grp >= cpt->next_mc_idx) {
+		dev_err(dev, "Request group is higher than available functional groups");
+		return -EINVAL;
+	}
+	pf_qx_ctl.u = cpt_read_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, q));
+	pf_qx_ctl.s.grp = mcode[grp].group;
+	cpt_write_csr64(cpt->reg_base, CPTX_PF_QX_CTL(0, q), pf_qx_ctl.u);
+	dev_dbg(dev, "VF %d TYPE %s", q, (mcode[grp].is_ae ? "AE" : "SE"));
+
+	return mcode[grp].is_ae ? AE_TYPES : SE_TYPES;
+}
+
+/* Interrupt handler to handle mailbox messages from VFs */
+static void cpt_handle_mbox_intr(struct cpt_device *cpt, int vf)
+{
+	struct cpt_vf_info *vfx = &cpt->vfinfo[vf];
+	struct cpt_mbox mbx = {};
+	int vftype;
+	struct device *dev = &cpt->pdev->dev;
+	/*
+	 * MBOX[0] contains msg
+	 * MBOX[1] contains data
+	 */
+	mbx.msg  = cpt_read_csr64(cpt->reg_base, CPTX_PF_VFX_MBOXX(0, vf, 0));
+	mbx.data = cpt_read_csr64(cpt->reg_base, CPTX_PF_VFX_MBOXX(0, vf, 1));
+	dev_dbg(dev, "%s: Mailbox msg 0x%llx from VF%d", __func__, mbx.msg, vf);
+	switch (mbx.msg) {
+	case CPT_MSG_VF_UP:
+		vfx->state = VF_STATE_UP;
+		try_module_get(THIS_MODULE);
+		cpt_mbox_send_ack(cpt, vf, &mbx);
+		break;
+	case CPT_MSG_READY:
+		mbx.msg  = CPT_MSG_READY;
+		mbx.data = vf;
+		cpt_send_msg_to_vf(cpt, vf, &mbx);
+		break;
+	case CPT_MSG_VF_DOWN:
+		/* First msg in VF teardown sequence */
+		vfx->state = VF_STATE_DOWN;
+		module_put(THIS_MODULE);
+		cpt_mbox_send_ack(cpt, vf, &mbx);
+		break;
+	case CPT_MSG_QLEN:
+		vfx->qlen = mbx.data;
+		cpt_cfg_qlen_for_vf(cpt, vf, vfx->qlen);
+		cpt_mbox_send_ack(cpt, vf, &mbx);
+		break;
+	case CPT_MSG_QBIND_GRP:
+		vftype = cpt_bind_vq_to_grp(cpt, vf, (u8)mbx.data);
+		if ((vftype != AE_TYPES) && (vftype != SE_TYPES))
+			dev_err(dev, "Queue %d binding to group %llu failed",
+				vf, mbx.data);
+		else {
+			dev_dbg(dev, "Queue %d binding to group %llu successful",
+				vf, mbx.data);
+			mbx.msg = CPT_MSG_QBIND_GRP;
+			mbx.data = vftype;
+			cpt_send_msg_to_vf(cpt, vf, &mbx);
+		}
+		break;
+	case CPT_MSG_VQ_PRIORITY:
+		vfx->priority = mbx.data;
+		cpt_cfg_vq_priority(cpt, vf, vfx->priority);
+		cpt_mbox_send_ack(cpt, vf, &mbx);
+		break;
+	default:
+		dev_err(&cpt->pdev->dev, "Invalid msg from VF%d, msg 0x%llx\n",
+			vf, mbx.msg);
+		break;
+	}
+}
+
+void cpt_mbox_intr_handler (struct cpt_device *cpt, int mbx)
+{
+	u64 intr;
+	u8  vf;
+
+	intr = cpt_read_csr64(cpt->reg_base, CPTX_PF_MBOX_INTX(0, 0));
+	dev_dbg(&cpt->pdev->dev, "PF interrupt Mbox%d 0x%llx\n", mbx, intr);
+	for (vf = 0; vf < CPT_MAX_VF_NUM; vf++) {
+		if (intr & (1ULL << vf)) {
+			dev_dbg(&cpt->pdev->dev, "Intr from VF %d\n", vf);
+			cpt_handle_mbox_intr(cpt, vf);
+			cpt_clear_mbox_intr(cpt, vf);
+		}
+	}
+}
diff --git a/drivers/crypto/cavium/cpt/cptvf.h b/drivers/crypto/cavium/cpt/cptvf.h
new file mode 100644
index 000000000000..0a835a07d4f2
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __CPTVF_H
+#define __CPTVF_H
+
+#include <linux/list.h>
+#include "cpt_common.h"
+
+/* Default command queue length */
+#define CPT_CMD_QLEN 2046
+#define CPT_CMD_QCHUNK_SIZE 1023
+
+/* Default command timeout in seconds */
+#define CPT_COMMAND_TIMEOUT 4
+#define CPT_TIMER_THOLD	0xFFFF
+#define CPT_NUM_QS_PER_VF 1
+#define CPT_INST_SIZE 64
+#define CPT_NEXT_CHUNK_PTR_SIZE 8
+
+#define	CPT_VF_MSIX_VECTORS 2
+#define CPT_VF_INTR_MBOX_MASK BIT(0)
+#define CPT_VF_INTR_DOVF_MASK BIT(1)
+#define CPT_VF_INTR_IRDE_MASK BIT(2)
+#define CPT_VF_INTR_NWRP_MASK BIT(3)
+#define CPT_VF_INTR_SERR_MASK BIT(4)
+#define DMA_DIRECT_DIRECT 0 /* Input DIRECT, Output DIRECT */
+#define DMA_GATHER_SCATTER 1
+#define FROM_DPTR 1
+
+/**
+ * Enumeration cpt_vf_int_vec_e
+ *
+ * CPT VF MSI-X Vector Enumeration
+ * Enumerates the MSI-X interrupt vectors.
+ */
+enum cpt_vf_int_vec_e {
+	CPT_VF_INT_VEC_E_MISC = 0x00,
+	CPT_VF_INT_VEC_E_DONE = 0x01
+};
+
+struct command_chunk {
+	u8 *head;
+	dma_addr_t dma_addr;
+	u32 size; /* Chunk size, max CPT_INST_CHUNK_MAX_SIZE */
+	struct hlist_node nextchunk;
+};
+
+struct command_queue {
+	spinlock_t lock; /* command queue lock */
+	u32 idx; /* Command queue host write idx */
+	u32 nchunks; /* Number of command chunks */
+	struct command_chunk *qhead;	/* Command queue head, instructions
+					 * are inserted here
+					 */
+	struct hlist_head chead;
+};
+
+struct command_qinfo {
+	u32 cmd_size;
+	u32 qchunksize; /* Command queue chunk size */
+	struct command_queue queue[CPT_NUM_QS_PER_VF];
+};
+
+struct pending_entry {
+	u8 busy; /* Entry status (free/busy) */
+
+	volatile u64 *completion_addr; /* Completion address */
+	void *post_arg;
+	void (*callback)(int, void *); /* Kernel ASYNC request callabck */
+	void *callback_arg; /* Kernel ASYNC request callabck arg */
+};
+
+struct pending_queue {
+	struct pending_entry *head;	/* head of the queue */
+	u32 front; /* Process work from here */
+	u32 rear; /* Append new work here */
+	atomic64_t pending_count;
+	spinlock_t lock; /* Queue lock */
+};
+
+struct pending_qinfo {
+	u32 nr_queues;	/* Number of queues supported */
+	u32 qlen; /* Queue length */
+	struct pending_queue queue[CPT_NUM_QS_PER_VF];
+};
+
+#define for_each_pending_queue(qinfo, q, i)	\
+	for (i = 0, q = &qinfo->queue[i]; i < qinfo->nr_queues; i++, \
+	     q = &qinfo->queue[i])
+
+struct cpt_vf {
+	u16 flags; /* Flags to hold device status bits */
+	u8 vfid; /* Device Index 0...CPT_MAX_VF_NUM */
+	u8 vftype; /* VF type of SE_TYPE(1) or AE_TYPE(1) */
+	u8 vfgrp; /* VF group (0 - 8) */
+	u8 node; /* Operating node: Bits (46:44) in BAR0 address */
+	u8 priority; /* VF priority ring: 1-High proirity round
+		      * robin ring;0-Low priority round robin ring;
+		      */
+	struct pci_dev *pdev; /* pci device handle */
+	void __iomem *reg_base; /* Register start address */
+	void *wqe_info;	/* BH worker info */
+	/* MSI-X */
+	cpumask_var_t affinity_mask[CPT_VF_MSIX_VECTORS];
+	/* Command and Pending queues */
+	u32 qsize;
+	u32 nr_queues;
+	struct command_qinfo cqinfo; /* Command queue information */
+	struct pending_qinfo pqinfo; /* Pending queue information */
+	/* VF-PF mailbox communication */
+	bool pf_acked;
+	bool pf_nacked;
+};
+
+int cptvf_send_vf_up(struct cpt_vf *cptvf);
+int cptvf_send_vf_down(struct cpt_vf *cptvf);
+int cptvf_send_vf_to_grp_msg(struct cpt_vf *cptvf);
+int cptvf_send_vf_priority_msg(struct cpt_vf *cptvf);
+int cptvf_send_vq_size_msg(struct cpt_vf *cptvf);
+int cptvf_check_pf_ready(struct cpt_vf *cptvf);
+void cptvf_handle_mbox_intr(struct cpt_vf *cptvf);
+void cvm_crypto_exit(void);
+int cvm_crypto_init(struct cpt_vf *cptvf);
+void vq_post_process(struct cpt_vf *cptvf, u32 qno);
+void cptvf_write_vq_doorbell(struct cpt_vf *cptvf, u32 val);
+#endif /* __CPTVF_H */
diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.c b/drivers/crypto/cavium/cpt/cptvf_algs.c
new file mode 100644
index 000000000000..cc853f913d4b
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf_algs.c
@@ -0,0 +1,444 @@
+
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/authenc.h>
+#include <crypto/cryptd.h>
+#include <crypto/crypto_wq.h>
+#include <crypto/des.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include "cptvf.h"
+#include "cptvf_algs.h"
+
+struct cpt_device_handle {
+	void *cdev[MAX_DEVICES];
+	u32 dev_count;
+};
+
+static struct cpt_device_handle dev_handle;
+
+static void cvm_callback(u32 status, void *arg)
+{
+	struct crypto_async_request *req = (struct crypto_async_request *)arg;
+
+	req->complete(req, !status);
+}
+
+static inline void update_input_iv(struct cpt_request_info *req_info,
+				   u8 *iv, u32 enc_iv_len,
+				   u32 *argcnt)
+{
+	/* Setting the iv information */
+	req_info->in[*argcnt].vptr = (void *)iv;
+	req_info->in[*argcnt].size = enc_iv_len;
+	req_info->req.dlen += enc_iv_len;
+
+	++(*argcnt);
+}
+
+static inline void update_output_iv(struct cpt_request_info *req_info,
+				    u8 *iv, u32 enc_iv_len,
+				    u32 *argcnt)
+{
+	/* Setting the iv information */
+	req_info->out[*argcnt].vptr = (void *)iv;
+	req_info->out[*argcnt].size = enc_iv_len;
+	req_info->rlen += enc_iv_len;
+
+	++(*argcnt);
+}
+
+static inline void update_input_data(struct cpt_request_info *req_info,
+				     struct scatterlist *inp_sg,
+				     u32 nbytes, u32 *argcnt)
+{
+	req_info->req.dlen += nbytes;
+
+	while (nbytes) {
+		u32 len = min(nbytes, inp_sg->length);
+		u8 *ptr = sg_virt(inp_sg);
+
+		req_info->in[*argcnt].vptr = (void *)ptr;
+		req_info->in[*argcnt].size = len;
+		nbytes -= len;
+
+		++(*argcnt);
+		++inp_sg;
+	}
+}
+
+static inline void update_output_data(struct cpt_request_info *req_info,
+				      struct scatterlist *outp_sg,
+				      u32 nbytes, u32 *argcnt)
+{
+	req_info->rlen += nbytes;
+
+	while (nbytes) {
+		u32 len = min(nbytes, outp_sg->length);
+		u8 *ptr = sg_virt(outp_sg);
+
+		req_info->out[*argcnt].vptr = (void *)ptr;
+		req_info->out[*argcnt].size = len;
+		nbytes -= len;
+		++(*argcnt);
+		++outp_sg;
+	}
+}
+
+static inline u32 create_ctx_hdr(struct ablkcipher_request *req, u32 enc,
+				 u32 cipher_type, u32 aes_key_type,
+				 u32 *argcnt)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct cvm_enc_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct cvm_req_ctx *rctx = ablkcipher_request_ctx(req);
+	struct fc_context *fctx = &rctx->fctx;
+	u64 *offset_control = &rctx->control_word;
+	u32 enc_iv_len = crypto_ablkcipher_ivsize(tfm);
+	struct cpt_request_info *req_info = &rctx->cpt_req;
+	u64 *ctrl_flags = NULL;
+
+	req_info->ctrl.s.grp = 0;
+	req_info->ctrl.s.dma_mode = DMA_GATHER_SCATTER;
+	req_info->ctrl.s.se_req = SE_CORE_REQ;
+
+	req_info->req.opcode.s.major = MAJOR_OP_FC |
+					DMA_MODE_FLAG(DMA_GATHER_SCATTER);
+	if (enc)
+		req_info->req.opcode.s.minor = 2;
+	else
+		req_info->req.opcode.s.minor = 3;
+
+	req_info->req.param1 = req->nbytes; /* Encryption Data length */
+	req_info->req.param2 = 0; /*Auth data length */
+
+	fctx->enc.enc_ctrl.e.enc_cipher = cipher_type;
+	fctx->enc.enc_ctrl.e.aes_key = aes_key_type;
+	fctx->enc.enc_ctrl.e.iv_source = FROM_DPTR;
+
+	if (cipher_type == AES_XTS)
+		memcpy(fctx->enc.encr_key, ctx->enc_key, ctx->key_len * 2);
+	else
+		memcpy(fctx->enc.encr_key, ctx->enc_key, ctx->key_len);
+	ctrl_flags = (u64 *)&fctx->enc.enc_ctrl.flags;
+	*ctrl_flags = cpu_to_be64(*ctrl_flags);
+
+	*offset_control = cpu_to_be64(((u64)(enc_iv_len) << 16));
+	/* Storing  Packet Data Information in offset
+	 * Control Word First 8 bytes
+	 */
+	req_info->in[*argcnt].vptr = (u8 *)offset_control;
+	req_info->in[*argcnt].size = CONTROL_WORD_LEN;
+	req_info->req.dlen += CONTROL_WORD_LEN;
+	++(*argcnt);
+
+	req_info->in[*argcnt].vptr = (u8 *)fctx;
+	req_info->in[*argcnt].size = sizeof(struct fc_context);
+	req_info->req.dlen += sizeof(struct fc_context);
+
+	++(*argcnt);
+
+	return 0;
+}
+
+static inline u32 create_input_list(struct ablkcipher_request  *req, u32 enc,
+				    u32 cipher_type, u32 aes_key_type,
+				    u32 enc_iv_len)
+{
+	struct cvm_req_ctx *rctx = ablkcipher_request_ctx(req);
+	struct cpt_request_info *req_info = &rctx->cpt_req;
+	u32 argcnt =  0;
+
+	create_ctx_hdr(req, enc, cipher_type, aes_key_type, &argcnt);
+	update_input_iv(req_info, req->info, enc_iv_len, &argcnt);
+	update_input_data(req_info, req->src, req->nbytes, &argcnt);
+	req_info->incnt = argcnt;
+
+	return 0;
+}
+
+static inline void store_cb_info(struct ablkcipher_request *req,
+				 struct cpt_request_info *req_info)
+{
+	req_info->callback = (void *)cvm_callback;
+	req_info->callback_arg = (void *)&req->base;
+}
+
+static inline void create_output_list(struct ablkcipher_request *req,
+				      u32 cipher_type,
+				      u32 enc_iv_len)
+{
+	struct cvm_req_ctx *rctx = ablkcipher_request_ctx(req);
+	struct cpt_request_info *req_info = &rctx->cpt_req;
+	u32 argcnt = 0;
+
+	/* OUTPUT Buffer Processing
+	 * AES encryption/decryption output would be
+	 * received in the following format
+	 *
+	 * ------IV--------|------ENCRYPTED/DECRYPTED DATA-----|
+	 * [ 16 Bytes/     [   Request Enc/Dec/ DATA Len AES CBC ]
+	 */
+	/* Reading IV information */
+	update_output_iv(req_info, req->info, enc_iv_len, &argcnt);
+	update_output_data(req_info, req->dst, req->nbytes, &argcnt);
+	req_info->outcnt = argcnt;
+}
+
+static inline int cvm_enc_dec(struct ablkcipher_request *req, u32 enc,
+			      u32 cipher_type)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct cvm_enc_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	u32 key_type = AES_128_BIT;
+	struct cvm_req_ctx *rctx = ablkcipher_request_ctx(req);
+	u32 enc_iv_len = crypto_ablkcipher_ivsize(tfm);
+	struct fc_context *fctx = &rctx->fctx;
+	struct cpt_request_info *req_info = &rctx->cpt_req;
+	void *cdev = NULL;
+	int status;
+
+	switch (ctx->key_len) {
+	case 16:
+		key_type = AES_128_BIT;
+		break;
+	case 24:
+		key_type = AES_192_BIT;
+		break;
+	case 32:
+		if (cipher_type == AES_XTS)
+			key_type = AES_128_BIT;
+		else
+			key_type = AES_256_BIT;
+		break;
+	case 64:
+		if (cipher_type == AES_XTS)
+			key_type = AES_256_BIT;
+		else
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (cipher_type == DES3_CBC)
+		key_type = 0;
+
+	memset(req_info, 0, sizeof(struct cpt_request_info));
+	memset(fctx, 0, sizeof(struct fc_context));
+	create_input_list(req, enc, cipher_type, key_type, enc_iv_len);
+	create_output_list(req, cipher_type, enc_iv_len);
+	store_cb_info(req, req_info);
+	cdev = dev_handle.cdev[smp_processor_id()];
+	status = cptvf_do_request(cdev, req_info);
+	/* We perform an asynchronous send and once
+	 * the request is completed the driver would
+	 * intimate through  registered call back functions
+	 */
+
+	if (status)
+		return status;
+	else
+		return -EINPROGRESS;
+}
+
+int cvm_des3_encrypt_cbc(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, true, DES3_CBC);
+}
+
+int cvm_des3_decrypt_cbc(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, false, DES3_CBC);
+}
+
+int cvm_aes_encrypt_xts(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, true, AES_XTS);
+}
+
+int cvm_aes_decrypt_xts(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, false, AES_XTS);
+}
+
+int cvm_aes_encrypt_cbc(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, true, AES_CBC);
+}
+
+int cvm_aes_decrypt_cbc(struct ablkcipher_request *req)
+{
+	return cvm_enc_dec(req, false, AES_CBC);
+}
+
+int cvm_xts_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+		   u32 keylen)
+{
+	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(cipher);
+	struct cvm_enc_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+	const u8 *key1 = key;
+	const u8 *key2 = key + (keylen / 2);
+
+	err = xts_check_key(tfm, key, keylen);
+	if (err)
+		return err;
+	ctx->key_len = keylen;
+	memcpy(ctx->enc_key, key1, keylen / 2);
+	memcpy(ctx->enc_key + KEY2_OFFSET, key2, keylen / 2);
+
+	return 0;
+}
+
+int cvm_enc_dec_setkey(struct crypto_ablkcipher *cipher, const u8 *key,
+		       u32 keylen)
+{
+	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(cipher);
+	struct cvm_enc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if ((keylen == 16) || (keylen == 24) || (keylen == 32)) {
+		ctx->key_len = keylen;
+		memcpy(ctx->enc_key, key, keylen);
+		return 0;
+	}
+	crypto_ablkcipher_set_flags(cipher, CRYPTO_TFM_RES_BAD_KEY_LEN);
+
+	return -EINVAL;
+}
+
+int cvm_enc_dec_init(struct crypto_tfm *tfm)
+{
+	struct cvm_enc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	memset(ctx, 0, sizeof(*ctx));
+	tfm->crt_ablkcipher.reqsize = sizeof(struct cvm_req_ctx) +
+					sizeof(struct ablkcipher_request);
+	/* Additional memory for ablkcipher_request is
+	 * allocated since the cryptd daemon uses
+	 * this memory for request_ctx information
+	 */
+
+	return 0;
+}
+
+struct crypto_alg algs[] = { {
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = AES_BLOCK_SIZE,
+	.cra_ctxsize = sizeof(struct cvm_enc_ctx),
+	.cra_alignmask = 7,
+	.cra_priority = 4001,
+	.cra_name = "xts(aes)",
+	.cra_driver_name = "cavium-xts-aes",
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_u = {
+		.ablkcipher = {
+			.ivsize = AES_BLOCK_SIZE,
+			.min_keysize = 2 * AES_MIN_KEY_SIZE,
+			.max_keysize = 2 * AES_MAX_KEY_SIZE,
+			.setkey = cvm_xts_setkey,
+			.encrypt = cvm_aes_encrypt_xts,
+			.decrypt = cvm_aes_decrypt_xts,
+		},
+	},
+	.cra_init = cvm_enc_dec_init,
+	.cra_module = THIS_MODULE,
+}, {
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = AES_BLOCK_SIZE,
+	.cra_ctxsize = sizeof(struct cvm_enc_ctx),
+	.cra_alignmask = 7,
+	.cra_priority = 4001,
+	.cra_name = "cbc(aes)",
+	.cra_driver_name = "cavium-cbc-aes",
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_u = {
+		.ablkcipher = {
+			.ivsize = AES_BLOCK_SIZE,
+			.min_keysize = AES_MIN_KEY_SIZE,
+			.max_keysize = AES_MAX_KEY_SIZE,
+			.setkey = cvm_enc_dec_setkey,
+			.encrypt = cvm_aes_encrypt_cbc,
+			.decrypt = cvm_aes_decrypt_cbc,
+		},
+	},
+	.cra_init = cvm_enc_dec_init,
+	.cra_module = THIS_MODULE,
+}, {
+	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize = sizeof(struct cvm_des3_ctx),
+	.cra_alignmask = 7,
+	.cra_priority = 4001,
+	.cra_name = "cbc(des3_ede)",
+	.cra_driver_name = "cavium-cbc-des3_ede",
+	.cra_type = &crypto_ablkcipher_type,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize = DES3_EDE_KEY_SIZE,
+			.max_keysize = DES3_EDE_KEY_SIZE,
+			.ivsize = DES_BLOCK_SIZE,
+			.setkey = cvm_enc_dec_setkey,
+			.encrypt = cvm_des3_encrypt_cbc,
+			.decrypt = cvm_des3_decrypt_cbc,
+		},
+	},
+	.cra_init = cvm_enc_dec_init,
+	.cra_module = THIS_MODULE,
+} };
+
+static inline int cav_register_algs(void)
+{
+	int err = 0;
+
+	err = crypto_register_algs(algs, ARRAY_SIZE(algs));
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static inline void cav_unregister_algs(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+int cvm_crypto_init(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	u32 dev_count;
+
+	dev_count = dev_handle.dev_count;
+	dev_handle.cdev[dev_count] = cptvf;
+	dev_handle.dev_count++;
+
+	if (dev_count == 3) {
+		if (cav_register_algs()) {
+			dev_err(&pdev->dev, "Error in registering crypto algorithms\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+void cvm_crypto_exit(void)
+{
+	u32 dev_count;
+
+	dev_count = --dev_handle.dev_count;
+	if (!dev_count)
+		cav_unregister_algs();
+}
diff --git a/drivers/crypto/cavium/cpt/cptvf_algs.h b/drivers/crypto/cavium/cpt/cptvf_algs.h
new file mode 100644
index 000000000000..a12050d11b0c
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf_algs.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _CPTVF_ALGS_H_
+#define _CPTVF_ALGS_H_
+
+#include "request_manager.h"
+
+#define MAX_DEVICES 16
+#define MAJOR_OP_FC 0x33
+#define MAX_ENC_KEY_SIZE 32
+#define MAX_HASH_KEY_SIZE 64
+#define MAX_KEY_SIZE (MAX_ENC_KEY_SIZE + MAX_HASH_KEY_SIZE)
+#define CONTROL_WORD_LEN 8
+#define KEY2_OFFSET 48
+
+#define DMA_MODE_FLAG(dma_mode) \
+	(((dma_mode) == DMA_GATHER_SCATTER) ? (1 << 7) : 0)
+
+enum req_type {
+	AE_CORE_REQ,
+	SE_CORE_REQ,
+};
+
+enum cipher_type {
+	DES3_CBC = 0x1,
+	DES3_ECB = 0x2,
+	AES_CBC = 0x3,
+	AES_ECB = 0x4,
+	AES_CFB = 0x5,
+	AES_CTR = 0x6,
+	AES_GCM = 0x7,
+	AES_XTS = 0x8
+};
+
+enum aes_type {
+	AES_128_BIT = 0x1,
+	AES_192_BIT = 0x2,
+	AES_256_BIT = 0x3
+};
+
+union encr_ctrl {
+	u64 flags;
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u64 enc_cipher:4;
+		u64 reserved1:1;
+		u64 aes_key:2;
+		u64 iv_source:1;
+		u64 hash_type:4;
+		u64 reserved2:3;
+		u64 auth_input_type:1;
+		u64 mac_len:8;
+		u64 reserved3:8;
+		u64 encr_offset:16;
+		u64 iv_offset:8;
+		u64 auth_offset:8;
+#else
+		u64 auth_offset:8;
+		u64 iv_offset:8;
+		u64 encr_offset:16;
+		u64 reserved3:8;
+		u64 mac_len:8;
+		u64 auth_input_type:1;
+		u64 reserved2:3;
+		u64 hash_type:4;
+		u64 iv_source:1;
+		u64 aes_key:2;
+		u64 reserved1:1;
+		u64 enc_cipher:4;
+#endif
+	} e;
+};
+
+struct enc_context {
+	union encr_ctrl enc_ctrl;
+	u8 encr_key[32];
+	u8 encr_iv[16];
+};
+
+struct fchmac_context {
+	u8 ipad[64];
+	u8 opad[64]; /* or OPAD */
+};
+
+struct fc_context {
+	struct enc_context enc;
+	struct fchmac_context hmac;
+};
+
+struct cvm_enc_ctx {
+	u32 key_len;
+	u8 enc_key[MAX_KEY_SIZE];
+};
+
+struct cvm_des3_ctx {
+	u32 key_len;
+	u8 des3_key[MAX_KEY_SIZE];
+};
+
+struct cvm_req_ctx {
+	struct cpt_request_info cpt_req;
+	u64 control_word;
+	struct fc_context fctx;
+};
+
+int cptvf_do_request(void *cptvf, struct cpt_request_info *req);
+#endif /*_CPTVF_ALGS_H_*/
diff --git a/drivers/crypto/cavium/cpt/cptvf_main.c b/drivers/crypto/cavium/cpt/cptvf_main.c
new file mode 100644
index 000000000000..aac2966ff8d9
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf_main.c
@@ -0,0 +1,863 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "cptvf.h"
+
+#define DRV_NAME	"thunder-cptvf"
+#define DRV_VERSION	"1.0"
+
+struct cptvf_wqe {
+	struct tasklet_struct twork;
+	void *cptvf;
+	u32 qno;
+};
+
+struct cptvf_wqe_info {
+	struct cptvf_wqe vq_wqe[CPT_NUM_QS_PER_VF];
+};
+
+static void vq_work_handler(unsigned long data)
+{
+	struct cptvf_wqe_info *cwqe_info = (struct cptvf_wqe_info *)data;
+	struct cptvf_wqe *cwqe = &cwqe_info->vq_wqe[0];
+
+	vq_post_process(cwqe->cptvf, cwqe->qno);
+}
+
+static int init_worker_threads(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cptvf_wqe_info *cwqe_info;
+	int i;
+
+	cwqe_info = kzalloc(sizeof(*cwqe_info), GFP_KERNEL);
+	if (!cwqe_info)
+		return -ENOMEM;
+
+	if (cptvf->nr_queues) {
+		dev_info(&pdev->dev, "Creating VQ worker threads (%d)\n",
+			 cptvf->nr_queues);
+	}
+
+	for (i = 0; i < cptvf->nr_queues; i++) {
+		tasklet_init(&cwqe_info->vq_wqe[i].twork, vq_work_handler,
+			     (u64)cwqe_info);
+		cwqe_info->vq_wqe[i].qno = i;
+		cwqe_info->vq_wqe[i].cptvf = cptvf;
+	}
+
+	cptvf->wqe_info = cwqe_info;
+
+	return 0;
+}
+
+static void cleanup_worker_threads(struct cpt_vf *cptvf)
+{
+	struct cptvf_wqe_info *cwqe_info;
+	struct pci_dev *pdev = cptvf->pdev;
+	int i;
+
+	cwqe_info = (struct cptvf_wqe_info *)cptvf->wqe_info;
+	if (!cwqe_info)
+		return;
+
+	if (cptvf->nr_queues) {
+		dev_info(&pdev->dev, "Cleaning VQ worker threads (%u)\n",
+			 cptvf->nr_queues);
+	}
+
+	for (i = 0; i < cptvf->nr_queues; i++)
+		tasklet_kill(&cwqe_info->vq_wqe[i].twork);
+
+	kzfree(cwqe_info);
+	cptvf->wqe_info = NULL;
+}
+
+static void free_pending_queues(struct pending_qinfo *pqinfo)
+{
+	int i;
+	struct pending_queue *queue;
+
+	for_each_pending_queue(pqinfo, queue, i) {
+		if (!queue->head)
+			continue;
+
+		/* free single queue */
+		kzfree((queue->head));
+
+		queue->front = 0;
+		queue->rear = 0;
+
+		return;
+	}
+
+	pqinfo->qlen = 0;
+	pqinfo->nr_queues = 0;
+}
+
+static int alloc_pending_queues(struct pending_qinfo *pqinfo, u32 qlen,
+				u32 nr_queues)
+{
+	u32 i;
+	size_t size;
+	int ret;
+	struct pending_queue *queue = NULL;
+
+	pqinfo->nr_queues = nr_queues;
+	pqinfo->qlen = qlen;
+
+	size = (qlen * sizeof(struct pending_entry));
+
+	for_each_pending_queue(pqinfo, queue, i) {
+		queue->head = kzalloc((size), GFP_KERNEL);
+		if (!queue->head) {
+			ret = -ENOMEM;
+			goto pending_qfail;
+		}
+
+		queue->front = 0;
+		queue->rear = 0;
+		atomic64_set((&queue->pending_count), (0));
+
+		/* init queue spin lock */
+		spin_lock_init(&queue->lock);
+	}
+
+	return 0;
+
+pending_qfail:
+	free_pending_queues(pqinfo);
+
+	return ret;
+}
+
+static int init_pending_queues(struct cpt_vf *cptvf, u32 qlen, u32 nr_queues)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	int ret;
+
+	if (!nr_queues)
+		return 0;
+
+	ret = alloc_pending_queues(&cptvf->pqinfo, qlen, nr_queues);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to setup pending queues (%u)\n",
+			nr_queues);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void cleanup_pending_queues(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (!cptvf->nr_queues)
+		return;
+
+	dev_info(&pdev->dev, "Cleaning VQ pending queue (%u)\n",
+		 cptvf->nr_queues);
+	free_pending_queues(&cptvf->pqinfo);
+}
+
+static void free_command_queues(struct cpt_vf *cptvf,
+				struct command_qinfo *cqinfo)
+{
+	int i;
+	struct command_queue *queue = NULL;
+	struct command_chunk *chunk = NULL;
+	struct pci_dev *pdev = cptvf->pdev;
+	struct hlist_node *node;
+
+	/* clean up for each queue */
+	for (i = 0; i < cptvf->nr_queues; i++) {
+		queue = &cqinfo->queue[i];
+		if (hlist_empty(&cqinfo->queue[i].chead))
+			continue;
+
+		hlist_for_each_entry_safe(chunk, node, &cqinfo->queue[i].chead,
+					  nextchunk) {
+			dma_free_coherent(&pdev->dev, chunk->size,
+					  chunk->head,
+					  chunk->dma_addr);
+			chunk->head = NULL;
+			chunk->dma_addr = 0;
+			hlist_del(&chunk->nextchunk);
+			kzfree(chunk);
+		}
+
+		queue->nchunks = 0;
+		queue->idx = 0;
+	}
+
+	/* common cleanup */
+	cqinfo->cmd_size = 0;
+}
+
+static int alloc_command_queues(struct cpt_vf *cptvf,
+				struct command_qinfo *cqinfo, size_t cmd_size,
+				u32 qlen)
+{
+	int i;
+	size_t q_size;
+	struct command_queue *queue = NULL;
+	struct pci_dev *pdev = cptvf->pdev;
+
+	/* common init */
+	cqinfo->cmd_size = cmd_size;
+	/* Qsize in dwords, needed for SADDR config, 1-next chunk pointer */
+	cptvf->qsize = min(qlen, cqinfo->qchunksize) *
+			CPT_NEXT_CHUNK_PTR_SIZE + 1;
+	/* Qsize in bytes to create space for alignment */
+	q_size = qlen * cqinfo->cmd_size;
+
+	/* per queue initialization */
+	for (i = 0; i < cptvf->nr_queues; i++) {
+		size_t c_size = 0;
+		size_t rem_q_size = q_size;
+		struct command_chunk *curr = NULL, *first = NULL, *last = NULL;
+		u32 qcsize_bytes = cqinfo->qchunksize * cqinfo->cmd_size;
+
+		queue = &cqinfo->queue[i];
+		INIT_HLIST_HEAD(&cqinfo->queue[i].chead);
+		do {
+			curr = kzalloc(sizeof(*curr), GFP_KERNEL);
+			if (!curr)
+				goto cmd_qfail;
+
+			c_size = (rem_q_size > qcsize_bytes) ? qcsize_bytes :
+					rem_q_size;
+			curr->head = (u8 *)dma_zalloc_coherent(&pdev->dev,
+					  c_size + CPT_NEXT_CHUNK_PTR_SIZE,
+					  &curr->dma_addr, GFP_KERNEL);
+			if (!curr->head) {
+				dev_err(&pdev->dev, "Command Q (%d) chunk (%d) allocation failed\n",
+					i, queue->nchunks);
+				goto cmd_qfail;
+			}
+
+			curr->size = c_size;
+			if (queue->nchunks == 0) {
+				hlist_add_head(&curr->nextchunk,
+					       &cqinfo->queue[i].chead);
+				first = curr;
+			} else {
+				hlist_add_behind(&curr->nextchunk,
+						 &last->nextchunk);
+			}
+
+			queue->nchunks++;
+			rem_q_size -= c_size;
+			if (last)
+				*((u64 *)(&last->head[last->size])) = (u64)curr->dma_addr;
+
+			last = curr;
+		} while (rem_q_size);
+
+		/* Make the queue circular */
+		/* Tie back last chunk entry to head */
+		curr = first;
+		*((u64 *)(&last->head[last->size])) = (u64)curr->dma_addr;
+		queue->qhead = curr;
+		spin_lock_init(&queue->lock);
+	}
+	return 0;
+
+cmd_qfail:
+	free_command_queues(cptvf, cqinfo);
+	return -ENOMEM;
+}
+
+static int init_command_queues(struct cpt_vf *cptvf, u32 qlen)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	int ret;
+
+	/* setup AE command queues */
+	ret = alloc_command_queues(cptvf, &cptvf->cqinfo, CPT_INST_SIZE,
+				   qlen);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to allocate AE command queues (%u)\n",
+			cptvf->nr_queues);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void cleanup_command_queues(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (!cptvf->nr_queues)
+		return;
+
+	dev_info(&pdev->dev, "Cleaning VQ command queue (%u)\n",
+		 cptvf->nr_queues);
+	free_command_queues(cptvf, &cptvf->cqinfo);
+}
+
+static void cptvf_sw_cleanup(struct cpt_vf *cptvf)
+{
+	cleanup_worker_threads(cptvf);
+	cleanup_pending_queues(cptvf);
+	cleanup_command_queues(cptvf);
+}
+
+static int cptvf_sw_init(struct cpt_vf *cptvf, u32 qlen, u32 nr_queues)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	int ret = 0;
+	u32 max_dev_queues = 0;
+
+	max_dev_queues = CPT_NUM_QS_PER_VF;
+	/* possible cpus */
+	nr_queues = min_t(u32, nr_queues, max_dev_queues);
+	cptvf->nr_queues = nr_queues;
+
+	ret = init_command_queues(cptvf, qlen);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup command queues (%u)\n",
+			nr_queues);
+		return ret;
+	}
+
+	ret = init_pending_queues(cptvf, qlen, nr_queues);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup pending queues (%u)\n",
+			nr_queues);
+		goto setup_pqfail;
+	}
+
+	/* Create worker threads for BH processing */
+	ret = init_worker_threads(cptvf);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup worker threads\n");
+		goto init_work_fail;
+	}
+
+	return 0;
+
+init_work_fail:
+	cleanup_worker_threads(cptvf);
+	cleanup_pending_queues(cptvf);
+
+setup_pqfail:
+	cleanup_command_queues(cptvf);
+
+	return ret;
+}
+
+static void cptvf_free_irq_affinity(struct cpt_vf *cptvf, int vec)
+{
+	irq_set_affinity_hint(pci_irq_vector(cptvf->pdev, vec), NULL);
+	free_cpumask_var(cptvf->affinity_mask[vec]);
+}
+
+static void cptvf_write_vq_ctl(struct cpt_vf *cptvf, bool val)
+{
+	union cptx_vqx_ctl vqx_ctl;
+
+	vqx_ctl.u = cpt_read_csr64(cptvf->reg_base, CPTX_VQX_CTL(0, 0));
+	vqx_ctl.s.ena = val;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_CTL(0, 0), vqx_ctl.u);
+}
+
+void cptvf_write_vq_doorbell(struct cpt_vf *cptvf, u32 val)
+{
+	union cptx_vqx_doorbell vqx_dbell;
+
+	vqx_dbell.u = cpt_read_csr64(cptvf->reg_base,
+				     CPTX_VQX_DOORBELL(0, 0));
+	vqx_dbell.s.dbell_cnt = val * 8; /* Num of Instructions * 8 words */
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_DOORBELL(0, 0),
+			vqx_dbell.u);
+}
+
+static void cptvf_write_vq_inprog(struct cpt_vf *cptvf, u8 val)
+{
+	union cptx_vqx_inprog vqx_inprg;
+
+	vqx_inprg.u = cpt_read_csr64(cptvf->reg_base, CPTX_VQX_INPROG(0, 0));
+	vqx_inprg.s.inflight = val;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_INPROG(0, 0), vqx_inprg.u);
+}
+
+static void cptvf_write_vq_done_numwait(struct cpt_vf *cptvf, u32 val)
+{
+	union cptx_vqx_done_wait vqx_dwait;
+
+	vqx_dwait.u = cpt_read_csr64(cptvf->reg_base,
+				     CPTX_VQX_DONE_WAIT(0, 0));
+	vqx_dwait.s.num_wait = val;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_DONE_WAIT(0, 0),
+			vqx_dwait.u);
+}
+
+static void cptvf_write_vq_done_timewait(struct cpt_vf *cptvf, u16 time)
+{
+	union cptx_vqx_done_wait vqx_dwait;
+
+	vqx_dwait.u = cpt_read_csr64(cptvf->reg_base,
+				     CPTX_VQX_DONE_WAIT(0, 0));
+	vqx_dwait.s.time_wait = time;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_DONE_WAIT(0, 0),
+			vqx_dwait.u);
+}
+
+static void cptvf_enable_swerr_interrupts(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_ena_w1s vqx_misc_ena;
+
+	vqx_misc_ena.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_ENA_W1S(0, 0));
+	/* Set mbox(0) interupts for the requested vf */
+	vqx_misc_ena.s.swerr = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_ENA_W1S(0, 0),
+			vqx_misc_ena.u);
+}
+
+static void cptvf_enable_mbox_interrupts(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_ena_w1s vqx_misc_ena;
+
+	vqx_misc_ena.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_ENA_W1S(0, 0));
+	/* Set mbox(0) interupts for the requested vf */
+	vqx_misc_ena.s.mbox = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_ENA_W1S(0, 0),
+			vqx_misc_ena.u);
+}
+
+static void cptvf_enable_done_interrupts(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_done_ena_w1s vqx_done_ena;
+
+	vqx_done_ena.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_DONE_ENA_W1S(0, 0));
+	/* Set DONE interrupt for the requested vf */
+	vqx_done_ena.s.done = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_DONE_ENA_W1S(0, 0),
+			vqx_done_ena.u);
+}
+
+static void cptvf_clear_dovf_intr(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_int vqx_misc_int;
+
+	vqx_misc_int.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_INT(0, 0));
+	/* W1C for the VF */
+	vqx_misc_int.s.dovf = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_INT(0, 0),
+			vqx_misc_int.u);
+}
+
+static void cptvf_clear_irde_intr(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_int vqx_misc_int;
+
+	vqx_misc_int.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_INT(0, 0));
+	/* W1C for the VF */
+	vqx_misc_int.s.irde = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_INT(0, 0),
+			vqx_misc_int.u);
+}
+
+static void cptvf_clear_nwrp_intr(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_int vqx_misc_int;
+
+	vqx_misc_int.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_INT(0, 0));
+	/* W1C for the VF */
+	vqx_misc_int.s.nwrp = 1;
+	cpt_write_csr64(cptvf->reg_base,
+			CPTX_VQX_MISC_INT(0, 0), vqx_misc_int.u);
+}
+
+static void cptvf_clear_mbox_intr(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_int vqx_misc_int;
+
+	vqx_misc_int.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_INT(0, 0));
+	/* W1C for the VF */
+	vqx_misc_int.s.mbox = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_INT(0, 0),
+			vqx_misc_int.u);
+}
+
+static void cptvf_clear_swerr_intr(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_misc_int vqx_misc_int;
+
+	vqx_misc_int.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_MISC_INT(0, 0));
+	/* W1C for the VF */
+	vqx_misc_int.s.swerr = 1;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_MISC_INT(0, 0),
+			vqx_misc_int.u);
+}
+
+static u64 cptvf_read_vf_misc_intr_status(struct cpt_vf *cptvf)
+{
+	return cpt_read_csr64(cptvf->reg_base, CPTX_VQX_MISC_INT(0, 0));
+}
+
+static irqreturn_t cptvf_misc_intr_handler(int irq, void *cptvf_irq)
+{
+	struct cpt_vf *cptvf = (struct cpt_vf *)cptvf_irq;
+	struct pci_dev *pdev = cptvf->pdev;
+	u64 intr;
+
+	intr = cptvf_read_vf_misc_intr_status(cptvf);
+	/*Check for MISC interrupt types*/
+	if (likely(intr & CPT_VF_INTR_MBOX_MASK)) {
+		dev_err(&pdev->dev, "Mailbox interrupt 0x%llx on CPT VF %d\n",
+			intr, cptvf->vfid);
+		cptvf_handle_mbox_intr(cptvf);
+		cptvf_clear_mbox_intr(cptvf);
+	} else if (unlikely(intr & CPT_VF_INTR_DOVF_MASK)) {
+		cptvf_clear_dovf_intr(cptvf);
+		/*Clear doorbell count*/
+		cptvf_write_vq_doorbell(cptvf, 0);
+		dev_err(&pdev->dev, "Doorbell overflow error interrupt 0x%llx on CPT VF %d\n",
+			intr, cptvf->vfid);
+	} else if (unlikely(intr & CPT_VF_INTR_IRDE_MASK)) {
+		cptvf_clear_irde_intr(cptvf);
+		dev_err(&pdev->dev, "Instruction NCB read error interrupt 0x%llx on CPT VF %d\n",
+			intr, cptvf->vfid);
+	} else if (unlikely(intr & CPT_VF_INTR_NWRP_MASK)) {
+		cptvf_clear_nwrp_intr(cptvf);
+		dev_err(&pdev->dev, "NCB response write error interrupt 0x%llx on CPT VF %d\n",
+			intr, cptvf->vfid);
+	} else if (unlikely(intr & CPT_VF_INTR_SERR_MASK)) {
+		cptvf_clear_swerr_intr(cptvf);
+		dev_err(&pdev->dev, "Software error interrupt 0x%llx on CPT VF %d\n",
+			intr, cptvf->vfid);
+	} else {
+		dev_err(&pdev->dev, "Unhandled interrupt in CPT VF %d\n",
+			cptvf->vfid);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static inline struct cptvf_wqe *get_cptvf_vq_wqe(struct cpt_vf *cptvf,
+						 int qno)
+{
+	struct cptvf_wqe_info *nwqe_info;
+
+	if (unlikely(qno >= cptvf->nr_queues))
+		return NULL;
+	nwqe_info = (struct cptvf_wqe_info *)cptvf->wqe_info;
+
+	return &nwqe_info->vq_wqe[qno];
+}
+
+static inline u32 cptvf_read_vq_done_count(struct cpt_vf *cptvf)
+{
+	union cptx_vqx_done vqx_done;
+
+	vqx_done.u = cpt_read_csr64(cptvf->reg_base, CPTX_VQX_DONE(0, 0));
+	return vqx_done.s.done;
+}
+
+static inline void cptvf_write_vq_done_ack(struct cpt_vf *cptvf,
+					   u32 ackcnt)
+{
+	union cptx_vqx_done_ack vqx_dack_cnt;
+
+	vqx_dack_cnt.u = cpt_read_csr64(cptvf->reg_base,
+					CPTX_VQX_DONE_ACK(0, 0));
+	vqx_dack_cnt.s.done_ack = ackcnt;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_DONE_ACK(0, 0),
+			vqx_dack_cnt.u);
+}
+
+static irqreturn_t cptvf_done_intr_handler(int irq, void *cptvf_irq)
+{
+	struct cpt_vf *cptvf = (struct cpt_vf *)cptvf_irq;
+	struct pci_dev *pdev = cptvf->pdev;
+	/* Read the number of completions */
+	u32 intr = cptvf_read_vq_done_count(cptvf);
+
+	if (intr) {
+		struct cptvf_wqe *wqe;
+
+		/* Acknowledge the number of
+		 * scheduled completions for processing
+		 */
+		cptvf_write_vq_done_ack(cptvf, intr);
+		wqe = get_cptvf_vq_wqe(cptvf, 0);
+		if (unlikely(!wqe)) {
+			dev_err(&pdev->dev, "No work to schedule for VF (%d)",
+				cptvf->vfid);
+			return IRQ_NONE;
+		}
+		tasklet_hi_schedule(&wqe->twork);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void cptvf_set_irq_affinity(struct cpt_vf *cptvf, int vec)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cptvf->affinity_mask[vec],
+				GFP_KERNEL)) {
+		dev_err(&pdev->dev, "Allocation failed for affinity_mask for VF %d",
+			cptvf->vfid);
+		return;
+	}
+
+	cpu = cptvf->vfid % num_online_cpus();
+	cpumask_set_cpu(cpumask_local_spread(cpu, cptvf->node),
+			cptvf->affinity_mask[vec]);
+	irq_set_affinity_hint(pci_irq_vector(pdev, vec),
+			cptvf->affinity_mask[vec]);
+}
+
+static void cptvf_write_vq_saddr(struct cpt_vf *cptvf, u64 val)
+{
+	union cptx_vqx_saddr vqx_saddr;
+
+	vqx_saddr.u = val;
+	cpt_write_csr64(cptvf->reg_base, CPTX_VQX_SADDR(0, 0), vqx_saddr.u);
+}
+
+void cptvf_device_init(struct cpt_vf *cptvf)
+{
+	u64 base_addr = 0;
+
+	/* Disable the VQ */
+	cptvf_write_vq_ctl(cptvf, 0);
+	/* Reset the doorbell */
+	cptvf_write_vq_doorbell(cptvf, 0);
+	/* Clear inflight */
+	cptvf_write_vq_inprog(cptvf, 0);
+	/* Write VQ SADDR */
+	/* TODO: for now only one queue, so hard coded */
+	base_addr = (u64)(cptvf->cqinfo.queue[0].qhead->dma_addr);
+	cptvf_write_vq_saddr(cptvf, base_addr);
+	/* Configure timerhold / coalescence */
+	cptvf_write_vq_done_timewait(cptvf, CPT_TIMER_THOLD);
+	cptvf_write_vq_done_numwait(cptvf, 1);
+	/* Enable the VQ */
+	cptvf_write_vq_ctl(cptvf, 1);
+	/* Flag the VF ready */
+	cptvf->flags |= CPT_FLAG_DEVICE_READY;
+}
+
+static int cptvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct device *dev = &pdev->dev;
+	struct cpt_vf *cptvf;
+	int    err;
+
+	cptvf = devm_kzalloc(dev, sizeof(*cptvf), GFP_KERNEL);
+	if (!cptvf)
+		return -ENOMEM;
+
+	pci_set_drvdata(pdev, cptvf);
+	cptvf->pdev = pdev;
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev, "Failed to enable PCI device\n");
+		pci_set_drvdata(pdev, NULL);
+		return err;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(dev, "PCI request regions failed 0x%x\n", err);
+		goto cptvf_err_disable_device;
+	}
+	/* Mark as VF driver */
+	cptvf->flags |= CPT_FLAG_VF_DRIVER;
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(48));
+	if (err) {
+		dev_err(dev, "Unable to get usable DMA configuration\n");
+		goto cptvf_err_release_regions;
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(48));
+	if (err) {
+		dev_err(dev, "Unable to get 48-bit DMA for consistent allocations\n");
+		goto cptvf_err_release_regions;
+	}
+
+	/* MAP PF's configuration registers */
+	cptvf->reg_base = pcim_iomap(pdev, 0, 0);
+	if (!cptvf->reg_base) {
+		dev_err(dev, "Cannot map config register space, aborting\n");
+		err = -ENOMEM;
+		goto cptvf_err_release_regions;
+	}
+
+	cptvf->node = dev_to_node(&pdev->dev);
+	err = pci_alloc_irq_vectors(pdev, CPT_VF_MSIX_VECTORS,
+			CPT_VF_MSIX_VECTORS, PCI_IRQ_MSIX);
+	if (err < 0) {
+		dev_err(dev, "Request for #%d msix vectors failed\n",
+			CPT_VF_MSIX_VECTORS);
+		goto cptvf_err_release_regions;
+	}
+
+	err = request_irq(pci_irq_vector(pdev, CPT_VF_INT_VEC_E_MISC),
+			  cptvf_misc_intr_handler, 0, "CPT VF misc intr",
+			  cptvf);
+	if (err) {
+		dev_err(dev, "Request misc irq failed");
+		goto cptvf_free_vectors;
+	}
+
+	/* Enable mailbox interrupt */
+	cptvf_enable_mbox_interrupts(cptvf);
+	cptvf_enable_swerr_interrupts(cptvf);
+
+	/* Check ready with PF */
+	/* Gets chip ID / device Id from PF if ready */
+	err = cptvf_check_pf_ready(cptvf);
+	if (err) {
+		dev_err(dev, "PF not responding to READY msg");
+		goto cptvf_free_misc_irq;
+	}
+
+	/* CPT VF software resources initialization */
+	cptvf->cqinfo.qchunksize = CPT_CMD_QCHUNK_SIZE;
+	err = cptvf_sw_init(cptvf, CPT_CMD_QLEN, CPT_NUM_QS_PER_VF);
+	if (err) {
+		dev_err(dev, "cptvf_sw_init() failed");
+		goto cptvf_free_misc_irq;
+	}
+	/* Convey VQ LEN to PF */
+	err = cptvf_send_vq_size_msg(cptvf);
+	if (err) {
+		dev_err(dev, "PF not responding to QLEN msg");
+		goto cptvf_free_misc_irq;
+	}
+
+	/* CPT VF device initialization */
+	cptvf_device_init(cptvf);
+	/* Send msg to PF to assign currnet Q to required group */
+	cptvf->vfgrp = 1;
+	err = cptvf_send_vf_to_grp_msg(cptvf);
+	if (err) {
+		dev_err(dev, "PF not responding to VF_GRP msg");
+		goto cptvf_free_misc_irq;
+	}
+
+	cptvf->priority = 1;
+	err = cptvf_send_vf_priority_msg(cptvf);
+	if (err) {
+		dev_err(dev, "PF not responding to VF_PRIO msg");
+		goto cptvf_free_misc_irq;
+	}
+
+	err = request_irq(pci_irq_vector(pdev, CPT_VF_INT_VEC_E_DONE),
+			  cptvf_done_intr_handler, 0, "CPT VF done intr",
+			  cptvf);
+	if (err) {
+		dev_err(dev, "Request done irq failed\n");
+		goto cptvf_free_misc_irq;
+	}
+
+	/* Enable mailbox interrupt */
+	cptvf_enable_done_interrupts(cptvf);
+
+	/* Set irq affinity masks */
+	cptvf_set_irq_affinity(cptvf, CPT_VF_INT_VEC_E_MISC);
+	cptvf_set_irq_affinity(cptvf, CPT_VF_INT_VEC_E_DONE);
+
+	err = cptvf_send_vf_up(cptvf);
+	if (err) {
+		dev_err(dev, "PF not responding to UP msg");
+		goto cptvf_free_irq_affinity;
+	}
+	err = cvm_crypto_init(cptvf);
+	if (err) {
+		dev_err(dev, "Algorithm register failed\n");
+		goto cptvf_free_irq_affinity;
+	}
+	return 0;
+
+cptvf_free_irq_affinity:
+	cptvf_free_irq_affinity(cptvf, CPT_VF_INT_VEC_E_DONE);
+	cptvf_free_irq_affinity(cptvf, CPT_VF_INT_VEC_E_MISC);
+cptvf_free_misc_irq:
+	free_irq(pci_irq_vector(pdev, CPT_VF_INT_VEC_E_MISC), cptvf);
+cptvf_free_vectors:
+	pci_free_irq_vectors(cptvf->pdev);
+cptvf_err_release_regions:
+	pci_release_regions(pdev);
+cptvf_err_disable_device:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+
+	return err;
+}
+
+static void cptvf_remove(struct pci_dev *pdev)
+{
+	struct cpt_vf *cptvf = pci_get_drvdata(pdev);
+
+	if (!cptvf)
+		dev_err(&pdev->dev, "Invalid CPT-VF device\n");
+
+	/* Convey DOWN to PF */
+	if (cptvf_send_vf_down(cptvf)) {
+		dev_err(&pdev->dev, "PF not responding to DOWN msg");
+	} else {
+		cptvf_free_irq_affinity(cptvf, CPT_VF_INT_VEC_E_DONE);
+		cptvf_free_irq_affinity(cptvf, CPT_VF_INT_VEC_E_MISC);
+		free_irq(pci_irq_vector(pdev, CPT_VF_INT_VEC_E_DONE), cptvf);
+		free_irq(pci_irq_vector(pdev, CPT_VF_INT_VEC_E_MISC), cptvf);
+		pci_free_irq_vectors(cptvf->pdev);
+		cptvf_sw_cleanup(cptvf);
+		pci_set_drvdata(pdev, NULL);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		cvm_crypto_exit();
+	}
+}
+
+static void cptvf_shutdown(struct pci_dev *pdev)
+{
+	cptvf_remove(pdev);
+}
+
+/* Supported devices */
+static const struct pci_device_id cptvf_id_table[] = {
+	{PCI_VDEVICE(CAVIUM, CPT_81XX_PCI_VF_DEVICE_ID), 0},
+	{ 0, }  /* end of table */
+};
+
+static struct pci_driver cptvf_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = cptvf_id_table,
+	.probe = cptvf_probe,
+	.remove = cptvf_remove,
+	.shutdown = cptvf_shutdown,
+};
+
+module_pci_driver(cptvf_pci_driver);
+
+MODULE_AUTHOR("George Cherian <george.cherian@cavium.com>");
+MODULE_DESCRIPTION("Cavium Thunder CPT Virtual Function Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cptvf_id_table);
diff --git a/drivers/crypto/cavium/cpt/cptvf_mbox.c b/drivers/crypto/cavium/cpt/cptvf_mbox.c
new file mode 100644
index 000000000000..d5ec3b8a9e61
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf_mbox.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include "cptvf.h"
+
+static void cptvf_send_msg_to_pf(struct cpt_vf *cptvf, struct cpt_mbox *mbx)
+{
+	/* Writing mbox(1) causes interrupt */
+	cpt_write_csr64(cptvf->reg_base, CPTX_VFX_PF_MBOXX(0, 0, 0),
+			mbx->msg);
+	cpt_write_csr64(cptvf->reg_base, CPTX_VFX_PF_MBOXX(0, 0, 1),
+			mbx->data);
+}
+
+/* ACKs PF's mailbox message
+ */
+void cptvf_mbox_send_ack(struct cpt_vf *cptvf, struct cpt_mbox *mbx)
+{
+	mbx->msg = CPT_MBOX_MSG_TYPE_ACK;
+	cptvf_send_msg_to_pf(cptvf, mbx);
+}
+
+/* NACKs PF's mailbox message that VF is not able to
+ * complete the action
+ */
+void cptvf_mbox_send_nack(struct cpt_vf *cptvf, struct cpt_mbox *mbx)
+{
+	mbx->msg = CPT_MBOX_MSG_TYPE_NACK;
+	cptvf_send_msg_to_pf(cptvf, mbx);
+}
+
+/* Interrupt handler to handle mailbox messages from VFs */
+void cptvf_handle_mbox_intr(struct cpt_vf *cptvf)
+{
+	struct cpt_mbox mbx = {};
+
+	/*
+	 * MBOX[0] contains msg
+	 * MBOX[1] contains data
+	 */
+	mbx.msg  = cpt_read_csr64(cptvf->reg_base, CPTX_VFX_PF_MBOXX(0, 0, 0));
+	mbx.data = cpt_read_csr64(cptvf->reg_base, CPTX_VFX_PF_MBOXX(0, 0, 1));
+	dev_dbg(&cptvf->pdev->dev, "%s: Mailbox msg 0x%llx from PF\n",
+		__func__, mbx.msg);
+	switch (mbx.msg) {
+	case CPT_MSG_READY:
+	{
+		cptvf->pf_acked = true;
+		cptvf->vfid = mbx.data;
+		dev_dbg(&cptvf->pdev->dev, "Received VFID %d\n", cptvf->vfid);
+		break;
+	}
+	case CPT_MSG_QBIND_GRP:
+		cptvf->pf_acked = true;
+		cptvf->vftype = mbx.data;
+		dev_dbg(&cptvf->pdev->dev, "VF %d type %s group %d\n",
+			cptvf->vfid, ((mbx.data == SE_TYPES) ? "SE" : "AE"),
+			cptvf->vfgrp);
+		break;
+	case CPT_MBOX_MSG_TYPE_ACK:
+		cptvf->pf_acked = true;
+		break;
+	case CPT_MBOX_MSG_TYPE_NACK:
+		cptvf->pf_nacked = true;
+		break;
+	default:
+		dev_err(&cptvf->pdev->dev, "Invalid msg from PF, msg 0x%llx\n",
+			mbx.msg);
+		break;
+	}
+}
+
+static int cptvf_send_msg_to_pf_timeout(struct cpt_vf *cptvf,
+					struct cpt_mbox *mbx)
+{
+	int timeout = CPT_MBOX_MSG_TIMEOUT;
+	int sleep = 10;
+
+	cptvf->pf_acked = false;
+	cptvf->pf_nacked = false;
+	cptvf_send_msg_to_pf(cptvf, mbx);
+	/* Wait for previous message to be acked, timeout 2sec */
+	while (!cptvf->pf_acked) {
+		if (cptvf->pf_nacked)
+			return -EINVAL;
+		msleep(sleep);
+		if (cptvf->pf_acked)
+			break;
+		timeout -= sleep;
+		if (!timeout) {
+			dev_err(&cptvf->pdev->dev, "PF didn't ack to mbox msg %llx from VF%u\n",
+				(mbx->msg & 0xFF), cptvf->vfid);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Checks if VF is able to comminicate with PF
+ * and also gets the CPT number this VF is associated to.
+ */
+int cptvf_check_pf_ready(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_READY;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to READY msg\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * Communicate VQs size to PF to program CPT(0)_PF_Q(0-15)_CTL of the VF.
+ * Must be ACKed.
+ */
+int cptvf_send_vq_size_msg(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_QLEN;
+	mbx.data = cptvf->qsize;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to vq_size msg\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * Communicate VF group required to PF and get the VQ binded to that group
+ */
+int cptvf_send_vf_to_grp_msg(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_QBIND_GRP;
+	/* Convey group of the VF */
+	mbx.data = cptvf->vfgrp;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to vf_type msg\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * Communicate VF group required to PF and get the VQ binded to that group
+ */
+int cptvf_send_vf_priority_msg(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_VQ_PRIORITY;
+	/* Convey group of the VF */
+	mbx.data = cptvf->priority;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to vf_type msg\n");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/*
+ * Communicate to PF that VF is UP and running
+ */
+int cptvf_send_vf_up(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_VF_UP;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to UP msg\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/*
+ * Communicate to PF that VF is DOWN and running
+ */
+int cptvf_send_vf_down(struct cpt_vf *cptvf)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_mbox mbx = {};
+
+	mbx.msg = CPT_MSG_VF_DOWN;
+	if (cptvf_send_msg_to_pf_timeout(cptvf, &mbx)) {
+		dev_err(&pdev->dev, "PF didn't respond to DOWN msg\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
diff --git a/drivers/crypto/cavium/cpt/cptvf_reqmanager.c b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
new file mode 100644
index 000000000000..7f57f30f8863
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include "cptvf.h"
+#include "request_manager.h"
+
+/**
+ * get_free_pending_entry - get free entry from pending queue
+ * @param pqinfo: pending_qinfo structure
+ * @param qno: queue number
+ */
+static struct pending_entry *get_free_pending_entry(struct pending_queue *q,
+						    int qlen)
+{
+	struct pending_entry *ent = NULL;
+
+	ent = &q->head[q->rear];
+	if (unlikely(ent->busy)) {
+		ent = NULL;
+		goto no_free_entry;
+	}
+
+	q->rear++;
+	if (unlikely(q->rear == qlen))
+		q->rear = 0;
+
+no_free_entry:
+	return ent;
+}
+
+static inline void pending_queue_inc_front(struct pending_qinfo *pqinfo,
+					   int qno)
+{
+	struct pending_queue *queue = &pqinfo->queue[qno];
+
+	queue->front++;
+	if (unlikely(queue->front == pqinfo->qlen))
+		queue->front = 0;
+}
+
+static int setup_sgio_components(struct cpt_vf *cptvf, struct buf_ptr *list,
+				 int buf_count, u8 *buffer)
+{
+	int ret = 0, i, j;
+	int components;
+	struct sglist_component *sg_ptr = NULL;
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (unlikely(!list)) {
+		dev_err(&pdev->dev, "Input List pointer is NULL\n");
+		return -EFAULT;
+	}
+
+	for (i = 0; i < buf_count; i++) {
+		if (likely(list[i].vptr)) {
+			list[i].dma_addr = dma_map_single(&pdev->dev,
+							  list[i].vptr,
+							  list[i].size,
+							  DMA_BIDIRECTIONAL);
+			if (unlikely(dma_mapping_error(&pdev->dev,
+						       list[i].dma_addr))) {
+				dev_err(&pdev->dev, "DMA map kernel buffer failed for component: %d\n",
+					i);
+				ret = -EIO;
+				goto sg_cleanup;
+			}
+		}
+	}
+
+	components = buf_count / 4;
+	sg_ptr = (struct sglist_component *)buffer;
+	for (i = 0; i < components; i++) {
+		sg_ptr->u.s.len0 = cpu_to_be16(list[i * 4 + 0].size);
+		sg_ptr->u.s.len1 = cpu_to_be16(list[i * 4 + 1].size);
+		sg_ptr->u.s.len2 = cpu_to_be16(list[i * 4 + 2].size);
+		sg_ptr->u.s.len3 = cpu_to_be16(list[i * 4 + 3].size);
+		sg_ptr->ptr0 = cpu_to_be64(list[i * 4 + 0].dma_addr);
+		sg_ptr->ptr1 = cpu_to_be64(list[i * 4 + 1].dma_addr);
+		sg_ptr->ptr2 = cpu_to_be64(list[i * 4 + 2].dma_addr);
+		sg_ptr->ptr3 = cpu_to_be64(list[i * 4 + 3].dma_addr);
+		sg_ptr++;
+	}
+
+	components = buf_count % 4;
+
+	switch (components) {
+	case 3:
+		sg_ptr->u.s.len2 = cpu_to_be16(list[i * 4 + 2].size);
+		sg_ptr->ptr2 = cpu_to_be64(list[i * 4 + 2].dma_addr);
+		/* Fall through */
+	case 2:
+		sg_ptr->u.s.len1 = cpu_to_be16(list[i * 4 + 1].size);
+		sg_ptr->ptr1 = cpu_to_be64(list[i * 4 + 1].dma_addr);
+		/* Fall through */
+	case 1:
+		sg_ptr->u.s.len0 = cpu_to_be16(list[i * 4 + 0].size);
+		sg_ptr->ptr0 = cpu_to_be64(list[i * 4 + 0].dma_addr);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+
+sg_cleanup:
+	for (j = 0; j < i; j++) {
+		if (list[j].dma_addr) {
+			dma_unmap_single(&pdev->dev, list[i].dma_addr,
+					 list[i].size, DMA_BIDIRECTIONAL);
+		}
+
+		list[j].dma_addr = 0;
+	}
+
+	return ret;
+}
+
+static inline int setup_sgio_list(struct cpt_vf *cptvf,
+				  struct cpt_info_buffer *info,
+				  struct cpt_request_info *req)
+{
+	u16 g_sz_bytes = 0, s_sz_bytes = 0;
+	int ret = 0;
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (req->incnt > MAX_SG_IN_CNT || req->outcnt > MAX_SG_OUT_CNT) {
+		dev_err(&pdev->dev, "Request SG components are higher than supported\n");
+		ret = -EINVAL;
+		goto  scatter_gather_clean;
+	}
+
+	/* Setup gather (input) components */
+	g_sz_bytes = ((req->incnt + 3) / 4) * sizeof(struct sglist_component);
+	info->gather_components = kzalloc(g_sz_bytes, GFP_KERNEL);
+	if (!info->gather_components) {
+		ret = -ENOMEM;
+		goto  scatter_gather_clean;
+	}
+
+	ret = setup_sgio_components(cptvf, req->in,
+				    req->incnt,
+				    info->gather_components);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup gather list\n");
+		ret = -EFAULT;
+		goto  scatter_gather_clean;
+	}
+
+	/* Setup scatter (output) components */
+	s_sz_bytes = ((req->outcnt + 3) / 4) * sizeof(struct sglist_component);
+	info->scatter_components = kzalloc(s_sz_bytes, GFP_KERNEL);
+	if (!info->scatter_components) {
+		ret = -ENOMEM;
+		goto  scatter_gather_clean;
+	}
+
+	ret = setup_sgio_components(cptvf, req->out,
+				    req->outcnt,
+				    info->scatter_components);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to setup gather list\n");
+		ret = -EFAULT;
+		goto  scatter_gather_clean;
+	}
+
+	/* Create and initialize DPTR */
+	info->dlen = g_sz_bytes + s_sz_bytes + SG_LIST_HDR_SIZE;
+	info->in_buffer = kzalloc(info->dlen, GFP_KERNEL);
+	if (!info->in_buffer) {
+		ret = -ENOMEM;
+		goto  scatter_gather_clean;
+	}
+
+	((u16 *)info->in_buffer)[0] = req->outcnt;
+	((u16 *)info->in_buffer)[1] = req->incnt;
+	((u16 *)info->in_buffer)[2] = 0;
+	((u16 *)info->in_buffer)[3] = 0;
+	*(u64 *)info->in_buffer = cpu_to_be64p((u64 *)info->in_buffer);
+
+	memcpy(&info->in_buffer[8], info->gather_components,
+	       g_sz_bytes);
+	memcpy(&info->in_buffer[8 + g_sz_bytes],
+	       info->scatter_components, s_sz_bytes);
+
+	info->dptr_baddr = dma_map_single(&pdev->dev,
+					  (void *)info->in_buffer,
+					  info->dlen,
+					  DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&pdev->dev, info->dptr_baddr)) {
+		dev_err(&pdev->dev, "Mapping DPTR Failed %d\n", info->dlen);
+		ret = -EIO;
+		goto  scatter_gather_clean;
+	}
+
+	/* Create and initialize RPTR */
+	info->out_buffer = kzalloc(COMPLETION_CODE_SIZE, GFP_KERNEL);
+	if (!info->out_buffer) {
+		ret = -ENOMEM;
+		goto scatter_gather_clean;
+	}
+
+	*((u64 *)info->out_buffer) = ~((u64)COMPLETION_CODE_INIT);
+	info->alternate_caddr = (u64 *)info->out_buffer;
+	info->rptr_baddr = dma_map_single(&pdev->dev,
+					  (void *)info->out_buffer,
+					  COMPLETION_CODE_SIZE,
+					  DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&pdev->dev, info->rptr_baddr)) {
+		dev_err(&pdev->dev, "Mapping RPTR Failed %d\n",
+			COMPLETION_CODE_SIZE);
+		ret = -EIO;
+		goto  scatter_gather_clean;
+	}
+
+	return 0;
+
+scatter_gather_clean:
+	return ret;
+}
+
+int send_cpt_command(struct cpt_vf *cptvf, union cpt_inst_s *cmd,
+		     u32 qno)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct command_qinfo *qinfo = NULL;
+	struct command_queue *queue;
+	struct command_chunk *chunk;
+	u8 *ent;
+	int ret = 0;
+
+	if (unlikely(qno >= cptvf->nr_queues)) {
+		dev_err(&pdev->dev, "Invalid queue (qno: %d, nr_queues: %d)\n",
+			qno, cptvf->nr_queues);
+		return -EINVAL;
+	}
+
+	qinfo = &cptvf->cqinfo;
+	queue = &qinfo->queue[qno];
+	/* lock commad queue */
+	spin_lock(&queue->lock);
+	ent = &queue->qhead->head[queue->idx * qinfo->cmd_size];
+	memcpy(ent, (void *)cmd, qinfo->cmd_size);
+
+	if (++queue->idx >= queue->qhead->size / 64) {
+		struct hlist_node *node;
+
+		hlist_for_each(node, &queue->chead) {
+			chunk = hlist_entry(node, struct command_chunk,
+					    nextchunk);
+			if (chunk == queue->qhead) {
+				continue;
+			} else {
+				queue->qhead = chunk;
+				break;
+			}
+		}
+		queue->idx = 0;
+	}
+	/* make sure all memory stores are done before ringing doorbell */
+	smp_wmb();
+	cptvf_write_vq_doorbell(cptvf, 1);
+	/* unlock command queue */
+	spin_unlock(&queue->lock);
+
+	return ret;
+}
+
+void do_request_cleanup(struct cpt_vf *cptvf,
+			struct cpt_info_buffer *info)
+{
+	int i;
+	struct pci_dev *pdev = cptvf->pdev;
+	struct cpt_request_info *req;
+
+	if (info->dptr_baddr)
+		dma_unmap_single(&pdev->dev, info->dptr_baddr,
+				 info->dlen, DMA_BIDIRECTIONAL);
+
+	if (info->rptr_baddr)
+		dma_unmap_single(&pdev->dev, info->rptr_baddr,
+				 COMPLETION_CODE_SIZE, DMA_BIDIRECTIONAL);
+
+	if (info->comp_baddr)
+		dma_unmap_single(&pdev->dev, info->comp_baddr,
+				 sizeof(union cpt_res_s), DMA_BIDIRECTIONAL);
+
+	if (info->req) {
+		req = info->req;
+		for (i = 0; i < req->outcnt; i++) {
+			if (req->out[i].dma_addr)
+				dma_unmap_single(&pdev->dev,
+						 req->out[i].dma_addr,
+						 req->out[i].size,
+						 DMA_BIDIRECTIONAL);
+		}
+
+		for (i = 0; i < req->incnt; i++) {
+			if (req->in[i].dma_addr)
+				dma_unmap_single(&pdev->dev,
+						 req->in[i].dma_addr,
+						 req->in[i].size,
+						 DMA_BIDIRECTIONAL);
+		}
+	}
+
+	if (info->scatter_components)
+		kzfree(info->scatter_components);
+
+	if (info->gather_components)
+		kzfree(info->gather_components);
+
+	if (info->out_buffer)
+		kzfree(info->out_buffer);
+
+	if (info->in_buffer)
+		kzfree(info->in_buffer);
+
+	if (info->completion_addr)
+		kzfree((void *)info->completion_addr);
+
+	kzfree(info);
+}
+
+void do_post_process(struct cpt_vf *cptvf, struct cpt_info_buffer *info)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (!info || !cptvf) {
+		dev_err(&pdev->dev, "Input params are incorrect for post processing\n");
+		return;
+	}
+
+	do_request_cleanup(cptvf, info);
+}
+
+static inline void process_pending_queue(struct cpt_vf *cptvf,
+					 struct pending_qinfo *pqinfo,
+					 int qno)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+	struct pending_queue *pqueue = &pqinfo->queue[qno];
+	struct pending_entry *pentry = NULL;
+	struct cpt_info_buffer *info = NULL;
+	union cpt_res_s *status = NULL;
+	unsigned char ccode;
+
+	while (1) {
+		spin_lock_bh(&pqueue->lock);
+		pentry = &pqueue->head[pqueue->front];
+		if (unlikely(!pentry->busy)) {
+			spin_unlock_bh(&pqueue->lock);
+			break;
+		}
+
+		info = (struct cpt_info_buffer *)pentry->post_arg;
+		if (unlikely(!info)) {
+			dev_err(&pdev->dev, "Pending Entry post arg NULL\n");
+			pending_queue_inc_front(pqinfo, qno);
+			spin_unlock_bh(&pqueue->lock);
+			continue;
+		}
+
+		status = (union cpt_res_s *)pentry->completion_addr;
+		ccode = status->s.compcode;
+		if ((status->s.compcode == CPT_COMP_E_FAULT) ||
+		    (status->s.compcode == CPT_COMP_E_SWERR)) {
+			dev_err(&pdev->dev, "Request failed with %s\n",
+				(status->s.compcode == CPT_COMP_E_FAULT) ?
+				"DMA Fault" : "Software error");
+			pentry->completion_addr = NULL;
+			pentry->busy = false;
+			atomic64_dec((&pqueue->pending_count));
+			pentry->post_arg = NULL;
+			pending_queue_inc_front(pqinfo, qno);
+			do_request_cleanup(cptvf, info);
+			spin_unlock_bh(&pqueue->lock);
+			break;
+		} else if (status->s.compcode == COMPLETION_CODE_INIT) {
+			/* check for timeout */
+			if (time_after_eq(jiffies,
+					  (info->time_in +
+					  (CPT_COMMAND_TIMEOUT * HZ)))) {
+				dev_err(&pdev->dev, "Request timed out");
+				pentry->completion_addr = NULL;
+				pentry->busy = false;
+				atomic64_dec((&pqueue->pending_count));
+				pentry->post_arg = NULL;
+				pending_queue_inc_front(pqinfo, qno);
+				do_request_cleanup(cptvf, info);
+				spin_unlock_bh(&pqueue->lock);
+				break;
+			} else if ((*info->alternate_caddr ==
+				(~COMPLETION_CODE_INIT)) &&
+				(info->extra_time < TIME_IN_RESET_COUNT)) {
+				info->time_in = jiffies;
+				info->extra_time++;
+				spin_unlock_bh(&pqueue->lock);
+				break;
+			}
+		}
+
+		pentry->completion_addr = NULL;
+		pentry->busy = false;
+		pentry->post_arg = NULL;
+		atomic64_dec((&pqueue->pending_count));
+		pending_queue_inc_front(pqinfo, qno);
+		spin_unlock_bh(&pqueue->lock);
+
+		do_post_process(info->cptvf, info);
+		/*
+		 * Calling callback after we find
+		 * that the request has been serviced
+		 */
+		pentry->callback(ccode, pentry->callback_arg);
+	}
+}
+
+int process_request(struct cpt_vf *cptvf, struct cpt_request_info *req)
+{
+	int ret = 0, clear = 0, queue = 0;
+	struct cpt_info_buffer *info = NULL;
+	struct cptvf_request *cpt_req = NULL;
+	union ctrl_info *ctrl = NULL;
+	union cpt_res_s *result = NULL;
+	struct pending_entry *pentry = NULL;
+	struct pending_queue *pqueue = NULL;
+	struct pci_dev *pdev = cptvf->pdev;
+	u8 group = 0;
+	struct cpt_vq_command vq_cmd;
+	union cpt_inst_s cptinst;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (unlikely(!info)) {
+		dev_err(&pdev->dev, "Unable to allocate memory for info_buffer\n");
+		return -ENOMEM;
+	}
+
+	cpt_req = (struct cptvf_request *)&req->req;
+	ctrl = (union ctrl_info *)&req->ctrl;
+
+	info->cptvf = cptvf;
+	group = ctrl->s.grp;
+	ret = setup_sgio_list(cptvf, info, req);
+	if (ret) {
+		dev_err(&pdev->dev, "Setting up SG list failed");
+		goto request_cleanup;
+	}
+
+	cpt_req->dlen = info->dlen;
+	/*
+	 * Get buffer for union cpt_res_s response
+	 * structure and its physical address
+	 */
+	info->completion_addr = kzalloc(sizeof(union cpt_res_s), GFP_KERNEL);
+	if (unlikely(!info->completion_addr)) {
+		dev_err(&pdev->dev, "Unable to allocate memory for completion_addr\n");
+		return -ENOMEM;
+	}
+
+	result = (union cpt_res_s *)info->completion_addr;
+	result->s.compcode = COMPLETION_CODE_INIT;
+	info->comp_baddr = dma_map_single(&pdev->dev,
+					       (void *)info->completion_addr,
+					       sizeof(union cpt_res_s),
+					       DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&pdev->dev, info->comp_baddr)) {
+		dev_err(&pdev->dev, "mapping compptr Failed %lu\n",
+			sizeof(union cpt_res_s));
+		ret = -EFAULT;
+		goto  request_cleanup;
+	}
+
+	/* Fill the VQ command */
+	vq_cmd.cmd.u64 = 0;
+	vq_cmd.cmd.s.opcode = cpu_to_be16(cpt_req->opcode.flags);
+	vq_cmd.cmd.s.param1 = cpu_to_be16(cpt_req->param1);
+	vq_cmd.cmd.s.param2 = cpu_to_be16(cpt_req->param2);
+	vq_cmd.cmd.s.dlen   = cpu_to_be16(cpt_req->dlen);
+
+	/* 64-bit swap for microcode data reads, not needed for addresses*/
+	vq_cmd.cmd.u64 = cpu_to_be64(vq_cmd.cmd.u64);
+	vq_cmd.dptr = info->dptr_baddr;
+	vq_cmd.rptr = info->rptr_baddr;
+	vq_cmd.cptr.u64 = 0;
+	vq_cmd.cptr.s.grp = group;
+	/* Get Pending Entry to submit command */
+	/* Always queue 0, because 1 queue per VF */
+	queue = 0;
+	pqueue = &cptvf->pqinfo.queue[queue];
+
+	if (atomic64_read(&pqueue->pending_count) > PENDING_THOLD) {
+		dev_err(&pdev->dev, "pending threshold reached\n");
+		process_pending_queue(cptvf, &cptvf->pqinfo, queue);
+	}
+
+get_pending_entry:
+	spin_lock_bh(&pqueue->lock);
+	pentry = get_free_pending_entry(pqueue, cptvf->pqinfo.qlen);
+	if (unlikely(!pentry)) {
+		spin_unlock_bh(&pqueue->lock);
+		if (clear == 0) {
+			process_pending_queue(cptvf, &cptvf->pqinfo, queue);
+			clear = 1;
+			goto get_pending_entry;
+		}
+		dev_err(&pdev->dev, "Get free entry failed\n");
+		dev_err(&pdev->dev, "queue: %d, rear: %d, front: %d\n",
+			queue, pqueue->rear, pqueue->front);
+		ret = -EFAULT;
+		goto request_cleanup;
+	}
+
+	pentry->completion_addr = info->completion_addr;
+	pentry->post_arg = (void *)info;
+	pentry->callback = req->callback;
+	pentry->callback_arg = req->callback_arg;
+	info->pentry = pentry;
+	pentry->busy = true;
+	atomic64_inc(&pqueue->pending_count);
+
+	/* Send CPT command */
+	info->pentry = pentry;
+	info->time_in = jiffies;
+	info->req = req;
+
+	/* Create the CPT_INST_S type command for HW intrepretation */
+	cptinst.s.doneint = true;
+	cptinst.s.res_addr = (u64)info->comp_baddr;
+	cptinst.s.tag = 0;
+	cptinst.s.grp = 0;
+	cptinst.s.wq_ptr = 0;
+	cptinst.s.ei0 = vq_cmd.cmd.u64;
+	cptinst.s.ei1 = vq_cmd.dptr;
+	cptinst.s.ei2 = vq_cmd.rptr;
+	cptinst.s.ei3 = vq_cmd.cptr.u64;
+
+	ret = send_cpt_command(cptvf, &cptinst, queue);
+	spin_unlock_bh(&pqueue->lock);
+	if (unlikely(ret)) {
+		dev_err(&pdev->dev, "Send command failed for AE\n");
+		ret = -EFAULT;
+		goto request_cleanup;
+	}
+
+	return 0;
+
+request_cleanup:
+	dev_dbg(&pdev->dev, "Failed to submit CPT command\n");
+	do_request_cleanup(cptvf, info);
+
+	return ret;
+}
+
+void vq_post_process(struct cpt_vf *cptvf, u32 qno)
+{
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (unlikely(qno > cptvf->nr_queues)) {
+		dev_err(&pdev->dev, "Request for post processing on invalid pending queue: %u\n",
+			qno);
+		return;
+	}
+
+	process_pending_queue(cptvf, &cptvf->pqinfo, qno);
+}
+
+int cptvf_do_request(void *vfdev, struct cpt_request_info *req)
+{
+	struct cpt_vf *cptvf = (struct cpt_vf *)vfdev;
+	struct pci_dev *pdev = cptvf->pdev;
+
+	if (!cpt_device_ready(cptvf)) {
+		dev_err(&pdev->dev, "CPT Device is not ready");
+		return -ENODEV;
+	}
+
+	if ((cptvf->vftype == SE_TYPES) && (!req->ctrl.s.se_req)) {
+		dev_err(&pdev->dev, "CPTVF-%d of SE TYPE got AE request",
+			cptvf->vfid);
+		return -EINVAL;
+	} else if ((cptvf->vftype == AE_TYPES) && (req->ctrl.s.se_req)) {
+		dev_err(&pdev->dev, "CPTVF-%d of AE TYPE got SE request",
+			cptvf->vfid);
+		return -EINVAL;
+	}
+
+	return process_request(cptvf, req);
+}
diff --git a/drivers/crypto/cavium/cpt/request_manager.h b/drivers/crypto/cavium/cpt/request_manager.h
new file mode 100644
index 000000000000..80ee074c6e0c
--- /dev/null
+++ b/drivers/crypto/cavium/cpt/request_manager.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2016 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __REQUEST_MANAGER_H
+#define __REQUEST_MANAGER_H
+
+#include "cpt_common.h"
+
+#define TIME_IN_RESET_COUNT  5
+#define COMPLETION_CODE_SIZE 8
+#define COMPLETION_CODE_INIT 0
+#define PENDING_THOLD  100
+#define MAX_SG_IN_CNT 12
+#define MAX_SG_OUT_CNT 13
+#define SG_LIST_HDR_SIZE  8
+#define MAX_BUF_CNT	16
+
+union ctrl_info {
+	u32 flags;
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u32 reserved0:26;
+		u32 grp:3; /* Group bits */
+		u32 dma_mode:2; /* DMA mode */
+		u32 se_req:1;/* To SE core */
+#else
+		u32 se_req:1; /* To SE core */
+		u32 dma_mode:2; /* DMA mode */
+		u32 grp:3; /* Group bits */
+		u32 reserved0:26;
+#endif
+	} s;
+};
+
+union opcode_info {
+	u16 flags;
+	struct {
+		u8 major;
+		u8 minor;
+	} s;
+};
+
+struct cptvf_request {
+	union opcode_info opcode;
+	u16 param1;
+	u16 param2;
+	u16 dlen;
+};
+
+struct buf_ptr {
+	u8 *vptr;
+	dma_addr_t dma_addr;
+	u16 size;
+};
+
+struct cpt_request_info {
+	u8 incnt; /* Number of input buffers */
+	u8 outcnt; /* Number of output buffers */
+	u16 rlen; /* Output length */
+	union ctrl_info ctrl; /* User control information */
+	struct cptvf_request req; /* Request Information (Core specific) */
+
+	struct buf_ptr in[MAX_BUF_CNT];
+	struct buf_ptr out[MAX_BUF_CNT];
+
+	void (*callback)(int, void *); /* Kernel ASYNC request callabck */
+	void *callback_arg; /* Kernel ASYNC request callabck arg */
+};
+
+struct sglist_component {
+	union {
+		u64 len;
+		struct {
+			u16 len0;
+			u16 len1;
+			u16 len2;
+			u16 len3;
+		} s;
+	} u;
+	u64 ptr0;
+	u64 ptr1;
+	u64 ptr2;
+	u64 ptr3;
+};
+
+struct cpt_info_buffer {
+	struct cpt_vf *cptvf;
+	unsigned long time_in;
+	u8 extra_time;
+
+	struct cpt_request_info *req;
+	dma_addr_t dptr_baddr;
+	u32 dlen;
+	dma_addr_t rptr_baddr;
+	dma_addr_t comp_baddr;
+	u8 *in_buffer;
+	u8 *out_buffer;
+	u8 *gather_components;
+	u8 *scatter_components;
+
+	struct pending_entry *pentry;
+	volatile u64 *completion_addr;
+	volatile u64 *alternate_caddr;
+};
+
+/*
+ * CPT_INST_S software command definitions
+ * Words EI (0-3)
+ */
+union vq_cmd_word0 {
+	u64 u64;
+	struct {
+		u16 opcode;
+		u16 param1;
+		u16 param2;
+		u16 dlen;
+	} s;
+};
+
+union vq_cmd_word3 {
+	u64 u64;
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u64 grp:3;
+		u64 cptr:61;
+#else
+		u64 cptr:61;
+		u64 grp:3;
+#endif
+	} s;
+};
+
+struct cpt_vq_command {
+	union vq_cmd_word0 cmd;
+	u64 dptr;
+	u64 rptr;
+	union vq_cmd_word3 cptr;
+};
+
+void vq_post_process(struct cpt_vf *cptvf, u32 qno);
+int process_request(struct cpt_vf *cptvf, struct cpt_request_info *req);
+#endif /* __REQUEST_MANAGER_H */
diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
index 612898b4aaad..41cc853f8569 100644
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -250,17 +250,20 @@ static int ccp5_do_cmd(struct ccp5_desc *desc,
 		ret = wait_event_interruptible(cmd_q->int_queue,
 					       cmd_q->int_rcvd);
 		if (ret || cmd_q->cmd_error) {
+			/* Log the error and flush the queue by
+			 * moving the head pointer
+			 */
 			if (cmd_q->cmd_error)
 				ccp_log_error(cmd_q->ccp,
 					      cmd_q->cmd_error);
-			/* A version 5 device doesn't use Job IDs... */
+			iowrite32(tail, cmd_q->reg_head_lo);
 			if (!ret)
 				ret = -EIO;
 		}
 		cmd_q->int_rcvd = 0;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int ccp5_perform_aes(struct ccp_op *op)
@@ -284,8 +287,7 @@ static int ccp5_perform_aes(struct ccp_op *op)
 	CCP_AES_ENCRYPT(&function) = op->u.aes.action;
 	CCP_AES_MODE(&function) = op->u.aes.mode;
 	CCP_AES_TYPE(&function) = op->u.aes.type;
-	if (op->u.aes.mode == CCP_AES_MODE_CFB)
-		CCP_AES_SIZE(&function) = 0x7f;
+	CCP_AES_SIZE(&function) = op->u.aes.size;
 
 	CCP5_CMD_FUNCTION(&desc) = function.raw;
 
@@ -532,7 +534,7 @@ static int ccp_find_lsb_regions(struct ccp_cmd_queue *cmd_q, u64 status)
 		status >>= LSB_REGION_WIDTH;
 	}
 	queues = bitmap_weight(cmd_q->lsbmask, MAX_LSB_CNT);
-	dev_info(cmd_q->ccp->dev, "Queue %d can access %d LSB regions\n",
+	dev_dbg(cmd_q->ccp->dev, "Queue %d can access %d LSB regions\n",
 		 cmd_q->id, queues);
 
 	return queues ? 0 : -EINVAL;
@@ -574,7 +576,7 @@ static int ccp_find_and_assign_lsb_to_q(struct ccp_device *ccp,
 					 */
 					cmd_q->lsb = bitno;
 					bitmap_clear(lsb_pub, bitno, 1);
-					dev_info(ccp->dev,
+					dev_dbg(ccp->dev,
 						 "Queue %d gets LSB %d\n",
 						 i, bitno);
 					break;
@@ -732,7 +734,6 @@ static int ccp5_init(struct ccp_device *ccp)
 		ret = -EIO;
 		goto e_pool;
 	}
-	dev_notice(dev, "%u command queues available\n", ccp->cmd_q_count);
 
 	/* Turn off the queues and disable interrupts until ready */
 	for (i = 0; i < ccp->cmd_q_count; i++) {
diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index 649e5610a5ce..2b5c01fade05 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -467,6 +467,7 @@ struct ccp_aes_op {
 	enum ccp_aes_type type;
 	enum ccp_aes_mode mode;
 	enum ccp_aes_action action;
+	unsigned int size;
 };
 
 struct ccp_xts_aes_op {
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index 50fae4442801..f1396c3aedac 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -184,62 +184,46 @@ static void ccp_get_dm_area(struct ccp_dm_workarea *wa, unsigned int wa_offset,
 }
 
 static int ccp_reverse_set_dm_area(struct ccp_dm_workarea *wa,
+				   unsigned int wa_offset,
 				   struct scatterlist *sg,
-				   unsigned int len, unsigned int se_len,
-				   bool sign_extend)
+				   unsigned int sg_offset,
+				   unsigned int len)
 {
-	unsigned int nbytes, sg_offset, dm_offset, sb_len, i;
-	u8 buffer[CCP_REVERSE_BUF_SIZE];
+	u8 *p, *q;
 
-	if (WARN_ON(se_len > sizeof(buffer)))
-		return -EINVAL;
+	ccp_set_dm_area(wa, wa_offset, sg, sg_offset, len);
 
-	sg_offset = len;
-	dm_offset = 0;
-	nbytes = len;
-	while (nbytes) {
-		sb_len = min_t(unsigned int, nbytes, se_len);
-		sg_offset -= sb_len;
-
-		scatterwalk_map_and_copy(buffer, sg, sg_offset, sb_len, 0);
-		for (i = 0; i < sb_len; i++)
-			wa->address[dm_offset + i] = buffer[sb_len - i - 1];
-
-		dm_offset += sb_len;
-		nbytes -= sb_len;
-
-		if ((sb_len != se_len) && sign_extend) {
-			/* Must sign-extend to nearest sign-extend length */
-			if (wa->address[dm_offset - 1] & 0x80)
-				memset(wa->address + dm_offset, 0xff,
-				       se_len - sb_len);
-		}
+	p = wa->address + wa_offset;
+	q = p + len - 1;
+	while (p < q) {
+		*p = *p ^ *q;
+		*q = *p ^ *q;
+		*p = *p ^ *q;
+		p++;
+		q--;
 	}
-
 	return 0;
 }
 
 static void ccp_reverse_get_dm_area(struct ccp_dm_workarea *wa,
+				    unsigned int wa_offset,
 				    struct scatterlist *sg,
+				    unsigned int sg_offset,
 				    unsigned int len)
 {
-	unsigned int nbytes, sg_offset, dm_offset, sb_len, i;
-	u8 buffer[CCP_REVERSE_BUF_SIZE];
+	u8 *p, *q;
 
-	sg_offset = 0;
-	dm_offset = len;
-	nbytes = len;
-	while (nbytes) {
-		sb_len = min_t(unsigned int, nbytes, sizeof(buffer));
-		dm_offset -= sb_len;
-
-		for (i = 0; i < sb_len; i++)
-			buffer[sb_len - i - 1] = wa->address[dm_offset + i];
-		scatterwalk_map_and_copy(buffer, sg, sg_offset, sb_len, 1);
-
-		sg_offset += sb_len;
-		nbytes -= sb_len;
+	p = wa->address + wa_offset;
+	q = p + len - 1;
+	while (p < q) {
+		*p = *p ^ *q;
+		*q = *p ^ *q;
+		*p = *p ^ *q;
+		p++;
+		q--;
 	}
+
+	ccp_get_dm_area(wa, wa_offset, sg, sg_offset, len);
 }
 
 static void ccp_free_data(struct ccp_data *data, struct ccp_cmd_queue *cmd_q)
@@ -692,6 +676,14 @@ static int ccp_run_aes_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 			goto e_ctx;
 		}
 	}
+	switch (aes->mode) {
+	case CCP_AES_MODE_CFB: /* CFB128 only */
+	case CCP_AES_MODE_CTR:
+		op.u.aes.size = AES_BLOCK_SIZE * BITS_PER_BYTE - 1;
+		break;
+	default:
+		op.u.aes.size = 0;
+	}
 
 	/* Prepare the input and output data workareas. For in-place
 	 * operations we need to set the dma direction to BIDIRECTIONAL
@@ -1261,8 +1253,7 @@ static int ccp_run_rsa_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	if (ret)
 		goto e_sb;
 
-	ret = ccp_reverse_set_dm_area(&exp, rsa->exp, rsa->exp_len,
-				      CCP_SB_BYTES, false);
+	ret = ccp_reverse_set_dm_area(&exp, 0, rsa->exp, 0, rsa->exp_len);
 	if (ret)
 		goto e_exp;
 	ret = ccp_copy_to_sb(cmd_q, &exp, op.jobid, op.sb_key,
@@ -1280,16 +1271,12 @@ static int ccp_run_rsa_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	if (ret)
 		goto e_exp;
 
-	ret = ccp_reverse_set_dm_area(&src, rsa->mod, rsa->mod_len,
-				      CCP_SB_BYTES, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, rsa->mod, 0, rsa->mod_len);
 	if (ret)
 		goto e_src;
-	src.address += o_len;	/* Adjust the address for the copy operation */
-	ret = ccp_reverse_set_dm_area(&src, rsa->src, rsa->src_len,
-				      CCP_SB_BYTES, false);
+	ret = ccp_reverse_set_dm_area(&src, o_len, rsa->src, 0, rsa->src_len);
 	if (ret)
 		goto e_src;
-	src.address -= o_len;	/* Reset the address to original value */
 
 	/* Prepare the output area for the operation */
 	ret = ccp_init_data(&dst, cmd_q, rsa->dst, rsa->mod_len,
@@ -1314,7 +1301,7 @@ static int ccp_run_rsa_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 		goto e_dst;
 	}
 
-	ccp_reverse_get_dm_area(&dst.dm_wa, rsa->dst, rsa->mod_len);
+	ccp_reverse_get_dm_area(&dst.dm_wa, 0, rsa->dst, 0, rsa->mod_len);
 
 e_dst:
 	ccp_free_data(&dst, cmd_q);
@@ -1566,25 +1553,22 @@ static int ccp_run_ecc_mm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	save = src.address;
 
 	/* Copy the ECC modulus */
-	ret = ccp_reverse_set_dm_area(&src, ecc->mod, ecc->mod_len,
-				      CCP_ECC_OPERAND_SIZE, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, ecc->mod, 0, ecc->mod_len);
 	if (ret)
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
 
 	/* Copy the first operand */
-	ret = ccp_reverse_set_dm_area(&src, ecc->u.mm.operand_1,
-				      ecc->u.mm.operand_1_len,
-				      CCP_ECC_OPERAND_SIZE, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.mm.operand_1, 0,
+				      ecc->u.mm.operand_1_len);
 	if (ret)
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
 
 	if (ecc->function != CCP_ECC_FUNCTION_MINV_384BIT) {
 		/* Copy the second operand */
-		ret = ccp_reverse_set_dm_area(&src, ecc->u.mm.operand_2,
-					      ecc->u.mm.operand_2_len,
-					      CCP_ECC_OPERAND_SIZE, false);
+		ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.mm.operand_2, 0,
+					      ecc->u.mm.operand_2_len);
 		if (ret)
 			goto e_src;
 		src.address += CCP_ECC_OPERAND_SIZE;
@@ -1623,7 +1607,8 @@ static int ccp_run_ecc_mm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	}
 
 	/* Save the ECC result */
-	ccp_reverse_get_dm_area(&dst, ecc->u.mm.result, CCP_ECC_MODULUS_BYTES);
+	ccp_reverse_get_dm_area(&dst, 0, ecc->u.mm.result, 0,
+				CCP_ECC_MODULUS_BYTES);
 
 e_dst:
 	ccp_dm_free(&dst);
@@ -1691,22 +1676,19 @@ static int ccp_run_ecc_pm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	save = src.address;
 
 	/* Copy the ECC modulus */
-	ret = ccp_reverse_set_dm_area(&src, ecc->mod, ecc->mod_len,
-				      CCP_ECC_OPERAND_SIZE, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, ecc->mod, 0, ecc->mod_len);
 	if (ret)
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
 
 	/* Copy the first point X and Y coordinate */
-	ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.point_1.x,
-				      ecc->u.pm.point_1.x_len,
-				      CCP_ECC_OPERAND_SIZE, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.pm.point_1.x, 0,
+				      ecc->u.pm.point_1.x_len);
 	if (ret)
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
-	ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.point_1.y,
-				      ecc->u.pm.point_1.y_len,
-				      CCP_ECC_OPERAND_SIZE, false);
+	ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.pm.point_1.y, 0,
+				      ecc->u.pm.point_1.y_len);
 	if (ret)
 		goto e_src;
 	src.address += CCP_ECC_OPERAND_SIZE;
@@ -1717,15 +1699,13 @@ static int ccp_run_ecc_pm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 
 	if (ecc->function == CCP_ECC_FUNCTION_PADD_384BIT) {
 		/* Copy the second point X and Y coordinate */
-		ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.point_2.x,
-					      ecc->u.pm.point_2.x_len,
-					      CCP_ECC_OPERAND_SIZE, false);
+		ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.pm.point_2.x, 0,
+					      ecc->u.pm.point_2.x_len);
 		if (ret)
 			goto e_src;
 		src.address += CCP_ECC_OPERAND_SIZE;
-		ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.point_2.y,
-					      ecc->u.pm.point_2.y_len,
-					      CCP_ECC_OPERAND_SIZE, false);
+		ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.pm.point_2.y, 0,
+					      ecc->u.pm.point_2.y_len);
 		if (ret)
 			goto e_src;
 		src.address += CCP_ECC_OPERAND_SIZE;
@@ -1735,19 +1715,17 @@ static int ccp_run_ecc_pm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 		src.address += CCP_ECC_OPERAND_SIZE;
 	} else {
 		/* Copy the Domain "a" parameter */
-		ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.domain_a,
-					      ecc->u.pm.domain_a_len,
-					      CCP_ECC_OPERAND_SIZE, false);
+		ret = ccp_reverse_set_dm_area(&src, 0, ecc->u.pm.domain_a, 0,
+					      ecc->u.pm.domain_a_len);
 		if (ret)
 			goto e_src;
 		src.address += CCP_ECC_OPERAND_SIZE;
 
 		if (ecc->function == CCP_ECC_FUNCTION_PMUL_384BIT) {
 			/* Copy the scalar value */
-			ret = ccp_reverse_set_dm_area(&src, ecc->u.pm.scalar,
-						      ecc->u.pm.scalar_len,
-						      CCP_ECC_OPERAND_SIZE,
-						      false);
+			ret = ccp_reverse_set_dm_area(&src, 0,
+						      ecc->u.pm.scalar, 0,
+						      ecc->u.pm.scalar_len);
 			if (ret)
 				goto e_src;
 			src.address += CCP_ECC_OPERAND_SIZE;
@@ -1792,10 +1770,10 @@ static int ccp_run_ecc_pm_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	save = dst.address;
 
 	/* Save the ECC result X and Y coordinates */
-	ccp_reverse_get_dm_area(&dst, ecc->u.pm.result.x,
+	ccp_reverse_get_dm_area(&dst, 0, ecc->u.pm.result.x, 0,
 				CCP_ECC_MODULUS_BYTES);
 	dst.address += CCP_ECC_OUTPUT_SIZE;
-	ccp_reverse_get_dm_area(&dst, ecc->u.pm.result.y,
+	ccp_reverse_get_dm_area(&dst, 0, ecc->u.pm.result.y, 0,
 				CCP_ECC_MODULUS_BYTES);
 	dst.address += CCP_ECC_OUTPUT_SIZE;
 
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
old mode 100644
new mode 100755
index b4b78b37f8a6..41bc7f4f58cd
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -171,7 +171,7 @@ int chcr_handle_resp(struct crypto_async_request *req, unsigned char *input,
 		}
 		break;
 
-	case CRYPTO_ALG_TYPE_BLKCIPHER:
+	case CRYPTO_ALG_TYPE_ABLKCIPHER:
 		ctx_req.req.ablk_req = (struct ablkcipher_request *)req;
 		ctx_req.ctx.ablk_ctx =
 			ablkcipher_request_ctx(ctx_req.req.ablk_req);
@@ -542,10 +542,11 @@ static inline void create_wreq(struct chcr_context *ctx,
 				    (calc_tx_flits_ofld(skb) * 8), 16)));
 	chcr_req->wreq.cookie = cpu_to_be64((uintptr_t)req);
 	chcr_req->wreq.rx_chid_to_rx_q_id =
-		FILL_WR_RX_Q_ID(ctx->dev->tx_channel_id, qid,
-				is_iv ? iv_loc : IV_NOP);
+		FILL_WR_RX_Q_ID(ctx->dev->rx_channel_id, qid,
+				is_iv ? iv_loc : IV_NOP, ctx->tx_channel_id);
 
-	chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->dev->tx_channel_id);
+	chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->dev->tx_channel_id,
+						       qid);
 	chcr_req->ulptx.len = htonl((DIV_ROUND_UP((calc_tx_flits_ofld(skb) * 8),
 					16) - ((sizeof(chcr_req->wreq)) >> 4)));
 
@@ -606,7 +607,7 @@ static struct sk_buff
 	chcr_req = (struct chcr_wr *)__skb_put(skb, transhdr_len);
 	memset(chcr_req, 0, transhdr_len);
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(ctx->dev->tx_channel_id, 2, 1);
+		FILL_SEC_CPL_OP_IVINSR(ctx->dev->rx_channel_id, 2, 1);
 
 	chcr_req->sec_cpl.pldlen = htonl(ivsize + req->nbytes);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -782,6 +783,7 @@ static int chcr_device_init(struct chcr_context *ctx)
 		spin_lock(&ctx->dev->lock_chcr_dev);
 		ctx->tx_channel_id = rxq_idx;
 		ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id;
+		ctx->dev->rx_channel_id = 0;
 		spin_unlock(&ctx->dev->lock_chcr_dev);
 	}
 out:
@@ -874,7 +876,7 @@ static struct sk_buff *create_hash_wr(struct ahash_request *req,
 	memset(chcr_req, 0, transhdr_len);
 
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(ctx->dev->tx_channel_id, 2, 0);
+		FILL_SEC_CPL_OP_IVINSR(ctx->dev->rx_channel_id, 2, 0);
 	chcr_req->sec_cpl.pldlen = htonl(param->bfr_len + param->sg_len);
 
 	chcr_req->sec_cpl.aadstart_cipherstop_hi =
@@ -1425,7 +1427,7 @@ static struct sk_buff *create_authenc_wr(struct aead_request *req,
 	 * to the hardware spec
 	 */
 	chcr_req->sec_cpl.op_ivinsrtofst =
-		FILL_SEC_CPL_OP_IVINSR(ctx->dev->tx_channel_id, 2,
+		FILL_SEC_CPL_OP_IVINSR(ctx->dev->rx_channel_id, 2,
 				       (ivsize ? (assoclen + 1) : 0));
 	chcr_req->sec_cpl.pldlen = htonl(assoclen + ivsize + req->cryptlen);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
@@ -1601,7 +1603,7 @@ static void fill_sec_cpl_for_aead(struct cpl_tx_sec_pdu *sec_cpl,
 	unsigned int ivsize = AES_BLOCK_SIZE;
 	unsigned int cipher_mode = CHCR_SCMD_CIPHER_MODE_AES_CCM;
 	unsigned int mac_mode = CHCR_SCMD_AUTH_MODE_CBCMAC;
-	unsigned int c_id = chcrctx->dev->tx_channel_id;
+	unsigned int c_id = chcrctx->dev->rx_channel_id;
 	unsigned int ccm_xtra;
 	unsigned char tag_offset = 0, auth_offset = 0;
 	unsigned char hmac_ctrl = get_hmac(crypto_aead_authsize(tfm));
@@ -1877,7 +1879,7 @@ static struct sk_buff *create_gcm_wr(struct aead_request *req,
 
 	tag_offset = (op_type == CHCR_ENCRYPT_OP) ? 0 : authsize;
 	chcr_req->sec_cpl.op_ivinsrtofst = FILL_SEC_CPL_OP_IVINSR(
-					ctx->dev->tx_channel_id, 2, (ivsize ?
+					ctx->dev->rx_channel_id, 2, (ivsize ?
 					(req->assoclen + 1) : 0));
 	chcr_req->sec_cpl.pldlen = htonl(req->assoclen + ivsize + crypt_len);
 	chcr_req->sec_cpl.aadstart_cipherstop_hi = FILL_SEC_CPL_CIPHERSTOP_HI(
@@ -2187,8 +2189,7 @@ static int chcr_gcm_setkey(struct crypto_aead *aead, const u8 *key,
 	struct chcr_context *ctx = crypto_aead_ctx(aead);
 	struct chcr_aead_ctx *aeadctx = AEAD_CTX(ctx);
 	struct chcr_gcm_ctx *gctx = GCM_CTX(aeadctx);
-	struct blkcipher_desc h_desc;
-	struct scatterlist src[1];
+	struct crypto_cipher *cipher;
 	unsigned int ck_size;
 	int ret = 0, key_ctx_size = 0;
 
@@ -2221,27 +2222,26 @@ static int chcr_gcm_setkey(struct crypto_aead *aead, const u8 *key,
 						CHCR_KEYCTX_MAC_KEY_SIZE_128,
 						0, 0,
 						key_ctx_size >> 4);
-	/* Calculate the H = CIPH(K, 0 repeated 16 times) using sync aes
-	 * blkcipher It will go on key context
+	/* Calculate the H = CIPH(K, 0 repeated 16 times).
+	 * It will go in key context
 	 */
-	h_desc.tfm = crypto_alloc_blkcipher("cbc(aes-generic)", 0, 0);
-	if (IS_ERR(h_desc.tfm)) {
+	cipher = crypto_alloc_cipher("aes-generic", 0, 0);
+	if (IS_ERR(cipher)) {
 		aeadctx->enckey_len = 0;
 		ret = -ENOMEM;
 		goto out;
 	}
-	h_desc.flags = 0;
-	ret = crypto_blkcipher_setkey(h_desc.tfm, key, keylen);
+
+	ret = crypto_cipher_setkey(cipher, key, keylen);
 	if (ret) {
 		aeadctx->enckey_len = 0;
 		goto out1;
 	}
 	memset(gctx->ghash_h, 0, AEAD_H_SIZE);
-	sg_init_one(&src[0], gctx->ghash_h, AEAD_H_SIZE);
-	ret = crypto_blkcipher_encrypt(&h_desc, &src[0], &src[0], AEAD_H_SIZE);
+	crypto_cipher_encrypt_one(cipher, gctx->ghash_h, gctx->ghash_h);
 
 out1:
-	crypto_free_blkcipher(h_desc.tfm);
+	crypto_free_cipher(cipher);
 out:
 	return ret;
 }
@@ -2456,13 +2456,14 @@ static int chcr_aead_op(struct aead_request *req,
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct chcr_context *ctx = crypto_aead_ctx(tfm);
-	struct uld_ctx *u_ctx = ULD_CTX(ctx);
+	struct uld_ctx *u_ctx;
 	struct sk_buff *skb;
 
-	if (ctx && !ctx->dev) {
+	if (!ctx->dev) {
 		pr_err("chcr : %s : No crypto device.\n", __func__);
 		return -ENXIO;
 	}
+	u_ctx = ULD_CTX(ctx);
 	if (cxgb4_is_crypto_q_full(u_ctx->lldi.ports[0],
 				   ctx->tx_channel_id)) {
 		if (!(req->base.flags & CRYPTO_TFM_REQ_MAY_BACKLOG))
@@ -2492,7 +2493,7 @@ static struct chcr_alg_template driver_algs[] = {
 			.cra_name		= "cbc(aes)",
 			.cra_driver_name	= "cbc-aes-chcr",
 			.cra_priority		= CHCR_CRA_PRIORITY,
-			.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+			.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
 				CRYPTO_ALG_ASYNC,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct chcr_context)
@@ -2519,7 +2520,7 @@ static struct chcr_alg_template driver_algs[] = {
 			.cra_name		= "xts(aes)",
 			.cra_driver_name	= "xts-aes-chcr",
 			.cra_priority		= CHCR_CRA_PRIORITY,
-			.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+			.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
 				CRYPTO_ALG_ASYNC,
 			.cra_blocksize		= AES_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct chcr_context) +
diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h
index 3c7c51f7bedf..ba38bae7ce80 100644
--- a/drivers/crypto/chelsio/chcr_algo.h
+++ b/drivers/crypto/chelsio/chcr_algo.h
@@ -185,20 +185,21 @@
 			FW_CRYPTO_LOOKASIDE_WR_CCTX_LOC_V(1) | \
 			FW_CRYPTO_LOOKASIDE_WR_CCTX_SIZE_V((ctx_len)))
 
-#define FILL_WR_RX_Q_ID(cid, qid, wr_iv) \
+#define FILL_WR_RX_Q_ID(cid, qid, wr_iv, fid) \
 		htonl( \
 			FW_CRYPTO_LOOKASIDE_WR_RX_CHID_V((cid)) | \
 			FW_CRYPTO_LOOKASIDE_WR_RX_Q_ID_V((qid)) | \
 			FW_CRYPTO_LOOKASIDE_WR_LCB_V(0) | \
-			FW_CRYPTO_LOOKASIDE_WR_IV_V((wr_iv)))
+			FW_CRYPTO_LOOKASIDE_WR_IV_V((wr_iv)) | \
+			FW_CRYPTO_LOOKASIDE_WR_FQIDX_V(fid))
 
-#define FILL_ULPTX_CMD_DEST(cid) \
+#define FILL_ULPTX_CMD_DEST(cid, qid) \
 	htonl(ULPTX_CMD_V(ULP_TX_PKT) | \
 	      ULP_TXPKT_DEST_V(0) | \
 	      ULP_TXPKT_DATAMODIFY_V(0) | \
 	      ULP_TXPKT_CHANNELID_V((cid)) | \
 	      ULP_TXPKT_RO_V(1) | \
-	      ULP_TXPKT_FID_V(0))
+	      ULP_TXPKT_FID_V(qid))
 
 #define KEYCTX_ALIGN_PAD(bs) ({unsigned int _bs = (bs);\
 			      _bs == SHA1_DIGEST_SIZE ? 12 : 0; })
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index 1c65f07e1cc9..c28e018e0773 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -61,7 +61,7 @@ int assign_chcr_device(struct chcr_dev **dev)
 	 */
 	mutex_lock(&dev_mutex); /* TODO ? */
 	list_for_each_entry(u_ctx, &uld_ctx_list, entry)
-		if (u_ctx && u_ctx->dev) {
+		if (u_ctx->dev) {
 			*dev = u_ctx->dev;
 			ret = 0;
 			break;
@@ -151,18 +151,17 @@ int chcr_uld_rx_handler(void *handle, const __be64 *rsp,
 {
 	struct uld_ctx *u_ctx = (struct uld_ctx *)handle;
 	struct chcr_dev *dev = u_ctx->dev;
-	const struct cpl_act_establish *rpl = (struct cpl_act_establish
-					       *)rsp;
+	const struct cpl_fw6_pld *rpl = (struct cpl_fw6_pld *)rsp;
 
-	if (rpl->ot.opcode != CPL_FW6_PLD) {
+	if (rpl->opcode != CPL_FW6_PLD) {
 		pr_err("Unsupported opcode\n");
 		return 0;
 	}
 
 	if (!pgl)
-		work_handlers[rpl->ot.opcode](dev, (unsigned char *)&rsp[1]);
+		work_handlers[rpl->opcode](dev, (unsigned char *)&rsp[1]);
 	else
-		work_handlers[rpl->ot.opcode](dev, pgl->va);
+		work_handlers[rpl->opcode](dev, pgl->va);
 	return 0;
 }
 
diff --git a/drivers/crypto/chelsio/chcr_core.h b/drivers/crypto/chelsio/chcr_core.h
index c7088a4e0a49..79da22b5cdc9 100644
--- a/drivers/crypto/chelsio/chcr_core.h
+++ b/drivers/crypto/chelsio/chcr_core.h
@@ -75,6 +75,7 @@ struct chcr_dev {
 	spinlock_t lock_chcr_dev;
 	struct uld_ctx *u_ctx;
 	unsigned char tx_channel_id;
+	unsigned char rx_channel_id;
 };
 
 struct uld_ctx {
diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h
index 7ec0a8f12475..81cfd0ba132e 100644
--- a/drivers/crypto/chelsio/chcr_crypto.h
+++ b/drivers/crypto/chelsio/chcr_crypto.h
@@ -48,7 +48,7 @@
  * giving the processed data
  */
 
-#define CHCR_CRA_PRIORITY 300
+#define CHCR_CRA_PRIORITY 3000
 
 #define CHCR_AES_MAX_KEY_LEN  (2 * (AES_MAX_KEY_SIZE)) /* consider xts */
 #define CHCR_MAX_CRYPTO_IV_LEN 16 /* AES IV len */
diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c
index a2e77b87485b..9b07f3d88feb 100644
--- a/drivers/crypto/img-hash.c
+++ b/drivers/crypto/img-hash.c
@@ -226,7 +226,7 @@ static int img_hash_xmit_dma(struct img_hash_dev *hdev, struct scatterlist *sg)
 	struct dma_async_tx_descriptor *desc;
 	struct img_hash_request_ctx *ctx = ahash_request_ctx(hdev->req);
 
-	ctx->dma_ct = dma_map_sg(hdev->dev, sg, 1, DMA_MEM_TO_DEV);
+	ctx->dma_ct = dma_map_sg(hdev->dev, sg, 1, DMA_TO_DEVICE);
 	if (ctx->dma_ct == 0) {
 		dev_err(hdev->dev, "Invalid DMA sg\n");
 		hdev->err = -EINVAL;
@@ -241,7 +241,7 @@ static int img_hash_xmit_dma(struct img_hash_dev *hdev, struct scatterlist *sg)
 	if (!desc) {
 		dev_err(hdev->dev, "Null DMA descriptor\n");
 		hdev->err = -EINVAL;
-		dma_unmap_sg(hdev->dev, sg, 1, DMA_MEM_TO_DEV);
+		dma_unmap_sg(hdev->dev, sg, 1, DMA_TO_DEVICE);
 		return -EINVAL;
 	}
 	desc->callback = img_hash_dma_callback;
diff --git a/drivers/crypto/mediatek/Makefile b/drivers/crypto/mediatek/Makefile
new file mode 100644
index 000000000000..187be79c7f3e
--- /dev/null
+++ b/drivers/crypto/mediatek/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_CRYPTO_DEV_MEDIATEK) += mtk-crypto.o
+mtk-crypto-objs:= mtk-platform.o mtk-aes.o mtk-sha.o
diff --git a/drivers/crypto/mediatek/mtk-aes.c b/drivers/crypto/mediatek/mtk-aes.c
new file mode 100644
index 000000000000..3a47cdb8f0c8
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-aes.c
@@ -0,0 +1,1299 @@
+/*
+ * Cryptographic API.
+ *
+ * Driver for EIP97 AES acceleration.
+ *
+ * Copyright (c) 2016 Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Some ideas are from atmel-aes.c drivers.
+ */
+
+#include <crypto/aes.h>
+#include "mtk-platform.h"
+
+#define AES_QUEUE_SIZE		512
+#define AES_BUF_ORDER		2
+#define AES_BUF_SIZE		((PAGE_SIZE << AES_BUF_ORDER) \
+				& ~(AES_BLOCK_SIZE - 1))
+
+/* AES command token size */
+#define AES_CT_SIZE_ECB		2
+#define AES_CT_SIZE_CBC		3
+#define AES_CT_SIZE_CTR		3
+#define AES_CT_SIZE_GCM_OUT	5
+#define AES_CT_SIZE_GCM_IN	6
+#define AES_CT_CTRL_HDR		cpu_to_le32(0x00220000)
+
+/* AES-CBC/ECB/CTR command token */
+#define AES_CMD0		cpu_to_le32(0x05000000)
+#define AES_CMD1		cpu_to_le32(0x2d060000)
+#define AES_CMD2		cpu_to_le32(0xe4a63806)
+/* AES-GCM command token */
+#define AES_GCM_CMD0		cpu_to_le32(0x0b000000)
+#define AES_GCM_CMD1		cpu_to_le32(0xa0800000)
+#define AES_GCM_CMD2		cpu_to_le32(0x25000010)
+#define AES_GCM_CMD3		cpu_to_le32(0x0f020000)
+#define AES_GCM_CMD4		cpu_to_le32(0x21e60000)
+#define AES_GCM_CMD5		cpu_to_le32(0x40e60000)
+#define AES_GCM_CMD6		cpu_to_le32(0xd0070000)
+
+/* AES transform information word 0 fields */
+#define AES_TFM_BASIC_OUT	cpu_to_le32(0x4 << 0)
+#define AES_TFM_BASIC_IN	cpu_to_le32(0x5 << 0)
+#define AES_TFM_GCM_OUT		cpu_to_le32(0x6 << 0)
+#define AES_TFM_GCM_IN		cpu_to_le32(0xf << 0)
+#define AES_TFM_SIZE(x)		cpu_to_le32((x) << 8)
+#define AES_TFM_128BITS		cpu_to_le32(0xb << 16)
+#define AES_TFM_192BITS		cpu_to_le32(0xd << 16)
+#define AES_TFM_256BITS		cpu_to_le32(0xf << 16)
+/* AES transform information word 1 fields */
+#define AES_TFM_ECB		cpu_to_le32(0x0 << 0)
+#define AES_TFM_CBC		cpu_to_le32(0x1 << 0)
+#define AES_TFM_CTR_INIT	cpu_to_le32(0x2 << 0)	/* init counter to 1 */
+#define AES_TFM_CTR_LOAD	cpu_to_le32(0x6 << 0)	/* load/reuse counter */
+#define AES_TFM_3IV		cpu_to_le32(0x7 << 5)	/* using IV 0-2 */
+#define AES_TFM_FULL_IV		cpu_to_le32(0xf << 5)	/* using IV 0-3 */
+#define AES_TFM_IV_CTR_MODE	cpu_to_le32(0x1 << 10)
+#define AES_TFM_ENC_HASH	cpu_to_le32(0x1 << 17)
+#define AES_TFM_GHASH_DIG	cpu_to_le32(0x2 << 21)
+#define AES_TFM_GHASH		cpu_to_le32(0x4 << 23)
+
+/* AES flags */
+#define AES_FLAGS_ECB		BIT(0)
+#define AES_FLAGS_CBC		BIT(1)
+#define AES_FLAGS_CTR		BIT(2)
+#define AES_FLAGS_GCM		BIT(3)
+#define AES_FLAGS_ENCRYPT	BIT(4)
+#define AES_FLAGS_BUSY		BIT(5)
+
+/**
+ * Command token(CT) is a set of hardware instructions that
+ * are used to control engine's processing flow of AES.
+ *
+ * Transform information(TFM) is used to define AES state and
+ * contains all keys and initial vectors.
+ *
+ * The engine requires CT and TFM to do:
+ * - Commands decoding and control of the engine's data path.
+ * - Coordinating hardware data fetch and store operations.
+ * - Result token construction and output.
+ *
+ * Memory map of GCM's TFM:
+ * /-----------\
+ * |  AES KEY  | 128/196/256 bits
+ * |-----------|
+ * |  HASH KEY | a string 128 zero bits encrypted using the block cipher
+ * |-----------|
+ * |    IVs    | 4 * 4 bytes
+ * \-----------/
+ */
+struct mtk_aes_ct {
+	__le32 cmd[AES_CT_SIZE_GCM_IN];
+};
+
+struct mtk_aes_tfm {
+	__le32 ctrl[2];
+	__le32 state[SIZE_IN_WORDS(AES_KEYSIZE_256 + AES_BLOCK_SIZE * 2)];
+};
+
+struct mtk_aes_reqctx {
+	u64 mode;
+};
+
+struct mtk_aes_base_ctx {
+	struct mtk_cryp *cryp;
+	u32 keylen;
+	mtk_aes_fn start;
+
+	struct mtk_aes_ct ct;
+	dma_addr_t ct_dma;
+	struct mtk_aes_tfm tfm;
+	dma_addr_t tfm_dma;
+
+	__le32 ct_hdr;
+	u32 ct_size;
+};
+
+struct mtk_aes_ctx {
+	struct mtk_aes_base_ctx	base;
+};
+
+struct mtk_aes_ctr_ctx {
+	struct mtk_aes_base_ctx base;
+
+	u32	iv[AES_BLOCK_SIZE / sizeof(u32)];
+	size_t offset;
+	struct scatterlist src[2];
+	struct scatterlist dst[2];
+};
+
+struct mtk_aes_gcm_ctx {
+	struct mtk_aes_base_ctx base;
+
+	u32 authsize;
+	size_t textlen;
+
+	struct crypto_skcipher *ctr;
+};
+
+struct mtk_aes_gcm_setkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct mtk_aes_drv {
+	struct list_head dev_list;
+	/* Device list lock */
+	spinlock_t lock;
+};
+
+static struct mtk_aes_drv mtk_aes = {
+	.dev_list = LIST_HEAD_INIT(mtk_aes.dev_list),
+	.lock = __SPIN_LOCK_UNLOCKED(mtk_aes.lock),
+};
+
+static inline u32 mtk_aes_read(struct mtk_cryp *cryp, u32 offset)
+{
+	return readl_relaxed(cryp->base + offset);
+}
+
+static inline void mtk_aes_write(struct mtk_cryp *cryp,
+				 u32 offset, u32 value)
+{
+	writel_relaxed(value, cryp->base + offset);
+}
+
+static struct mtk_cryp *mtk_aes_find_dev(struct mtk_aes_base_ctx *ctx)
+{
+	struct mtk_cryp *cryp = NULL;
+	struct mtk_cryp *tmp;
+
+	spin_lock_bh(&mtk_aes.lock);
+	if (!ctx->cryp) {
+		list_for_each_entry(tmp, &mtk_aes.dev_list, aes_list) {
+			cryp = tmp;
+			break;
+		}
+		ctx->cryp = cryp;
+	} else {
+		cryp = ctx->cryp;
+	}
+	spin_unlock_bh(&mtk_aes.lock);
+
+	return cryp;
+}
+
+static inline size_t mtk_aes_padlen(size_t len)
+{
+	len &= AES_BLOCK_SIZE - 1;
+	return len ? AES_BLOCK_SIZE - len : 0;
+}
+
+static bool mtk_aes_check_aligned(struct scatterlist *sg, size_t len,
+				  struct mtk_aes_dma *dma)
+{
+	int nents;
+
+	if (!IS_ALIGNED(len, AES_BLOCK_SIZE))
+		return false;
+
+	for (nents = 0; sg; sg = sg_next(sg), ++nents) {
+		if (!IS_ALIGNED(sg->offset, sizeof(u32)))
+			return false;
+
+		if (len <= sg->length) {
+			if (!IS_ALIGNED(len, AES_BLOCK_SIZE))
+				return false;
+
+			dma->nents = nents + 1;
+			dma->remainder = sg->length - len;
+			sg->length = len;
+			return true;
+		}
+
+		if (!IS_ALIGNED(sg->length, AES_BLOCK_SIZE))
+			return false;
+
+		len -= sg->length;
+	}
+
+	return false;
+}
+
+static inline void mtk_aes_set_mode(struct mtk_aes_rec *aes,
+				    const struct mtk_aes_reqctx *rctx)
+{
+	/* Clear all but persistent flags and set request flags. */
+	aes->flags = (aes->flags & AES_FLAGS_BUSY) | rctx->mode;
+}
+
+static inline void mtk_aes_restore_sg(const struct mtk_aes_dma *dma)
+{
+	struct scatterlist *sg = dma->sg;
+	int nents = dma->nents;
+
+	if (!dma->remainder)
+		return;
+
+	while (--nents > 0 && sg)
+		sg = sg_next(sg);
+
+	if (!sg)
+		return;
+
+	sg->length += dma->remainder;
+}
+
+/*
+ * Write descriptors for processing. This will configure the engine, load
+ * the transform information and then start the packet processing.
+ */
+static int mtk_aes_xmit(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_ring *ring = cryp->ring[aes->id];
+	struct mtk_desc *cmd = NULL, *res = NULL;
+	struct scatterlist *ssg = aes->src.sg, *dsg = aes->dst.sg;
+	u32 slen = aes->src.sg_len, dlen = aes->dst.sg_len;
+	int nents;
+
+	/* Write command descriptors */
+	for (nents = 0; nents < slen; ++nents, ssg = sg_next(ssg)) {
+		cmd = ring->cmd_base + ring->cmd_pos;
+		cmd->hdr = MTK_DESC_BUF_LEN(ssg->length);
+		cmd->buf = cpu_to_le32(sg_dma_address(ssg));
+
+		if (nents == 0) {
+			cmd->hdr |= MTK_DESC_FIRST |
+				    MTK_DESC_CT_LEN(aes->ctx->ct_size);
+			cmd->ct = cpu_to_le32(aes->ctx->ct_dma);
+			cmd->ct_hdr = aes->ctx->ct_hdr;
+			cmd->tfm = cpu_to_le32(aes->ctx->tfm_dma);
+		}
+
+		if (++ring->cmd_pos == MTK_DESC_NUM)
+			ring->cmd_pos = 0;
+	}
+	cmd->hdr |= MTK_DESC_LAST;
+
+	/* Prepare result descriptors */
+	for (nents = 0; nents < dlen; ++nents, dsg = sg_next(dsg)) {
+		res = ring->res_base + ring->res_pos;
+		res->hdr = MTK_DESC_BUF_LEN(dsg->length);
+		res->buf = cpu_to_le32(sg_dma_address(dsg));
+
+		if (nents == 0)
+			res->hdr |= MTK_DESC_FIRST;
+
+		if (++ring->res_pos == MTK_DESC_NUM)
+			ring->res_pos = 0;
+	}
+	res->hdr |= MTK_DESC_LAST;
+
+	/* Prepare enough space for authenticated tag */
+	if (aes->flags & AES_FLAGS_GCM)
+		res->hdr += AES_BLOCK_SIZE;
+
+	/*
+	 * Make sure that all changes to the DMA ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_aes_write(cryp, RDR_PREP_COUNT(aes->id), MTK_DESC_CNT(dlen));
+	mtk_aes_write(cryp, CDR_PREP_COUNT(aes->id), MTK_DESC_CNT(slen));
+
+	return -EINPROGRESS;
+}
+
+static void mtk_aes_unmap(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_aes_base_ctx *ctx = aes->ctx;
+
+	dma_unmap_single(cryp->dev, ctx->ct_dma, sizeof(ctx->ct),
+			 DMA_TO_DEVICE);
+	dma_unmap_single(cryp->dev, ctx->tfm_dma, sizeof(ctx->tfm),
+			 DMA_TO_DEVICE);
+
+	if (aes->src.sg == aes->dst.sg) {
+		dma_unmap_sg(cryp->dev, aes->src.sg, aes->src.nents,
+			     DMA_BIDIRECTIONAL);
+
+		if (aes->src.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->src);
+	} else {
+		dma_unmap_sg(cryp->dev, aes->dst.sg, aes->dst.nents,
+			     DMA_FROM_DEVICE);
+
+		if (aes->dst.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->dst);
+
+		dma_unmap_sg(cryp->dev, aes->src.sg, aes->src.nents,
+			     DMA_TO_DEVICE);
+
+		if (aes->src.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->src);
+	}
+
+	if (aes->dst.sg == &aes->aligned_sg)
+		sg_copy_from_buffer(aes->real_dst, sg_nents(aes->real_dst),
+				    aes->buf, aes->total);
+}
+
+static int mtk_aes_map(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_aes_base_ctx *ctx = aes->ctx;
+
+	ctx->ct_dma = dma_map_single(cryp->dev, &ctx->ct, sizeof(ctx->ct),
+				     DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(cryp->dev, ctx->ct_dma)))
+		return -EINVAL;
+
+	ctx->tfm_dma = dma_map_single(cryp->dev, &ctx->tfm, sizeof(ctx->tfm),
+				      DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(cryp->dev, ctx->tfm_dma)))
+		goto tfm_map_err;
+
+	if (aes->src.sg == aes->dst.sg) {
+		aes->src.sg_len = dma_map_sg(cryp->dev, aes->src.sg,
+					     aes->src.nents,
+					     DMA_BIDIRECTIONAL);
+		aes->dst.sg_len = aes->src.sg_len;
+		if (unlikely(!aes->src.sg_len))
+			goto sg_map_err;
+	} else {
+		aes->src.sg_len = dma_map_sg(cryp->dev, aes->src.sg,
+					     aes->src.nents, DMA_TO_DEVICE);
+		if (unlikely(!aes->src.sg_len))
+			goto sg_map_err;
+
+		aes->dst.sg_len = dma_map_sg(cryp->dev, aes->dst.sg,
+					     aes->dst.nents, DMA_FROM_DEVICE);
+		if (unlikely(!aes->dst.sg_len)) {
+			dma_unmap_sg(cryp->dev, aes->src.sg, aes->src.nents,
+				     DMA_TO_DEVICE);
+			goto sg_map_err;
+		}
+	}
+
+	return mtk_aes_xmit(cryp, aes);
+
+sg_map_err:
+	dma_unmap_single(cryp->dev, ctx->tfm_dma, sizeof(ctx->tfm),
+			 DMA_TO_DEVICE);
+tfm_map_err:
+	dma_unmap_single(cryp->dev, ctx->ct_dma, sizeof(ctx->ct),
+			 DMA_TO_DEVICE);
+
+	return -EINVAL;
+}
+
+/* Initialize transform information of CBC/ECB/CTR mode */
+static void mtk_aes_info_init(struct mtk_cryp *cryp, struct mtk_aes_rec *aes,
+			      size_t len)
+{
+	struct ablkcipher_request *req = ablkcipher_request_cast(aes->areq);
+	struct mtk_aes_base_ctx *ctx = aes->ctx;
+
+	ctx->ct_hdr = AES_CT_CTRL_HDR | cpu_to_le32(len);
+	ctx->ct.cmd[0] = AES_CMD0 | cpu_to_le32(len);
+	ctx->ct.cmd[1] = AES_CMD1;
+
+	if (aes->flags & AES_FLAGS_ENCRYPT)
+		ctx->tfm.ctrl[0] = AES_TFM_BASIC_OUT;
+	else
+		ctx->tfm.ctrl[0] = AES_TFM_BASIC_IN;
+
+	if (ctx->keylen == SIZE_IN_WORDS(AES_KEYSIZE_128))
+		ctx->tfm.ctrl[0] |= AES_TFM_128BITS;
+	else if (ctx->keylen == SIZE_IN_WORDS(AES_KEYSIZE_256))
+		ctx->tfm.ctrl[0] |= AES_TFM_256BITS;
+	else
+		ctx->tfm.ctrl[0] |= AES_TFM_192BITS;
+
+	if (aes->flags & AES_FLAGS_CBC) {
+		const u32 *iv = (const u32 *)req->info;
+		u32 *iv_state = ctx->tfm.state + ctx->keylen;
+		int i;
+
+		ctx->tfm.ctrl[0] |= AES_TFM_SIZE(ctx->keylen +
+				    SIZE_IN_WORDS(AES_BLOCK_SIZE));
+		ctx->tfm.ctrl[1] = AES_TFM_CBC | AES_TFM_FULL_IV;
+
+		for (i = 0; i < SIZE_IN_WORDS(AES_BLOCK_SIZE); i++)
+			iv_state[i] = cpu_to_le32(iv[i]);
+
+		ctx->ct.cmd[2] = AES_CMD2;
+		ctx->ct_size = AES_CT_SIZE_CBC;
+	} else if (aes->flags & AES_FLAGS_ECB) {
+		ctx->tfm.ctrl[0] |= AES_TFM_SIZE(ctx->keylen);
+		ctx->tfm.ctrl[1] = AES_TFM_ECB;
+
+		ctx->ct_size = AES_CT_SIZE_ECB;
+	} else if (aes->flags & AES_FLAGS_CTR) {
+		ctx->tfm.ctrl[0] |= AES_TFM_SIZE(ctx->keylen +
+				    SIZE_IN_WORDS(AES_BLOCK_SIZE));
+		ctx->tfm.ctrl[1] = AES_TFM_CTR_LOAD | AES_TFM_FULL_IV;
+
+		ctx->ct.cmd[2] = AES_CMD2;
+		ctx->ct_size = AES_CT_SIZE_CTR;
+	}
+}
+
+static int mtk_aes_dma(struct mtk_cryp *cryp, struct mtk_aes_rec *aes,
+		       struct scatterlist *src, struct scatterlist *dst,
+		       size_t len)
+{
+	size_t padlen = 0;
+	bool src_aligned, dst_aligned;
+
+	aes->total = len;
+	aes->src.sg = src;
+	aes->dst.sg = dst;
+	aes->real_dst = dst;
+
+	src_aligned = mtk_aes_check_aligned(src, len, &aes->src);
+	if (src == dst)
+		dst_aligned = src_aligned;
+	else
+		dst_aligned = mtk_aes_check_aligned(dst, len, &aes->dst);
+
+	if (!src_aligned || !dst_aligned) {
+		padlen = mtk_aes_padlen(len);
+
+		if (len + padlen > AES_BUF_SIZE)
+			return -ENOMEM;
+
+		if (!src_aligned) {
+			sg_copy_to_buffer(src, sg_nents(src), aes->buf, len);
+			aes->src.sg = &aes->aligned_sg;
+			aes->src.nents = 1;
+			aes->src.remainder = 0;
+		}
+
+		if (!dst_aligned) {
+			aes->dst.sg = &aes->aligned_sg;
+			aes->dst.nents = 1;
+			aes->dst.remainder = 0;
+		}
+
+		sg_init_table(&aes->aligned_sg, 1);
+		sg_set_buf(&aes->aligned_sg, aes->buf, len + padlen);
+	}
+
+	mtk_aes_info_init(cryp, aes, len + padlen);
+
+	return mtk_aes_map(cryp, aes);
+}
+
+static int mtk_aes_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct crypto_async_request *new_areq)
+{
+	struct mtk_aes_rec *aes = cryp->aes[id];
+	struct crypto_async_request *areq, *backlog;
+	struct mtk_aes_base_ctx *ctx;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&aes->lock, flags);
+	if (new_areq)
+		ret = crypto_enqueue_request(&aes->queue, new_areq);
+	if (aes->flags & AES_FLAGS_BUSY) {
+		spin_unlock_irqrestore(&aes->lock, flags);
+		return ret;
+	}
+	backlog = crypto_get_backlog(&aes->queue);
+	areq = crypto_dequeue_request(&aes->queue);
+	if (areq)
+		aes->flags |= AES_FLAGS_BUSY;
+	spin_unlock_irqrestore(&aes->lock, flags);
+
+	if (!areq)
+		return ret;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	ctx = crypto_tfm_ctx(areq->tfm);
+
+	aes->areq = areq;
+	aes->ctx = ctx;
+
+	return ctx->start(cryp, aes);
+}
+
+static int mtk_aes_complete(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	aes->flags &= ~AES_FLAGS_BUSY;
+	aes->areq->complete(aes->areq, 0);
+
+	/* Handle new request */
+	return mtk_aes_handle_queue(cryp, aes->id, NULL);
+}
+
+static int mtk_aes_start(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct ablkcipher_request *req = ablkcipher_request_cast(aes->areq);
+	struct mtk_aes_reqctx *rctx = ablkcipher_request_ctx(req);
+
+	mtk_aes_set_mode(aes, rctx);
+	aes->resume = mtk_aes_complete;
+
+	return mtk_aes_dma(cryp, aes, req->src, req->dst, req->nbytes);
+}
+
+static inline struct mtk_aes_ctr_ctx *
+mtk_aes_ctr_ctx_cast(struct mtk_aes_base_ctx *ctx)
+{
+	return container_of(ctx, struct mtk_aes_ctr_ctx, base);
+}
+
+static int mtk_aes_ctr_transfer(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_aes_base_ctx *ctx = aes->ctx;
+	struct mtk_aes_ctr_ctx *cctx = mtk_aes_ctr_ctx_cast(ctx);
+	struct ablkcipher_request *req = ablkcipher_request_cast(aes->areq);
+	struct scatterlist *src, *dst;
+	int i;
+	u32 start, end, ctr, blocks, *iv_state;
+	size_t datalen;
+	bool fragmented = false;
+
+	/* Check for transfer completion. */
+	cctx->offset += aes->total;
+	if (cctx->offset >= req->nbytes)
+		return mtk_aes_complete(cryp, aes);
+
+	/* Compute data length. */
+	datalen = req->nbytes - cctx->offset;
+	blocks = DIV_ROUND_UP(datalen, AES_BLOCK_SIZE);
+	ctr = be32_to_cpu(cctx->iv[3]);
+
+	/* Check 32bit counter overflow. */
+	start = ctr;
+	end = start + blocks - 1;
+	if (end < start) {
+		ctr |= 0xffffffff;
+		datalen = AES_BLOCK_SIZE * -start;
+		fragmented = true;
+	}
+
+	/* Jump to offset. */
+	src = scatterwalk_ffwd(cctx->src, req->src, cctx->offset);
+	dst = ((req->src == req->dst) ? src :
+	       scatterwalk_ffwd(cctx->dst, req->dst, cctx->offset));
+
+	/* Write IVs into transform state buffer. */
+	iv_state = ctx->tfm.state + ctx->keylen;
+	for (i = 0; i < SIZE_IN_WORDS(AES_BLOCK_SIZE); i++)
+		iv_state[i] = cpu_to_le32(cctx->iv[i]);
+
+	if (unlikely(fragmented)) {
+	/*
+	 * Increment the counter manually to cope with the hardware
+	 * counter overflow.
+	 */
+		cctx->iv[3] = cpu_to_be32(ctr);
+		crypto_inc((u8 *)cctx->iv, AES_BLOCK_SIZE);
+	}
+	aes->resume = mtk_aes_ctr_transfer;
+
+	return mtk_aes_dma(cryp, aes, src, dst, datalen);
+}
+
+static int mtk_aes_ctr_start(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_aes_ctr_ctx *cctx = mtk_aes_ctr_ctx_cast(aes->ctx);
+	struct ablkcipher_request *req = ablkcipher_request_cast(aes->areq);
+	struct mtk_aes_reqctx *rctx = ablkcipher_request_ctx(req);
+
+	mtk_aes_set_mode(aes, rctx);
+
+	memcpy(cctx->iv, req->info, AES_BLOCK_SIZE);
+	cctx->offset = 0;
+	aes->total = 0;
+
+	return mtk_aes_ctr_transfer(cryp, aes);
+}
+
+/* Check and set the AES key to transform state buffer */
+static int mtk_aes_setkey(struct crypto_ablkcipher *tfm,
+			  const u8 *key, u32 keylen)
+{
+	struct mtk_aes_base_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	const u32 *aes_key = (const u32 *)key;
+	u32 *key_state = ctx->tfm.state;
+	int i;
+
+	if (keylen != AES_KEYSIZE_128 &&
+	    keylen != AES_KEYSIZE_192 &&
+	    keylen != AES_KEYSIZE_256) {
+		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->keylen = SIZE_IN_WORDS(keylen);
+
+	for (i = 0; i < ctx->keylen; i++)
+		key_state[i] = cpu_to_le32(aes_key[i]);
+
+	return 0;
+}
+
+static int mtk_aes_crypt(struct ablkcipher_request *req, u64 mode)
+{
+	struct mtk_aes_base_ctx *ctx;
+	struct mtk_aes_reqctx *rctx;
+
+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
+	rctx = ablkcipher_request_ctx(req);
+	rctx->mode = mode;
+
+	return mtk_aes_handle_queue(ctx->cryp, !(mode & AES_FLAGS_ENCRYPT),
+				    &req->base);
+}
+
+static int mtk_aes_ecb_encrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ENCRYPT | AES_FLAGS_ECB);
+}
+
+static int mtk_aes_ecb_decrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ECB);
+}
+
+static int mtk_aes_cbc_encrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ENCRYPT | AES_FLAGS_CBC);
+}
+
+static int mtk_aes_cbc_decrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_CBC);
+}
+
+static int mtk_aes_ctr_encrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ENCRYPT | AES_FLAGS_CTR);
+}
+
+static int mtk_aes_ctr_decrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_CTR);
+}
+
+static int mtk_aes_cra_init(struct crypto_tfm *tfm)
+{
+	struct mtk_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mtk_cryp *cryp = NULL;
+
+	cryp = mtk_aes_find_dev(&ctx->base);
+	if (!cryp) {
+		pr_err("can't find crypto device\n");
+		return -ENODEV;
+	}
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct mtk_aes_reqctx);
+	ctx->base.start = mtk_aes_start;
+	return 0;
+}
+
+static int mtk_aes_ctr_cra_init(struct crypto_tfm *tfm)
+{
+	struct mtk_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mtk_cryp *cryp = NULL;
+
+	cryp = mtk_aes_find_dev(&ctx->base);
+	if (!cryp) {
+		pr_err("can't find crypto device\n");
+		return -ENODEV;
+	}
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct mtk_aes_reqctx);
+	ctx->base.start = mtk_aes_ctr_start;
+	return 0;
+}
+
+static struct crypto_alg aes_algs[] = {
+{
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-mtk",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
+				  CRYPTO_ALG_ASYNC,
+	.cra_init		= mtk_aes_cra_init,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct mtk_aes_ctx),
+	.cra_alignmask		= 0xf,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.setkey		= mtk_aes_setkey,
+		.encrypt	= mtk_aes_cbc_encrypt,
+		.decrypt	= mtk_aes_cbc_decrypt,
+		.ivsize		= AES_BLOCK_SIZE,
+	}
+},
+{
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-mtk",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
+				  CRYPTO_ALG_ASYNC,
+	.cra_init		= mtk_aes_cra_init,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct mtk_aes_ctx),
+	.cra_alignmask		= 0xf,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.setkey		= mtk_aes_setkey,
+		.encrypt	= mtk_aes_ecb_encrypt,
+		.decrypt	= mtk_aes_ecb_decrypt,
+	}
+},
+{
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-mtk",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
+				  CRYPTO_ALG_ASYNC,
+	.cra_init		= mtk_aes_ctr_cra_init,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct mtk_aes_ctr_ctx),
+	.cra_alignmask		= 0xf,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= mtk_aes_setkey,
+		.encrypt	= mtk_aes_ctr_encrypt,
+		.decrypt	= mtk_aes_ctr_decrypt,
+	}
+},
+};
+
+static inline struct mtk_aes_gcm_ctx *
+mtk_aes_gcm_ctx_cast(struct mtk_aes_base_ctx *ctx)
+{
+	return container_of(ctx, struct mtk_aes_gcm_ctx, base);
+}
+
+/* Initialize transform information of GCM mode */
+static void mtk_aes_gcm_info_init(struct mtk_cryp *cryp,
+				  struct mtk_aes_rec *aes,
+				  size_t len)
+{
+	struct aead_request *req = aead_request_cast(aes->areq);
+	struct mtk_aes_base_ctx *ctx = aes->ctx;
+	struct mtk_aes_gcm_ctx *gctx = mtk_aes_gcm_ctx_cast(ctx);
+	const u32 *iv = (const u32 *)req->iv;
+	u32 *iv_state = ctx->tfm.state + ctx->keylen +
+			SIZE_IN_WORDS(AES_BLOCK_SIZE);
+	u32 ivsize = crypto_aead_ivsize(crypto_aead_reqtfm(req));
+	int i;
+
+	ctx->ct_hdr = AES_CT_CTRL_HDR | len;
+
+	ctx->ct.cmd[0] = AES_GCM_CMD0 | cpu_to_le32(req->assoclen);
+	ctx->ct.cmd[1] = AES_GCM_CMD1 | cpu_to_le32(req->assoclen);
+	ctx->ct.cmd[2] = AES_GCM_CMD2;
+	ctx->ct.cmd[3] = AES_GCM_CMD3 | cpu_to_le32(gctx->textlen);
+
+	if (aes->flags & AES_FLAGS_ENCRYPT) {
+		ctx->ct.cmd[4] = AES_GCM_CMD4 | cpu_to_le32(gctx->authsize);
+		ctx->ct_size = AES_CT_SIZE_GCM_OUT;
+		ctx->tfm.ctrl[0] = AES_TFM_GCM_OUT;
+	} else {
+		ctx->ct.cmd[4] = AES_GCM_CMD5 | cpu_to_le32(gctx->authsize);
+		ctx->ct.cmd[5] = AES_GCM_CMD6 | cpu_to_le32(gctx->authsize);
+		ctx->ct_size = AES_CT_SIZE_GCM_IN;
+		ctx->tfm.ctrl[0] = AES_TFM_GCM_IN;
+	}
+
+	if (ctx->keylen == SIZE_IN_WORDS(AES_KEYSIZE_128))
+		ctx->tfm.ctrl[0] |= AES_TFM_128BITS;
+	else if (ctx->keylen == SIZE_IN_WORDS(AES_KEYSIZE_256))
+		ctx->tfm.ctrl[0] |= AES_TFM_256BITS;
+	else
+		ctx->tfm.ctrl[0] |= AES_TFM_192BITS;
+
+	ctx->tfm.ctrl[0] |= AES_TFM_GHASH_DIG | AES_TFM_GHASH |
+			    AES_TFM_SIZE(ctx->keylen + SIZE_IN_WORDS(
+			    AES_BLOCK_SIZE + ivsize));
+	ctx->tfm.ctrl[1] = AES_TFM_CTR_INIT | AES_TFM_IV_CTR_MODE |
+			   AES_TFM_3IV | AES_TFM_ENC_HASH;
+
+	for (i = 0; i < SIZE_IN_WORDS(ivsize); i++)
+		iv_state[i] = cpu_to_le32(iv[i]);
+}
+
+static int mtk_aes_gcm_dma(struct mtk_cryp *cryp, struct mtk_aes_rec *aes,
+			   struct scatterlist *src, struct scatterlist *dst,
+			   size_t len)
+{
+	bool src_aligned, dst_aligned;
+
+	aes->src.sg = src;
+	aes->dst.sg = dst;
+	aes->real_dst = dst;
+
+	src_aligned = mtk_aes_check_aligned(src, len, &aes->src);
+	if (src == dst)
+		dst_aligned = src_aligned;
+	else
+		dst_aligned = mtk_aes_check_aligned(dst, len, &aes->dst);
+
+	if (!src_aligned || !dst_aligned) {
+		if (aes->total > AES_BUF_SIZE)
+			return -ENOMEM;
+
+		if (!src_aligned) {
+			sg_copy_to_buffer(src, sg_nents(src), aes->buf, len);
+			aes->src.sg = &aes->aligned_sg;
+			aes->src.nents = 1;
+			aes->src.remainder = 0;
+		}
+
+		if (!dst_aligned) {
+			aes->dst.sg = &aes->aligned_sg;
+			aes->dst.nents = 1;
+			aes->dst.remainder = 0;
+		}
+
+		sg_init_table(&aes->aligned_sg, 1);
+		sg_set_buf(&aes->aligned_sg, aes->buf, aes->total);
+	}
+
+	mtk_aes_gcm_info_init(cryp, aes, len);
+
+	return mtk_aes_map(cryp, aes);
+}
+
+/* Todo: GMAC */
+static int mtk_aes_gcm_start(struct mtk_cryp *cryp, struct mtk_aes_rec *aes)
+{
+	struct mtk_aes_gcm_ctx *gctx = mtk_aes_gcm_ctx_cast(aes->ctx);
+	struct aead_request *req = aead_request_cast(aes->areq);
+	struct mtk_aes_reqctx *rctx = aead_request_ctx(req);
+	u32 len = req->assoclen + req->cryptlen;
+
+	mtk_aes_set_mode(aes, rctx);
+
+	if (aes->flags & AES_FLAGS_ENCRYPT) {
+		u32 tag[4];
+		/* Compute total process length. */
+		aes->total = len + gctx->authsize;
+		/* Compute text length. */
+		gctx->textlen = req->cryptlen;
+		/* Hardware will append authenticated tag to output buffer */
+		scatterwalk_map_and_copy(tag, req->dst, len, gctx->authsize, 1);
+	} else {
+		aes->total = len;
+		gctx->textlen = req->cryptlen - gctx->authsize;
+	}
+	aes->resume = mtk_aes_complete;
+
+	return mtk_aes_gcm_dma(cryp, aes, req->src, req->dst, len);
+}
+
+static int mtk_aes_gcm_crypt(struct aead_request *req, u64 mode)
+{
+	struct mtk_aes_base_ctx *ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));
+	struct mtk_aes_reqctx *rctx = aead_request_ctx(req);
+
+	rctx->mode = AES_FLAGS_GCM | mode;
+
+	return mtk_aes_handle_queue(ctx->cryp, !!(mode & AES_FLAGS_ENCRYPT),
+								&req->base);
+}
+
+static void mtk_gcm_setkey_done(struct crypto_async_request *req, int err)
+{
+	struct mtk_aes_gcm_setkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	result->err = err;
+	complete(&result->completion);
+}
+
+/*
+ * Because of the hardware limitation, we need to pre-calculate key(H)
+ * for the GHASH operation. The result of the encryption operation
+ * need to be stored in the transform state buffer.
+ */
+static int mtk_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
+			      u32 keylen)
+{
+	struct mtk_aes_base_ctx *ctx = crypto_aead_ctx(aead);
+	struct mtk_aes_gcm_ctx *gctx = mtk_aes_gcm_ctx_cast(ctx);
+	struct crypto_skcipher *ctr = gctx->ctr;
+	struct {
+		u32 hash[4];
+		u8 iv[8];
+
+		struct mtk_aes_gcm_setkey_result result;
+
+		struct scatterlist sg[1];
+		struct skcipher_request req;
+	} *data;
+	const u32 *aes_key;
+	u32 *key_state, *hash_state;
+	int err, i;
+
+	if (keylen != AES_KEYSIZE_256 &&
+	    keylen != AES_KEYSIZE_192 &&
+	    keylen != AES_KEYSIZE_128) {
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	key_state = ctx->tfm.state;
+	aes_key = (u32 *)key;
+	ctx->keylen = SIZE_IN_WORDS(keylen);
+
+	for (i = 0; i < ctx->keylen; i++)
+		ctx->tfm.state[i] = cpu_to_le32(aes_key[i]);
+
+	/* Same as crypto_gcm_setkey() from crypto/gcm.c */
+	crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
+	crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) &
+				  CRYPTO_TFM_REQ_MASK);
+	err = crypto_skcipher_setkey(ctr, key, keylen);
+	crypto_aead_set_flags(aead, crypto_skcipher_get_flags(ctr) &
+			      CRYPTO_TFM_RES_MASK);
+	if (err)
+		return err;
+
+	data = kzalloc(sizeof(*data) + crypto_skcipher_reqsize(ctr),
+		       GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	init_completion(&data->result.completion);
+	sg_init_one(data->sg, &data->hash, AES_BLOCK_SIZE);
+	skcipher_request_set_tfm(&data->req, ctr);
+	skcipher_request_set_callback(&data->req, CRYPTO_TFM_REQ_MAY_SLEEP |
+				      CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      mtk_gcm_setkey_done, &data->result);
+	skcipher_request_set_crypt(&data->req, data->sg, data->sg,
+				   AES_BLOCK_SIZE, data->iv);
+
+	err = crypto_skcipher_encrypt(&data->req);
+	if (err == -EINPROGRESS || err == -EBUSY) {
+		err = wait_for_completion_interruptible(
+			&data->result.completion);
+		if (!err)
+			err = data->result.err;
+	}
+	if (err)
+		goto out;
+
+	hash_state = key_state + ctx->keylen;
+
+	for (i = 0; i < 4; i++)
+		hash_state[i] = cpu_to_be32(data->hash[i]);
+out:
+	kzfree(data);
+	return err;
+}
+
+static int mtk_aes_gcm_setauthsize(struct crypto_aead *aead,
+				   u32 authsize)
+{
+	struct mtk_aes_base_ctx *ctx = crypto_aead_ctx(aead);
+	struct mtk_aes_gcm_ctx *gctx = mtk_aes_gcm_ctx_cast(ctx);
+
+	/* Same as crypto_gcm_authsize() from crypto/gcm.c */
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	gctx->authsize = authsize;
+	return 0;
+}
+
+static int mtk_aes_gcm_encrypt(struct aead_request *req)
+{
+	return mtk_aes_gcm_crypt(req, AES_FLAGS_ENCRYPT);
+}
+
+static int mtk_aes_gcm_decrypt(struct aead_request *req)
+{
+	return mtk_aes_gcm_crypt(req, 0);
+}
+
+static int mtk_aes_gcm_init(struct crypto_aead *aead)
+{
+	struct mtk_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+	struct mtk_cryp *cryp = NULL;
+
+	cryp = mtk_aes_find_dev(&ctx->base);
+	if (!cryp) {
+		pr_err("can't find crypto device\n");
+		return -ENODEV;
+	}
+
+	ctx->ctr = crypto_alloc_skcipher("ctr(aes)", 0,
+					 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ctx->ctr)) {
+		pr_err("Error allocating ctr(aes)\n");
+		return PTR_ERR(ctx->ctr);
+	}
+
+	crypto_aead_set_reqsize(aead, sizeof(struct mtk_aes_reqctx));
+	ctx->base.start = mtk_aes_gcm_start;
+	return 0;
+}
+
+static void mtk_aes_gcm_exit(struct crypto_aead *aead)
+{
+	struct mtk_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+
+	crypto_free_skcipher(ctx->ctr);
+}
+
+static struct aead_alg aes_gcm_alg = {
+	.setkey		= mtk_aes_gcm_setkey,
+	.setauthsize	= mtk_aes_gcm_setauthsize,
+	.encrypt	= mtk_aes_gcm_encrypt,
+	.decrypt	= mtk_aes_gcm_decrypt,
+	.init		= mtk_aes_gcm_init,
+	.exit		= mtk_aes_gcm_exit,
+	.ivsize		= 12,
+	.maxauthsize	= AES_BLOCK_SIZE,
+
+	.base = {
+		.cra_name		= "gcm(aes)",
+		.cra_driver_name	= "gcm-aes-mtk",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct mtk_aes_gcm_ctx),
+		.cra_alignmask		= 0xf,
+		.cra_module		= THIS_MODULE,
+	},
+};
+
+static void mtk_aes_enc_task(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_aes_rec *aes = cryp->aes[0];
+
+	mtk_aes_unmap(cryp, aes);
+	aes->resume(cryp, aes);
+}
+
+static void mtk_aes_dec_task(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_aes_rec *aes = cryp->aes[1];
+
+	mtk_aes_unmap(cryp, aes);
+	aes->resume(cryp, aes);
+}
+
+static irqreturn_t mtk_aes_enc_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_aes_rec *aes = cryp->aes[0];
+	u32 val = mtk_aes_read(cryp, RDR_STAT(RING0));
+
+	mtk_aes_write(cryp, RDR_STAT(RING0), val);
+
+	if (likely(AES_FLAGS_BUSY & aes->flags)) {
+		mtk_aes_write(cryp, RDR_PROC_COUNT(RING0), MTK_CNT_RST);
+		mtk_aes_write(cryp, RDR_THRESH(RING0),
+			      MTK_RDR_PROC_THRESH | MTK_RDR_PROC_MODE);
+
+		tasklet_schedule(&aes->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_aes_dec_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_aes_rec *aes = cryp->aes[1];
+	u32 val = mtk_aes_read(cryp, RDR_STAT(RING1));
+
+	mtk_aes_write(cryp, RDR_STAT(RING1), val);
+
+	if (likely(AES_FLAGS_BUSY & aes->flags)) {
+		mtk_aes_write(cryp, RDR_PROC_COUNT(RING1), MTK_CNT_RST);
+		mtk_aes_write(cryp, RDR_THRESH(RING1),
+			      MTK_RDR_PROC_THRESH | MTK_RDR_PROC_MODE);
+
+		tasklet_schedule(&aes->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * The purpose of creating encryption and decryption records is
+ * to process outbound/inbound data in parallel, it can improve
+ * performance in most use cases, such as IPSec VPN, especially
+ * under heavy network traffic.
+ */
+static int mtk_aes_record_init(struct mtk_cryp *cryp)
+{
+	struct mtk_aes_rec **aes = cryp->aes;
+	int i, err = -ENOMEM;
+
+	for (i = 0; i < MTK_REC_NUM; i++) {
+		aes[i] = kzalloc(sizeof(**aes), GFP_KERNEL);
+		if (!aes[i])
+			goto err_cleanup;
+
+		aes[i]->buf = (void *)__get_free_pages(GFP_KERNEL,
+						AES_BUF_ORDER);
+		if (!aes[i]->buf)
+			goto err_cleanup;
+
+		aes[i]->id = i;
+
+		spin_lock_init(&aes[i]->lock);
+		crypto_init_queue(&aes[i]->queue, AES_QUEUE_SIZE);
+	}
+
+	tasklet_init(&aes[0]->task, mtk_aes_enc_task, (unsigned long)cryp);
+	tasklet_init(&aes[1]->task, mtk_aes_dec_task, (unsigned long)cryp);
+
+	return 0;
+
+err_cleanup:
+	for (; i--; ) {
+		free_page((unsigned long)aes[i]->buf);
+		kfree(aes[i]);
+	}
+
+	return err;
+}
+
+static void mtk_aes_record_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < MTK_REC_NUM; i++) {
+		tasklet_kill(&cryp->aes[i]->task);
+		free_page((unsigned long)cryp->aes[i]->buf);
+		kfree(cryp->aes[i]);
+	}
+}
+
+static void mtk_aes_unregister_algs(void)
+{
+	int i;
+
+	crypto_unregister_aead(&aes_gcm_alg);
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
+		crypto_unregister_alg(&aes_algs[i]);
+}
+
+static int mtk_aes_register_algs(void)
+{
+	int err, i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		err = crypto_register_alg(&aes_algs[i]);
+		if (err)
+			goto err_aes_algs;
+	}
+
+	err = crypto_register_aead(&aes_gcm_alg);
+	if (err)
+		goto err_aes_algs;
+
+	return 0;
+
+err_aes_algs:
+	for (; i--; )
+		crypto_unregister_alg(&aes_algs[i]);
+
+	return err;
+}
+
+int mtk_cipher_alg_register(struct mtk_cryp *cryp)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&cryp->aes_list);
+
+	/* Initialize two cipher records */
+	ret = mtk_aes_record_init(cryp);
+	if (ret)
+		goto err_record;
+
+	/* Ring0 is use by encryption record */
+	ret = devm_request_irq(cryp->dev, cryp->irq[RING0], mtk_aes_enc_irq,
+			       IRQF_TRIGGER_LOW, "mtk-aes", cryp);
+	if (ret) {
+		dev_err(cryp->dev, "unable to request AES encryption irq.\n");
+		goto err_res;
+	}
+
+	/* Ring1 is use by decryption record */
+	ret = devm_request_irq(cryp->dev, cryp->irq[RING1], mtk_aes_dec_irq,
+			       IRQF_TRIGGER_LOW, "mtk-aes", cryp);
+	if (ret) {
+		dev_err(cryp->dev, "unable to request AES decryption irq.\n");
+		goto err_res;
+	}
+
+	/* Enable ring0 and ring1 interrupt */
+	mtk_aes_write(cryp, AIC_ENABLE_SET(RING0), MTK_IRQ_RDR0);
+	mtk_aes_write(cryp, AIC_ENABLE_SET(RING1), MTK_IRQ_RDR1);
+
+	spin_lock(&mtk_aes.lock);
+	list_add_tail(&cryp->aes_list, &mtk_aes.dev_list);
+	spin_unlock(&mtk_aes.lock);
+
+	ret = mtk_aes_register_algs();
+	if (ret)
+		goto err_algs;
+
+	return 0;
+
+err_algs:
+	spin_lock(&mtk_aes.lock);
+	list_del(&cryp->aes_list);
+	spin_unlock(&mtk_aes.lock);
+err_res:
+	mtk_aes_record_free(cryp);
+err_record:
+
+	dev_err(cryp->dev, "mtk-aes initialization failed.\n");
+	return ret;
+}
+
+void mtk_cipher_alg_release(struct mtk_cryp *cryp)
+{
+	spin_lock(&mtk_aes.lock);
+	list_del(&cryp->aes_list);
+	spin_unlock(&mtk_aes.lock);
+
+	mtk_aes_unregister_algs();
+	mtk_aes_record_free(cryp);
+}
diff --git a/drivers/crypto/mediatek/mtk-platform.c b/drivers/crypto/mediatek/mtk-platform.c
new file mode 100644
index 000000000000..a9c713d4c733
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-platform.c
@@ -0,0 +1,604 @@
+/*
+ * Driver for EIP97 cryptographic accelerator.
+ *
+ * Copyright (c) 2016 Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include "mtk-platform.h"
+
+#define MTK_BURST_SIZE_MSK		GENMASK(7, 4)
+#define MTK_BURST_SIZE(x)		((x) << 4)
+#define MTK_DESC_SIZE(x)		((x) << 0)
+#define MTK_DESC_OFFSET(x)		((x) << 16)
+#define MTK_DESC_FETCH_SIZE(x)		((x) << 0)
+#define MTK_DESC_FETCH_THRESH(x)	((x) << 16)
+#define MTK_DESC_OVL_IRQ_EN		BIT(25)
+#define MTK_DESC_ATP_PRESENT		BIT(30)
+
+#define MTK_DFSE_IDLE			GENMASK(3, 0)
+#define MTK_DFSE_THR_CTRL_EN		BIT(30)
+#define MTK_DFSE_THR_CTRL_RESET		BIT(31)
+#define MTK_DFSE_RING_ID(x)		(((x) >> 12) & GENMASK(3, 0))
+#define MTK_DFSE_MIN_DATA(x)		((x) << 0)
+#define MTK_DFSE_MAX_DATA(x)		((x) << 8)
+#define MTK_DFE_MIN_CTRL(x)		((x) << 16)
+#define MTK_DFE_MAX_CTRL(x)		((x) << 24)
+
+#define MTK_IN_BUF_MIN_THRESH(x)	((x) << 8)
+#define MTK_IN_BUF_MAX_THRESH(x)	((x) << 12)
+#define MTK_OUT_BUF_MIN_THRESH(x)	((x) << 0)
+#define MTK_OUT_BUF_MAX_THRESH(x)	((x) << 4)
+#define MTK_IN_TBUF_SIZE(x)		(((x) >> 4) & GENMASK(3, 0))
+#define MTK_IN_DBUF_SIZE(x)		(((x) >> 8) & GENMASK(3, 0))
+#define MTK_OUT_DBUF_SIZE(x)		(((x) >> 16) & GENMASK(3, 0))
+#define MTK_CMD_FIFO_SIZE(x)		(((x) >> 8) & GENMASK(3, 0))
+#define MTK_RES_FIFO_SIZE(x)		(((x) >> 12) & GENMASK(3, 0))
+
+#define MTK_PE_TK_LOC_AVL		BIT(2)
+#define MTK_PE_PROC_HELD		BIT(14)
+#define MTK_PE_TK_TIMEOUT_EN		BIT(22)
+#define MTK_PE_INPUT_DMA_ERR		BIT(0)
+#define MTK_PE_OUTPUT_DMA_ERR		BIT(1)
+#define MTK_PE_PKT_PORC_ERR		BIT(2)
+#define MTK_PE_PKT_TIMEOUT		BIT(3)
+#define MTK_PE_FATAL_ERR		BIT(14)
+#define MTK_PE_INPUT_DMA_ERR_EN		BIT(16)
+#define MTK_PE_OUTPUT_DMA_ERR_EN	BIT(17)
+#define MTK_PE_PKT_PORC_ERR_EN		BIT(18)
+#define MTK_PE_PKT_TIMEOUT_EN		BIT(19)
+#define MTK_PE_FATAL_ERR_EN		BIT(30)
+#define MTK_PE_INT_OUT_EN		BIT(31)
+
+#define MTK_HIA_SIGNATURE		((u16)0x35ca)
+#define MTK_HIA_DATA_WIDTH(x)		(((x) >> 25) & GENMASK(1, 0))
+#define MTK_HIA_DMA_LENGTH(x)		(((x) >> 20) & GENMASK(4, 0))
+#define MTK_CDR_STAT_CLR		GENMASK(4, 0)
+#define MTK_RDR_STAT_CLR		GENMASK(7, 0)
+
+#define MTK_AIC_INT_MSK			GENMASK(5, 0)
+#define MTK_AIC_VER_MSK			(GENMASK(15, 0) | GENMASK(27, 20))
+#define MTK_AIC_VER11			0x011036c9
+#define MTK_AIC_VER12			0x012036c9
+#define MTK_AIC_G_CLR			GENMASK(30, 20)
+
+/**
+ * EIP97 is an integrated security subsystem to accelerate cryptographic
+ * functions and protocols to offload the host processor.
+ * Some important hardware modules are briefly introduced below:
+ *
+ * Host Interface Adapter(HIA) - the main interface between the host
+ * system and the hardware subsystem. It is responsible for attaching
+ * processing engine to the specific host bus interface and provides a
+ * standardized software view for off loading tasks to the engine.
+ *
+ * Command Descriptor Ring Manager(CDR Manager) - keeps track of how many
+ * CD the host has prepared in the CDR. It monitors the fill level of its
+ * CD-FIFO and if there's sufficient space for the next block of descriptors,
+ * then it fires off a DMA request to fetch a block of CDs.
+ *
+ * Data fetch engine(DFE) - It is responsible for parsing the CD and
+ * setting up the required control and packet data DMA transfers from
+ * system memory to the processing engine.
+ *
+ * Result Descriptor Ring Manager(RDR Manager) - same as CDR Manager,
+ * but target is result descriptors, Moreover, it also handles the RD
+ * updates under control of the DSE. For each packet data segment
+ * processed, the DSE triggers the RDR Manager to write the updated RD.
+ * If triggered to update, the RDR Manager sets up a DMA operation to
+ * copy the RD from the DSE to the correct location in the RDR.
+ *
+ * Data Store Engine(DSE) - It is responsible for parsing the prepared RD
+ * and setting up the required control and packet data DMA transfers from
+ * the processing engine to system memory.
+ *
+ * Advanced Interrupt Controllers(AICs) - receive interrupt request signals
+ * from various sources and combine them into one interrupt output.
+ * The AICs are used by:
+ * - One for the HIA global and processing engine interrupts.
+ * - The others for the descriptor ring interrupts.
+ */
+
+/* Cryptographic engine capabilities */
+struct mtk_sys_cap {
+	/* host interface adapter */
+	u32 hia_ver;
+	u32 hia_opt;
+	/* packet engine */
+	u32 pkt_eng_opt;
+	/* global hardware */
+	u32 hw_opt;
+};
+
+static void mtk_desc_ring_link(struct mtk_cryp *cryp, u32 mask)
+{
+	/* Assign rings to DFE/DSE thread and enable it */
+	writel(MTK_DFSE_THR_CTRL_EN | mask, cryp->base + DFE_THR_CTRL);
+	writel(MTK_DFSE_THR_CTRL_EN | mask, cryp->base + DSE_THR_CTRL);
+}
+
+static void mtk_dfe_dse_buf_setup(struct mtk_cryp *cryp,
+				  struct mtk_sys_cap *cap)
+{
+	u32 width = MTK_HIA_DATA_WIDTH(cap->hia_opt) + 2;
+	u32 len = MTK_HIA_DMA_LENGTH(cap->hia_opt) - 1;
+	u32 ipbuf = min((u32)MTK_IN_DBUF_SIZE(cap->hw_opt) + width, len);
+	u32 opbuf = min((u32)MTK_OUT_DBUF_SIZE(cap->hw_opt) + width, len);
+	u32 itbuf = min((u32)MTK_IN_TBUF_SIZE(cap->hw_opt) + width, len);
+
+	writel(MTK_DFSE_MIN_DATA(ipbuf - 1) |
+	       MTK_DFSE_MAX_DATA(ipbuf) |
+	       MTK_DFE_MIN_CTRL(itbuf - 1) |
+	       MTK_DFE_MAX_CTRL(itbuf),
+	       cryp->base + DFE_CFG);
+
+	writel(MTK_DFSE_MIN_DATA(opbuf - 1) |
+	       MTK_DFSE_MAX_DATA(opbuf),
+	       cryp->base + DSE_CFG);
+
+	writel(MTK_IN_BUF_MIN_THRESH(ipbuf - 1) |
+	       MTK_IN_BUF_MAX_THRESH(ipbuf),
+	       cryp->base + PE_IN_DBUF_THRESH);
+
+	writel(MTK_IN_BUF_MIN_THRESH(itbuf - 1) |
+	       MTK_IN_BUF_MAX_THRESH(itbuf),
+	       cryp->base + PE_IN_TBUF_THRESH);
+
+	writel(MTK_OUT_BUF_MIN_THRESH(opbuf - 1) |
+	       MTK_OUT_BUF_MAX_THRESH(opbuf),
+	       cryp->base + PE_OUT_DBUF_THRESH);
+
+	writel(0, cryp->base + PE_OUT_TBUF_THRESH);
+	writel(0, cryp->base + PE_OUT_BUF_CTRL);
+}
+
+static int mtk_dfe_dse_state_check(struct mtk_cryp *cryp)
+{
+	int ret = -EINVAL;
+	u32 val;
+
+	/* Check for completion of all DMA transfers */
+	val = readl(cryp->base + DFE_THR_STAT);
+	if (MTK_DFSE_RING_ID(val) == MTK_DFSE_IDLE) {
+		val = readl(cryp->base + DSE_THR_STAT);
+		if (MTK_DFSE_RING_ID(val) == MTK_DFSE_IDLE)
+			ret = 0;
+	}
+
+	if (!ret) {
+		/* Take DFE/DSE thread out of reset */
+		writel(0, cryp->base + DFE_THR_CTRL);
+		writel(0, cryp->base + DSE_THR_CTRL);
+	} else {
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int mtk_dfe_dse_reset(struct mtk_cryp *cryp)
+{
+	int err;
+
+	/* Reset DSE/DFE and correct system priorities for all rings. */
+	writel(MTK_DFSE_THR_CTRL_RESET, cryp->base + DFE_THR_CTRL);
+	writel(0, cryp->base + DFE_PRIO_0);
+	writel(0, cryp->base + DFE_PRIO_1);
+	writel(0, cryp->base + DFE_PRIO_2);
+	writel(0, cryp->base + DFE_PRIO_3);
+
+	writel(MTK_DFSE_THR_CTRL_RESET, cryp->base + DSE_THR_CTRL);
+	writel(0, cryp->base + DSE_PRIO_0);
+	writel(0, cryp->base + DSE_PRIO_1);
+	writel(0, cryp->base + DSE_PRIO_2);
+	writel(0, cryp->base + DSE_PRIO_3);
+
+	err = mtk_dfe_dse_state_check(cryp);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void mtk_cmd_desc_ring_setup(struct mtk_cryp *cryp,
+				    int i, struct mtk_sys_cap *cap)
+{
+	/* Full descriptor that fits FIFO minus one */
+	u32 count =
+		((1 << MTK_CMD_FIFO_SIZE(cap->hia_opt)) / MTK_DESC_SZ) - 1;
+
+	/* Temporarily disable external triggering */
+	writel(0, cryp->base + CDR_CFG(i));
+
+	/* Clear CDR count */
+	writel(MTK_CNT_RST, cryp->base + CDR_PREP_COUNT(i));
+	writel(MTK_CNT_RST, cryp->base + CDR_PROC_COUNT(i));
+
+	writel(0, cryp->base + CDR_PREP_PNTR(i));
+	writel(0, cryp->base + CDR_PROC_PNTR(i));
+	writel(0, cryp->base + CDR_DMA_CFG(i));
+
+	/* Configure CDR host address space */
+	writel(0, cryp->base + CDR_BASE_ADDR_HI(i));
+	writel(cryp->ring[i]->cmd_dma, cryp->base + CDR_BASE_ADDR_LO(i));
+
+	writel(MTK_DESC_RING_SZ, cryp->base + CDR_RING_SIZE(i));
+
+	/* Clear and disable all CDR interrupts */
+	writel(MTK_CDR_STAT_CLR, cryp->base + CDR_STAT(i));
+
+	/*
+	 * Set command descriptor offset and enable additional
+	 * token present in descriptor.
+	 */
+	writel(MTK_DESC_SIZE(MTK_DESC_SZ) |
+		   MTK_DESC_OFFSET(MTK_DESC_OFF) |
+	       MTK_DESC_ATP_PRESENT,
+	       cryp->base + CDR_DESC_SIZE(i));
+
+	writel(MTK_DESC_FETCH_SIZE(count * MTK_DESC_OFF) |
+		   MTK_DESC_FETCH_THRESH(count * MTK_DESC_SZ),
+		   cryp->base + CDR_CFG(i));
+}
+
+static void mtk_res_desc_ring_setup(struct mtk_cryp *cryp,
+				    int i, struct mtk_sys_cap *cap)
+{
+	u32 rndup = 2;
+	u32 count = ((1 << MTK_RES_FIFO_SIZE(cap->hia_opt)) / rndup) - 1;
+
+	/* Temporarily disable external triggering */
+	writel(0, cryp->base + RDR_CFG(i));
+
+	/* Clear RDR count */
+	writel(MTK_CNT_RST, cryp->base + RDR_PREP_COUNT(i));
+	writel(MTK_CNT_RST, cryp->base + RDR_PROC_COUNT(i));
+
+	writel(0, cryp->base + RDR_PREP_PNTR(i));
+	writel(0, cryp->base + RDR_PROC_PNTR(i));
+	writel(0, cryp->base + RDR_DMA_CFG(i));
+
+	/* Configure RDR host address space */
+	writel(0, cryp->base + RDR_BASE_ADDR_HI(i));
+	writel(cryp->ring[i]->res_dma, cryp->base + RDR_BASE_ADDR_LO(i));
+
+	writel(MTK_DESC_RING_SZ, cryp->base + RDR_RING_SIZE(i));
+	writel(MTK_RDR_STAT_CLR, cryp->base + RDR_STAT(i));
+
+	/*
+	 * RDR manager generates update interrupts on a per-completed-packet,
+	 * and the rd_proc_thresh_irq interrupt is fired when proc_pkt_count
+	 * for the RDR exceeds the number of packets.
+	 */
+	writel(MTK_RDR_PROC_THRESH | MTK_RDR_PROC_MODE,
+	       cryp->base + RDR_THRESH(i));
+
+	/*
+	 * Configure a threshold and time-out value for the processed
+	 * result descriptors (or complete packets) that are written to
+	 * the RDR.
+	 */
+	writel(MTK_DESC_SIZE(MTK_DESC_SZ) | MTK_DESC_OFFSET(MTK_DESC_OFF),
+	       cryp->base + RDR_DESC_SIZE(i));
+
+	/*
+	 * Configure HIA fetch size and fetch threshold that are used to
+	 * fetch blocks of multiple descriptors.
+	 */
+	writel(MTK_DESC_FETCH_SIZE(count * MTK_DESC_OFF) |
+	       MTK_DESC_FETCH_THRESH(count * rndup) |
+	       MTK_DESC_OVL_IRQ_EN,
+		   cryp->base + RDR_CFG(i));
+}
+
+static int mtk_packet_engine_setup(struct mtk_cryp *cryp)
+{
+	struct mtk_sys_cap cap;
+	int i, err;
+	u32 val;
+
+	cap.hia_ver = readl(cryp->base + HIA_VERSION);
+	cap.hia_opt = readl(cryp->base + HIA_OPTIONS);
+	cap.hw_opt = readl(cryp->base + EIP97_OPTIONS);
+
+	if (!(((u16)cap.hia_ver) == MTK_HIA_SIGNATURE))
+		return -EINVAL;
+
+	/* Configure endianness conversion method for master (DMA) interface */
+	writel(0, cryp->base + EIP97_MST_CTRL);
+
+	/* Set HIA burst size */
+	val = readl(cryp->base + HIA_MST_CTRL);
+	val &= ~MTK_BURST_SIZE_MSK;
+	val |= MTK_BURST_SIZE(5);
+	writel(val, cryp->base + HIA_MST_CTRL);
+
+	err = mtk_dfe_dse_reset(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to reset DFE and DSE.\n");
+		return err;
+	}
+
+	mtk_dfe_dse_buf_setup(cryp, &cap);
+
+	/* Enable the 4 rings for the packet engines. */
+	mtk_desc_ring_link(cryp, 0xf);
+
+	for (i = 0; i < RING_MAX; i++) {
+		mtk_cmd_desc_ring_setup(cryp, i, &cap);
+		mtk_res_desc_ring_setup(cryp, i, &cap);
+	}
+
+	writel(MTK_PE_TK_LOC_AVL | MTK_PE_PROC_HELD | MTK_PE_TK_TIMEOUT_EN,
+	       cryp->base + PE_TOKEN_CTRL_STAT);
+
+	/* Clear all pending interrupts */
+	writel(MTK_AIC_G_CLR, cryp->base + AIC_G_ACK);
+	writel(MTK_PE_INPUT_DMA_ERR | MTK_PE_OUTPUT_DMA_ERR |
+	       MTK_PE_PKT_PORC_ERR | MTK_PE_PKT_TIMEOUT |
+	       MTK_PE_FATAL_ERR | MTK_PE_INPUT_DMA_ERR_EN |
+	       MTK_PE_OUTPUT_DMA_ERR_EN | MTK_PE_PKT_PORC_ERR_EN |
+	       MTK_PE_PKT_TIMEOUT_EN | MTK_PE_FATAL_ERR_EN |
+	       MTK_PE_INT_OUT_EN,
+	       cryp->base + PE_INTERRUPT_CTRL_STAT);
+
+	return 0;
+}
+
+static int mtk_aic_cap_check(struct mtk_cryp *cryp, int hw)
+{
+	u32 val;
+
+	if (hw == RING_MAX)
+		val = readl(cryp->base + AIC_G_VERSION);
+	else
+		val = readl(cryp->base + AIC_VERSION(hw));
+
+	val &= MTK_AIC_VER_MSK;
+	if (val != MTK_AIC_VER11 && val != MTK_AIC_VER12)
+		return -ENXIO;
+
+	if (hw == RING_MAX)
+		val = readl(cryp->base + AIC_G_OPTIONS);
+	else
+		val = readl(cryp->base + AIC_OPTIONS(hw));
+
+	val &= MTK_AIC_INT_MSK;
+	if (!val || val > 32)
+		return -ENXIO;
+
+	return 0;
+}
+
+static int mtk_aic_init(struct mtk_cryp *cryp, int hw)
+{
+	int err;
+
+	err = mtk_aic_cap_check(cryp, hw);
+	if (err)
+		return err;
+
+	/* Disable all interrupts and set initial configuration */
+	if (hw == RING_MAX) {
+		writel(0, cryp->base + AIC_G_ENABLE_CTRL);
+		writel(0, cryp->base + AIC_G_POL_CTRL);
+		writel(0, cryp->base + AIC_G_TYPE_CTRL);
+		writel(0, cryp->base + AIC_G_ENABLE_SET);
+	} else {
+		writel(0, cryp->base + AIC_ENABLE_CTRL(hw));
+		writel(0, cryp->base + AIC_POL_CTRL(hw));
+		writel(0, cryp->base + AIC_TYPE_CTRL(hw));
+		writel(0, cryp->base + AIC_ENABLE_SET(hw));
+	}
+
+	return 0;
+}
+
+static int mtk_accelerator_init(struct mtk_cryp *cryp)
+{
+	int i, err;
+
+	/* Initialize advanced interrupt controller(AIC) */
+	for (i = 0; i < MTK_IRQ_NUM; i++) {
+		err = mtk_aic_init(cryp, i);
+		if (err) {
+			dev_err(cryp->dev, "Failed to initialize AIC.\n");
+			return err;
+		}
+	}
+
+	/* Initialize packet engine */
+	err = mtk_packet_engine_setup(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to configure packet engine.\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static void mtk_desc_dma_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < RING_MAX; i++) {
+		dma_free_coherent(cryp->dev, MTK_DESC_RING_SZ,
+				  cryp->ring[i]->res_base,
+				  cryp->ring[i]->res_dma);
+		dma_free_coherent(cryp->dev, MTK_DESC_RING_SZ,
+				  cryp->ring[i]->cmd_base,
+				  cryp->ring[i]->cmd_dma);
+		kfree(cryp->ring[i]);
+	}
+}
+
+static int mtk_desc_ring_alloc(struct mtk_cryp *cryp)
+{
+	struct mtk_ring **ring = cryp->ring;
+	int i, err = ENOMEM;
+
+	for (i = 0; i < RING_MAX; i++) {
+		ring[i] = kzalloc(sizeof(**ring), GFP_KERNEL);
+		if (!ring[i])
+			goto err_cleanup;
+
+		ring[i]->cmd_base = dma_zalloc_coherent(cryp->dev,
+					   MTK_DESC_RING_SZ,
+					   &ring[i]->cmd_dma,
+					   GFP_KERNEL);
+		if (!ring[i]->cmd_base)
+			goto err_cleanup;
+
+		ring[i]->res_base = dma_zalloc_coherent(cryp->dev,
+					   MTK_DESC_RING_SZ,
+					   &ring[i]->res_dma,
+					   GFP_KERNEL);
+		if (!ring[i]->res_base)
+			goto err_cleanup;
+	}
+	return 0;
+
+err_cleanup:
+	for (; i--; ) {
+		dma_free_coherent(cryp->dev, MTK_DESC_RING_SZ,
+				  ring[i]->res_base, ring[i]->res_dma);
+		dma_free_coherent(cryp->dev, MTK_DESC_RING_SZ,
+				  ring[i]->cmd_base, ring[i]->cmd_dma);
+		kfree(ring[i]);
+	}
+	return err;
+}
+
+static int mtk_crypto_probe(struct platform_device *pdev)
+{
+	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	struct mtk_cryp *cryp;
+	int i, err;
+
+	cryp = devm_kzalloc(&pdev->dev, sizeof(*cryp), GFP_KERNEL);
+	if (!cryp)
+		return -ENOMEM;
+
+	cryp->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(cryp->base))
+		return PTR_ERR(cryp->base);
+
+	for (i = 0; i < MTK_IRQ_NUM; i++) {
+		cryp->irq[i] = platform_get_irq(pdev, i);
+		if (cryp->irq[i] < 0) {
+			dev_err(cryp->dev, "no IRQ:%d resource info\n", i);
+			return -ENXIO;
+		}
+	}
+
+	cryp->clk_ethif = devm_clk_get(&pdev->dev, "ethif");
+	cryp->clk_cryp = devm_clk_get(&pdev->dev, "cryp");
+	if (IS_ERR(cryp->clk_ethif) || IS_ERR(cryp->clk_cryp))
+		return -EPROBE_DEFER;
+
+	cryp->dev = &pdev->dev;
+	pm_runtime_enable(cryp->dev);
+	pm_runtime_get_sync(cryp->dev);
+
+	err = clk_prepare_enable(cryp->clk_ethif);
+	if (err)
+		goto err_clk_ethif;
+
+	err = clk_prepare_enable(cryp->clk_cryp);
+	if (err)
+		goto err_clk_cryp;
+
+	/* Allocate four command/result descriptor rings */
+	err = mtk_desc_ring_alloc(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to allocate descriptor rings.\n");
+		goto err_resource;
+	}
+
+	/* Initialize hardware modules */
+	err = mtk_accelerator_init(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to initialize cryptographic engine.\n");
+		goto err_engine;
+	}
+
+	err = mtk_cipher_alg_register(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to register cipher algorithm.\n");
+		goto err_cipher;
+	}
+
+	err = mtk_hash_alg_register(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to register hash algorithm.\n");
+		goto err_hash;
+	}
+
+	platform_set_drvdata(pdev, cryp);
+	return 0;
+
+err_hash:
+	mtk_cipher_alg_release(cryp);
+err_cipher:
+	mtk_dfe_dse_reset(cryp);
+err_engine:
+	mtk_desc_dma_free(cryp);
+err_resource:
+	clk_disable_unprepare(cryp->clk_cryp);
+err_clk_cryp:
+	clk_disable_unprepare(cryp->clk_ethif);
+err_clk_ethif:
+	pm_runtime_put_sync(cryp->dev);
+	pm_runtime_disable(cryp->dev);
+
+	return err;
+}
+
+static int mtk_crypto_remove(struct platform_device *pdev)
+{
+	struct mtk_cryp *cryp = platform_get_drvdata(pdev);
+
+	mtk_hash_alg_release(cryp);
+	mtk_cipher_alg_release(cryp);
+	mtk_desc_dma_free(cryp);
+
+	clk_disable_unprepare(cryp->clk_cryp);
+	clk_disable_unprepare(cryp->clk_ethif);
+
+	pm_runtime_put_sync(cryp->dev);
+	pm_runtime_disable(cryp->dev);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static const struct of_device_id of_crypto_id[] = {
+	{ .compatible = "mediatek,eip97-crypto" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_crypto_id);
+
+static struct platform_driver mtk_crypto_driver = {
+	.probe = mtk_crypto_probe,
+	.remove = mtk_crypto_remove,
+	.driver = {
+		   .name = "mtk-crypto",
+		   .owner = THIS_MODULE,
+		   .of_match_table = of_crypto_id,
+	},
+};
+module_platform_driver(mtk_crypto_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ryder Lee <ryder.lee@mediatek.com>");
+MODULE_DESCRIPTION("Cryptographic accelerator driver for EIP97");
diff --git a/drivers/crypto/mediatek/mtk-platform.h b/drivers/crypto/mediatek/mtk-platform.h
new file mode 100644
index 000000000000..ed6d8717f7f4
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-platform.h
@@ -0,0 +1,231 @@
+/*
+ * Driver for EIP97 cryptographic accelerator.
+ *
+ * Copyright (c) 2016 Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __MTK_PLATFORM_H_
+#define __MTK_PLATFORM_H_
+
+#include <crypto/algapi.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/hash.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include "mtk-regs.h"
+
+#define MTK_RDR_PROC_THRESH	BIT(0)
+#define MTK_RDR_PROC_MODE	BIT(23)
+#define MTK_CNT_RST		BIT(31)
+#define MTK_IRQ_RDR0		BIT(1)
+#define MTK_IRQ_RDR1		BIT(3)
+#define MTK_IRQ_RDR2		BIT(5)
+#define MTK_IRQ_RDR3		BIT(7)
+
+#define SIZE_IN_WORDS(x)	((x) >> 2)
+
+/**
+ * Ring 0/1 are used by AES encrypt and decrypt.
+ * Ring 2/3 are used by SHA.
+ */
+enum {
+	RING0 = 0,
+	RING1,
+	RING2,
+	RING3,
+	RING_MAX,
+};
+
+#define MTK_REC_NUM		(RING_MAX / 2)
+#define MTK_IRQ_NUM		5
+
+/**
+ * struct mtk_desc - DMA descriptor
+ * @hdr:	the descriptor control header
+ * @buf:	DMA address of input buffer segment
+ * @ct:		DMA address of command token that control operation flow
+ * @ct_hdr:	the command token control header
+ * @tag:	the user-defined field
+ * @tfm:	DMA address of transform state
+ * @bound:	align descriptors offset boundary
+ *
+ * Structure passed to the crypto engine to describe where source
+ * data needs to be fetched and how it needs to be processed.
+ */
+struct mtk_desc {
+	__le32 hdr;
+	__le32 buf;
+	__le32 ct;
+	__le32 ct_hdr;
+	__le32 tag;
+	__le32 tfm;
+	__le32 bound[2];
+};
+
+#define MTK_DESC_NUM		512
+#define MTK_DESC_OFF		SIZE_IN_WORDS(sizeof(struct mtk_desc))
+#define MTK_DESC_SZ		(MTK_DESC_OFF - 2)
+#define MTK_DESC_RING_SZ	((sizeof(struct mtk_desc) * MTK_DESC_NUM))
+#define MTK_DESC_CNT(x)		((MTK_DESC_OFF * (x)) << 2)
+#define MTK_DESC_LAST		cpu_to_le32(BIT(22))
+#define MTK_DESC_FIRST		cpu_to_le32(BIT(23))
+#define MTK_DESC_BUF_LEN(x)	cpu_to_le32(x)
+#define MTK_DESC_CT_LEN(x)	cpu_to_le32((x) << 24)
+
+/**
+ * struct mtk_ring - Descriptor ring
+ * @cmd_base:	pointer to command descriptor ring base
+ * @cmd_dma:	DMA address of command descriptor ring
+ * @cmd_pos:	current position in the command descriptor ring
+ * @res_base:	pointer to result descriptor ring base
+ * @res_dma:	DMA address of result descriptor ring
+ * @res_pos:	current position in the result descriptor ring
+ *
+ * A descriptor ring is a circular buffer that is used to manage
+ * one or more descriptors. There are two type of descriptor rings;
+ * the command descriptor ring and result descriptor ring.
+ */
+struct mtk_ring {
+	struct mtk_desc *cmd_base;
+	dma_addr_t cmd_dma;
+	u32 cmd_pos;
+	struct mtk_desc *res_base;
+	dma_addr_t res_dma;
+	u32 res_pos;
+};
+
+/**
+ * struct mtk_aes_dma - Structure that holds sg list info
+ * @sg:		pointer to scatter-gather list
+ * @nents:	number of entries in the sg list
+ * @remainder:	remainder of sg list
+ * @sg_len:	number of entries in the sg mapped list
+ */
+struct mtk_aes_dma {
+	struct scatterlist *sg;
+	int nents;
+	u32 remainder;
+	u32 sg_len;
+};
+
+struct mtk_aes_base_ctx;
+struct mtk_aes_rec;
+struct mtk_cryp;
+
+typedef int (*mtk_aes_fn)(struct mtk_cryp *cryp, struct mtk_aes_rec *aes);
+
+/**
+ * struct mtk_aes_rec - AES operation record
+ * @queue:	crypto request queue
+ * @areq:	pointer to async request
+ * @task:	the tasklet is use in AES interrupt
+ * @ctx:	pointer to current context
+ * @src:	the structure that holds source sg list info
+ * @dst:	the structure that holds destination sg list info
+ * @aligned_sg:	the scatter list is use to alignment
+ * @real_dst:	pointer to the destination sg list
+ * @resume:	pointer to resume function
+ * @total:	request buffer length
+ * @buf:	pointer to page buffer
+ * @id:		record identification
+ * @flags:	it's describing AES operation state
+ * @lock:	the async queue lock
+ *
+ * Structure used to record AES execution state.
+ */
+struct mtk_aes_rec {
+	struct crypto_queue queue;
+	struct crypto_async_request *areq;
+	struct tasklet_struct task;
+	struct mtk_aes_base_ctx *ctx;
+	struct mtk_aes_dma src;
+	struct mtk_aes_dma dst;
+
+	struct scatterlist aligned_sg;
+	struct scatterlist *real_dst;
+
+	mtk_aes_fn resume;
+
+	size_t total;
+	void *buf;
+
+	u8 id;
+	unsigned long flags;
+	/* queue lock */
+	spinlock_t lock;
+};
+
+/**
+ * struct mtk_sha_rec - SHA operation record
+ * @queue:	crypto request queue
+ * @req:	pointer to ahash request
+ * @task:	the tasklet is use in SHA interrupt
+ * @id:		record identification
+ * @flags:	it's describing SHA operation state
+ * @lock:	the ablkcipher queue lock
+ *
+ * Structure used to record SHA execution state.
+ */
+struct mtk_sha_rec {
+	struct crypto_queue queue;
+	struct ahash_request *req;
+	struct tasklet_struct task;
+
+	u8 id;
+	unsigned long flags;
+	/* queue lock */
+	spinlock_t lock;
+};
+
+/**
+ * struct mtk_cryp - Cryptographic device
+ * @base:	pointer to mapped register I/O base
+ * @dev:	pointer to device
+ * @clk_ethif:	pointer to ethif clock
+ * @clk_cryp:	pointer to crypto clock
+ * @irq:	global system and rings IRQ
+ * @ring:	pointer to execution state of AES
+ * @aes:	pointer to execution state of SHA
+ * @sha:	each execution record map to a ring
+ * @aes_list:	device list of AES
+ * @sha_list:	device list of SHA
+ * @tmp:	pointer to temporary buffer for internal use
+ * @tmp_dma:	DMA address of temporary buffer
+ * @rec:	it's used to select SHA record for tfm
+ *
+ * Structure storing cryptographic device information.
+ */
+struct mtk_cryp {
+	void __iomem *base;
+	struct device *dev;
+	struct clk *clk_ethif;
+	struct clk *clk_cryp;
+	int irq[MTK_IRQ_NUM];
+
+	struct mtk_ring *ring[RING_MAX];
+	struct mtk_aes_rec *aes[MTK_REC_NUM];
+	struct mtk_sha_rec *sha[MTK_REC_NUM];
+
+	struct list_head aes_list;
+	struct list_head sha_list;
+
+	void *tmp;
+	dma_addr_t tmp_dma;
+	bool rec;
+};
+
+int mtk_cipher_alg_register(struct mtk_cryp *cryp);
+void mtk_cipher_alg_release(struct mtk_cryp *cryp);
+int mtk_hash_alg_register(struct mtk_cryp *cryp);
+void mtk_hash_alg_release(struct mtk_cryp *cryp);
+
+#endif /* __MTK_PLATFORM_H_ */
diff --git a/drivers/crypto/mediatek/mtk-regs.h b/drivers/crypto/mediatek/mtk-regs.h
new file mode 100644
index 000000000000..94f4eb85be3f
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-regs.h
@@ -0,0 +1,194 @@
+/*
+ * Support for MediaTek cryptographic accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ */
+
+#ifndef __MTK_REGS_H__
+#define __MTK_REGS_H__
+
+/* HIA, Command Descriptor Ring Manager */
+#define CDR_BASE_ADDR_LO(x)		(0x0 + ((x) << 12))
+#define CDR_BASE_ADDR_HI(x)		(0x4 + ((x) << 12))
+#define CDR_DATA_BASE_ADDR_LO(x)	(0x8 + ((x) << 12))
+#define CDR_DATA_BASE_ADDR_HI(x)	(0xC + ((x) << 12))
+#define CDR_ACD_BASE_ADDR_LO(x)		(0x10 + ((x) << 12))
+#define CDR_ACD_BASE_ADDR_HI(x)		(0x14 + ((x) << 12))
+#define CDR_RING_SIZE(x)		(0x18 + ((x) << 12))
+#define CDR_DESC_SIZE(x)		(0x1C + ((x) << 12))
+#define CDR_CFG(x)			(0x20 + ((x) << 12))
+#define CDR_DMA_CFG(x)			(0x24 + ((x) << 12))
+#define CDR_THRESH(x)			(0x28 + ((x) << 12))
+#define CDR_PREP_COUNT(x)		(0x2C + ((x) << 12))
+#define CDR_PROC_COUNT(x)		(0x30 + ((x) << 12))
+#define CDR_PREP_PNTR(x)		(0x34 + ((x) << 12))
+#define CDR_PROC_PNTR(x)		(0x38 + ((x) << 12))
+#define CDR_STAT(x)			(0x3C + ((x) << 12))
+
+/* HIA, Result Descriptor Ring Manager */
+#define RDR_BASE_ADDR_LO(x)		(0x800 + ((x) << 12))
+#define RDR_BASE_ADDR_HI(x)		(0x804 + ((x) << 12))
+#define RDR_DATA_BASE_ADDR_LO(x)	(0x808 + ((x) << 12))
+#define RDR_DATA_BASE_ADDR_HI(x)	(0x80C + ((x) << 12))
+#define RDR_ACD_BASE_ADDR_LO(x)		(0x810 + ((x) << 12))
+#define RDR_ACD_BASE_ADDR_HI(x)		(0x814 + ((x) << 12))
+#define RDR_RING_SIZE(x)		(0x818 + ((x) << 12))
+#define RDR_DESC_SIZE(x)		(0x81C + ((x) << 12))
+#define RDR_CFG(x)			(0x820 + ((x) << 12))
+#define RDR_DMA_CFG(x)			(0x824 + ((x) << 12))
+#define RDR_THRESH(x)			(0x828 + ((x) << 12))
+#define RDR_PREP_COUNT(x)		(0x82C + ((x) << 12))
+#define RDR_PROC_COUNT(x)		(0x830 + ((x) << 12))
+#define RDR_PREP_PNTR(x)		(0x834 + ((x) << 12))
+#define RDR_PROC_PNTR(x)		(0x838 + ((x) << 12))
+#define RDR_STAT(x)			(0x83C + ((x) << 12))
+
+/* HIA, Ring AIC */
+#define AIC_POL_CTRL(x)			(0xE000 - ((x) << 12))
+#define	AIC_TYPE_CTRL(x)		(0xE004 - ((x) << 12))
+#define	AIC_ENABLE_CTRL(x)		(0xE008 - ((x) << 12))
+#define	AIC_RAW_STAL(x)			(0xE00C - ((x) << 12))
+#define	AIC_ENABLE_SET(x)		(0xE00C - ((x) << 12))
+#define	AIC_ENABLED_STAT(x)		(0xE010 - ((x) << 12))
+#define	AIC_ACK(x)			(0xE010 - ((x) << 12))
+#define	AIC_ENABLE_CLR(x)		(0xE014 - ((x) << 12))
+#define	AIC_OPTIONS(x)			(0xE018 - ((x) << 12))
+#define	AIC_VERSION(x)			(0xE01C - ((x) << 12))
+
+/* HIA, Global AIC */
+#define AIC_G_POL_CTRL			0xF800
+#define AIC_G_TYPE_CTRL			0xF804
+#define AIC_G_ENABLE_CTRL		0xF808
+#define AIC_G_RAW_STAT			0xF80C
+#define AIC_G_ENABLE_SET		0xF80C
+#define AIC_G_ENABLED_STAT		0xF810
+#define AIC_G_ACK			0xF810
+#define AIC_G_ENABLE_CLR		0xF814
+#define AIC_G_OPTIONS			0xF818
+#define AIC_G_VERSION			0xF81C
+
+/* HIA, Data Fetch Engine */
+#define DFE_CFG				0xF000
+#define DFE_PRIO_0			0xF010
+#define DFE_PRIO_1			0xF014
+#define DFE_PRIO_2			0xF018
+#define DFE_PRIO_3			0xF01C
+
+/* HIA, Data Fetch Engine access monitoring for CDR */
+#define DFE_RING_REGION_LO(x)		(0xF080 + ((x) << 3))
+#define DFE_RING_REGION_HI(x)		(0xF084 + ((x) << 3))
+
+/* HIA, Data Fetch Engine thread control and status for thread */
+#define DFE_THR_CTRL			0xF200
+#define DFE_THR_STAT			0xF204
+#define DFE_THR_DESC_CTRL		0xF208
+#define DFE_THR_DESC_DPTR_LO		0xF210
+#define DFE_THR_DESC_DPTR_HI		0xF214
+#define DFE_THR_DESC_ACDPTR_LO		0xF218
+#define DFE_THR_DESC_ACDPTR_HI		0xF21C
+
+/* HIA, Data Store Engine */
+#define DSE_CFG				0xF400
+#define DSE_PRIO_0			0xF410
+#define DSE_PRIO_1			0xF414
+#define DSE_PRIO_2			0xF418
+#define DSE_PRIO_3			0xF41C
+
+/* HIA, Data Store Engine access monitoring for RDR */
+#define DSE_RING_REGION_LO(x)		(0xF480 + ((x) << 3))
+#define DSE_RING_REGION_HI(x)		(0xF484 + ((x) << 3))
+
+/* HIA, Data Store Engine thread control and status for thread */
+#define DSE_THR_CTRL			0xF600
+#define DSE_THR_STAT			0xF604
+#define DSE_THR_DESC_CTRL		0xF608
+#define DSE_THR_DESC_DPTR_LO		0xF610
+#define DSE_THR_DESC_DPTR_HI		0xF614
+#define DSE_THR_DESC_S_DPTR_LO		0xF618
+#define DSE_THR_DESC_S_DPTR_HI		0xF61C
+#define DSE_THR_ERROR_STAT		0xF620
+
+/* HIA Global */
+#define HIA_MST_CTRL			0xFFF4
+#define HIA_OPTIONS			0xFFF8
+#define HIA_VERSION			0xFFFC
+
+/* Processing Engine Input Side, Processing Engine */
+#define PE_IN_DBUF_THRESH		0x10000
+#define PE_IN_TBUF_THRESH		0x10100
+
+/* Packet Engine Configuration / Status Registers */
+#define PE_TOKEN_CTRL_STAT		0x11000
+#define PE_FUNCTION_EN			0x11004
+#define PE_CONTEXT_CTRL			0x11008
+#define PE_INTERRUPT_CTRL_STAT		0x11010
+#define PE_CONTEXT_STAT			0x1100C
+#define PE_OUT_TRANS_CTRL_STAT		0x11018
+#define PE_OUT_BUF_CTRL			0x1101C
+
+/* Packet Engine PRNG Registers */
+#define PE_PRNG_STAT			0x11040
+#define PE_PRNG_CTRL			0x11044
+#define PE_PRNG_SEED_L			0x11048
+#define PE_PRNG_SEED_H			0x1104C
+#define PE_PRNG_KEY_0_L			0x11050
+#define PE_PRNG_KEY_0_H			0x11054
+#define PE_PRNG_KEY_1_L			0x11058
+#define PE_PRNG_KEY_1_H			0x1105C
+#define PE_PRNG_RES_0			0x11060
+#define PE_PRNG_RES_1			0x11064
+#define PE_PRNG_RES_2			0x11068
+#define PE_PRNG_RES_3			0x1106C
+#define PE_PRNG_LFSR_L			0x11070
+#define PE_PRNG_LFSR_H			0x11074
+
+/* Packet Engine AIC */
+#define PE_EIP96_AIC_POL_CTRL		0x113C0
+#define PE_EIP96_AIC_TYPE_CTRL		0x113C4
+#define PE_EIP96_AIC_ENABLE_CTRL	0x113C8
+#define PE_EIP96_AIC_RAW_STAT		0x113CC
+#define PE_EIP96_AIC_ENABLE_SET		0x113CC
+#define PE_EIP96_AIC_ENABLED_STAT	0x113D0
+#define PE_EIP96_AIC_ACK		0x113D0
+#define PE_EIP96_AIC_ENABLE_CLR		0x113D4
+#define PE_EIP96_AIC_OPTIONS		0x113D8
+#define PE_EIP96_AIC_VERSION		0x113DC
+
+/* Packet Engine Options & Version Registers */
+#define PE_EIP96_OPTIONS		0x113F8
+#define PE_EIP96_VERSION		0x113FC
+
+/* Processing Engine Output Side */
+#define PE_OUT_DBUF_THRESH		0x11C00
+#define PE_OUT_TBUF_THRESH		0x11D00
+
+/* Processing Engine Local AIC */
+#define PE_AIC_POL_CTRL			0x11F00
+#define PE_AIC_TYPE_CTRL		0x11F04
+#define PE_AIC_ENABLE_CTRL		0x11F08
+#define PE_AIC_RAW_STAT			0x11F0C
+#define PE_AIC_ENABLE_SET		0x11F0C
+#define PE_AIC_ENABLED_STAT		0x11F10
+#define PE_AIC_ENABLE_CLR		0x11F14
+#define PE_AIC_OPTIONS			0x11F18
+#define PE_AIC_VERSION			0x11F1C
+
+/* Processing Engine General Configuration and Version */
+#define PE_IN_FLIGHT			0x11FF0
+#define PE_OPTIONS			0x11FF8
+#define PE_VERSION			0x11FFC
+
+/* EIP-97 - Global */
+#define EIP97_CLOCK_STATE		0x1FFE4
+#define EIP97_FORCE_CLOCK_ON		0x1FFE8
+#define EIP97_FORCE_CLOCK_OFF		0x1FFEC
+#define EIP97_MST_CTRL			0x1FFF4
+#define EIP97_OPTIONS			0x1FFF8
+#define EIP97_VERSION			0x1FFFC
+#endif /* __MTK_REGS_H__ */
diff --git a/drivers/crypto/mediatek/mtk-sha.c b/drivers/crypto/mediatek/mtk-sha.c
new file mode 100644
index 000000000000..55e3805fba07
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-sha.c
@@ -0,0 +1,1435 @@
+/*
+ * Cryptographic API.
+ *
+ * Driver for EIP97 SHA1/SHA2(HMAC) acceleration.
+ *
+ * Copyright (c) 2016 Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Some ideas are from atmel-sha.c and omap-sham.c drivers.
+ */
+
+#include <crypto/sha.h>
+#include "mtk-platform.h"
+
+#define SHA_ALIGN_MSK		(sizeof(u32) - 1)
+#define SHA_QUEUE_SIZE		512
+#define SHA_TMP_BUF_SIZE	512
+#define SHA_BUF_SIZE		((u32)PAGE_SIZE)
+
+#define SHA_OP_UPDATE		1
+#define SHA_OP_FINAL		2
+
+#define SHA_DATA_LEN_MSK	cpu_to_le32(GENMASK(16, 0))
+
+/* SHA command token */
+#define SHA_CT_SIZE		5
+#define SHA_CT_CTRL_HDR		cpu_to_le32(0x02220000)
+#define SHA_CMD0		cpu_to_le32(0x03020000)
+#define SHA_CMD1		cpu_to_le32(0x21060000)
+#define SHA_CMD2		cpu_to_le32(0xe0e63802)
+
+/* SHA transform information */
+#define SHA_TFM_HASH		cpu_to_le32(0x2 << 0)
+#define SHA_TFM_INNER_DIG	cpu_to_le32(0x1 << 21)
+#define SHA_TFM_SIZE(x)		cpu_to_le32((x) << 8)
+#define SHA_TFM_START		cpu_to_le32(0x1 << 4)
+#define SHA_TFM_CONTINUE	cpu_to_le32(0x1 << 5)
+#define SHA_TFM_HASH_STORE	cpu_to_le32(0x1 << 19)
+#define SHA_TFM_SHA1		cpu_to_le32(0x2 << 23)
+#define SHA_TFM_SHA256		cpu_to_le32(0x3 << 23)
+#define SHA_TFM_SHA224		cpu_to_le32(0x4 << 23)
+#define SHA_TFM_SHA512		cpu_to_le32(0x5 << 23)
+#define SHA_TFM_SHA384		cpu_to_le32(0x6 << 23)
+#define SHA_TFM_DIGEST(x)	cpu_to_le32(((x) & GENMASK(3, 0)) << 24)
+
+/* SHA flags */
+#define SHA_FLAGS_BUSY		BIT(0)
+#define	SHA_FLAGS_FINAL		BIT(1)
+#define SHA_FLAGS_FINUP		BIT(2)
+#define SHA_FLAGS_SG		BIT(3)
+#define SHA_FLAGS_ALGO_MSK	GENMASK(8, 4)
+#define SHA_FLAGS_SHA1		BIT(4)
+#define SHA_FLAGS_SHA224	BIT(5)
+#define SHA_FLAGS_SHA256	BIT(6)
+#define SHA_FLAGS_SHA384	BIT(7)
+#define SHA_FLAGS_SHA512	BIT(8)
+#define SHA_FLAGS_HMAC		BIT(9)
+#define SHA_FLAGS_PAD		BIT(10)
+
+/**
+ * mtk_sha_ct is a set of hardware instructions(command token)
+ * that are used to control engine's processing flow of SHA,
+ * and it contains the first two words of transform state.
+ */
+struct mtk_sha_ct {
+	__le32 ctrl[2];
+	__le32 cmd[3];
+};
+
+/**
+ * mtk_sha_tfm is used to define SHA transform state
+ * and store result digest that produced by engine.
+ */
+struct mtk_sha_tfm {
+	__le32 ctrl[2];
+	__le32 digest[SIZE_IN_WORDS(SHA512_DIGEST_SIZE)];
+};
+
+/**
+ * mtk_sha_info consists of command token and transform state
+ * of SHA, its role is similar to mtk_aes_info.
+ */
+struct mtk_sha_info {
+	struct mtk_sha_ct ct;
+	struct mtk_sha_tfm tfm;
+};
+
+struct mtk_sha_reqctx {
+	struct mtk_sha_info info;
+	unsigned long flags;
+	unsigned long op;
+
+	u64 digcnt;
+	bool start;
+	size_t bufcnt;
+	dma_addr_t dma_addr;
+
+	__le32 ct_hdr;
+	u32 ct_size;
+	dma_addr_t ct_dma;
+	dma_addr_t tfm_dma;
+
+	/* Walk state */
+	struct scatterlist *sg;
+	u32 offset;	/* Offset in current sg */
+	u32 total;	/* Total request */
+	size_t ds;
+	size_t bs;
+
+	u8 *buffer;
+};
+
+struct mtk_sha_hmac_ctx {
+	struct crypto_shash	*shash;
+	u8 ipad[SHA512_BLOCK_SIZE] __aligned(sizeof(u32));
+	u8 opad[SHA512_BLOCK_SIZE] __aligned(sizeof(u32));
+};
+
+struct mtk_sha_ctx {
+	struct mtk_cryp *cryp;
+	unsigned long flags;
+	u8 id;
+	u8 buf[SHA_BUF_SIZE] __aligned(sizeof(u32));
+
+	struct mtk_sha_hmac_ctx	base[0];
+};
+
+struct mtk_sha_drv {
+	struct list_head dev_list;
+	/* Device list lock */
+	spinlock_t lock;
+};
+
+static struct mtk_sha_drv mtk_sha = {
+	.dev_list = LIST_HEAD_INIT(mtk_sha.dev_list),
+	.lock = __SPIN_LOCK_UNLOCKED(mtk_sha.lock),
+};
+
+static int mtk_sha_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct ahash_request *req);
+
+static inline u32 mtk_sha_read(struct mtk_cryp *cryp, u32 offset)
+{
+	return readl_relaxed(cryp->base + offset);
+}
+
+static inline void mtk_sha_write(struct mtk_cryp *cryp,
+				 u32 offset, u32 value)
+{
+	writel_relaxed(value, cryp->base + offset);
+}
+
+static struct mtk_cryp *mtk_sha_find_dev(struct mtk_sha_ctx *tctx)
+{
+	struct mtk_cryp *cryp = NULL;
+	struct mtk_cryp *tmp;
+
+	spin_lock_bh(&mtk_sha.lock);
+	if (!tctx->cryp) {
+		list_for_each_entry(tmp, &mtk_sha.dev_list, sha_list) {
+			cryp = tmp;
+			break;
+		}
+		tctx->cryp = cryp;
+	} else {
+		cryp = tctx->cryp;
+	}
+
+	/*
+	 * Assign record id to tfm in round-robin fashion, and this
+	 * will help tfm to bind  to corresponding descriptor rings.
+	 */
+	tctx->id = cryp->rec;
+	cryp->rec = !cryp->rec;
+
+	spin_unlock_bh(&mtk_sha.lock);
+
+	return cryp;
+}
+
+static int mtk_sha_append_sg(struct mtk_sha_reqctx *ctx)
+{
+	size_t count;
+
+	while ((ctx->bufcnt < SHA_BUF_SIZE) && ctx->total) {
+		count = min(ctx->sg->length - ctx->offset, ctx->total);
+		count = min(count, SHA_BUF_SIZE - ctx->bufcnt);
+
+		if (count <= 0) {
+			/*
+			 * Check if count <= 0 because the buffer is full or
+			 * because the sg length is 0. In the latest case,
+			 * check if there is another sg in the list, a 0 length
+			 * sg doesn't necessarily mean the end of the sg list.
+			 */
+			if ((ctx->sg->length == 0) && !sg_is_last(ctx->sg)) {
+				ctx->sg = sg_next(ctx->sg);
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		scatterwalk_map_and_copy(ctx->buffer + ctx->bufcnt, ctx->sg,
+					 ctx->offset, count, 0);
+
+		ctx->bufcnt += count;
+		ctx->offset += count;
+		ctx->total -= count;
+
+		if (ctx->offset == ctx->sg->length) {
+			ctx->sg = sg_next(ctx->sg);
+			if (ctx->sg)
+				ctx->offset = 0;
+			else
+				ctx->total = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * The purpose of this padding is to ensure that the padded message is a
+ * multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512).
+ * The bit "1" is appended at the end of the message followed by
+ * "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or
+ * 128 bits block (SHA384/SHA512) equals to the message length in bits
+ * is appended.
+ *
+ * For SHA1/SHA224/SHA256, padlen is calculated as followed:
+ *  - if message length < 56 bytes then padlen = 56 - message length
+ *  - else padlen = 64 + 56 - message length
+ *
+ * For SHA384/SHA512, padlen is calculated as followed:
+ *  - if message length < 112 bytes then padlen = 112 - message length
+ *  - else padlen = 128 + 112 - message length
+ */
+static void mtk_sha_fill_padding(struct mtk_sha_reqctx *ctx, u32 len)
+{
+	u32 index, padlen;
+	u64 bits[2];
+	u64 size = ctx->digcnt;
+
+	size += ctx->bufcnt;
+	size += len;
+
+	bits[1] = cpu_to_be64(size << 3);
+	bits[0] = cpu_to_be64(size >> 61);
+
+	if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
+		index = ctx->bufcnt & 0x7f;
+		padlen = (index < 112) ? (112 - index) : ((128 + 112) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen - 1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
+		ctx->bufcnt += padlen + 16;
+		ctx->flags |= SHA_FLAGS_PAD;
+	} else {
+		index = ctx->bufcnt & 0x3f;
+		padlen = (index < 56) ? (56 - index) : ((64 + 56) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen - 1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
+		ctx->bufcnt += padlen + 8;
+		ctx->flags |= SHA_FLAGS_PAD;
+	}
+}
+
+/* Initialize basic transform information of SHA */
+static void mtk_sha_info_init(struct mtk_sha_reqctx *ctx)
+{
+	struct mtk_sha_ct *ct = &ctx->info.ct;
+	struct mtk_sha_tfm *tfm = &ctx->info.tfm;
+
+	ctx->ct_hdr = SHA_CT_CTRL_HDR;
+	ctx->ct_size = SHA_CT_SIZE;
+
+	tfm->ctrl[0] = SHA_TFM_HASH | SHA_TFM_INNER_DIG |
+		       SHA_TFM_SIZE(SIZE_IN_WORDS(ctx->ds));
+
+	switch (ctx->flags & SHA_FLAGS_ALGO_MSK) {
+	case SHA_FLAGS_SHA1:
+		tfm->ctrl[0] |= SHA_TFM_SHA1;
+		break;
+	case SHA_FLAGS_SHA224:
+		tfm->ctrl[0] |= SHA_TFM_SHA224;
+		break;
+	case SHA_FLAGS_SHA256:
+		tfm->ctrl[0] |= SHA_TFM_SHA256;
+		break;
+	case SHA_FLAGS_SHA384:
+		tfm->ctrl[0] |= SHA_TFM_SHA384;
+		break;
+	case SHA_FLAGS_SHA512:
+		tfm->ctrl[0] |= SHA_TFM_SHA512;
+		break;
+
+	default:
+		/* Should not happen... */
+		return;
+	}
+
+	tfm->ctrl[1] = SHA_TFM_HASH_STORE;
+	ct->ctrl[0] = tfm->ctrl[0] | SHA_TFM_CONTINUE | SHA_TFM_START;
+	ct->ctrl[1] = tfm->ctrl[1];
+
+	ct->cmd[0] = SHA_CMD0;
+	ct->cmd[1] = SHA_CMD1;
+	ct->cmd[2] = SHA_CMD2 | SHA_TFM_DIGEST(SIZE_IN_WORDS(ctx->ds));
+}
+
+/*
+ * Update input data length field of transform information and
+ * map it to DMA region.
+ */
+static int mtk_sha_info_update(struct mtk_cryp *cryp,
+			       struct mtk_sha_rec *sha,
+			       size_t len)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	struct mtk_sha_info *info = &ctx->info;
+	struct mtk_sha_ct *ct = &info->ct;
+
+	if (ctx->start)
+		ctx->start = false;
+	else
+		ct->ctrl[0] &= ~SHA_TFM_START;
+
+	ctx->ct_hdr &= ~SHA_DATA_LEN_MSK;
+	ctx->ct_hdr |= cpu_to_le32(len);
+	ct->cmd[0] &= ~SHA_DATA_LEN_MSK;
+	ct->cmd[0] |= cpu_to_le32(len);
+
+	ctx->digcnt += len;
+
+	ctx->ct_dma = dma_map_single(cryp->dev, info, sizeof(*info),
+				     DMA_BIDIRECTIONAL);
+	if (unlikely(dma_mapping_error(cryp->dev, ctx->ct_dma))) {
+		dev_err(cryp->dev, "dma %zu bytes error\n", sizeof(*info));
+		return -EINVAL;
+	}
+	ctx->tfm_dma = ctx->ct_dma + sizeof(*ct);
+
+	return 0;
+}
+
+/*
+ * Because of hardware limitation, we must pre-calculate the inner
+ * and outer digest that need to be processed firstly by engine, then
+ * apply the result digest to the input message. These complex hashing
+ * procedures limits HMAC performance, so we use fallback SW encoding.
+ */
+static int mtk_sha_finish_hmac(struct ahash_request *req)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
+	struct mtk_sha_hmac_ctx *bctx = tctx->base;
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	SHASH_DESC_ON_STACK(shash, bctx->shash);
+
+	shash->tfm = bctx->shash;
+	shash->flags = 0; /* not CRYPTO_TFM_REQ_MAY_SLEEP */
+
+	return crypto_shash_init(shash) ?:
+	       crypto_shash_update(shash, bctx->opad, ctx->bs) ?:
+	       crypto_shash_finup(shash, req->result, ctx->ds, req->result);
+}
+
+/* Initialize request context */
+static int mtk_sha_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct mtk_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->flags = 0;
+	ctx->ds = crypto_ahash_digestsize(tfm);
+
+	switch (ctx->ds) {
+	case SHA1_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA1;
+		ctx->bs = SHA1_BLOCK_SIZE;
+		break;
+	case SHA224_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA224;
+		ctx->bs = SHA224_BLOCK_SIZE;
+		break;
+	case SHA256_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA256;
+		ctx->bs = SHA256_BLOCK_SIZE;
+		break;
+	case SHA384_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA384;
+		ctx->bs = SHA384_BLOCK_SIZE;
+		break;
+	case SHA512_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA512;
+		ctx->bs = SHA512_BLOCK_SIZE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ctx->bufcnt = 0;
+	ctx->digcnt = 0;
+	ctx->buffer = tctx->buf;
+	ctx->start = true;
+
+	if (tctx->flags & SHA_FLAGS_HMAC) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		memcpy(ctx->buffer, bctx->ipad, ctx->bs);
+		ctx->bufcnt = ctx->bs;
+		ctx->flags |= SHA_FLAGS_HMAC;
+	}
+
+	return 0;
+}
+
+static int mtk_sha_xmit(struct mtk_cryp *cryp, struct mtk_sha_rec *sha,
+			dma_addr_t addr, size_t len)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	struct mtk_ring *ring = cryp->ring[sha->id];
+	struct mtk_desc *cmd = ring->cmd_base + ring->cmd_pos;
+	struct mtk_desc *res = ring->res_base + ring->res_pos;
+	int err;
+
+	err = mtk_sha_info_update(cryp, sha, len);
+	if (err)
+		return err;
+
+	/* Fill in the command/result descriptors */
+	res->hdr = MTK_DESC_FIRST | MTK_DESC_LAST | MTK_DESC_BUF_LEN(len);
+	res->buf = cpu_to_le32(cryp->tmp_dma);
+
+	cmd->hdr = MTK_DESC_FIRST | MTK_DESC_LAST | MTK_DESC_BUF_LEN(len) |
+		   MTK_DESC_CT_LEN(ctx->ct_size);
+
+	cmd->buf = cpu_to_le32(addr);
+	cmd->ct = cpu_to_le32(ctx->ct_dma);
+	cmd->ct_hdr = ctx->ct_hdr;
+	cmd->tfm = cpu_to_le32(ctx->tfm_dma);
+
+	if (++ring->cmd_pos == MTK_DESC_NUM)
+		ring->cmd_pos = 0;
+
+	ring->res_pos = ring->cmd_pos;
+	/*
+	 * Make sure that all changes to the DMA ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_sha_write(cryp, RDR_PREP_COUNT(sha->id), MTK_DESC_CNT(1));
+	mtk_sha_write(cryp, CDR_PREP_COUNT(sha->id), MTK_DESC_CNT(1));
+
+	return -EINPROGRESS;
+}
+
+static int mtk_sha_xmit2(struct mtk_cryp *cryp,
+			 struct mtk_sha_rec *sha,
+			 struct mtk_sha_reqctx *ctx,
+			 size_t len1, size_t len2)
+{
+	struct mtk_ring *ring = cryp->ring[sha->id];
+	struct mtk_desc *cmd = ring->cmd_base + ring->cmd_pos;
+	struct mtk_desc *res = ring->res_base + ring->res_pos;
+	int err;
+
+	err = mtk_sha_info_update(cryp, sha, len1 + len2);
+	if (err)
+		return err;
+
+	/* Fill in the command/result descriptors */
+	res->hdr = MTK_DESC_BUF_LEN(len1) | MTK_DESC_FIRST;
+	res->buf = cpu_to_le32(cryp->tmp_dma);
+
+	cmd->hdr = MTK_DESC_BUF_LEN(len1) | MTK_DESC_FIRST |
+		   MTK_DESC_CT_LEN(ctx->ct_size);
+	cmd->buf = cpu_to_le32(sg_dma_address(ctx->sg));
+	cmd->ct = cpu_to_le32(ctx->ct_dma);
+	cmd->ct_hdr = ctx->ct_hdr;
+	cmd->tfm = cpu_to_le32(ctx->tfm_dma);
+
+	if (++ring->cmd_pos == MTK_DESC_NUM)
+		ring->cmd_pos = 0;
+
+	ring->res_pos = ring->cmd_pos;
+
+	cmd = ring->cmd_base + ring->cmd_pos;
+	res = ring->res_base + ring->res_pos;
+
+	res->hdr = MTK_DESC_BUF_LEN(len2) | MTK_DESC_LAST;
+	res->buf = cpu_to_le32(cryp->tmp_dma);
+
+	cmd->hdr = MTK_DESC_BUF_LEN(len2) | MTK_DESC_LAST;
+	cmd->buf = cpu_to_le32(ctx->dma_addr);
+
+	if (++ring->cmd_pos == MTK_DESC_NUM)
+		ring->cmd_pos = 0;
+
+	ring->res_pos = ring->cmd_pos;
+
+	/*
+	 * Make sure that all changes to the DMA ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_sha_write(cryp, RDR_PREP_COUNT(sha->id), MTK_DESC_CNT(2));
+	mtk_sha_write(cryp, CDR_PREP_COUNT(sha->id), MTK_DESC_CNT(2));
+
+	return -EINPROGRESS;
+}
+
+static int mtk_sha_dma_map(struct mtk_cryp *cryp,
+			   struct mtk_sha_rec *sha,
+			   struct mtk_sha_reqctx *ctx,
+			   size_t count)
+{
+	ctx->dma_addr = dma_map_single(cryp->dev, ctx->buffer,
+				       SHA_BUF_SIZE, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(cryp->dev, ctx->dma_addr))) {
+		dev_err(cryp->dev, "dma map error\n");
+		return -EINVAL;
+	}
+
+	ctx->flags &= ~SHA_FLAGS_SG;
+
+	return mtk_sha_xmit(cryp, sha, ctx->dma_addr, count);
+}
+
+static int mtk_sha_update_slow(struct mtk_cryp *cryp,
+			       struct mtk_sha_rec *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	size_t count;
+	u32 final;
+
+	mtk_sha_append_sg(ctx);
+
+	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
+
+	dev_dbg(cryp->dev, "slow: bufcnt: %zu\n", ctx->bufcnt);
+
+	if (final) {
+		sha->flags |= SHA_FLAGS_FINAL;
+		mtk_sha_fill_padding(ctx, 0);
+	}
+
+	if (final || (ctx->bufcnt == SHA_BUF_SIZE && ctx->total)) {
+		count = ctx->bufcnt;
+		ctx->bufcnt = 0;
+
+		return mtk_sha_dma_map(cryp, sha, ctx, count);
+	}
+	return 0;
+}
+
+static int mtk_sha_update_start(struct mtk_cryp *cryp,
+				struct mtk_sha_rec *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	u32 len, final, tail;
+	struct scatterlist *sg;
+
+	if (!ctx->total)
+		return 0;
+
+	if (ctx->bufcnt || ctx->offset)
+		return mtk_sha_update_slow(cryp, sha);
+
+	sg = ctx->sg;
+
+	if (!IS_ALIGNED(sg->offset, sizeof(u32)))
+		return mtk_sha_update_slow(cryp, sha);
+
+	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->bs))
+		/* size is not ctx->bs aligned */
+		return mtk_sha_update_slow(cryp, sha);
+
+	len = min(ctx->total, sg->length);
+
+	if (sg_is_last(sg)) {
+		if (!(ctx->flags & SHA_FLAGS_FINUP)) {
+			/* not last sg must be ctx->bs aligned */
+			tail = len & (ctx->bs - 1);
+			len -= tail;
+		}
+	}
+
+	ctx->total -= len;
+	ctx->offset = len; /* offset where to start slow */
+
+	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
+
+	/* Add padding */
+	if (final) {
+		size_t count;
+
+		tail = len & (ctx->bs - 1);
+		len -= tail;
+		ctx->total += tail;
+		ctx->offset = len; /* offset where to start slow */
+
+		sg = ctx->sg;
+		mtk_sha_append_sg(ctx);
+		mtk_sha_fill_padding(ctx, len);
+
+		ctx->dma_addr = dma_map_single(cryp->dev, ctx->buffer,
+					       SHA_BUF_SIZE, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(cryp->dev, ctx->dma_addr))) {
+			dev_err(cryp->dev, "dma map bytes error\n");
+			return -EINVAL;
+		}
+
+		sha->flags |= SHA_FLAGS_FINAL;
+		count = ctx->bufcnt;
+		ctx->bufcnt = 0;
+
+		if (len == 0) {
+			ctx->flags &= ~SHA_FLAGS_SG;
+			return mtk_sha_xmit(cryp, sha, ctx->dma_addr, count);
+
+		} else {
+			ctx->sg = sg;
+			if (!dma_map_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
+				dev_err(cryp->dev, "dma_map_sg error\n");
+				return -EINVAL;
+			}
+
+			ctx->flags |= SHA_FLAGS_SG;
+			return mtk_sha_xmit2(cryp, sha, ctx, len, count);
+		}
+	}
+
+	if (!dma_map_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
+		dev_err(cryp->dev, "dma_map_sg  error\n");
+		return -EINVAL;
+	}
+
+	ctx->flags |= SHA_FLAGS_SG;
+
+	return mtk_sha_xmit(cryp, sha, sg_dma_address(ctx->sg), len);
+}
+
+static int mtk_sha_final_req(struct mtk_cryp *cryp,
+			     struct mtk_sha_rec *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	size_t count;
+
+	mtk_sha_fill_padding(ctx, 0);
+
+	sha->flags |= SHA_FLAGS_FINAL;
+	count = ctx->bufcnt;
+	ctx->bufcnt = 0;
+
+	return mtk_sha_dma_map(cryp, sha, ctx, count);
+}
+
+/* Copy ready hash (+ finalize hmac) */
+static int mtk_sha_finish(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	u32 *digest = ctx->info.tfm.digest;
+	u32 *result = (u32 *)req->result;
+	int i;
+
+	/* Get the hash from the digest buffer */
+	for (i = 0; i < SIZE_IN_WORDS(ctx->ds); i++)
+		result[i] = le32_to_cpu(digest[i]);
+
+	if (ctx->flags & SHA_FLAGS_HMAC)
+		return mtk_sha_finish_hmac(req);
+
+	return 0;
+}
+
+static void mtk_sha_finish_req(struct mtk_cryp *cryp,
+			       struct mtk_sha_rec *sha,
+			       int err)
+{
+	if (likely(!err && (SHA_FLAGS_FINAL & sha->flags)))
+		err = mtk_sha_finish(sha->req);
+
+	sha->flags &= ~(SHA_FLAGS_BUSY | SHA_FLAGS_FINAL);
+
+	sha->req->base.complete(&sha->req->base, err);
+
+	/* Handle new request */
+	mtk_sha_handle_queue(cryp, sha->id - RING2, NULL);
+}
+
+static int mtk_sha_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct ahash_request *req)
+{
+	struct mtk_sha_rec *sha = cryp->sha[id];
+	struct crypto_async_request *async_req, *backlog;
+	struct mtk_sha_reqctx *ctx;
+	unsigned long flags;
+	int err = 0, ret = 0;
+
+	spin_lock_irqsave(&sha->lock, flags);
+	if (req)
+		ret = ahash_enqueue_request(&sha->queue, req);
+
+	if (SHA_FLAGS_BUSY & sha->flags) {
+		spin_unlock_irqrestore(&sha->lock, flags);
+		return ret;
+	}
+
+	backlog = crypto_get_backlog(&sha->queue);
+	async_req = crypto_dequeue_request(&sha->queue);
+	if (async_req)
+		sha->flags |= SHA_FLAGS_BUSY;
+	spin_unlock_irqrestore(&sha->lock, flags);
+
+	if (!async_req)
+		return ret;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ahash_request_cast(async_req);
+	ctx = ahash_request_ctx(req);
+
+	sha->req = req;
+
+	mtk_sha_info_init(ctx);
+
+	if (ctx->op == SHA_OP_UPDATE) {
+		err = mtk_sha_update_start(cryp, sha);
+		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
+			/* No final() after finup() */
+			err = mtk_sha_final_req(cryp, sha);
+	} else if (ctx->op == SHA_OP_FINAL) {
+		err = mtk_sha_final_req(cryp, sha);
+	}
+
+	if (unlikely(err != -EINPROGRESS))
+		/* Task will not finish it, so do it here */
+		mtk_sha_finish_req(cryp, sha, err);
+
+	return ret;
+}
+
+static int mtk_sha_enqueue(struct ahash_request *req, u32 op)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
+
+	ctx->op = op;
+
+	return mtk_sha_handle_queue(tctx->cryp, tctx->id, req);
+}
+
+static void mtk_sha_unmap(struct mtk_cryp *cryp, struct mtk_sha_rec *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+
+	dma_unmap_single(cryp->dev, ctx->ct_dma, sizeof(ctx->info),
+			 DMA_BIDIRECTIONAL);
+
+	if (ctx->flags & SHA_FLAGS_SG) {
+		dma_unmap_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE);
+		if (ctx->sg->length == ctx->offset) {
+			ctx->sg = sg_next(ctx->sg);
+			if (ctx->sg)
+				ctx->offset = 0;
+		}
+		if (ctx->flags & SHA_FLAGS_PAD) {
+			dma_unmap_single(cryp->dev, ctx->dma_addr,
+					 SHA_BUF_SIZE, DMA_TO_DEVICE);
+		}
+	} else
+		dma_unmap_single(cryp->dev, ctx->dma_addr,
+				 SHA_BUF_SIZE, DMA_TO_DEVICE);
+}
+
+static void mtk_sha_complete(struct mtk_cryp *cryp,
+			     struct mtk_sha_rec *sha)
+{
+	int err = 0;
+
+	err = mtk_sha_update_start(cryp, sha);
+	if (err != -EINPROGRESS)
+		mtk_sha_finish_req(cryp, sha, err);
+}
+
+static int mtk_sha_update(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->total = req->nbytes;
+	ctx->sg = req->src;
+	ctx->offset = 0;
+
+	if ((ctx->bufcnt + ctx->total < SHA_BUF_SIZE) &&
+	    !(ctx->flags & SHA_FLAGS_FINUP))
+		return mtk_sha_append_sg(ctx);
+
+	return mtk_sha_enqueue(req, SHA_OP_UPDATE);
+}
+
+static int mtk_sha_final(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->flags |= SHA_FLAGS_FINUP;
+
+	if (ctx->flags & SHA_FLAGS_PAD)
+		return mtk_sha_finish(req);
+
+	return mtk_sha_enqueue(req, SHA_OP_FINAL);
+}
+
+static int mtk_sha_finup(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	int err1, err2;
+
+	ctx->flags |= SHA_FLAGS_FINUP;
+
+	err1 = mtk_sha_update(req);
+	if (err1 == -EINPROGRESS || err1 == -EBUSY)
+		return err1;
+	/*
+	 * final() has to be always called to cleanup resources
+	 * even if update() failed
+	 */
+	err2 = mtk_sha_final(req);
+
+	return err1 ?: err2;
+}
+
+static int mtk_sha_digest(struct ahash_request *req)
+{
+	return mtk_sha_init(req) ?: mtk_sha_finup(req);
+}
+
+static int mtk_sha_setkey(struct crypto_ahash *tfm, const u8 *key,
+			  u32 keylen)
+{
+	struct mtk_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct mtk_sha_hmac_ctx *bctx = tctx->base;
+	size_t bs = crypto_shash_blocksize(bctx->shash);
+	size_t ds = crypto_shash_digestsize(bctx->shash);
+	int err, i;
+
+	SHASH_DESC_ON_STACK(shash, bctx->shash);
+
+	shash->tfm = bctx->shash;
+	shash->flags = crypto_shash_get_flags(bctx->shash) &
+		       CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	if (keylen > bs) {
+		err = crypto_shash_digest(shash, key, keylen, bctx->ipad);
+		if (err)
+			return err;
+		keylen = ds;
+	} else {
+		memcpy(bctx->ipad, key, keylen);
+	}
+
+	memset(bctx->ipad + keylen, 0, bs - keylen);
+	memcpy(bctx->opad, bctx->ipad, bs);
+
+	for (i = 0; i < bs; i++) {
+		bctx->ipad[i] ^= 0x36;
+		bctx->opad[i] ^= 0x5c;
+	}
+
+	return 0;
+}
+
+static int mtk_sha_export(struct ahash_request *req, void *out)
+{
+	const struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	memcpy(out, ctx, sizeof(*ctx));
+	return 0;
+}
+
+static int mtk_sha_import(struct ahash_request *req, const void *in)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	memcpy(ctx, in, sizeof(*ctx));
+	return 0;
+}
+
+static int mtk_sha_cra_init_alg(struct crypto_tfm *tfm,
+				const char *alg_base)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(tfm);
+	struct mtk_cryp *cryp = NULL;
+
+	cryp = mtk_sha_find_dev(tctx);
+	if (!cryp)
+		return -ENODEV;
+
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				 sizeof(struct mtk_sha_reqctx));
+
+	if (alg_base) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		tctx->flags |= SHA_FLAGS_HMAC;
+		bctx->shash = crypto_alloc_shash(alg_base, 0,
+					CRYPTO_ALG_NEED_FALLBACK);
+		if (IS_ERR(bctx->shash)) {
+			pr_err("base driver %s could not be loaded.\n",
+			       alg_base);
+
+			return PTR_ERR(bctx->shash);
+		}
+	}
+	return 0;
+}
+
+static int mtk_sha_cra_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, NULL);
+}
+
+static int mtk_sha_cra_sha1_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha1");
+}
+
+static int mtk_sha_cra_sha224_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha224");
+}
+
+static int mtk_sha_cra_sha256_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha256");
+}
+
+static int mtk_sha_cra_sha384_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha384");
+}
+
+static int mtk_sha_cra_sha512_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha512");
+}
+
+static void mtk_sha_cra_exit(struct crypto_tfm *tfm)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(tfm);
+
+	if (tctx->flags & SHA_FLAGS_HMAC) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		crypto_free_shash(bctx->shash);
+	}
+}
+
+static struct ahash_alg algs_sha1_sha224_sha256[] = {
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA1_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha1",
+		.cra_driver_name	= "mtk-sha1",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA1_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA224_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha224",
+		.cra_driver_name	= "mtk-sha224",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA224_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA256_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha256",
+		.cra_driver_name	= "mtk-sha256",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA256_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA1_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha1)",
+		.cra_driver_name	= "mtk-hmac-sha1",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA1_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha1_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA224_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha224)",
+		.cra_driver_name	= "mtk-hmac-sha224",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA224_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha224_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA256_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha256)",
+		.cra_driver_name	= "mtk-hmac-sha256",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA256_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha256_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+};
+
+static struct ahash_alg algs_sha384_sha512[] = {
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA384_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "mtk-sha384",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA512_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "mtk-sha512",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA384_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha384)",
+		.cra_driver_name	= "mtk-hmac-sha384",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha384_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA512_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha512)",
+		.cra_driver_name	= "mtk-hmac-sha512",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha512_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+};
+
+static void mtk_sha_task0(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_sha_rec *sha = cryp->sha[0];
+
+	mtk_sha_unmap(cryp, sha);
+	mtk_sha_complete(cryp, sha);
+}
+
+static void mtk_sha_task1(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_sha_rec *sha = cryp->sha[1];
+
+	mtk_sha_unmap(cryp, sha);
+	mtk_sha_complete(cryp, sha);
+}
+
+static irqreturn_t mtk_sha_ring2_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_sha_rec *sha = cryp->sha[0];
+	u32 val = mtk_sha_read(cryp, RDR_STAT(RING2));
+
+	mtk_sha_write(cryp, RDR_STAT(RING2), val);
+
+	if (likely((SHA_FLAGS_BUSY & sha->flags))) {
+		mtk_sha_write(cryp, RDR_PROC_COUNT(RING2), MTK_CNT_RST);
+		mtk_sha_write(cryp, RDR_THRESH(RING2),
+			      MTK_RDR_PROC_THRESH | MTK_RDR_PROC_MODE);
+
+		tasklet_schedule(&sha->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_sha_ring3_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_sha_rec *sha = cryp->sha[1];
+	u32 val = mtk_sha_read(cryp, RDR_STAT(RING3));
+
+	mtk_sha_write(cryp, RDR_STAT(RING3), val);
+
+	if (likely((SHA_FLAGS_BUSY & sha->flags))) {
+		mtk_sha_write(cryp, RDR_PROC_COUNT(RING3), MTK_CNT_RST);
+		mtk_sha_write(cryp, RDR_THRESH(RING3),
+			      MTK_RDR_PROC_THRESH | MTK_RDR_PROC_MODE);
+
+		tasklet_schedule(&sha->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * The purpose of two SHA records is used to get extra performance.
+ * It is similar to mtk_aes_record_init().
+ */
+static int mtk_sha_record_init(struct mtk_cryp *cryp)
+{
+	struct mtk_sha_rec **sha = cryp->sha;
+	int i, err = -ENOMEM;
+
+	for (i = 0; i < MTK_REC_NUM; i++) {
+		sha[i] = kzalloc(sizeof(**sha), GFP_KERNEL);
+		if (!sha[i])
+			goto err_cleanup;
+
+		sha[i]->id = i + RING2;
+
+		spin_lock_init(&sha[i]->lock);
+		crypto_init_queue(&sha[i]->queue, SHA_QUEUE_SIZE);
+	}
+
+	tasklet_init(&sha[0]->task, mtk_sha_task0, (unsigned long)cryp);
+	tasklet_init(&sha[1]->task, mtk_sha_task1, (unsigned long)cryp);
+
+	cryp->rec = 1;
+
+	return 0;
+
+err_cleanup:
+	for (; i--; )
+		kfree(sha[i]);
+	return err;
+}
+
+static void mtk_sha_record_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < MTK_REC_NUM; i++) {
+		tasklet_kill(&cryp->sha[i]->task);
+		kfree(cryp->sha[i]);
+	}
+}
+
+static void mtk_sha_unregister_algs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha1_sha224_sha256); i++)
+		crypto_unregister_ahash(&algs_sha1_sha224_sha256[i]);
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha384_sha512); i++)
+		crypto_unregister_ahash(&algs_sha384_sha512[i]);
+}
+
+static int mtk_sha_register_algs(void)
+{
+	int err, i;
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha1_sha224_sha256); i++) {
+		err = crypto_register_ahash(&algs_sha1_sha224_sha256[i]);
+		if (err)
+			goto err_sha_224_256_algs;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha384_sha512); i++) {
+		err = crypto_register_ahash(&algs_sha384_sha512[i]);
+		if (err)
+			goto err_sha_384_512_algs;
+	}
+
+	return 0;
+
+err_sha_384_512_algs:
+	for (; i--; )
+		crypto_unregister_ahash(&algs_sha384_sha512[i]);
+	i = ARRAY_SIZE(algs_sha1_sha224_sha256);
+err_sha_224_256_algs:
+	for (; i--; )
+		crypto_unregister_ahash(&algs_sha1_sha224_sha256[i]);
+
+	return err;
+}
+
+int mtk_hash_alg_register(struct mtk_cryp *cryp)
+{
+	int err;
+
+	INIT_LIST_HEAD(&cryp->sha_list);
+
+	/* Initialize two hash records */
+	err = mtk_sha_record_init(cryp);
+	if (err)
+		goto err_record;
+
+	/* Ring2 is use by SHA record0 */
+	err = devm_request_irq(cryp->dev, cryp->irq[RING2],
+			       mtk_sha_ring2_irq, IRQF_TRIGGER_LOW,
+			       "mtk-sha", cryp);
+	if (err) {
+		dev_err(cryp->dev, "unable to request sha irq0.\n");
+		goto err_res;
+	}
+
+	/* Ring3 is use by SHA record1 */
+	err = devm_request_irq(cryp->dev, cryp->irq[RING3],
+			       mtk_sha_ring3_irq, IRQF_TRIGGER_LOW,
+			       "mtk-sha", cryp);
+	if (err) {
+		dev_err(cryp->dev, "unable to request sha irq1.\n");
+		goto err_res;
+	}
+
+	/* Enable ring2 and ring3 interrupt for hash */
+	mtk_sha_write(cryp, AIC_ENABLE_SET(RING2), MTK_IRQ_RDR2);
+	mtk_sha_write(cryp, AIC_ENABLE_SET(RING3), MTK_IRQ_RDR3);
+
+	cryp->tmp = dma_alloc_coherent(cryp->dev, SHA_TMP_BUF_SIZE,
+					&cryp->tmp_dma, GFP_KERNEL);
+	if (!cryp->tmp) {
+		dev_err(cryp->dev, "unable to allocate tmp buffer.\n");
+		err = -EINVAL;
+		goto err_res;
+	}
+
+	spin_lock(&mtk_sha.lock);
+	list_add_tail(&cryp->sha_list, &mtk_sha.dev_list);
+	spin_unlock(&mtk_sha.lock);
+
+	err = mtk_sha_register_algs();
+	if (err)
+		goto err_algs;
+
+	return 0;
+
+err_algs:
+	spin_lock(&mtk_sha.lock);
+	list_del(&cryp->sha_list);
+	spin_unlock(&mtk_sha.lock);
+	dma_free_coherent(cryp->dev, SHA_TMP_BUF_SIZE,
+			  cryp->tmp, cryp->tmp_dma);
+err_res:
+	mtk_sha_record_free(cryp);
+err_record:
+
+	dev_err(cryp->dev, "mtk-sha initialization failed.\n");
+	return err;
+}
+
+void mtk_hash_alg_release(struct mtk_cryp *cryp)
+{
+	spin_lock(&mtk_sha.lock);
+	list_del(&cryp->sha_list);
+	spin_unlock(&mtk_sha.lock);
+
+	mtk_sha_unregister_algs();
+	dma_free_coherent(cryp->dev, SHA_TMP_BUF_SIZE,
+			  cryp->tmp, cryp->tmp_dma);
+	mtk_sha_record_free(cryp);
+}
diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
index 47576098831f..b6f14844702e 100644
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -1616,32 +1616,17 @@ static const struct of_device_id spacc_of_id_table[] = {
 MODULE_DEVICE_TABLE(of, spacc_of_id_table);
 #endif /* CONFIG_OF */
 
-static bool spacc_is_compatible(struct platform_device *pdev,
-				const char *spacc_type)
-{
-	const struct platform_device_id *platid = platform_get_device_id(pdev);
-
-	if (platid && !strcmp(platid->name, spacc_type))
-		return true;
-
-#ifdef CONFIG_OF
-	if (of_device_is_compatible(pdev->dev.of_node, spacc_type))
-		return true;
-#endif /* CONFIG_OF */
-
-	return false;
-}
-
 static int spacc_probe(struct platform_device *pdev)
 {
 	int i, err, ret = -EINVAL;
 	struct resource *mem, *irq;
+	struct device_node *np = pdev->dev.of_node;
 	struct spacc_engine *engine = devm_kzalloc(&pdev->dev, sizeof(*engine),
 						   GFP_KERNEL);
 	if (!engine)
 		return -ENOMEM;
 
-	if (spacc_is_compatible(pdev, "picochip,spacc-ipsec")) {
+	if (of_device_is_compatible(np, "picochip,spacc-ipsec")) {
 		engine->max_ctxs	= SPACC_CRYPTO_IPSEC_MAX_CTXS;
 		engine->cipher_pg_sz	= SPACC_CRYPTO_IPSEC_CIPHER_PG_SZ;
 		engine->hash_pg_sz	= SPACC_CRYPTO_IPSEC_HASH_PG_SZ;
@@ -1650,7 +1635,7 @@ static int spacc_probe(struct platform_device *pdev)
 		engine->num_algs	= ARRAY_SIZE(ipsec_engine_algs);
 		engine->aeads		= ipsec_engine_aeads;
 		engine->num_aeads	= ARRAY_SIZE(ipsec_engine_aeads);
-	} else if (spacc_is_compatible(pdev, "picochip,spacc-l2")) {
+	} else if (of_device_is_compatible(np, "picochip,spacc-l2")) {
 		engine->max_ctxs	= SPACC_CRYPTO_L2_MAX_CTXS;
 		engine->cipher_pg_sz	= SPACC_CRYPTO_L2_CIPHER_PG_SZ;
 		engine->hash_pg_sz	= SPACC_CRYPTO_L2_HASH_PG_SZ;
@@ -1803,12 +1788,6 @@ static int spacc_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct platform_device_id spacc_id_table[] = {
-	{ "picochip,spacc-ipsec", },
-	{ "picochip,spacc-l2", },
-	{ }
-};
-
 static struct platform_driver spacc_driver = {
 	.probe		= spacc_probe,
 	.remove		= spacc_remove,
@@ -1819,7 +1798,6 @@ static struct platform_driver spacc_driver = {
 #endif /* CONFIG_PM */
 		.of_match_table	= of_match_ptr(spacc_of_id_table),
 	},
-	.id_table	= spacc_id_table,
 };
 
 module_platform_driver(spacc_driver);
diff --git a/drivers/crypto/qat/qat_c3xxx/adf_drv.c b/drivers/crypto/qat/qat_c3xxx/adf_drv.c
index 640c3fc870fd..f172171668ee 100644
--- a/drivers/crypto/qat/qat_c3xxx/adf_drv.c
+++ b/drivers/crypto/qat/qat_c3xxx/adf_drv.c
@@ -186,7 +186,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c
index 949d77b79fbe..24ec908eb26c 100644
--- a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c
+++ b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c
@@ -170,7 +170,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	accel_pci_dev->sku = hw_data->get_sku(hw_data);
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/qat/qat_c62x/adf_drv.c b/drivers/crypto/qat/qat_c62x/adf_drv.c
index 5b2d78a5b5aa..58a984c9c3ec 100644
--- a/drivers/crypto/qat/qat_c62x/adf_drv.c
+++ b/drivers/crypto/qat/qat_c62x/adf_drv.c
@@ -186,7 +186,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/qat/qat_c62xvf/adf_drv.c b/drivers/crypto/qat/qat_c62xvf/adf_drv.c
index 7540ce13b0d0..b9f3e0e4fde9 100644
--- a/drivers/crypto/qat/qat_c62xvf/adf_drv.c
+++ b/drivers/crypto/qat/qat_c62xvf/adf_drv.c
@@ -170,7 +170,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	accel_pci_dev->sku = hw_data->get_sku(hw_data);
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/qat/qat_common/adf_cfg_common.h b/drivers/crypto/qat/qat_common/adf_cfg_common.h
index 8c4f6573ce59..1211261de7c2 100644
--- a/drivers/crypto/qat/qat_common/adf_cfg_common.h
+++ b/drivers/crypto/qat/qat_common/adf_cfg_common.h
@@ -61,6 +61,7 @@
 #define ADF_CFG_AFFINITY_WHATEVER 0xFF
 #define MAX_DEVICE_NAME_SIZE 32
 #define ADF_MAX_DEVICES (32 * 32)
+#define ADF_DEVS_ARRAY_SIZE BITS_TO_LONGS(ADF_MAX_DEVICES)
 
 enum adf_cfg_val_type {
 	ADF_DEC,
diff --git a/drivers/crypto/qat/qat_common/adf_common_drv.h b/drivers/crypto/qat/qat_common/adf_common_drv.h
index 980e07475012..5c4c0a253129 100644
--- a/drivers/crypto/qat/qat_common/adf_common_drv.h
+++ b/drivers/crypto/qat/qat_common/adf_common_drv.h
@@ -87,8 +87,8 @@ enum adf_event {
 struct service_hndl {
 	int (*event_hld)(struct adf_accel_dev *accel_dev,
 			 enum adf_event event);
-	unsigned long init_status;
-	unsigned long start_status;
+	unsigned long init_status[ADF_DEVS_ARRAY_SIZE];
+	unsigned long start_status[ADF_DEVS_ARRAY_SIZE];
 	char *name;
 	struct list_head list;
 };
diff --git a/drivers/crypto/qat/qat_common/adf_dev_mgr.c b/drivers/crypto/qat/qat_common/adf_dev_mgr.c
index b3ebb25f9ca7..8afac52677a6 100644
--- a/drivers/crypto/qat/qat_common/adf_dev_mgr.c
+++ b/drivers/crypto/qat/qat_common/adf_dev_mgr.c
@@ -152,7 +152,7 @@ void adf_devmgr_update_class_index(struct adf_hw_device_data *hw_data)
 			ptr->hw_device->instance_id = i++;
 
 		if (i == class->instances)
-				break;
+			break;
 	}
 }
 EXPORT_SYMBOL_GPL(adf_devmgr_update_class_index);
diff --git a/drivers/crypto/qat/qat_common/adf_init.c b/drivers/crypto/qat/qat_common/adf_init.c
index 888c6675e7e5..26556c713049 100644
--- a/drivers/crypto/qat/qat_common/adf_init.c
+++ b/drivers/crypto/qat/qat_common/adf_init.c
@@ -64,8 +64,8 @@ static void adf_service_add(struct service_hndl *service)
 
 int adf_service_register(struct service_hndl *service)
 {
-	service->init_status = 0;
-	service->start_status = 0;
+	memset(service->init_status, 0, sizeof(service->init_status));
+	memset(service->start_status, 0, sizeof(service->start_status));
 	adf_service_add(service);
 	return 0;
 }
@@ -79,9 +79,13 @@ static void adf_service_remove(struct service_hndl *service)
 
 int adf_service_unregister(struct service_hndl *service)
 {
-	if (service->init_status || service->start_status) {
-		pr_err("QAT: Could not remove active service\n");
-		return -EFAULT;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(service->init_status); i++) {
+		if (service->init_status[i] || service->start_status[i]) {
+			pr_err("QAT: Could not remove active service\n");
+			return -EFAULT;
+		}
 	}
 	adf_service_remove(service);
 	return 0;
@@ -163,7 +167,7 @@ int adf_dev_init(struct adf_accel_dev *accel_dev)
 				service->name);
 			return -EFAULT;
 		}
-		set_bit(accel_dev->accel_id, &service->init_status);
+		set_bit(accel_dev->accel_id, service->init_status);
 	}
 
 	hw_data->enable_error_correction(accel_dev);
@@ -210,7 +214,7 @@ int adf_dev_start(struct adf_accel_dev *accel_dev)
 				service->name);
 			return -EFAULT;
 		}
-		set_bit(accel_dev->accel_id, &service->start_status);
+		set_bit(accel_dev->accel_id, service->start_status);
 	}
 
 	clear_bit(ADF_STATUS_STARTING, &accel_dev->status);
@@ -259,14 +263,14 @@ void adf_dev_stop(struct adf_accel_dev *accel_dev)
 
 	list_for_each(list_itr, &service_table) {
 		service = list_entry(list_itr, struct service_hndl, list);
-		if (!test_bit(accel_dev->accel_id, &service->start_status))
+		if (!test_bit(accel_dev->accel_id, service->start_status))
 			continue;
 		ret = service->event_hld(accel_dev, ADF_EVENT_STOP);
 		if (!ret) {
-			clear_bit(accel_dev->accel_id, &service->start_status);
+			clear_bit(accel_dev->accel_id, service->start_status);
 		} else if (ret == -EAGAIN) {
 			wait = true;
-			clear_bit(accel_dev->accel_id, &service->start_status);
+			clear_bit(accel_dev->accel_id, service->start_status);
 		}
 	}
 
@@ -317,14 +321,14 @@ void adf_dev_shutdown(struct adf_accel_dev *accel_dev)
 
 	list_for_each(list_itr, &service_table) {
 		service = list_entry(list_itr, struct service_hndl, list);
-		if (!test_bit(accel_dev->accel_id, &service->init_status))
+		if (!test_bit(accel_dev->accel_id, service->init_status))
 			continue;
 		if (service->event_hld(accel_dev, ADF_EVENT_SHUTDOWN))
 			dev_err(&GET_DEV(accel_dev),
 				"Failed to shutdown service %s\n",
 				service->name);
 		else
-			clear_bit(accel_dev->accel_id, &service->init_status);
+			clear_bit(accel_dev->accel_id, service->init_status);
 	}
 
 	hw_data->disable_iov(accel_dev);
diff --git a/drivers/crypto/qat/qat_common/adf_sriov.c b/drivers/crypto/qat/qat_common/adf_sriov.c
index 9320ae1d005b..b36d8653b1ba 100644
--- a/drivers/crypto/qat/qat_common/adf_sriov.c
+++ b/drivers/crypto/qat/qat_common/adf_sriov.c
@@ -162,9 +162,9 @@ static int adf_enable_sriov(struct adf_accel_dev *accel_dev)
 
 /**
  * adf_disable_sriov() - Disable SRIOV for the device
- * @pdev:  Pointer to pci device.
+ * @accel_dev:  Pointer to accel device.
  *
- * Function disables SRIOV for the pci device.
+ * Function disables SRIOV for the accel device.
  *
  * Return: 0 on success, error code otherwise.
  */
diff --git a/drivers/crypto/qat/qat_common/adf_vf_isr.c b/drivers/crypto/qat/qat_common/adf_vf_isr.c
index bf99e11a3403..4a73fc70f7a9 100644
--- a/drivers/crypto/qat/qat_common/adf_vf_isr.c
+++ b/drivers/crypto/qat/qat_common/adf_vf_isr.c
@@ -148,7 +148,7 @@ static void adf_pf2vf_bh_handler(void *data)
 		INIT_WORK(&stop_data->work, adf_dev_stop_async);
 		queue_work(adf_vf_stop_wq, &stop_data->work);
 		/* To ack, clear the PF2VFINT bit */
-		msg &= ~BIT(0);
+		msg &= ~ADF_PF2VF_INT;
 		ADF_CSR_WR(pmisc_bar_addr, hw_data->get_pf2vf_offset(0), msg);
 		return;
 	}
@@ -168,7 +168,7 @@ static void adf_pf2vf_bh_handler(void *data)
 	}
 
 	/* To ack, clear the PF2VFINT bit */
-	msg &= ~BIT(0);
+	msg &= ~ADF_PF2VF_INT;
 	ADF_CSR_WR(pmisc_bar_addr, hw_data->get_pf2vf_offset(0), msg);
 
 	/* Re-enable PF2VF interrupts */
diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
index 4d2de2838451..2ce01f010c74 100644
--- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
+++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c
@@ -186,7 +186,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c
index 60df98632fa2..26ab17bfc6da 100644
--- a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c
+++ b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c
@@ -170,7 +170,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	accel_pci_dev->sku = hw_data->get_sku(hw_data);
 
 	/* Create dev top level debugfs entry */
-	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%02d",
+	snprintf(name, sizeof(name), "%s%s_%02x:%02d.%d",
 		 ADF_DEVICE_NAME_PREFIX, hw_data->dev_class->name,
 		 pdev->bus->number, PCI_SLOT(pdev->devfn),
 		 PCI_FUNC(pdev->devfn));
diff --git a/drivers/crypto/virtio/Kconfig b/drivers/crypto/virtio/Kconfig
index d80f73366ae2..5db07495ddc5 100644
--- a/drivers/crypto/virtio/Kconfig
+++ b/drivers/crypto/virtio/Kconfig
@@ -4,6 +4,7 @@ config CRYPTO_DEV_VIRTIO
 	select CRYPTO_AEAD
 	select CRYPTO_AUTHENC
 	select CRYPTO_BLKCIPHER
+	select CRYPTO_ENGINE
 	default m
 	help
 	  This driver provides support for virtio crypto device. If you
diff --git a/drivers/crypto/virtio/virtio_crypto_algs.c b/drivers/crypto/virtio/virtio_crypto_algs.c
index c2374df9abae..49defda4e03d 100644
--- a/drivers/crypto/virtio/virtio_crypto_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_algs.c
@@ -288,8 +288,7 @@ static int virtio_crypto_ablkcipher_setkey(struct crypto_ablkcipher *tfm,
 static int
 __virtio_crypto_ablkcipher_do_req(struct virtio_crypto_request *vc_req,
 		struct ablkcipher_request *req,
-		struct data_queue *data_vq,
-		__u8 op)
+		struct data_queue *data_vq)
 {
 	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
 	unsigned int ivsize = crypto_ablkcipher_ivsize(tfm);
@@ -329,7 +328,7 @@ __virtio_crypto_ablkcipher_do_req(struct virtio_crypto_request *vc_req,
 	vc_req->req_data = req_data;
 	vc_req->type = VIRTIO_CRYPTO_SYM_OP_CIPHER;
 	/* Head of operation */
-	if (op) {
+	if (vc_req->encrypt) {
 		req_data->header.session_id =
 			cpu_to_le64(ctx->enc_sess_info.session_id);
 		req_data->header.opcode =
@@ -424,19 +423,15 @@ static int virtio_crypto_ablkcipher_encrypt(struct ablkcipher_request *req)
 	struct virtio_crypto_ablkcipher_ctx *ctx = crypto_ablkcipher_ctx(atfm);
 	struct virtio_crypto_request *vc_req = ablkcipher_request_ctx(req);
 	struct virtio_crypto *vcrypto = ctx->vcrypto;
-	int ret;
 	/* Use the first data virtqueue as default */
 	struct data_queue *data_vq = &vcrypto->data_vq[0];
 
 	vc_req->ablkcipher_ctx = ctx;
 	vc_req->ablkcipher_req = req;
-	ret = __virtio_crypto_ablkcipher_do_req(vc_req, req, data_vq, 1);
-	if (ret < 0) {
-		pr_err("virtio_crypto: Encryption failed!\n");
-		return ret;
-	}
+	vc_req->encrypt = true;
+	vc_req->dataq = data_vq;
 
-	return -EINPROGRESS;
+	return crypto_transfer_cipher_request_to_engine(data_vq->engine, req);
 }
 
 static int virtio_crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
@@ -445,20 +440,16 @@ static int virtio_crypto_ablkcipher_decrypt(struct ablkcipher_request *req)
 	struct virtio_crypto_ablkcipher_ctx *ctx = crypto_ablkcipher_ctx(atfm);
 	struct virtio_crypto_request *vc_req = ablkcipher_request_ctx(req);
 	struct virtio_crypto *vcrypto = ctx->vcrypto;
-	int ret;
 	/* Use the first data virtqueue as default */
 	struct data_queue *data_vq = &vcrypto->data_vq[0];
 
 	vc_req->ablkcipher_ctx = ctx;
 	vc_req->ablkcipher_req = req;
 
-	ret = __virtio_crypto_ablkcipher_do_req(vc_req, req, data_vq, 0);
-	if (ret < 0) {
-		pr_err("virtio_crypto: Decryption failed!\n");
-		return ret;
-	}
+	vc_req->encrypt = false;
+	vc_req->dataq = data_vq;
 
-	return -EINPROGRESS;
+	return crypto_transfer_cipher_request_to_engine(data_vq->engine, req);
 }
 
 static int virtio_crypto_ablkcipher_init(struct crypto_tfm *tfm)
@@ -484,10 +475,37 @@ static void virtio_crypto_ablkcipher_exit(struct crypto_tfm *tfm)
 	ctx->vcrypto = NULL;
 }
 
+int virtio_crypto_ablkcipher_crypt_req(
+	struct crypto_engine *engine,
+	struct ablkcipher_request *req)
+{
+	struct virtio_crypto_request *vc_req = ablkcipher_request_ctx(req);
+	struct data_queue *data_vq = vc_req->dataq;
+	int ret;
+
+	ret = __virtio_crypto_ablkcipher_do_req(vc_req, req, data_vq);
+	if (ret < 0)
+		return ret;
+
+	virtqueue_kick(data_vq->vq);
+
+	return 0;
+}
+
+void virtio_crypto_ablkcipher_finalize_req(
+	struct virtio_crypto_request *vc_req,
+	struct ablkcipher_request *req,
+	int err)
+{
+	crypto_finalize_cipher_request(vc_req->dataq->engine, req, err);
+
+	virtcrypto_clear_request(vc_req);
+}
+
 static struct crypto_alg virtio_crypto_algs[] = { {
 	.cra_name = "cbc(aes)",
 	.cra_driver_name = "virtio_crypto_aes_cbc",
-	.cra_priority = 501,
+	.cra_priority = 150,
 	.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize = AES_BLOCK_SIZE,
 	.cra_ctxsize  = sizeof(struct virtio_crypto_ablkcipher_ctx),
diff --git a/drivers/crypto/virtio/virtio_crypto_common.h b/drivers/crypto/virtio/virtio_crypto_common.h
index 3d6566b02876..da6d8c0ea407 100644
--- a/drivers/crypto/virtio/virtio_crypto_common.h
+++ b/drivers/crypto/virtio/virtio_crypto_common.h
@@ -25,6 +25,7 @@
 #include <crypto/aead.h>
 #include <crypto/aes.h>
 #include <crypto/authenc.h>
+#include <crypto/engine.h>
 
 
 /* Internal representation of a data virtqueue */
@@ -37,6 +38,8 @@ struct data_queue {
 
 	/* Name of the tx queue: dataq.$index */
 	char name[32];
+
+	struct crypto_engine *engine;
 };
 
 struct virtio_crypto {
@@ -97,6 +100,9 @@ struct virtio_crypto_request {
 	struct virtio_crypto_op_data_req *req_data;
 	struct scatterlist **sgs;
 	uint8_t *iv;
+	/* Encryption? */
+	bool encrypt;
+	struct data_queue *dataq;
 };
 
 int virtcrypto_devmgr_add_dev(struct virtio_crypto *vcrypto_dev);
@@ -110,6 +116,16 @@ int virtcrypto_dev_started(struct virtio_crypto *vcrypto_dev);
 struct virtio_crypto *virtcrypto_get_dev_node(int node);
 int virtcrypto_dev_start(struct virtio_crypto *vcrypto);
 void virtcrypto_dev_stop(struct virtio_crypto *vcrypto);
+int virtio_crypto_ablkcipher_crypt_req(
+	struct crypto_engine *engine,
+	struct ablkcipher_request *req);
+void virtio_crypto_ablkcipher_finalize_req(
+	struct virtio_crypto_request *vc_req,
+	struct ablkcipher_request *req,
+	int err);
+
+void
+virtcrypto_clear_request(struct virtio_crypto_request *vc_req);
 
 static inline int virtio_crypto_get_current_node(void)
 {
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index fe70ec823b27..b5b153317376 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -25,7 +25,7 @@
 #include "virtio_crypto_common.h"
 
 
-static void
+void
 virtcrypto_clear_request(struct virtio_crypto_request *vc_req)
 {
 	if (vc_req) {
@@ -66,12 +66,12 @@ static void virtcrypto_dataq_callback(struct virtqueue *vq)
 					break;
 				}
 				ablk_req = vc_req->ablkcipher_req;
-				virtcrypto_clear_request(vc_req);
 
 				spin_unlock_irqrestore(
 					&vcrypto->data_vq[qid].lock, flags);
 				/* Finish the encrypt or decrypt process */
-				ablk_req->base.complete(&ablk_req->base, error);
+				virtio_crypto_ablkcipher_finalize_req(vc_req,
+				    ablk_req, error);
 				spin_lock_irqsave(
 					&vcrypto->data_vq[qid].lock, flags);
 			}
@@ -87,6 +87,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 	int ret = -ENOMEM;
 	int i, total_vqs;
 	const char **names;
+	struct device *dev = &vi->vdev->dev;
 
 	/*
 	 * We expect 1 data virtqueue, followed by
@@ -128,6 +129,15 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 	for (i = 0; i < vi->max_data_queues; i++) {
 		spin_lock_init(&vi->data_vq[i].lock);
 		vi->data_vq[i].vq = vqs[i];
+		/* Initialize crypto engine */
+		vi->data_vq[i].engine = crypto_engine_alloc_init(dev, 1);
+		if (!vi->data_vq[i].engine) {
+			ret = -ENOMEM;
+			goto err_engine;
+		}
+
+		vi->data_vq[i].engine->cipher_one_request =
+			virtio_crypto_ablkcipher_crypt_req;
 	}
 
 	kfree(names);
@@ -136,6 +146,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 
 	return 0;
 
+err_engine:
 err_find:
 	kfree(names);
 err_names:
@@ -269,6 +280,38 @@ static int virtcrypto_update_status(struct virtio_crypto *vcrypto)
 	return 0;
 }
 
+static int virtcrypto_start_crypto_engines(struct virtio_crypto *vcrypto)
+{
+	int32_t i;
+	int ret;
+
+	for (i = 0; i < vcrypto->max_data_queues; i++) {
+		if (vcrypto->data_vq[i].engine) {
+			ret = crypto_engine_start(vcrypto->data_vq[i].engine);
+			if (ret)
+				goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	while (--i >= 0)
+		if (vcrypto->data_vq[i].engine)
+			crypto_engine_exit(vcrypto->data_vq[i].engine);
+
+	return ret;
+}
+
+static void virtcrypto_clear_crypto_engines(struct virtio_crypto *vcrypto)
+{
+	u32 i;
+
+	for (i = 0; i < vcrypto->max_data_queues; i++)
+		if (vcrypto->data_vq[i].engine)
+			crypto_engine_exit(vcrypto->data_vq[i].engine);
+}
+
 static void virtcrypto_del_vqs(struct virtio_crypto *vcrypto)
 {
 	struct virtio_device *vdev = vcrypto->vdev;
@@ -355,14 +398,21 @@ static int virtcrypto_probe(struct virtio_device *vdev)
 		dev_err(&vdev->dev, "Failed to initialize vqs.\n");
 		goto free_dev;
 	}
+
+	err = virtcrypto_start_crypto_engines(vcrypto);
+	if (err)
+		goto free_vqs;
+
 	virtio_device_ready(vdev);
 
 	err = virtcrypto_update_status(vcrypto);
 	if (err)
-		goto free_vqs;
+		goto free_engines;
 
 	return 0;
 
+free_engines:
+	virtcrypto_clear_crypto_engines(vcrypto);
 free_vqs:
 	vcrypto->vdev->config->reset(vdev);
 	virtcrypto_del_vqs(vcrypto);
@@ -398,6 +448,7 @@ static void virtcrypto_remove(struct virtio_device *vdev)
 		virtcrypto_dev_stop(vcrypto);
 	vdev->config->reset(vdev);
 	virtcrypto_free_unused_reqs(vcrypto);
+	virtcrypto_clear_crypto_engines(vcrypto);
 	virtcrypto_del_vqs(vcrypto);
 	virtcrypto_devmgr_rm_dev(vcrypto);
 	kfree(vcrypto);
@@ -420,6 +471,7 @@ static int virtcrypto_freeze(struct virtio_device *vdev)
 	if (virtcrypto_dev_started(vcrypto))
 		virtcrypto_dev_stop(vcrypto);
 
+	virtcrypto_clear_crypto_engines(vcrypto);
 	virtcrypto_del_vqs(vcrypto);
 	return 0;
 }
@@ -433,14 +485,26 @@ static int virtcrypto_restore(struct virtio_device *vdev)
 	if (err)
 		return err;
 
+	err = virtcrypto_start_crypto_engines(vcrypto);
+	if (err)
+		goto free_vqs;
+
 	virtio_device_ready(vdev);
+
 	err = virtcrypto_dev_start(vcrypto);
 	if (err) {
 		dev_err(&vdev->dev, "Failed to start virtio crypto device.\n");
-		return -EFAULT;
+		goto free_engines;
 	}
 
 	return 0;
+
+free_engines:
+	virtcrypto_clear_crypto_engines(vcrypto);
+free_vqs:
+	vcrypto->vdev->config->reset(vdev);
+	virtcrypto_del_vqs(vcrypto);
+	return err;
 }
 #endif
 
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 38ed10d761d0..7cf6d31c1123 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -80,11 +80,13 @@ static int p8_aes_ctr_setkey(struct crypto_tfm *tfm, const u8 *key,
 	int ret;
 	struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
 
+	preempt_disable();
 	pagefault_disable();
 	enable_kernel_vsx();
 	ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
 	disable_kernel_vsx();
 	pagefault_enable();
+	preempt_enable();
 
 	ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
 	return ret;
@@ -99,11 +101,13 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
 	u8 *dst = walk->dst.virt.addr;
 	unsigned int nbytes = walk->nbytes;
 
+	preempt_disable();
 	pagefault_disable();
 	enable_kernel_vsx();
 	aes_p8_encrypt(ctrblk, keystream, &ctx->enc_key);
 	disable_kernel_vsx();
 	pagefault_enable();
+	preempt_enable();
 
 	crypto_xor(keystream, src, nbytes);
 	memcpy(dst, keystream, nbytes);
@@ -132,6 +136,7 @@ static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
 		blkcipher_walk_init(&walk, dst, src, nbytes);
 		ret = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
 		while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+			preempt_disable();
 			pagefault_disable();
 			enable_kernel_vsx();
 			aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
@@ -143,6 +148,7 @@ static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
 						    walk.iv);
 			disable_kernel_vsx();
 			pagefault_enable();
+			preempt_enable();
 
 			/* We need to update IV mostly for last bytes/round */
 			inc = (nbytes & AES_BLOCK_MASK) / AES_BLOCK_SIZE;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 8d9e4b7a8e84..ccc05f874419 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -3385,6 +3385,14 @@ struct fw_crypto_lookaside_wr {
 #define FW_CRYPTO_LOOKASIDE_WR_IV_G(x) \
 	(((x) >> FW_CRYPTO_LOOKASIDE_WR_IV_S) & FW_CRYPTO_LOOKASIDE_WR_IV_M)
 
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_S   15
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_M   0xff
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_V(x) \
+	((x) << FW_CRYPTO_LOOKASIDE_WR_FQIDX_S)
+#define FW_CRYPTO_LOOKASIDE_WR_FQIDX_G(x) \
+	(((x) >> FW_CRYPTO_LOOKASIDE_WR_FQIDX_S) & \
+	 FW_CRYPTO_LOOKASIDE_WR_FQIDX_M)
+
 #define FW_CRYPTO_LOOKASIDE_WR_TX_CH_S 10
 #define FW_CRYPTO_LOOKASIDE_WR_TX_CH_M 0x3
 #define FW_CRYPTO_LOOKASIDE_WR_TX_CH_V(x) \
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 404e9558e879..ebe4ded0c55d 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -191,9 +191,25 @@ static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
 	return queue->qlen;
 }
 
-/* These functions require the input/output to be aligned as u32. */
 void crypto_inc(u8 *a, unsigned int size);
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size);
+void __crypto_xor(u8 *dst, const u8 *src, unsigned int size);
+
+static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
+{
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    __builtin_constant_p(size) &&
+	    (size % sizeof(unsigned long)) == 0) {
+		unsigned long *d = (unsigned long *)dst;
+		unsigned long *s = (unsigned long *)src;
+
+		while (size > 0) {
+			*d++ ^= *s++;
+			size -= sizeof(unsigned long);
+		}
+	} else {
+		__crypto_xor(dst, src, size);
+	}
+}
 
 int blkcipher_walk_done(struct blkcipher_desc *desc,
 			struct blkcipher_walk *walk, int err);
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index 20d20f681a72..445fc45f4b5b 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -5,6 +5,7 @@
 #ifndef _CRYPTO_CHACHA20_H
 #define _CRYPTO_CHACHA20_H
 
+#include <crypto/skcipher.h>
 #include <linux/types.h>
 #include <linux/crypto.h>
 
@@ -18,9 +19,8 @@ struct chacha20_ctx {
 
 void chacha20_block(u32 *state, void *stream);
 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
 			   unsigned int keysize);
-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-			  struct scatterlist *src, unsigned int nbytes);
+int crypto_chacha20_crypt(struct skcipher_request *req);
 
 #endif
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 216a2b876147..b5727bcd2336 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -329,6 +329,16 @@ static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
 	return crypto_hash_alg_common(tfm)->digestsize;
 }
 
+/**
+ * crypto_ahash_statesize() - obtain size of the ahash state
+ * @tfm: cipher handle
+ *
+ * Return the size of the ahash state. With the crypto_ahash_export()
+ * function, the caller can export the state into a buffer whose size is
+ * defined with this function.
+ *
+ * Return: size of the ahash state
+ */
 static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm)
 {
 	return crypto_hash_alg_common(tfm)->statesize;
@@ -369,11 +379,7 @@ static inline struct crypto_ahash *crypto_ahash_reqtfm(
  * crypto_ahash_reqsize() - obtain size of the request data structure
  * @tfm: cipher handle
  *
- * Return the size of the ahash state size. With the crypto_ahash_export
- * function, the caller can export the state into a buffer whose size is
- * defined with this function.
- *
- * Return: size of the ahash state
+ * Return: size of the request data
  */
 static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
 {
@@ -453,7 +459,7 @@ int crypto_ahash_digest(struct ahash_request *req);
  *
  * This function exports the hash state of the ahash_request handle into the
  * caller-allocated output buffer out which must have sufficient size (e.g. by
- * calling crypto_ahash_reqsize).
+ * calling crypto_ahash_statesize()).
  *
  * Return: 0 if the export was successful; < 0 if an error occurred
  */
diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h
index 8735979ed341..e42f7063f245 100644
--- a/include/crypto/internal/skcipher.h
+++ b/include/crypto/internal/skcipher.h
@@ -66,7 +66,7 @@ struct skcipher_walk {
 
 	int flags;
 	unsigned int blocksize;
-	unsigned int chunksize;
+	unsigned int stride;
 	unsigned int alignmask;
 };
 
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index 750b14f1ada4..562001cb412b 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -115,6 +115,9 @@ struct crypto_skcipher {
  *	    IV of exactly that size to perform the encrypt or decrypt operation.
  * @chunksize: Equal to the block size except for stream ciphers such as
  *	       CTR where it is set to the underlying block size.
+ * @walksize: Equal to the chunk size except in cases where the algorithm is
+ * 	      considerably more efficient if it can operate on multiple chunks
+ * 	      in parallel. Should be a multiple of chunksize.
  * @base: Definition of a generic crypto algorithm.
  *
  * All fields except @ivsize are mandatory and must be filled.
@@ -131,6 +134,7 @@ struct skcipher_alg {
 	unsigned int max_keysize;
 	unsigned int ivsize;
 	unsigned int chunksize;
+	unsigned int walksize;
 
 	struct crypto_alg base;
 };
@@ -289,6 +293,19 @@ static inline unsigned int crypto_skcipher_alg_chunksize(
 	return alg->chunksize;
 }
 
+static inline unsigned int crypto_skcipher_alg_walksize(
+	struct skcipher_alg *alg)
+{
+	if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) ==
+	    CRYPTO_ALG_TYPE_BLKCIPHER)
+		return alg->base.cra_blocksize;
+
+	if (alg->base.cra_ablkcipher.encrypt)
+		return alg->base.cra_blocksize;
+
+	return alg->walksize;
+}
+
 /**
  * crypto_skcipher_chunksize() - obtain chunk size
  * @tfm: cipher handle
@@ -306,6 +323,23 @@ static inline unsigned int crypto_skcipher_chunksize(
 	return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm));
 }
 
+/**
+ * crypto_skcipher_walksize() - obtain walk size
+ * @tfm: cipher handle
+ *
+ * In some cases, algorithms can only perform optimally when operating on
+ * multiple blocks in parallel. This is reflected by the walksize, which
+ * must be a multiple of the chunksize (or equal if the concern does not
+ * apply)
+ *
+ * Return: walk size in bytes
+ */
+static inline unsigned int crypto_skcipher_walksize(
+	struct crypto_skcipher *tfm)
+{
+	return crypto_skcipher_alg_walksize(crypto_skcipher_alg(tfm));
+}
+
 /**
  * crypto_skcipher_blocksize() - obtain block size of cipher
  * @tfm: cipher handle
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0444b1336268..fddd1a5eb322 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -116,6 +116,7 @@
  */
 #define __pure			__attribute__((pure))
 #define __aligned(x)		__attribute__((aligned(x)))
+#define __aligned_largest	__attribute__((aligned))
 #define __printf(a, b)		__attribute__((format(printf, a, b)))
 #define __scanf(a, b)		__attribute__((format(scanf, a, b)))
 #define __attribute_const__	__attribute__((__const__))
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 0590263c462c..762b5fec3383 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -32,6 +32,7 @@
 #define SGI_MMTIMER		153
 #define STORE_QUEUE_MINOR	155	/* unused */
 #define I2O_MINOR		166
+#define HWRNG_MINOR		183
 #define MICROCODE_MINOR		184
 #define IRNET_MINOR		187
 #define VFIO_MINOR		196