crypto: x86/salsa20 - remove x86 salsa20 implementations

commit b7b73cd5d7 upstream. The x86 assembly implementations of Salsa20 use the frame base pointer register (%ebp or %rbp), which breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Recent (v4.10+) kernels will warn about this, e.g. WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c [...] But after looking into it, I believe there's very little reason to still retain the x86 Salsa20 code. First, these are *not* vectorized (SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere close to the best Salsa20 performance on any remotely modern x86 processor; they're just regular x86 assembly. Second, it's still unclear that anyone is actually using the kernel's Salsa20 at all, especially given that now ChaCha20 is supported too, and with much more efficient SSSE3 and AVX2 implementations. Finally, in benchmarks I did on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic (~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm is only slightly faster than salsa20-generic (~15% faster on Skylake, ~20% faster on Zen). The gcc version made little difference. So, the x86_64 salsa20-asm is pretty clearly useless. That leaves just the i686 salsa20-asm, which based on my tests provides a 15-20% speed boost. But that's without updating the code to not use %ebp. And given the maintenance cost, the small speed difference vs. salsa20-generic, the fact that few people still use i686 kernels, the doubt that anyone is even using the kernel's Salsa20 at all, and the fact that a SSE2 implementation would almost certainly be much faster on any remotely modern x86 processor yet no one has cared enough to add one yet, I don't think it's worthwhile to keep. Thus, just remove both the x86_64 and i686 salsa20-asm implementations. Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-05-26 00:08:58 -07:00 · 2018-05-26 00:08:58 -07:00 · 19f39eff68
parent 2a017ea2ea
commit 19f39eff68
5 changed files with 0 additions and 2179 deletions
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o

 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
-obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o

 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@ -59,7 +57,6 @@ endif

 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
-salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o

 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@ -68,7 +65,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o

--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@ -1,919 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/linkage.h>
-
-# enter salsa20_encrypt_bytes
-ENTRY(salsa20_encrypt_bytes)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	# x = arg1
-	mov	%rdi,%r8
-	# m = arg2
-	mov	%rsi,%rsi
-	# out = arg3
-	mov	%rdx,%rdi
-	# bytes = arg4
-	mov	%rcx,%rdx
-	#               unsigned>? bytes - 0
-	cmp	$0,%rdx
-	# comment:fp stack unchanged by jump
-	# goto done if !unsigned>
-	jbe	._done
-	# comment:fp stack unchanged by fallthrough
-# start:
-._start:
-	# r11_stack = r11
-	movq	%r11,0(%rsp)
-	# r12_stack = r12
-	movq	%r12,8(%rsp)
-	# r13_stack = r13
-	movq	%r13,16(%rsp)
-	# r14_stack = r14
-	movq	%r14,24(%rsp)
-	# r15_stack = r15
-	movq	%r15,32(%rsp)
-	# rbx_stack = rbx
-	movq	%rbx,40(%rsp)
-	# rbp_stack = rbp
-	movq	%rbp,48(%rsp)
-	# in0 = *(uint64 *) (x + 0)
-	movq	0(%r8),%rcx
-	# in2 = *(uint64 *) (x + 8)
-	movq	8(%r8),%r9
-	# in4 = *(uint64 *) (x + 16)
-	movq	16(%r8),%rax
-	# in6 = *(uint64 *) (x + 24)
-	movq	24(%r8),%r10
-	# in8 = *(uint64 *) (x + 32)
-	movq	32(%r8),%r11
-	# in10 = *(uint64 *) (x + 40)
-	movq	40(%r8),%r12
-	# in12 = *(uint64 *) (x + 48)
-	movq	48(%r8),%r13
-	# in14 = *(uint64 *) (x + 56)
-	movq	56(%r8),%r14
-	# j0 = in0
-	movq	%rcx,56(%rsp)
-	# j2 = in2
-	movq	%r9,64(%rsp)
-	# j4 = in4
-	movq	%rax,72(%rsp)
-	# j6 = in6
-	movq	%r10,80(%rsp)
-	# j8 = in8
-	movq	%r11,88(%rsp)
-	# j10 = in10
-	movq	%r12,96(%rsp)
-	# j12 = in12
-	movq	%r13,104(%rsp)
-	# j14 = in14
-	movq	%r14,112(%rsp)
-	# x_backup = x
-	movq	%r8,120(%rsp)
-# bytesatleast1:
-._bytesatleast1:
-	#                   unsigned<? bytes - 64
-	cmp	$64,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto nocopy if !unsigned<
-	jae	._nocopy
-	#     ctarget = out
-	movq	%rdi,128(%rsp)
-	#     out = &tmp
-	leaq	192(%rsp),%rdi
-	#     i = bytes
-	mov	%rdx,%rcx
-	#     while (i) { *out++ = *m++; --i }
-	rep	movsb
-	#     out = &tmp
-	leaq	192(%rsp),%rdi
-	#     m = &tmp
-	leaq	192(%rsp),%rsi
-	# comment:fp stack unchanged by fallthrough
-#   nocopy:
-._nocopy:
-	#   out_backup = out
-	movq	%rdi,136(%rsp)
-	#   m_backup = m
-	movq	%rsi,144(%rsp)
-	#   bytes_backup = bytes
-	movq	%rdx,152(%rsp)
-	#   x1 = j0
-	movq	56(%rsp),%rdi
-	#   x0 = x1
-	mov	%rdi,%rdx
-	#   (uint64) x1 >>= 32
-	shr	$32,%rdi
-	#   		x3 = j2
-	movq	64(%rsp),%rsi
-	#   		x2 = x3
-	mov	%rsi,%rcx
-	#   		(uint64) x3 >>= 32
-	shr	$32,%rsi
-	#   x5 = j4
-	movq	72(%rsp),%r8
-	#   x4 = x5
-	mov	%r8,%r9
-	#   (uint64) x5 >>= 32
-	shr	$32,%r8
-	#   x5_stack = x5
-	movq	%r8,160(%rsp)
-	#   		x7 = j6
-	movq	80(%rsp),%r8
-	#   		x6 = x7
-	mov	%r8,%rax
-	#   		(uint64) x7 >>= 32
-	shr	$32,%r8
-	#   x9 = j8
-	movq	88(%rsp),%r10
-	#   x8 = x9
-	mov	%r10,%r11
-	#   (uint64) x9 >>= 32
-	shr	$32,%r10
-	#   		x11 = j10
-	movq	96(%rsp),%r12
-	#   		x10 = x11
-	mov	%r12,%r13
-	#   		x10_stack = x10
-	movq	%r13,168(%rsp)
-	#   		(uint64) x11 >>= 32
-	shr	$32,%r12
-	#   x13 = j12
-	movq	104(%rsp),%r13
-	#   x12 = x13
-	mov	%r13,%r14
-	#   (uint64) x13 >>= 32
-	shr	$32,%r13
-	#   		x15 = j14
-	movq	112(%rsp),%r15
-	#   		x14 = x15
-	mov	%r15,%rbx
-	#   		(uint64) x15 >>= 32
-	shr	$32,%r15
-	#   		x15_stack = x15
-	movq	%r15,176(%rsp)
-	#   i = 20
-	mov	$20,%r15
-#   mainloop:
-._mainloop:
-	#   i_backup = i
-	movq	%r15,184(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x12 + x0
-	lea	(%r14,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x4 ^= a
-	xor	%rbp,%r9
-	# 		b = x1 + x5
-	lea	(%rdi,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x9 ^= b
-	xor	%rbp,%r10
-	# a = x0 + x4
-	lea	(%rdx,%r9),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x8 ^= a
-	xor	%rbp,%r11
-	# 		b = x5 + x9
-	lea	(%r15,%r10),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x13 ^= b
-	xor	%rbp,%r13
-	# a = x4 + x8
-	lea	(%r9,%r11),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x12 ^= a
-	xor	%rbp,%r14
-	# 		b = x9 + x13
-	lea	(%r10,%r13),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x1 ^= b
-	xor	%rbp,%rdi
-	# a = x8 + x12
-	lea	(%r11,%r14),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x13 + x1
-	lea	(%r13,%rdi),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x6 + x10
-	lea	(%rax,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x14 ^= c
-	xor	%r15,%rbx
-	# 				c = x10 + x14
-	lea	(%rbp,%rbx),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x2 ^= c
-	xor	%r15,%rcx
-	# 				c = x14 + x2
-	lea	(%rbx,%rcx),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x6 ^= c
-	xor	%r15,%rax
-	# 				c = x2 + x6
-	lea	(%rcx,%rax),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x11 + x15
-	lea	(%r12,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x3 ^= d
-	xor	%rbp,%rsi
-	# 						d = x15 + x3
-	lea	(%r15,%rsi),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x7 ^= d
-	xor	%rbp,%r8
-	# 						d = x3 + x7
-	lea	(%rsi,%r8),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x11 ^= d
-	xor	%rbp,%r12
-	# 						d = x7 + x11
-	lea	(%r8,%r12),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x3 + x0
-	lea	(%rsi,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x1 ^= a
-	xor	%rbp,%rdi
-	# 		b = x4 + x5
-	lea	(%r9,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x6 ^= b
-	xor	%rbp,%rax
-	# a = x0 + x1
-	lea	(%rdx,%rdi),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x2 ^= a
-	xor	%rbp,%rcx
-	# 		b = x5 + x6
-	lea	(%r15,%rax),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x7 ^= b
-	xor	%rbp,%r8
-	# a = x1 + x2
-	lea	(%rdi,%rcx),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x3 ^= a
-	xor	%rbp,%rsi
-	# 		b = x6 + x7
-	lea	(%rax,%r8),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x4 ^= b
-	xor	%rbp,%r9
-	# a = x2 + x3
-	lea	(%rcx,%rsi),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x7 + x4
-	lea	(%r8,%r9),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x9 + x10
-	lea	(%r10,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x11 ^= c
-	xor	%r15,%r12
-	# 				c = x10 + x11
-	lea	(%rbp,%r12),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x8 ^= c
-	xor	%r15,%r11
-	# 				c = x11 + x8
-	lea	(%r12,%r11),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x9 ^= c
-	xor	%r15,%r10
-	# 				c = x8 + x9
-	lea	(%r11,%r10),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x14 + x15
-	lea	(%rbx,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x12 ^= d
-	xor	%rbp,%r14
-	# 						d = x15 + x12
-	lea	(%r15,%r14),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x13 ^= d
-	xor	%rbp,%r13
-	# 						d = x12 + x13
-	lea	(%r14,%r13),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x14 ^= d
-	xor	%rbp,%rbx
-	# 						d = x13 + x14
-	lea	(%r13,%rbx),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x12 + x0
-	lea	(%r14,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x4 ^= a
-	xor	%rbp,%r9
-	# 		b = x1 + x5
-	lea	(%rdi,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x9 ^= b
-	xor	%rbp,%r10
-	# a = x0 + x4
-	lea	(%rdx,%r9),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x8 ^= a
-	xor	%rbp,%r11
-	# 		b = x5 + x9
-	lea	(%r15,%r10),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x13 ^= b
-	xor	%rbp,%r13
-	# a = x4 + x8
-	lea	(%r9,%r11),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x12 ^= a
-	xor	%rbp,%r14
-	# 		b = x9 + x13
-	lea	(%r10,%r13),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x1 ^= b
-	xor	%rbp,%rdi
-	# a = x8 + x12
-	lea	(%r11,%r14),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x13 + x1
-	lea	(%r13,%rdi),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x6 + x10
-	lea	(%rax,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x14 ^= c
-	xor	%r15,%rbx
-	# 				c = x10 + x14
-	lea	(%rbp,%rbx),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x2 ^= c
-	xor	%r15,%rcx
-	# 				c = x14 + x2
-	lea	(%rbx,%rcx),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x6 ^= c
-	xor	%r15,%rax
-	# 				c = x2 + x6
-	lea	(%rcx,%rax),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x11 + x15
-	lea	(%r12,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x3 ^= d
-	xor	%rbp,%rsi
-	# 						d = x15 + x3
-	lea	(%r15,%rsi),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x7 ^= d
-	xor	%rbp,%r8
-	# 						d = x3 + x7
-	lea	(%rsi,%r8),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x11 ^= d
-	xor	%rbp,%r12
-	# 						d = x7 + x11
-	lea	(%r8,%r12),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	# 		x5 = x5_stack
-	movq	160(%rsp),%r15
-	# a = x3 + x0
-	lea	(%rsi,%rdx),%rbp
-	# (uint32) a <<<= 7
-	rol	$7,%ebp
-	# x1 ^= a
-	xor	%rbp,%rdi
-	# 		b = x4 + x5
-	lea	(%r9,%r15),%rbp
-	# 		(uint32) b <<<= 7
-	rol	$7,%ebp
-	# 		x6 ^= b
-	xor	%rbp,%rax
-	# a = x0 + x1
-	lea	(%rdx,%rdi),%rbp
-	# (uint32) a <<<= 9
-	rol	$9,%ebp
-	# x2 ^= a
-	xor	%rbp,%rcx
-	# 		b = x5 + x6
-	lea	(%r15,%rax),%rbp
-	# 		(uint32) b <<<= 9
-	rol	$9,%ebp
-	# 		x7 ^= b
-	xor	%rbp,%r8
-	# a = x1 + x2
-	lea	(%rdi,%rcx),%rbp
-	# (uint32) a <<<= 13
-	rol	$13,%ebp
-	# x3 ^= a
-	xor	%rbp,%rsi
-	# 		b = x6 + x7
-	lea	(%rax,%r8),%rbp
-	# 		(uint32) b <<<= 13
-	rol	$13,%ebp
-	# 		x4 ^= b
-	xor	%rbp,%r9
-	# a = x2 + x3
-	lea	(%rcx,%rsi),%rbp
-	# (uint32) a <<<= 18
-	rol	$18,%ebp
-	# x0 ^= a
-	xor	%rbp,%rdx
-	# 		b = x7 + x4
-	lea	(%r8,%r9),%rbp
-	# 		(uint32) b <<<= 18
-	rol	$18,%ebp
-	# 		x5 ^= b
-	xor	%rbp,%r15
-	# 				x10 = x10_stack
-	movq	168(%rsp),%rbp
-	# 		x5_stack = x5
-	movq	%r15,160(%rsp)
-	# 				c = x9 + x10
-	lea	(%r10,%rbp),%r15
-	# 				(uint32) c <<<= 7
-	rol	$7,%r15d
-	# 				x11 ^= c
-	xor	%r15,%r12
-	# 				c = x10 + x11
-	lea	(%rbp,%r12),%r15
-	# 				(uint32) c <<<= 9
-	rol	$9,%r15d
-	# 				x8 ^= c
-	xor	%r15,%r11
-	# 				c = x11 + x8
-	lea	(%r12,%r11),%r15
-	# 				(uint32) c <<<= 13
-	rol	$13,%r15d
-	# 				x9 ^= c
-	xor	%r15,%r10
-	# 				c = x8 + x9
-	lea	(%r11,%r10),%r15
-	# 				(uint32) c <<<= 18
-	rol	$18,%r15d
-	# 				x10 ^= c
-	xor	%r15,%rbp
-	# 						x15 = x15_stack
-	movq	176(%rsp),%r15
-	# 				x10_stack = x10
-	movq	%rbp,168(%rsp)
-	# 						d = x14 + x15
-	lea	(%rbx,%r15),%rbp
-	# 						(uint32) d <<<= 7
-	rol	$7,%ebp
-	# 						x12 ^= d
-	xor	%rbp,%r14
-	# 						d = x15 + x12
-	lea	(%r15,%r14),%rbp
-	# 						(uint32) d <<<= 9
-	rol	$9,%ebp
-	# 						x13 ^= d
-	xor	%rbp,%r13
-	# 						d = x12 + x13
-	lea	(%r14,%r13),%rbp
-	# 						(uint32) d <<<= 13
-	rol	$13,%ebp
-	# 						x14 ^= d
-	xor	%rbp,%rbx
-	# 						d = x13 + x14
-	lea	(%r13,%rbx),%rbp
-	# 						(uint32) d <<<= 18
-	rol	$18,%ebp
-	# 						x15 ^= d
-	xor	%rbp,%r15
-	# 						x15_stack = x15
-	movq	%r15,176(%rsp)
-	#   i = i_backup
-	movq	184(%rsp),%r15
-	#                  unsigned>? i -= 4
-	sub	$4,%r15
-	# comment:fp stack unchanged by jump
-	# goto mainloop if unsigned>
-	ja	._mainloop
-	#   (uint32) x2 += j2
-	addl	64(%rsp),%ecx
-	#   x3 <<= 32
-	shl	$32,%rsi
-	#   x3 += j2
-	addq	64(%rsp),%rsi
-	#   (uint64) x3 >>= 32
-	shr	$32,%rsi
-	#   x3 <<= 32
-	shl	$32,%rsi
-	#   x2 += x3
-	add	%rsi,%rcx
-	#   (uint32) x6 += j6
-	addl	80(%rsp),%eax
-	#   x7 <<= 32
-	shl	$32,%r8
-	#   x7 += j6
-	addq	80(%rsp),%r8
-	#   (uint64) x7 >>= 32
-	shr	$32,%r8
-	#   x7 <<= 32
-	shl	$32,%r8
-	#   x6 += x7
-	add	%r8,%rax
-	#   (uint32) x8 += j8
-	addl	88(%rsp),%r11d
-	#   x9 <<= 32
-	shl	$32,%r10
-	#   x9 += j8
-	addq	88(%rsp),%r10
-	#   (uint64) x9 >>= 32
-	shr	$32,%r10
-	#   x9 <<= 32
-	shl	$32,%r10
-	#   x8 += x9
-	add	%r10,%r11
-	#   (uint32) x12 += j12
-	addl	104(%rsp),%r14d
-	#   x13 <<= 32
-	shl	$32,%r13
-	#   x13 += j12
-	addq	104(%rsp),%r13
-	#   (uint64) x13 >>= 32
-	shr	$32,%r13
-	#   x13 <<= 32
-	shl	$32,%r13
-	#   x12 += x13
-	add	%r13,%r14
-	#   (uint32) x0 += j0
-	addl	56(%rsp),%edx
-	#   x1 <<= 32
-	shl	$32,%rdi
-	#   x1 += j0
-	addq	56(%rsp),%rdi
-	#   (uint64) x1 >>= 32
-	shr	$32,%rdi
-	#   x1 <<= 32
-	shl	$32,%rdi
-	#   x0 += x1
-	add	%rdi,%rdx
-	#   x5 = x5_stack
-	movq	160(%rsp),%rdi
-	#   (uint32) x4 += j4
-	addl	72(%rsp),%r9d
-	#   x5 <<= 32
-	shl	$32,%rdi
-	#   x5 += j4
-	addq	72(%rsp),%rdi
-	#   (uint64) x5 >>= 32
-	shr	$32,%rdi
-	#   x5 <<= 32
-	shl	$32,%rdi
-	#   x4 += x5
-	add	%rdi,%r9
-	#   x10 = x10_stack
-	movq	168(%rsp),%r8
-	#   (uint32) x10 += j10
-	addl	96(%rsp),%r8d
-	#   x11 <<= 32
-	shl	$32,%r12
-	#   x11 += j10
-	addq	96(%rsp),%r12
-	#   (uint64) x11 >>= 32
-	shr	$32,%r12
-	#   x11 <<= 32
-	shl	$32,%r12
-	#   x10 += x11
-	add	%r12,%r8
-	#   x15 = x15_stack
-	movq	176(%rsp),%rdi
-	#   (uint32) x14 += j14
-	addl	112(%rsp),%ebx
-	#   x15 <<= 32
-	shl	$32,%rdi
-	#   x15 += j14
-	addq	112(%rsp),%rdi
-	#   (uint64) x15 >>= 32
-	shr	$32,%rdi
-	#   x15 <<= 32
-	shl	$32,%rdi
-	#   x14 += x15
-	add	%rdi,%rbx
-	#   out = out_backup
-	movq	136(%rsp),%rdi
-	#   m = m_backup
-	movq	144(%rsp),%rsi
-	#   x0 ^= *(uint64 *) (m + 0)
-	xorq	0(%rsi),%rdx
-	#   *(uint64 *) (out + 0) = x0
-	movq	%rdx,0(%rdi)
-	#   x2 ^= *(uint64 *) (m + 8)
-	xorq	8(%rsi),%rcx
-	#   *(uint64 *) (out + 8) = x2
-	movq	%rcx,8(%rdi)
-	#   x4 ^= *(uint64 *) (m + 16)
-	xorq	16(%rsi),%r9
-	#   *(uint64 *) (out + 16) = x4
-	movq	%r9,16(%rdi)
-	#   x6 ^= *(uint64 *) (m + 24)
-	xorq	24(%rsi),%rax
-	#   *(uint64 *) (out + 24) = x6
-	movq	%rax,24(%rdi)
-	#   x8 ^= *(uint64 *) (m + 32)
-	xorq	32(%rsi),%r11
-	#   *(uint64 *) (out + 32) = x8
-	movq	%r11,32(%rdi)
-	#   x10 ^= *(uint64 *) (m + 40)
-	xorq	40(%rsi),%r8
-	#   *(uint64 *) (out + 40) = x10
-	movq	%r8,40(%rdi)
-	#   x12 ^= *(uint64 *) (m + 48)
-	xorq	48(%rsi),%r14
-	#   *(uint64 *) (out + 48) = x12
-	movq	%r14,48(%rdi)
-	#   x14 ^= *(uint64 *) (m + 56)
-	xorq	56(%rsi),%rbx
-	#   *(uint64 *) (out + 56) = x14
-	movq	%rbx,56(%rdi)
-	#   bytes = bytes_backup
-	movq	152(%rsp),%rdx
-	#   in8 = j8
-	movq	88(%rsp),%rcx
-	#   in8 += 1
-	add	$1,%rcx
-	#   j8 = in8
-	movq	%rcx,88(%rsp)
-	#                          unsigned>? unsigned<? bytes - 64
-	cmp	$64,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto bytesatleast65 if unsigned>
-	ja	._bytesatleast65
-	# comment:fp stack unchanged by jump
-	#     goto bytesatleast64 if !unsigned<
-	jae	._bytesatleast64
-	#       m = out
-	mov	%rdi,%rsi
-	#       out = ctarget
-	movq	128(%rsp),%rdi
-	#       i = bytes
-	mov	%rdx,%rcx
-	#       while (i) { *out++ = *m++; --i }
-	rep	movsb
-	# comment:fp stack unchanged by fallthrough
-#     bytesatleast64:
-._bytesatleast64:
-	#     x = x_backup
-	movq	120(%rsp),%rdi
-	#     in8 = j8
-	movq	88(%rsp),%rsi
-	#     *(uint64 *) (x + 32) = in8
-	movq	%rsi,32(%rdi)
-	#     r11 = r11_stack
-	movq	0(%rsp),%r11
-	#     r12 = r12_stack
-	movq	8(%rsp),%r12
-	#     r13 = r13_stack
-	movq	16(%rsp),%r13
-	#     r14 = r14_stack
-	movq	24(%rsp),%r14
-	#     r15 = r15_stack
-	movq	32(%rsp),%r15
-	#     rbx = rbx_stack
-	movq	40(%rsp),%rbx
-	#     rbp = rbp_stack
-	movq	48(%rsp),%rbp
-	# comment:fp stack unchanged by fallthrough
-#     done:
-._done:
-	#     leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-#   bytesatleast65:
-._bytesatleast65:
-	#   bytes -= 64
-	sub	$64,%rdx
-	#   out += 64
-	add	$64,%rdi
-	#   m += 64
-	add	$64,%rsi
-	# comment:fp stack unchanged by jump
-	# goto bytesatleast1
-	jmp	._bytesatleast1
-ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	#   k = arg2
-	mov	%rsi,%rsi
-	#   kbits = arg3
-	mov	%rdx,%rdx
-	#   x = arg1
-	mov	%rdi,%rdi
-	#   in0 = *(uint64 *) (k + 0)
-	movq	0(%rsi),%r8
-	#   in2 = *(uint64 *) (k + 8)
-	movq	8(%rsi),%r9
-	#   *(uint64 *) (x + 4) = in0
-	movq	%r8,4(%rdi)
-	#   *(uint64 *) (x + 12) = in2
-	movq	%r9,12(%rdi)
-	#                    unsigned<? kbits - 256
-	cmp	$256,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto kbits128 if unsigned<
-	jb	._kbits128
-#   kbits256:
-._kbits256:
-	#     in10 = *(uint64 *) (k + 16)
-	movq	16(%rsi),%rdx
-	#     in12 = *(uint64 *) (k + 24)
-	movq	24(%rsi),%rsi
-	#     *(uint64 *) (x + 44) = in10
-	movq	%rdx,44(%rdi)
-	#     *(uint64 *) (x + 52) = in12
-	movq	%rsi,52(%rdi)
-	#     in0 = 1634760805
-	mov	$1634760805,%rsi
-	#     in4 = 857760878
-	mov	$857760878,%rdx
-	#     in10 = 2036477234
-	mov	$2036477234,%rcx
-	#     in14 = 1797285236
-	mov	$1797285236,%r8
-	#     *(uint32 *) (x + 0) = in0
-	movl	%esi,0(%rdi)
-	#     *(uint32 *) (x + 20) = in4
-	movl	%edx,20(%rdi)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ecx,40(%rdi)
-	#     *(uint32 *) (x + 60) = in14
-	movl	%r8d,60(%rdi)
-	# comment:fp stack unchanged by jump
-	#   goto keysetupdone
-	jmp	._keysetupdone
-#   kbits128:
-._kbits128:
-	#     in10 = *(uint64 *) (k + 0)
-	movq	0(%rsi),%rdx
-	#     in12 = *(uint64 *) (k + 8)
-	movq	8(%rsi),%rsi
-	#     *(uint64 *) (x + 44) = in10
-	movq	%rdx,44(%rdi)
-	#     *(uint64 *) (x + 52) = in12
-	movq	%rsi,52(%rdi)
-	#     in0 = 1634760805
-	mov	$1634760805,%rsi
-	#     in4 = 824206446
-	mov	$824206446,%rdx
-	#     in10 = 2036477238
-	mov	$2036477238,%rcx
-	#     in14 = 1797285236
-	mov	$1797285236,%r8
-	#     *(uint32 *) (x + 0) = in0
-	movl	%esi,0(%rdi)
-	#     *(uint32 *) (x + 20) = in4
-	movl	%edx,20(%rdi)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ecx,40(%rdi)
-	#     *(uint32 *) (x + 60) = in14
-	movl	%r8d,60(%rdi)
-#   keysetupdone:
-._keysetupdone:
-	# leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	#   iv = arg2
-	mov	%rsi,%rsi
-	#   x = arg1
-	mov	%rdi,%rdi
-	#   in6 = *(uint64 *) (iv + 0)
-	movq	0(%rsi),%rsi
-	#   in8 = 0
-	mov	$0,%r8
-	#   *(uint64 *) (x + 24) = in6
-	movq	%rsi,24(%rdi)
-	#   *(uint64 *) (x + 32) = in8
-	movq	%r8,32(%rdi)
-	# leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-ENDPROC(salsa20_ivsetup)
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@ -1,116 +0,0 @@
-/*
- * Glue code for optimized assembly version of  Salsa20.
- *
- * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
- *
- * The assembly codes are public domain assembly codes written by Daniel. J.
- * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
- * and to remove extraneous comments and functions that are not needed.
- * - i586 version, renamed as salsa20-i586-asm_32.S
- *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
- * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
- *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/algapi.h>
-#include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE        8U
-#define SALSA20_MIN_KEY_SIZE  16U
-#define SALSA20_MAX_KEY_SIZE  32U
-
-struct salsa20_ctx
-{
-	u32 input[16];
-};
-
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
-				 u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
-				      const u8 *src, u8 *dst, u32 bytes);
-
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
-		  unsigned int keysize)
-{
-	struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
-	salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
-	return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
-		   struct scatterlist *dst, struct scatterlist *src,
-		   unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	struct crypto_blkcipher *tfm = desc->tfm;
-	struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
-	int err;
-
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 64);
-
-	salsa20_ivsetup(ctx, walk.iv);
-
-	while (walk.nbytes >= 64) {
-		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-				      walk.dst.virt.addr,
-				      walk.nbytes - (walk.nbytes % 64));
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
-	}
-
-	if (walk.nbytes) {
-		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-				      walk.dst.virt.addr, walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
-	}
-
-	return err;
-}
-
-static struct crypto_alg alg = {
-	.cra_name           =   "salsa20",
-	.cra_driver_name    =   "salsa20-asm",
-	.cra_priority       =   200,
-	.cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_type           =   &crypto_blkcipher_type,
-	.cra_blocksize      =   1,
-	.cra_ctxsize        =   sizeof(struct salsa20_ctx),
-	.cra_alignmask      =	3,
-	.cra_module         =   THIS_MODULE,
-	.cra_u              =   {
-		.blkcipher = {
-			.setkey         =   setkey,
-			.encrypt        =   encrypt,
-			.decrypt        =   encrypt,
-			.min_keysize    =   SALSA20_MIN_KEY_SIZE,
-			.max_keysize    =   SALSA20_MAX_KEY_SIZE,
-			.ivsize         =   SALSA20_IV_SIZE,
-		}
-	}
-};
-
-static int __init init(void)
-{
-	return crypto_register_alg(&alg);
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_alg(&alg);
-}
-
-module_init(init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
-MODULE_ALIAS_CRYPTO("salsa20");
-MODULE_ALIAS_CRYPTO("salsa20-asm");
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@ -1324,32 +1324,6 @@ config CRYPTO_SALSA20
 	  The Salsa20 stream cipher algorithm is designed by Daniel J.
 	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>

-config CRYPTO_SALSA20_586
-	tristate "Salsa20 stream cipher algorithm (i586)"
-	depends on (X86 || UML_X86) && !64BIT
-	select CRYPTO_BLKCIPHER
-	help
-	  Salsa20 stream cipher algorithm.
-
-	  Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-	  Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-	  The Salsa20 stream cipher algorithm is designed by Daniel J.
-	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
-config CRYPTO_SALSA20_X86_64
-	tristate "Salsa20 stream cipher algorithm (x86_64)"
-	depends on (X86 || UML_X86) && 64BIT
-	select CRYPTO_BLKCIPHER
-	help
-	  Salsa20 stream cipher algorithm.
-
-	  Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
-	  Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
-
-	  The Salsa20 stream cipher algorithm is designed by Daniel J.
-	  Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
-
 config CRYPTO_CHACHA20
 	tristate "ChaCha20 cipher algorithm"
 	select CRYPTO_BLKCIPHER