1
0
Fork 0

crypto: x86/salsa20 - remove x86 salsa20 implementations

commit b7b73cd5d7 upstream.

The x86 assembly implementations of Salsa20 use the frame base pointer
register (%ebp or %rbp), which breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.
Recent (v4.10+) kernels will warn about this, e.g.

WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c
[...]

But after looking into it, I believe there's very little reason to still
retain the x86 Salsa20 code.  First, these are *not* vectorized
(SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere
close to the best Salsa20 performance on any remotely modern x86
processor; they're just regular x86 assembly.  Second, it's still
unclear that anyone is actually using the kernel's Salsa20 at all,
especially given that now ChaCha20 is supported too, and with much more
efficient SSSE3 and AVX2 implementations.  Finally, in benchmarks I did
on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the
x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic
(~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm
is only slightly faster than salsa20-generic (~15% faster on Skylake,
~20% faster on Zen).  The gcc version made little difference.

So, the x86_64 salsa20-asm is pretty clearly useless.  That leaves just
the i686 salsa20-asm, which based on my tests provides a 15-20% speed
boost.  But that's without updating the code to not use %ebp.  And given
the maintenance cost, the small speed difference vs. salsa20-generic,
the fact that few people still use i686 kernels, the doubt that anyone
is even using the kernel's Salsa20 at all, and the fact that a SSE2
implementation would almost certainly be much faster on any remotely
modern x86 processor yet no one has cared enough to add one yet, I don't
think it's worthwhile to keep.

Thus, just remove both the x86_64 and i686 salsa20-asm implementations.

Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
pull/10/head
Eric Biggers 2018-05-26 00:08:58 -07:00 committed by Greg Kroah-Hartman
parent 2a017ea2ea
commit 19f39eff68
5 changed files with 0 additions and 2179 deletions

View File

@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@ -59,7 +57,6 @@ endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@ -68,7 +65,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o

File diff suppressed because it is too large Load Diff

View File

@ -1,919 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
# enter salsa20_encrypt_bytes
ENTRY(salsa20_encrypt_bytes)
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# x = arg1
mov %rdi,%r8
# m = arg2
mov %rsi,%rsi
# out = arg3
mov %rdx,%rdi
# bytes = arg4
mov %rcx,%rdx
# unsigned>? bytes - 0
cmp $0,%rdx
# comment:fp stack unchanged by jump
# goto done if !unsigned>
jbe ._done
# comment:fp stack unchanged by fallthrough
# start:
._start:
# r11_stack = r11
movq %r11,0(%rsp)
# r12_stack = r12
movq %r12,8(%rsp)
# r13_stack = r13
movq %r13,16(%rsp)
# r14_stack = r14
movq %r14,24(%rsp)
# r15_stack = r15
movq %r15,32(%rsp)
# rbx_stack = rbx
movq %rbx,40(%rsp)
# rbp_stack = rbp
movq %rbp,48(%rsp)
# in0 = *(uint64 *) (x + 0)
movq 0(%r8),%rcx
# in2 = *(uint64 *) (x + 8)
movq 8(%r8),%r9
# in4 = *(uint64 *) (x + 16)
movq 16(%r8),%rax
# in6 = *(uint64 *) (x + 24)
movq 24(%r8),%r10
# in8 = *(uint64 *) (x + 32)
movq 32(%r8),%r11
# in10 = *(uint64 *) (x + 40)
movq 40(%r8),%r12
# in12 = *(uint64 *) (x + 48)
movq 48(%r8),%r13
# in14 = *(uint64 *) (x + 56)
movq 56(%r8),%r14
# j0 = in0
movq %rcx,56(%rsp)
# j2 = in2
movq %r9,64(%rsp)
# j4 = in4
movq %rax,72(%rsp)
# j6 = in6
movq %r10,80(%rsp)
# j8 = in8
movq %r11,88(%rsp)
# j10 = in10
movq %r12,96(%rsp)
# j12 = in12
movq %r13,104(%rsp)
# j14 = in14
movq %r14,112(%rsp)
# x_backup = x
movq %r8,120(%rsp)
# bytesatleast1:
._bytesatleast1:
# unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto nocopy if !unsigned<
jae ._nocopy
# ctarget = out
movq %rdi,128(%rsp)
# out = &tmp
leaq 192(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# out = &tmp
leaq 192(%rsp),%rdi
# m = &tmp
leaq 192(%rsp),%rsi
# comment:fp stack unchanged by fallthrough
# nocopy:
._nocopy:
# out_backup = out
movq %rdi,136(%rsp)
# m_backup = m
movq %rsi,144(%rsp)
# bytes_backup = bytes
movq %rdx,152(%rsp)
# x1 = j0
movq 56(%rsp),%rdi
# x0 = x1
mov %rdi,%rdx
# (uint64) x1 >>= 32
shr $32,%rdi
# x3 = j2
movq 64(%rsp),%rsi
# x2 = x3
mov %rsi,%rcx
# (uint64) x3 >>= 32
shr $32,%rsi
# x5 = j4
movq 72(%rsp),%r8
# x4 = x5
mov %r8,%r9
# (uint64) x5 >>= 32
shr $32,%r8
# x5_stack = x5
movq %r8,160(%rsp)
# x7 = j6
movq 80(%rsp),%r8
# x6 = x7
mov %r8,%rax
# (uint64) x7 >>= 32
shr $32,%r8
# x9 = j8
movq 88(%rsp),%r10
# x8 = x9
mov %r10,%r11
# (uint64) x9 >>= 32
shr $32,%r10
# x11 = j10
movq 96(%rsp),%r12
# x10 = x11
mov %r12,%r13
# x10_stack = x10
movq %r13,168(%rsp)
# (uint64) x11 >>= 32
shr $32,%r12
# x13 = j12
movq 104(%rsp),%r13
# x12 = x13
mov %r13,%r14
# (uint64) x13 >>= 32
shr $32,%r13
# x15 = j14
movq 112(%rsp),%r15
# x14 = x15
mov %r15,%rbx
# (uint64) x15 >>= 32
shr $32,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = 20
mov $20,%r15
# mainloop:
._mainloop:
# i_backup = i
movq %r15,184(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = i_backup
movq 184(%rsp),%r15
# unsigned>? i -= 4
sub $4,%r15
# comment:fp stack unchanged by jump
# goto mainloop if unsigned>
ja ._mainloop
# (uint32) x2 += j2
addl 64(%rsp),%ecx
# x3 <<= 32
shl $32,%rsi
# x3 += j2
addq 64(%rsp),%rsi
# (uint64) x3 >>= 32
shr $32,%rsi
# x3 <<= 32
shl $32,%rsi
# x2 += x3
add %rsi,%rcx
# (uint32) x6 += j6
addl 80(%rsp),%eax
# x7 <<= 32
shl $32,%r8
# x7 += j6
addq 80(%rsp),%r8
# (uint64) x7 >>= 32
shr $32,%r8
# x7 <<= 32
shl $32,%r8
# x6 += x7
add %r8,%rax
# (uint32) x8 += j8
addl 88(%rsp),%r11d
# x9 <<= 32
shl $32,%r10
# x9 += j8
addq 88(%rsp),%r10
# (uint64) x9 >>= 32
shr $32,%r10
# x9 <<= 32
shl $32,%r10
# x8 += x9
add %r10,%r11
# (uint32) x12 += j12
addl 104(%rsp),%r14d
# x13 <<= 32
shl $32,%r13
# x13 += j12
addq 104(%rsp),%r13
# (uint64) x13 >>= 32
shr $32,%r13
# x13 <<= 32
shl $32,%r13
# x12 += x13
add %r13,%r14
# (uint32) x0 += j0
addl 56(%rsp),%edx
# x1 <<= 32
shl $32,%rdi
# x1 += j0
addq 56(%rsp),%rdi
# (uint64) x1 >>= 32
shr $32,%rdi
# x1 <<= 32
shl $32,%rdi
# x0 += x1
add %rdi,%rdx
# x5 = x5_stack
movq 160(%rsp),%rdi
# (uint32) x4 += j4
addl 72(%rsp),%r9d
# x5 <<= 32
shl $32,%rdi
# x5 += j4
addq 72(%rsp),%rdi
# (uint64) x5 >>= 32
shr $32,%rdi
# x5 <<= 32
shl $32,%rdi
# x4 += x5
add %rdi,%r9
# x10 = x10_stack
movq 168(%rsp),%r8
# (uint32) x10 += j10
addl 96(%rsp),%r8d
# x11 <<= 32
shl $32,%r12
# x11 += j10
addq 96(%rsp),%r12
# (uint64) x11 >>= 32
shr $32,%r12
# x11 <<= 32
shl $32,%r12
# x10 += x11
add %r12,%r8
# x15 = x15_stack
movq 176(%rsp),%rdi
# (uint32) x14 += j14
addl 112(%rsp),%ebx
# x15 <<= 32
shl $32,%rdi
# x15 += j14
addq 112(%rsp),%rdi
# (uint64) x15 >>= 32
shr $32,%rdi
# x15 <<= 32
shl $32,%rdi
# x14 += x15
add %rdi,%rbx
# out = out_backup
movq 136(%rsp),%rdi
# m = m_backup
movq 144(%rsp),%rsi
# x0 ^= *(uint64 *) (m + 0)
xorq 0(%rsi),%rdx
# *(uint64 *) (out + 0) = x0
movq %rdx,0(%rdi)
# x2 ^= *(uint64 *) (m + 8)
xorq 8(%rsi),%rcx
# *(uint64 *) (out + 8) = x2
movq %rcx,8(%rdi)
# x4 ^= *(uint64 *) (m + 16)
xorq 16(%rsi),%r9
# *(uint64 *) (out + 16) = x4
movq %r9,16(%rdi)
# x6 ^= *(uint64 *) (m + 24)
xorq 24(%rsi),%rax
# *(uint64 *) (out + 24) = x6
movq %rax,24(%rdi)
# x8 ^= *(uint64 *) (m + 32)
xorq 32(%rsi),%r11
# *(uint64 *) (out + 32) = x8
movq %r11,32(%rdi)
# x10 ^= *(uint64 *) (m + 40)
xorq 40(%rsi),%r8
# *(uint64 *) (out + 40) = x10
movq %r8,40(%rdi)
# x12 ^= *(uint64 *) (m + 48)
xorq 48(%rsi),%r14
# *(uint64 *) (out + 48) = x12
movq %r14,48(%rdi)
# x14 ^= *(uint64 *) (m + 56)
xorq 56(%rsi),%rbx
# *(uint64 *) (out + 56) = x14
movq %rbx,56(%rdi)
# bytes = bytes_backup
movq 152(%rsp),%rdx
# in8 = j8
movq 88(%rsp),%rcx
# in8 += 1
add $1,%rcx
# j8 = in8
movq %rcx,88(%rsp)
# unsigned>? unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto bytesatleast65 if unsigned>
ja ._bytesatleast65
# comment:fp stack unchanged by jump
# goto bytesatleast64 if !unsigned<
jae ._bytesatleast64
# m = out
mov %rdi,%rsi
# out = ctarget
movq 128(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# comment:fp stack unchanged by fallthrough
# bytesatleast64:
._bytesatleast64:
# x = x_backup
movq 120(%rsp),%rdi
# in8 = j8
movq 88(%rsp),%rsi
# *(uint64 *) (x + 32) = in8
movq %rsi,32(%rdi)
# r11 = r11_stack
movq 0(%rsp),%r11
# r12 = r12_stack
movq 8(%rsp),%r12
# r13 = r13_stack
movq 16(%rsp),%r13
# r14 = r14_stack
movq 24(%rsp),%r14
# r15 = r15_stack
movq 32(%rsp),%r15
# rbx = rbx_stack
movq 40(%rsp),%rbx
# rbp = rbp_stack
movq 48(%rsp),%rbp
# comment:fp stack unchanged by fallthrough
# done:
._done:
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
# bytesatleast65:
._bytesatleast65:
# bytes -= 64
sub $64,%rdx
# out += 64
add $64,%rdi
# m += 64
add $64,%rsi
# comment:fp stack unchanged by jump
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)
# enter salsa20_keysetup
ENTRY(salsa20_keysetup)
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# k = arg2
mov %rsi,%rsi
# kbits = arg3
mov %rdx,%rdx
# x = arg1
mov %rdi,%rdi
# in0 = *(uint64 *) (k + 0)
movq 0(%rsi),%r8
# in2 = *(uint64 *) (k + 8)
movq 8(%rsi),%r9
# *(uint64 *) (x + 4) = in0
movq %r8,4(%rdi)
# *(uint64 *) (x + 12) = in2
movq %r9,12(%rdi)
# unsigned<? kbits - 256
cmp $256,%rdx
# comment:fp stack unchanged by jump
# goto kbits128 if unsigned<
jb ._kbits128
# kbits256:
._kbits256:
# in10 = *(uint64 *) (k + 16)
movq 16(%rsi),%rdx
# in12 = *(uint64 *) (k + 24)
movq 24(%rsi),%rsi
# *(uint64 *) (x + 44) = in10
movq %rdx,44(%rdi)
# *(uint64 *) (x + 52) = in12
movq %rsi,52(%rdi)
# in0 = 1634760805
mov $1634760805,%rsi
# in4 = 857760878
mov $857760878,%rdx
# in10 = 2036477234
mov $2036477234,%rcx
# in14 = 1797285236
mov $1797285236,%r8
# *(uint32 *) (x + 0) = in0
movl %esi,0(%rdi)
# *(uint32 *) (x + 20) = in4
movl %edx,20(%rdi)
# *(uint32 *) (x + 40) = in10
movl %ecx,40(%rdi)
# *(uint32 *) (x + 60) = in14
movl %r8d,60(%rdi)
# comment:fp stack unchanged by jump
# goto keysetupdone
jmp ._keysetupdone
# kbits128:
._kbits128:
# in10 = *(uint64 *) (k + 0)
movq 0(%rsi),%rdx
# in12 = *(uint64 *) (k + 8)
movq 8(%rsi),%rsi
# *(uint64 *) (x + 44) = in10
movq %rdx,44(%rdi)
# *(uint64 *) (x + 52) = in12
movq %rsi,52(%rdi)
# in0 = 1634760805
mov $1634760805,%rsi
# in4 = 824206446
mov $824206446,%rdx
# in10 = 2036477238
mov $2036477238,%rcx
# in14 = 1797285236
mov $1797285236,%r8
# *(uint32 *) (x + 0) = in0
movl %esi,0(%rdi)
# *(uint32 *) (x + 20) = in4
movl %edx,20(%rdi)
# *(uint32 *) (x + 40) = in10
movl %ecx,40(%rdi)
# *(uint32 *) (x + 60) = in14
movl %r8d,60(%rdi)
# keysetupdone:
._keysetupdone:
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
ENDPROC(salsa20_keysetup)
# enter salsa20_ivsetup
ENTRY(salsa20_ivsetup)
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# iv = arg2
mov %rsi,%rsi
# x = arg1
mov %rdi,%rdi
# in6 = *(uint64 *) (iv + 0)
movq 0(%rsi),%rsi
# in8 = 0
mov $0,%r8
# *(uint64 *) (x + 24) = in6
movq %rsi,24(%rdi)
# *(uint64 *) (x + 32) = in8
movq %r8,32(%rdi)
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
ENDPROC(salsa20_ivsetup)

View File

@ -1,116 +0,0 @@
/*
* Glue code for optimized assembly version of Salsa20.
*
* Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
*
* The assembly codes are public domain assembly codes written by Daniel. J.
* Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
* and to remove extraneous comments and functions that are not needed.
* - i586 version, renamed as salsa20-i586-asm_32.S
* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
* - x86-64 version, renamed as salsa20-x86_64-asm_64.S
* available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <crypto/algapi.h>
#include <linux/module.h>
#include <linux/crypto.h>
#define SALSA20_IV_SIZE 8U
#define SALSA20_MIN_KEY_SIZE 16U
#define SALSA20_MAX_KEY_SIZE 32U
struct salsa20_ctx
{
u32 input[16];
};
asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
u32 keysize, u32 ivsize);
asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
const u8 *src, u8 *dst, u32 bytes);
static int setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keysize)
{
struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
return 0;
}
static int encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
struct blkcipher_walk walk;
struct crypto_blkcipher *tfm = desc->tfm;
struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
int err;
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, 64);
salsa20_ivsetup(ctx, walk.iv);
while (walk.nbytes >= 64) {
salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
walk.dst.virt.addr,
walk.nbytes - (walk.nbytes % 64));
err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
}
if (walk.nbytes) {
salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
walk.dst.virt.addr, walk.nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
return err;
}
static struct crypto_alg alg = {
.cra_name = "salsa20",
.cra_driver_name = "salsa20-asm",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_type = &crypto_blkcipher_type,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct salsa20_ctx),
.cra_alignmask = 3,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.setkey = setkey,
.encrypt = encrypt,
.decrypt = encrypt,
.min_keysize = SALSA20_MIN_KEY_SIZE,
.max_keysize = SALSA20_MAX_KEY_SIZE,
.ivsize = SALSA20_IV_SIZE,
}
}
};
static int __init init(void)
{
return crypto_register_alg(&alg);
}
static void __exit fini(void)
{
crypto_unregister_alg(&alg);
}
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
MODULE_ALIAS_CRYPTO("salsa20");
MODULE_ALIAS_CRYPTO("salsa20-asm");

View File

@ -1324,32 +1324,6 @@ config CRYPTO_SALSA20
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_SALSA20_586
tristate "Salsa20 stream cipher algorithm (i586)"
depends on (X86 || UML_X86) && !64BIT
select CRYPTO_BLKCIPHER
help
Salsa20 stream cipher algorithm.
Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_SALSA20_X86_64
tristate "Salsa20 stream cipher algorithm (x86_64)"
depends on (X86 || UML_X86) && 64BIT
select CRYPTO_BLKCIPHER
help
Salsa20 stream cipher algorithm.
Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_CHACHA20
tristate "ChaCha20 cipher algorithm"
select CRYPTO_BLKCIPHER